Raytune is throwing error: "module 'pickle' has no attribute 'PickleBuffer'" when attempting hyperparameter search

2k views Asked by At

I am more or less following this example to integrate the ray tune hyperparameter library with the huggingface transformers library using my own dataset.

Here is my script:

import ray
from ray import tune
from ray.tune import CLIReporter
from ray.tune.examples.pbt_transformers.utils import download_data, \
    build_compute_metrics_fn
from ray.tune.schedulers import PopulationBasedTraining
from transformers import glue_tasks_num_labels, AutoConfig, \
    AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments

def get_model():
    # tokenizer = AutoTokenizer.from_pretrained(model_name, additional_special_tokens = ['[CHARACTER]'])
    model = ElectraForSequenceClassification.from_pretrained('google/electra-small-discriminator', num_labels=2)
    model.resize_token_embeddings(len(tokenizer))
    return model

from sklearn.metrics import accuracy_score, precision_recall_fscore_support
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

training_args = TrainingArguments(
    "electra_hp_tune",
    report_to = "wandb",
    learning_rate=2e-5,  # config
    do_train=True,
    do_eval=True,
    evaluation_strategy="epoch",
    load_best_model_at_end=True,
    num_train_epochs=2,  # config
    per_device_train_batch_size=16,  # config
    per_device_eval_batch_size=16,  # config
    warmup_steps=0,
    weight_decay=0.1,  # config
    logging_dir="./logs",
)

trainer = Trainer(
    model_init=get_model,
    args=training_args,
    train_dataset=chunked_encoded_dataset['train'],
    eval_dataset=chunked_encoded_dataset['validation'],
    compute_metrics=compute_metrics
)

tune_config = {
    "per_device_train_batch_size": 32,
    "per_device_eval_batch_size": 32,
    "num_train_epochs": tune.choice([2, 3, 4, 5])
}

scheduler = PopulationBasedTraining(
    time_attr="training_iteration",
    metric="eval_acc",
    mode="max",
    perturbation_interval=1,
    hyperparam_mutations={
        "weight_decay": tune.uniform(0.0, 0.3),
        "learning_rate": tune.uniform(1e-5, 2.5e-5),
        "per_device_train_batch_size": [16, 32, 64],
    })

reporter = CLIReporter(
    parameter_columns={
        "weight_decay": "w_decay",
        "learning_rate": "lr",
        "per_device_train_batch_size": "train_bs/gpu",
        "num_train_epochs": "num_epochs"
    },
    metric_columns=[
        "eval_f1", "eval_loss", "epoch", "training_iteration"
    ])

from ray.tune.integration.wandb import WandbLogger
trainer.hyperparameter_search(
    hp_space=lambda _: tune_config,
    backend="ray",
    n_trials=10,
    scheduler=scheduler,
    keep_checkpoints_num=1,
    checkpoint_score_attr="training_iteration",
    progress_reporter=reporter,
    name="tune_transformer_gr")

The last function call (to trainer.hyperparameter_search) is when the error is raised. The error message is:

AttributeError: module 'pickle' has no attribute 'PickleBuffer'

And here is the full stack trace:


AttributeError Traceback (most recent call last)

in () 8 checkpoint_score_attr="training_iteration", 9 progress_reporter=reporter, ---> 10 name="tune_transformer_gr")

14 frames

/usr/local/lib/python3.7/dist-packages/transformers/trainer.py in hyperparameter_search(self, hp_space, compute_objective, n_trials, direction, backend, hp_name, **kwargs) 1666 1667
run_hp_search = run_hp_search_optuna if backend == HPSearchBackend.OPTUNA else run_hp_search_ray -> 1668 best_run = run_hp_search(self, n_trials, direction, **kwargs) 1669 1670 self.hp_search_backend = None

/usr/local/lib/python3.7/dist-packages/transformers/integrations.py in run_hp_search_ray(trainer, n_trials, direction, **kwargs) 231 232 analysis = ray.tune.run( --> 233 ray.tune.with_parameters(_objective, local_trainer=trainer), 234 config=trainer.hp_space(None), 235 num_samples=n_trials,

/usr/local/lib/python3.7/dist-packages/ray/tune/utils/trainable.py in with_parameters(trainable, **kwargs) 294 prefix = f"{str(trainable)}_" 295 for k, v in kwargs.items(): --> 296 parameter_registry.put(prefix + k, v) 297 298 trainable_name = getattr(trainable, "name", "tune_with_parameters")

/usr/local/lib/python3.7/dist-packages/ray/tune/registry.py in put(self, k, v) 160 self.to_flush[k] = v 161 if ray.is_initialized(): --> 162 self.flush() 163 164 def get(self, k):

/usr/local/lib/python3.7/dist-packages/ray/tune/registry.py in flush(self) 169 def flush(self): 170 for k, v in self.to_flush.items(): --> 171 self.references[k] = ray.put(v) 172 self.to_flush.clear() 173

/usr/local/lib/python3.7/dist-packages/ray/_private/client_mode_hook.py in wrapper(*args, **kwargs) 45 if client_mode_should_convert(): 46 return getattr(ray, func.name)(*args, **kwargs) ---> 47 return func(*args, **kwargs) 48 49 return wrapper

/usr/local/lib/python3.7/dist-packages/ray/worker.py in put(value)
1512 with profiling.profile("ray.put"): 1513 try: -> 1514 object_ref = worker.put_object(value) 1515 except ObjectStoreFullError: 1516 logger.info(

/usr/local/lib/python3.7/dist-packages/ray/worker.py in put_object(self, value, object_ref) 259 "inserting with an ObjectRef") 260 --> 261 serialized_value = self.get_serialization_context().serialize(value) 262 # This must be the first place that we construct this python 263 # ObjectRef because an entry with 0 local references is created when

/usr/local/lib/python3.7/dist-packages/ray/serialization.py in serialize(self, value) 322 return RawSerializedObject(value) 323 else: --> 324 return self._serialize_to_msgpack(value)

/usr/local/lib/python3.7/dist-packages/ray/serialization.py in _serialize_to_msgpack(self, value) 302 metadata = ray_constants.OBJECT_METADATA_TYPE_PYTHON 303 pickle5_serialized_object =
--> 304 self._serialize_to_pickle5(metadata, python_objects) 305 else: 306 pickle5_serialized_object = None

/usr/local/lib/python3.7/dist-packages/ray/serialization.py in _serialize_to_pickle5(self, metadata, value) 262 except Exception as e: 263 self.get_and_clear_contained_object_refs() --> 264 raise e 265 finally: 266 self.set_out_of_band_serialization()

/usr/local/lib/python3.7/dist-packages/ray/serialization.py in _serialize_to_pickle5(self, metadata, value) 259 self.set_in_band_serialization() 260 inband = pickle.dumps( --> 261 value, protocol=5, buffer_callback=writer.buffer_callback) 262 except Exception as e: 263 self.get_and_clear_contained_object_refs()

/usr/local/lib/python3.7/dist-packages/ray/cloudpickle/cloudpickle_fast.py in dumps(obj, protocol, buffer_callback) 71 file, protocol=protocol, buffer_callback=buffer_callback 72 ) ---> 73 cp.dump(obj) 74 return file.getvalue() 75

/usr/local/lib/python3.7/dist-packages/ray/cloudpickle/cloudpickle_fast.py in dump(self, obj) 578 def dump(self, obj): 579 try: --> 580 return Pickler.dump(self, obj) 581 except RuntimeError as e: 582 if "recursion" in e.args[0]:

/usr/local/lib/python3.7/dist-packages/pyarrow/io.pxi in pyarrow.lib.Buffer.reduce_ex()

AttributeError: module 'pickle' has no attribute 'PickleBuffer'

My environment set-up:

  • Am using Google Colab
  • Platform: Linux-5.4.109+-x86_64-with-Ubuntu-18.04-bionic
  • Python version: 3.7.10
  • Transformers version: 4.6.1
  • ray version: 1.3.0

What I have tried:

  • Updating pickle
  • Installed and imported pickle5 as pickle
  • Made sure that I did not have a python file with the name of 'pickle' in my immediate directory

Where is this bug coming from and how can I resolve it?

3

There are 3 answers

0
Bram Vanroy On

Not a "real" solution but at least a workaround. For me this issue was occurring on Python 3.7. Switching to Python 3.8 solved the issue.

0
OlegK On

I also encountered this error on Google Colab trying ray tune hyperparameter search with the huggingface transformers.

This helped me:

!pip install pickle5

Then

import pickle5 as pickle

After the first run there will be the pickle warning to restart the notebook and the same error. After the second “Restart and run all” the ray tune hyperparameter search begins.

2
Samuel Håkansson On

I had the same error when trying to use pickle.dump(), for me it worked to downgrade pickle5 from version 0.0.11 to 0.0.10