PyTorch and Transformers training fails on cuda devices

129 views Asked by At

So I have a mix of PyTorch and Transformers code that loads my custom dataset, processes it, downloads a TinyLlama model, then finetunes that model with the processed dataset.

It works fine on cpu, but when i set the device to "cuda" it fails with the following:

Training started...
  0%|                                                                                                 | 0/10 [00:00<?, ?it/s]Traceback (most recent call last):
  File "/storage/Code/DataAssistant/main.py", line 368, in <module>
    main()
  File "/storage/Code/DataAssistant/main.py", line 364, in main
    train()
  File "/storage/Code/DataAssistant/main.py", line 252, in train
    trainer.train()
  File "/storage/Programs/PythonVenvs/transformers-rocm/lib/python3.11/site-packages/transformers/trainer.py", line 1555, in train
    return inner_training_loop(
           ^^^^^^^^^^^^^^^^^^^^
  File "/storage/Programs/PythonVenvs/transformers-rocm/lib/python3.11/site-packages/transformers/trainer.py", line 1838, in _inner_training_loop
    for step, inputs in enumerate(epoch_iterator):
  File "/storage/Programs/PythonVenvs/transformers-rocm/lib/python3.11/site-packages/accelerate/data_loader.py", line 451, in __iter__
    current_batch = next(dataloader_iter)
                    ^^^^^^^^^^^^^^^^^^^^^
  File "/storage/Programs/PythonVenvs/transformers-rocm/lib/python3.11/site-packages/torch/utils/data/dataloader.py", line 630, in __next__
    data = self._next_data()
           ^^^^^^^^^^^^^^^^^
  File "/storage/Programs/PythonVenvs/transformers-rocm/lib/python3.11/site-packages/torch/utils/data/dataloader.py", line 673, in _next_data
    index = self._next_index()  # may raise StopIteration
            ^^^^^^^^^^^^^^^^^^
  File "/storage/Programs/PythonVenvs/transformers-rocm/lib/python3.11/site-packages/torch/utils/data/dataloader.py", line 620, in _next_index
    return next(self._sampler_iter)  # may raise StopIteration
           ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/storage/Programs/PythonVenvs/transformers-rocm/lib/python3.11/site-packages/torch/utils/data/sampler.py", line 282, in __iter__
    for idx in self.sampler:
  File "/storage/Programs/PythonVenvs/transformers-rocm/lib/python3.11/site-packages/torch/utils/data/sampler.py", line 164, in __iter__
    yield from torch.randperm(n, generator=generator).tolist()
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/storage/Programs/PythonVenvs/transformers-rocm/lib/python3.11/site-packages/torch/utils/_device.py", line 77, in __torch_function__
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
RuntimeError: Expected a 'cuda' device type for generator but found 'cpu'
  0%| 

Here is the short version of my code:

from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, Seq2SeqTrainer, Seq2SeqTrainingArguments, PreTrainedTokenizer
from dataclasses import dataclass, field
from typing import Optional, Dict, Sequence
from torch.nn.utils.rnn import pad_sequence
from datasets import Dataset
import torch

device_name = "cpu"
if(torch.cuda.is_available()):
    device_name = "cuda:0"
    torch.cuda.set_device(0)
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True
    torch.set_default_dtype(torch.float32)
torch.set_default_device(device_name)

model = AutoModelForCausalLM.from_pretrained(
        path,
        torch_dtype=torch.float32,
        device_map="auto"
        )

tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        padding_side="right"
        )
tokenizer.pad_token="[PAD]"

def preprocess_prompt(data):
    inputs = [process_input(c,i) for c,i in zip(data['context'],data['input'])]
    outputs = [process_output(a) for a in data["output"]]
    return dict(
        input=inputs,
        output=outputs
    )

dataset = Dataset.from_json("data/training_data.json")
dataset = dataset.map(preprocess_prompt, batched=True)
dataset = dataset.with_format("torch", device=torch.device(device_name))

@dataclass
class DataCollatorForCausalLM(object):
    tokenizer: PreTrainedTokenizer
    source_max_len: int
    target_max_len: int

    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
        # Extract elements
        sources = [f"{self.tokenizer.bos_token}{example['input']}" for example in instances]
        targets = [f"{example['output']}{self.tokenizer.eos_token}" for example in instances]
        # Tokenize
        tokenized_sources_with_prompt = self.tokenizer(
            sources,
            max_length=self.source_max_len,
            truncation=True,
            add_special_tokens=False,
        )
        tokenized_targets = self.tokenizer(
            targets,
            max_length=self.target_max_len,
            truncation=True,
            add_special_tokens=False,
        )
        # Build the input and labels for causal LM
        input_ids = []
        labels = []
        for tokenized_source, tokenized_target in zip(
            tokenized_sources_with_prompt['input_ids'],
            tokenized_targets['input_ids']
            ):
            input_ids.append(torch.tensor(tokenized_source))
        # Apply padding
        input_ids = pad_sequence(input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)
        labels = pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX)
        data_dict = {
            'input_ids': input_ids,
            'attention_mask':input_ids.ne(self.tokenizer.pad_token_id),
        }
        if labels is not None:
            data_dict['labels'] = labels
        return data_dict

data_collator = DataCollatorForCausalLM(
        tokenizer=tokenizer,
        source_max_len=1024,
        target_max_len=256
)

training_args = Seq2SeqTrainingArguments(
        output_dir=model_dir,
        optim="adamw_torch",
        per_device_train_batch_size=4,
        gradient_accumulation_steps=1,
        gradient_checkpointing=True,
        save_strategy='steps',
        save_steps=250,
        save_total_limit=40,
        lr_scheduler_type='constant',
        remove_unused_columns=False,
        max_grad_norm=0.3,
        max_steps=10,
        num_train_epochs=3,
        learning_rate=2e-5,
        do_train=True
)

trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        tokenizer=tokenizer,
        train_dataset=dataset,
        eval_dataset=dataset,
        data_collator=data_collator
)

#Exception raised here:
trainer.train()

The DataCollator is a pre-existing class that I stole from TinyLlama's finetune.py

Any ideas?

0

There are 0 answers