Is it possible to fine tune the model nllb200_1.3B in Google Colab?
I'm trying to use Lora and Peft to tune an English to Spanish translation model
My code is the following:
from transformers import M2M100Config, M2M100ForConditionalGeneration, M2M100Tokenizer
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import load_dataset
raw_datasets = load_dataset('csv', data_files='Dataset.csv', delimiter=',')
model_checkpoint = "facebook/nllb-200-distilled-1.3B"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, src_lang="en", tgt_lang="es")
def preprocess_function(examples):
inputs = [ex for ex in examples["sourceString"]]
targets = [ex for ex in examples["targetString"]]
model_inputs = tokenizer(inputs, max_length=128, truncation=True)
with tokenizer.as_target_tokenizer():
labels = tokenizer(targets, max_length=128, truncation=True)
model_inputs["labels"] = labels["input_ids"]
model_inputs["decoder_input_ids"] = labels["input_ids"]
return model_inputs
Tokenize Data
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)
Fine Tune Model
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
Fine Tune Model
Load model with quantization
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=False,
)
model = AutoModelForSeq2SeqLM.from_pretrained(
model_checkpoint,
load_in_4bit=True,
quantization_config=bnb_config,
torch_dtype=torch.bfloat16,
device_map="auto"
)
model = prepare_model_for_kbit_training(model)
peft_config = LoraConfig(
lora_alpha=16,
lora_dropout=0.1,
r=64,
bias="none",
task_type="CAUSAL_LM",
target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
)
model = get_peft_model(model, peft_config)
args = Seq2SeqTrainingArguments(
f"Model-{model_checkpoint}",
#evaluation_strategy = "steps",
learning_rate=2e-5,
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
weight_decay=0.01,
save_total_limit=3,
num_train_epochs=15,
predict_with_generate=True,
logging_steps=200,#50,
warmup_steps=500,#100,
fp16 = True,
label_smoothing_factor = 0.1,
logging_first_step = True
)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
trainer = Seq2SeqTrainer(
model,
args,
train_dataset=tokenized_datasets["train"],
#eval_dataset=tokenized_datasets["test"],
data_collator=data_collator,
tokenizer=tokenizer
)
trainer.train()
and I get the error
ValueError: You have to specify either decoder_input_ids or decoder_inputs_embeds
I assume it is a problem with quantization, because I have tried this code with other models and it does not give me this problem.