Goal: Fine Tune T5 on SQUAD dataset for extractive question and answer
Question: If my tokenized dataset contains input_ids, attention_mask, targets_ids, and target_attention_mask, why is the trainer complaining about needing decoder_input_ids?
Error: ValueError: You have to specify either decoder_input_ids or decoder_inputs_embeds
Helper Functions
def subset_dataset(dataset, train_size=32, validation_size=16):
# Extract the specified number of examples for training and validation
train_data = dataset["train"].shuffle(seed=42).select([i for i in range(train_size)])
validation_data = dataset["validation"].shuffle(seed=42).select([i for i in range(validation_size)])
# Create a new DatasetDict with the subsetted data
subsetted_dataset = DatasetDict({"train": train_data, "validation": validation_data})
return subsetted_dataset
def t5_preprocess(entry):
"""
Process input for T5 question and answer format
"""
context = entry['context']
question = entry['question']
answer = entry['answers']['text'][0]
answer_start = entry['answers']['answer_start'][0]
answer_end = answer_start + len(answer)
input_text = f"question: {question} context: {context}"
target_text = answer
entry = {'input_text':input_text,
'target_text': target_text,
# 'question': question,
# 'context': context,
# 'answers': answer,
# 'answer_start':answer_start,
# 'answer_end':answer_end
}
return entry
def convert_to_features(entry, tokenizer):
"""""
Convert entry to features
"""
input_encodings = tokenizer.encode_plus(entry['input_text'], pad_to_max_length=True, max_length=512)
target_encodings = tokenizer.encode_plus(entry['target_text'], pad_to_max_length=True, max_length=512)
entry = {
'input_ids': input_encodings['input_ids'],
'attention_mask': input_encodings['attention_mask'],
'target_ids': target_encodings['input_ids'],
'target_attention_mask': target_encodings['attention_mask']
}
return entry
Full Code
#define model checkpoint
model_checkpoint = 't5-small'
# instantiate tokenizer
tokenizer = T5Tokenizer.from_pretrained(model_checkpoint)
# instantiate model
model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)
# load data
datasets = load_dataset('squad') #87,599 samples
datasets = subset_dataset(datasets) # 48 examples
# tokenize input
tokenized_datasets = datasets.map(lambda examples: t5_preprocess(examples), remove_columns=datasets["train"].column_names) # prep data
tokenized_datasets = tokenized_datasets.map(lambda examples: convert_to_features(examples, tokenizer), remove_columns=tokenized_datasets["train"].column_names)
# define save directory
model_checkpoint_save_directory = './model_checkpoints'
# define model name
model_name = model_checkpoint.split("/")[-1] # model name for pushing to hugging face
# define checkpoint save location
timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
model_checkpoint_file_path = model_checkpoint_save_directory + "/" + model_name + "/"+ timestamp + "/" + "epochs" + "/"+ '{epoch:02d}-{val_loss:.2f}.hdf5' # callback knows to autopopulate these parameters
# define Training Arguments
batch_size = 16
learning_rate = 2e-5 # learning rate; The learning rate is a hyperparameter that determines the step size at each iteration while moving toward a minimum of the loss function
weight_decay = 0.01
num_epochs = 4 # number of times the entire training dataset will be passed through the model. Training for more epochs allows the model to see the entire dataset multiple times, potentially improving performance. However, training for too many epochs can lead to overfitting, where the model performs well on the training data but poorly on new, unseen data
total_train_steps = (len(tokenized_datasets["train"]) // batch_size) * num_epochs
# define the data collator
data_collator = DataCollatorForSeq2Seq(
tokenizer=tokenizer,
model=model,
padding=True,
max_length=512,
)
# compile model for PyTorch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# move to gpu
model.to(device)
# define optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
# define training args
training_args = TrainingArguments(
output_dir=model_checkpoint_file_path,
evaluation_strategy="epoch",
learning_rate=learning_rate,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
num_train_epochs=num_epochs,
weight_decay=weight_decay
)
# define the trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets['train'],
eval_dataset=tokenized_datasets['validation'],
tokenizer=tokenizer,
data_collator=data_collator,
)
trainer.train()
Tokenized Dataset
DatasetDict({
train: Dataset({
features: ['input_ids', 'attention_mask', 'target_ids', 'target_attention_mask'],
num_rows: 32
})
validation: Dataset({
features: ['input_ids', 'attention_mask', 'target_ids', 'target_attention_mask'],
num_rows: 16
})
})
As far as it looks, you're using
target_ids
andtarget_attention_mask
keys, but instead they should be renamed aslabels
anddecoder_attention_mask
. In fact, the keys that your are using are not known by Huggingface.Huggingface will look for the
labels
argument, and will produce thedecoder_input_ids
by shifting thelabels
to the right.