Fine Tuning a T5 Model on Squad

150 views Asked by At

Goal: Fine Tune T5 on SQUAD dataset for extractive question and answer

Question: If my tokenized dataset contains input_ids, attention_mask, targets_ids, and target_attention_mask, why is the trainer complaining about needing decoder_input_ids?

Error: ValueError: You have to specify either decoder_input_ids or decoder_inputs_embeds

Helper Functions

def subset_dataset(dataset, train_size=32, validation_size=16):
    # Extract the specified number of examples for training and validation
    train_data = dataset["train"].shuffle(seed=42).select([i for i in range(train_size)])
    validation_data = dataset["validation"].shuffle(seed=42).select([i for i in range(validation_size)])

    # Create a new DatasetDict with the subsetted data
    subsetted_dataset = DatasetDict({"train": train_data, "validation": validation_data})

    return subsetted_dataset

def t5_preprocess(entry):
    """
    Process input for T5 question and answer format
    """
    context = entry['context']
    question = entry['question']
    answer = entry['answers']['text'][0]
    answer_start =  entry['answers']['answer_start'][0]
    answer_end = answer_start + len(answer)
    
    input_text = f"question: {question} context: {context}"
    target_text = answer
    
    entry = {'input_text':input_text,
             'target_text': target_text, 
            #  'question': question, 
            #  'context': context, 
            #  'answers': answer, 
            #  'answer_start':answer_start, 
            #  'answer_end':answer_end
    }
    return entry

def convert_to_features(entry, tokenizer):
    """""
    Convert entry to features
    """

    input_encodings = tokenizer.encode_plus(entry['input_text'], pad_to_max_length=True, max_length=512)
    target_encodings = tokenizer.encode_plus(entry['target_text'], pad_to_max_length=True, max_length=512)

    entry = {
        'input_ids': input_encodings['input_ids'], 
        'attention_mask': input_encodings['attention_mask'],
        'target_ids': target_encodings['input_ids'],
        'target_attention_mask': target_encodings['attention_mask']
    }

    return entry

Full Code

#define model checkpoint
model_checkpoint = 't5-small'

# instantiate tokenizer
tokenizer = T5Tokenizer.from_pretrained(model_checkpoint)

# instantiate model
model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)

# load data
datasets = load_dataset('squad') #87,599 samples
datasets = subset_dataset(datasets) # 48 examples

# tokenize input
tokenized_datasets = datasets.map(lambda examples: t5_preprocess(examples), remove_columns=datasets["train"].column_names) # prep data 
tokenized_datasets = tokenized_datasets.map(lambda examples: convert_to_features(examples, tokenizer), remove_columns=tokenized_datasets["train"].column_names)

# define save directory
model_checkpoint_save_directory = './model_checkpoints'

# define model name
model_name = model_checkpoint.split("/")[-1] # model name for pushing to hugging face

# define checkpoint save location
timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
model_checkpoint_file_path = model_checkpoint_save_directory + "/" + model_name + "/"+ timestamp + "/" + "epochs" + "/"+  '{epoch:02d}-{val_loss:.2f}.hdf5' # callback knows to autopopulate these parameters

# define Training Arguments
batch_size = 16
learning_rate = 2e-5 # learning rate; The learning rate is a hyperparameter that determines the step size at each iteration while moving toward a minimum of the loss function
weight_decay = 0.01
num_epochs = 4 # number of times the entire training dataset will be passed through the model. Training for more epochs allows the model to see the entire dataset multiple times, potentially improving performance. However, training for too many epochs can lead to overfitting, where the model performs well on the training data but poorly on new, unseen data
total_train_steps = (len(tokenized_datasets["train"]) // batch_size) * num_epochs

# define the data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True,
    max_length=512,
)

# compile model for PyTorch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# move to gpu
model.to(device)

# define optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

# define training args
training_args = TrainingArguments(
    output_dir=model_checkpoint_file_path,
    evaluation_strategy="epoch",
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=weight_decay
)

# define the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

Tokenized Dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'target_ids', 'target_attention_mask'],
        num_rows: 32
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'target_ids', 'target_attention_mask'],
        num_rows: 16
    })
})
1

There are 1 answers

0
inverted_index On

As far as it looks, you're using target_ids and target_attention_mask keys, but instead they should be renamed as labels and decoder_attention_mask. In fact, the keys that your are using are not known by Huggingface.

Huggingface will look for the labels argument, and will produce the decoder_input_ids by shifting the labels to the right.

def convert_to_features(entry, tokenizer):

    ...

    entry = {
        'input_ids': input_encodings['input_ids'], 
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids'],
        'decoder_attention_mask': target_encodings['attention_mask']
    }

    return entry