fine tuning with hugging face trainer when adding layer on eletra model

781 views Asked by At

i'm trying to fine tune my own model with hugging face trainer module. There was no problem until just training ElectraforQuestionAnswering, however I tried to add additional layer on the model and tried the same process. And there comes this error

from transformers import ElectraForQuestionAnswering
from torch import nn
class Jelectra(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = ElectraForQuestionAnswering.from_pretrained("google/electra-small-discriminator")
        self.sm = nn.Softmax(dim=1)

    def forward(self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        start_positions=None,
        end_positions=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,):
        
        outputs = self.model(input_ids, token_type_ids, attention_mask, start_positions, end_positions)
        output_start = self.sm(outputs[0])
        output_end = self.sm(outputs[1])
        return QuestionAnsweringModelOutput(start_logits=output_start, end_logits=output_end)
    
model = Jelectra()
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./fine_tuned_electra",
    evaluation_strategy="epoch",
    learning_rate=5e-4,
    per_device_train_batch_size=12,
    per_device_eval_batch_size=12,
    num_train_epochs=2,
    weight_decay=0.01,
    gradient_accumulation_steps=2,
    eval_accumulation_steps=1,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad["train"],
    eval_dataset=tokenized_squad["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

The error is...

RuntimeError                              Traceback (most recent call last)
Input In [12], in <module>
      3 training_args = TrainingArguments(
      4     output_dir="./fine_tuned_electra",
      5     evaluation_strategy="epoch",
   (...)
     12     eval_accumulation_steps=1,
     13 )
     15 trainer = Trainer(
     16     model=model,
     17     args=training_args,
   (...)
     21     data_collator=data_collator,
     22 )
---> 24 trainer.train()

File ~/anaconda3/envs/domain/lib/python3.8/site-packages/transformers/trainer.py:1365, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
   1363         tr_loss_step = self.training_step(model, inputs)
   1364 else:
-> 1365     tr_loss_step = self.training_step(model, inputs)
   1367 if (
   1368     args.logging_nan_inf_filter
   1369     and not is_torch_tpu_available()
   1370     and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))
   1371 ):
   1372     # if loss is nan or inf simply add the average of previous logged losses
   1373     tr_loss += tr_loss / (1 + self.state.global_step - self._globalstep_last_logged)

File ~/anaconda3/envs/domain/lib/python3.8/site-packages/transformers/trainer.py:1940, in Trainer.training_step(self, model, inputs)
   1937     return loss_mb.reduce_mean().detach().to(self.args.device)
   1939 with self.autocast_smart_context_manager():
-> 1940     loss = self.compute_loss(model, inputs)
   1942 if self.args.n_gpu > 1:
   1943     loss = loss.mean()  # mean() to average on multi-gpu parallel training

File ~/anaconda3/envs/domain/lib/python3.8/site-packages/transformers/trainer.py:1972, in Trainer.compute_loss(self, model, inputs, return_outputs)
   1970 else:
   1971     labels = None
-> 1972 outputs = model(**inputs)
   1973 # Save past state if it exists
   1974 # TODO: this needs to be fixed and made cleaner later.
   1975 if self.args.past_index >= 0:

File ~/anaconda3/envs/domain/lib/python3.8/site-packages/torch/nn/modules/module.py:727, in Module._call_impl(self, *input, **kwargs)
    725     result = self._slow_forward(*input, **kwargs)
    726 else:
--> 727     result = self.forward(*input, **kwargs)
    728 for hook in itertools.chain(
    729         _global_forward_hooks.values(),
    730         self._forward_hooks.values()):
    731     hook_result = hook(self, input, result)

Input In [11], in Jelectra.forward(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, start_positions, end_positions, output_attentions, output_hidden_states, return_dict)
      9 def forward(self,
     10     input_ids=None,
     11     attention_mask=None,
   (...)
     19     output_hidden_states=None,
     20     return_dict=None,):
---> 22     outputs = self.model(input_ids, token_type_ids, attention_mask, start_positions, end_positions)
     23     output_start = self.sm(outputs[0])
     24     output_end = self.sm(outputs[1])

File ~/anaconda3/envs/domain/lib/python3.8/site-packages/torch/nn/modules/module.py:727, in Module._call_impl(self, *input, **kwargs)
    725     result = self._slow_forward(*input, **kwargs)
    726 else:
--> 727     result = self.forward(*input, **kwargs)
    728 for hook in itertools.chain(
    729         _global_forward_hooks.values(),
    730         self._forward_hooks.values()):
    731     hook_result = hook(self, input, result)

File ~/anaconda3/envs/domain/lib/python3.8/site-packages/transformers/models/electra/modeling_electra.py:1377, in ElectraForQuestionAnswering.forward(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, start_positions, end_positions, output_attentions, output_hidden_states, return_dict)
   1365 r"""
   1366 start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
   1367     Labels for position (index) of the start of the labelled span for computing the token classification loss.
   (...)
   1373     are not taken into account for computing the loss.
   1374 """
   1375 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-> 1377 discriminator_hidden_states = self.electra(
   1378     input_ids,
   1379     attention_mask=attention_mask,
   1380     token_type_ids=token_type_ids,
   1381     position_ids=position_ids,
   1382     head_mask=head_mask,
   1383     inputs_embeds=inputs_embeds,
   1384     output_attentions=output_attentions,
   1385     output_hidden_states=output_hidden_states,
   1386 )
   1388 sequence_output = discriminator_hidden_states[0]
   1390 logits = self.qa_outputs(sequence_output)

File ~/anaconda3/envs/domain/lib/python3.8/site-packages/torch/nn/modules/module.py:727, in Module._call_impl(self, *input, **kwargs)
    725     result = self._slow_forward(*input, **kwargs)
    726 else:
--> 727     result = self.forward(*input, **kwargs)
    728 for hook in itertools.chain(
    729         _global_forward_hooks.values(),
    730         self._forward_hooks.values()):
    731     hook_result = hook(self, input, result)

File ~/anaconda3/envs/domain/lib/python3.8/site-packages/transformers/models/electra/modeling_electra.py:905, in ElectraModel.forward(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)
    901     encoder_extended_attention_mask = None
    903 head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
--> 905 hidden_states = self.embeddings(
    906     input_ids=input_ids,
    907     position_ids=position_ids,
    908     token_type_ids=token_type_ids,
    909     inputs_embeds=inputs_embeds,
    910     past_key_values_length=past_key_values_length,
    911 )
    913 if hasattr(self, "embeddings_project"):
    914     hidden_states = self.embeddings_project(hidden_states)

File ~/anaconda3/envs/domain/lib/python3.8/site-packages/torch/nn/modules/module.py:727, in Module._call_impl(self, *input, **kwargs)
    725     result = self._slow_forward(*input, **kwargs)
    726 else:
--> 727     result = self.forward(*input, **kwargs)
    728 for hook in itertools.chain(
    729         _global_forward_hooks.values(),
    730         self._forward_hooks.values()):
    731     hook_result = hook(self, input, result)

File ~/anaconda3/envs/domain/lib/python3.8/site-packages/transformers/models/electra/modeling_electra.py:212, in ElectraEmbeddings.forward(self, input_ids, token_type_ids, position_ids, inputs_embeds, past_key_values_length)
    210 if self.position_embedding_type == "absolute":
    211     position_embeddings = self.position_embeddings(position_ids)
--> 212     embeddings += position_embeddings
    213 embeddings = self.LayerNorm(embeddings)
    214 embeddings = self.dropout(embeddings)

RuntimeError: The size of tensor a (512) must match the size of tensor b (12) at non-singleton dimension 1

how can i solve this?? I'm using squad data

0

There are 0 answers