i'm trying to fine tune my own model with hugging face trainer module. There was no problem until just training ElectraforQuestionAnswering, however I tried to add additional layer on the model and tried the same process. And there comes this error
from transformers import ElectraForQuestionAnswering
from torch import nn
class Jelectra(nn.Module):
def __init__(self):
super().__init__()
self.model = ElectraForQuestionAnswering.from_pretrained("google/electra-small-discriminator")
self.sm = nn.Softmax(dim=1)
def forward(self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
start_positions=None,
end_positions=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,):
outputs = self.model(input_ids, token_type_ids, attention_mask, start_positions, end_positions)
output_start = self.sm(outputs[0])
output_end = self.sm(outputs[1])
return QuestionAnsweringModelOutput(start_logits=output_start, end_logits=output_end)
model = Jelectra()
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(
output_dir="./fine_tuned_electra",
evaluation_strategy="epoch",
learning_rate=5e-4,
per_device_train_batch_size=12,
per_device_eval_batch_size=12,
num_train_epochs=2,
weight_decay=0.01,
gradient_accumulation_steps=2,
eval_accumulation_steps=1,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_squad["train"],
eval_dataset=tokenized_squad["validation"],
tokenizer=tokenizer,
data_collator=data_collator,
)
trainer.train()
The error is...
RuntimeError Traceback (most recent call last)
Input In [12], in <module>
3 training_args = TrainingArguments(
4 output_dir="./fine_tuned_electra",
5 evaluation_strategy="epoch",
(...)
12 eval_accumulation_steps=1,
13 )
15 trainer = Trainer(
16 model=model,
17 args=training_args,
(...)
21 data_collator=data_collator,
22 )
---> 24 trainer.train()
File ~/anaconda3/envs/domain/lib/python3.8/site-packages/transformers/trainer.py:1365, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1363 tr_loss_step = self.training_step(model, inputs)
1364 else:
-> 1365 tr_loss_step = self.training_step(model, inputs)
1367 if (
1368 args.logging_nan_inf_filter
1369 and not is_torch_tpu_available()
1370 and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))
1371 ):
1372 # if loss is nan or inf simply add the average of previous logged losses
1373 tr_loss += tr_loss / (1 + self.state.global_step - self._globalstep_last_logged)
File ~/anaconda3/envs/domain/lib/python3.8/site-packages/transformers/trainer.py:1940, in Trainer.training_step(self, model, inputs)
1937 return loss_mb.reduce_mean().detach().to(self.args.device)
1939 with self.autocast_smart_context_manager():
-> 1940 loss = self.compute_loss(model, inputs)
1942 if self.args.n_gpu > 1:
1943 loss = loss.mean() # mean() to average on multi-gpu parallel training
File ~/anaconda3/envs/domain/lib/python3.8/site-packages/transformers/trainer.py:1972, in Trainer.compute_loss(self, model, inputs, return_outputs)
1970 else:
1971 labels = None
-> 1972 outputs = model(**inputs)
1973 # Save past state if it exists
1974 # TODO: this needs to be fixed and made cleaner later.
1975 if self.args.past_index >= 0:
File ~/anaconda3/envs/domain/lib/python3.8/site-packages/torch/nn/modules/module.py:727, in Module._call_impl(self, *input, **kwargs)
725 result = self._slow_forward(*input, **kwargs)
726 else:
--> 727 result = self.forward(*input, **kwargs)
728 for hook in itertools.chain(
729 _global_forward_hooks.values(),
730 self._forward_hooks.values()):
731 hook_result = hook(self, input, result)
Input In [11], in Jelectra.forward(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, start_positions, end_positions, output_attentions, output_hidden_states, return_dict)
9 def forward(self,
10 input_ids=None,
11 attention_mask=None,
(...)
19 output_hidden_states=None,
20 return_dict=None,):
---> 22 outputs = self.model(input_ids, token_type_ids, attention_mask, start_positions, end_positions)
23 output_start = self.sm(outputs[0])
24 output_end = self.sm(outputs[1])
File ~/anaconda3/envs/domain/lib/python3.8/site-packages/torch/nn/modules/module.py:727, in Module._call_impl(self, *input, **kwargs)
725 result = self._slow_forward(*input, **kwargs)
726 else:
--> 727 result = self.forward(*input, **kwargs)
728 for hook in itertools.chain(
729 _global_forward_hooks.values(),
730 self._forward_hooks.values()):
731 hook_result = hook(self, input, result)
File ~/anaconda3/envs/domain/lib/python3.8/site-packages/transformers/models/electra/modeling_electra.py:1377, in ElectraForQuestionAnswering.forward(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, start_positions, end_positions, output_attentions, output_hidden_states, return_dict)
1365 r"""
1366 start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1367 Labels for position (index) of the start of the labelled span for computing the token classification loss.
(...)
1373 are not taken into account for computing the loss.
1374 """
1375 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-> 1377 discriminator_hidden_states = self.electra(
1378 input_ids,
1379 attention_mask=attention_mask,
1380 token_type_ids=token_type_ids,
1381 position_ids=position_ids,
1382 head_mask=head_mask,
1383 inputs_embeds=inputs_embeds,
1384 output_attentions=output_attentions,
1385 output_hidden_states=output_hidden_states,
1386 )
1388 sequence_output = discriminator_hidden_states[0]
1390 logits = self.qa_outputs(sequence_output)
File ~/anaconda3/envs/domain/lib/python3.8/site-packages/torch/nn/modules/module.py:727, in Module._call_impl(self, *input, **kwargs)
725 result = self._slow_forward(*input, **kwargs)
726 else:
--> 727 result = self.forward(*input, **kwargs)
728 for hook in itertools.chain(
729 _global_forward_hooks.values(),
730 self._forward_hooks.values()):
731 hook_result = hook(self, input, result)
File ~/anaconda3/envs/domain/lib/python3.8/site-packages/transformers/models/electra/modeling_electra.py:905, in ElectraModel.forward(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)
901 encoder_extended_attention_mask = None
903 head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
--> 905 hidden_states = self.embeddings(
906 input_ids=input_ids,
907 position_ids=position_ids,
908 token_type_ids=token_type_ids,
909 inputs_embeds=inputs_embeds,
910 past_key_values_length=past_key_values_length,
911 )
913 if hasattr(self, "embeddings_project"):
914 hidden_states = self.embeddings_project(hidden_states)
File ~/anaconda3/envs/domain/lib/python3.8/site-packages/torch/nn/modules/module.py:727, in Module._call_impl(self, *input, **kwargs)
725 result = self._slow_forward(*input, **kwargs)
726 else:
--> 727 result = self.forward(*input, **kwargs)
728 for hook in itertools.chain(
729 _global_forward_hooks.values(),
730 self._forward_hooks.values()):
731 hook_result = hook(self, input, result)
File ~/anaconda3/envs/domain/lib/python3.8/site-packages/transformers/models/electra/modeling_electra.py:212, in ElectraEmbeddings.forward(self, input_ids, token_type_ids, position_ids, inputs_embeds, past_key_values_length)
210 if self.position_embedding_type == "absolute":
211 position_embeddings = self.position_embeddings(position_ids)
--> 212 embeddings += position_embeddings
213 embeddings = self.LayerNorm(embeddings)
214 embeddings = self.dropout(embeddings)
RuntimeError: The size of tensor a (512) must match the size of tensor b (12) at non-singleton dimension 1
how can i solve this?? I'm using squad data