I am trying to do a multilabel classification using the model emilyalsentzer/Bio_ClinicalBERT. I have medical text data that I am using. I have attached a snippet of it where there are quite a few labels that each have some sort of text. The text itself are paragraphs relating to the label. snippet of my label and text columns
When I am training the model, I just get this: error that im facing
I will provide my code below (my dataframe with 2 columns (label and text) is called data_c). Where am I going wrong and how can I fix this?
I tried to change the training arguments but wasnt successful.
My code:
###################################################################
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
###################################################################
tokenizer = AutoTokenizer.from_pretrained('emilyalsentzer/Bio_ClinicalBERT')
def preprocess_function(examples):
return tokenizer(examples["text"],
truncation=True,
padding="max_length",
max_length=512)
###################################################################
doc_train, doc_test = train_test_split(data_c, test_size = 0.3)
dataset_train = Dataset.from_pandas(doc_train)
dataset_test = Dataset.from_pandas(doc_test)
tokenized_dataset_train = dataset_train.map(preprocess_function, batched=True)
tokenized_dataset_test = dataset_test.map(preprocess_function, batched=True)
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
###################################################################
num_labels = len(data_c.label.unique())
disease = AutoModelForSequenceClassification.from_pretrained('emilyalsentzer/Bio_ClinicalBERT', num_labels=num_labels)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
###################################################################
training_args = TrainingArguments(
output_dir="./results",
learning_rate=5e-6,
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
num_train_epochs=2,
weight_decay=0.01,
report_to = "none"
)
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
def compute_metrics(pred):
labels = pred.label_ids
preds = pred.predictions.argmax(-1)
precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='micro')
acc = accuracy_score(labels, preds)
loss = pred.loss
return {
'accuracy': acc,
'f1': f1,
'precision': precision,
'recall': recall
}
trainer = Trainer(
model=disease,
args=training_args,
train_dataset=tokenized_dataset_train,
eval_dataset=tokenized_dataset_test,
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics = compute_metrics,
)
###################################################################
trainer.train()