This is the code for the resume dataset which was in DataFrame format. I tried to make it as json format as required by Simple Transformer.
I have 5 sets of questions which repeats for each row in the dataframe changing the company name, post name and its answers.
counter is just to give unique id to the questions. The dataframe has 22000 rows. I am not sure why I keep getting this error. Either the error is the one I have uploaded below or there is error of "bool object is not callable"
for index, value in df_1000_final.items():
counter = range_till_1000[low:high] # Slicing the id's
# Taking company name and post name as per each row
company_name = df_1000_final[index][index_df.loc[index,"company_index"]:index_df.loc[index,"job_role_index"]-4]
post_name = df_1000_final[index][index_df.loc[index,"title_index"]:index_df.loc[index,"company_index"]-4]
pay_rate = df_1000_final[index][index_df.loc[index,"payrate_index"]:]
skills_required = df_1000_final[index][index_df.loc[index,"skills_index"]:index_df.loc[index,"payrate_index"]-3]
experience_required = df_1000_final[index][index_df.loc[index,"experience_index"]:index_df.loc[index,"skills_index"]-3]
role = df_1000_final[index][index_df.loc[index,"job_role_index"]:index_df.loc[index,"job_location_index"]-3]
json_objects.append({
"context": value,
"qas": [
{
"id": str(counter[0]),
"is_impossible": False,
"question": f"What is job provided by {company_name} company?",
"answers": [
{
"text": post_name,
"answer_start": index_df.loc[index,'title_index'],
}
],
},
{
"id": str(counter[1]),
"is_impossible": False,
"question": f"What is salary provided by {company_name} company for {post_name} post?",
"answers": [
{
"text": pay_rate,
"answer_start": index_df.loc[index,'payrate_index'],
}
],
},
{
"id": str(counter[2]),
"is_impossible": False,
"question": f"What are skills required for {post_name} post in {company_name} company?",
"answers": [
{
"text": skills_required,
"answer_start": index_df.loc[index,'skills_index'],
}
],
},
{
"id": str(counter[3]),
"is_impossible": False,
"question": f"What is experience required for {post_name} post in {company_name} company?",
"answers": [
{
"text": experience_required,
"answer_start": index_df.loc[index,'experience_index'],
}
],
},
{
"id": str(counter[4]),
"is_impossible": False,
"question": f"What is role for {post_name} post in {company_name} company?",
"answers": [
{
"text": role,
"answer_start": index_df.loc[index,'job_role_index'],
}
],
},
],
},
)
# Configure the model
model_args = QuestionAnsweringArgs()
model_args.train_batch_size = 16
model_args.evaluate_during_training = True
model = QuestionAnsweringModel(
"roberta", "roberta-base", args=model_args
)
# Train the model
model.train_model(train_data, eval_data=False,use_cuda=True)
This is the error :
'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 6d205e22-5a78-4562-ba07-f26af7fbf7fc)')' thrown while requesting HEAD https://huggingface.co/roberta-base/resolve/main/config.json
WARNING:huggingface_hub.utils._http:'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 6d205e22-5a78-4562-ba07-f26af7fbf7fc)')' thrown while requesting HEAD https://huggingface.co/roberta-base/resolve/main/config.json
Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
convert squad examples to features: 100%|██████████| 5000/5000 [01:00<00:00, 82.60it/s]
add example index and unique id: 100%|██████████| 5000/5000 [00:00<00:00, 611182.93it/s]
Epoch 1 of 1: 0%
0/1 [00:56<?, ?it/s]
Epochs 0/1. Running Loss: 0.0076: 100%
322/322 [00:51<00:00, 7.43it/s]
/usr/local/lib/python3.10/dist-packages/torch/optim/lr_scheduler.py:136: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
warnings.warn("Detected call of `lr_scheduler.step()` before `optimizer.step()`. "
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-45-03fadae3f855> in <cell line: 11>()
9
10 # Train the model
---> 11 model.train_model(train_data, eval_data=False,use_cuda=True)
5 frames
/usr/local/lib/python3.10/dist-packages/simpletransformers/question_answering/question_answering_utils.py in get_examples(examples_to_process, is_training, version_2_with_negative)
130 def get_examples(examples_to_process, is_training=True, version_2_with_negative=True):
131 if not isinstance(examples_to_process, list):
--> 132 raise TypeError("Input should be a list of examples.")
133
134 def is_whitespace(c):
TypeError: Input should be a list of examples.
I did modified the json structure as much as I can but it is still throwing the error