BigQuery Python: google.api_core.exceptions.BadRequest: 400 Error while reading data, error message: Schema mismatch: referenced variable 'ro_sub_ros.$is_not_null' has array levels of 1, while the corresponding field path to Parquet column has 0 repeated fields.
My original data looks like this:
testData = {
"ro_user_email": "[email protected]",
"ro_account_id": "23402042",
"ro_sub_account_id": "34020334",
"ro_name": "Test RO",
"ro_number": "1304340",
"ro_currency": {"label":"USD","value":"USD"},
"ro_dates": {"from":now,"to":now},
"ro_status": "draft",
"ro_operation_timestamp": pd.Timestamp(now),
"ro_billing_cycle": {"label":"Fortnightly","value":"Fortnightly"},
"ro_sub_ros": [
{
"sub_ro_id": "2323",
"valid":False,
"sub_ro_name": "Testing",
"sub_ro_dates":{"from":now,"to":now},
"sub_ro_budget": 1203302.22,
"sub_ro_revenue_price":1202302.22,
"sub_ro_revenue_selected": {"label":"Fortnightly","value":"Fortnightly"},
"sub_ro_revenue_model_selected": {"label":"Fortnightly","value":"Fortnightly"},
"sub_ro_campaigns_selected": [{"label":"Fortnightly","value":"Fortnightly"}],
"sub_ro_ios_selected": [{"label":"Fortnightly","value":"Fortnightly"}],
"sub_ro_client_id": [{"label":"Fortnightly","value":"Fortnightly"}],
"sub_ro_ids_selected": [{"label":"Fortnightly","value":"Fortnightly"}],
"sub_ro_pixels_selected": [{"label":"Fortnightly","value":"Fortnightly"}],
"kpi_1_metric_selected": {"label":"Fortnightly","value":"Fortnightly"},
"attribution_model_selected": {"label":"Fortnightly","value":"Fortnightly"},
"kpi_window_selected": {"label":"Fortnightly","value":"Fortnightly"},
"deepMetrics_selected": {"label":"Fortnightly","value":"Fortnightly"},
"sub_ro_kpi_goal":"ROI"
}
],
}
And here's how I created my BQ Schema:
schema = [
bigquery.SchemaField("ro_user_email", "STRING", mode="REQUIRED"),
bigquery.SchemaField("ro_account_id", "STRING", mode="REQUIRED"),
bigquery.SchemaField("ro_sub_account_id", "STRING", mode="REQUIRED"),
bigquery.SchemaField("ro_name", "STRING", mode="REQUIRED"),
bigquery.SchemaField("ro_number", "STRING", mode="REQUIRED"),
bigquery.SchemaField("ro_currency",
"STRUCT",
mode="REQUIRED",
fields=[
bigquery.SchemaField("label", "STRING", mode="REQUIRED"),
bigquery.SchemaField("value", "STRING", mode="REQUIRED"),
]
),
bigquery.SchemaField("ro_dates",
"STRUCT",
mode="REQUIRED",
fields=[
bigquery.SchemaField("from", "DATE", mode="REQUIRED"),
bigquery.SchemaField("to", "DATE", mode="REQUIRED"),
]
),
bigquery.SchemaField("ro_status","STRING", mode="REQUIRED"),
bigquery.SchemaField("ro_operation_timestamp","TIMESTAMP", mode="REQUIRED"),
bigquery.SchemaField("ro_billing_cycle",
"STRUCT",
mode="REQUIRED",
fields=[
bigquery.SchemaField("label", "STRING", mode="REQUIRED"),
bigquery.SchemaField("value", "STRING", mode="REQUIRED"),
]
),
bigquery.SchemaField(
"ro_sub_ros",
"RECORD",
mode="REPEATED",
fields=[
bigquery.SchemaField("sub_ro_id", "STRING", mode="REQUIRED"),
bigquery.SchemaField("valid", "BOOL", mode="REQUIRED"),
bigquery.SchemaField("sub_ro_name", "STRING", mode="REQUIRED"),
bigquery.SchemaField("sub_ro_dates", "STRUCT", mode="REQUIRED",
fields=[
bigquery.SchemaField("from", "DATE", mode="REQUIRED"),
bigquery.SchemaField("to", "DATE", mode="REQUIRED"),
]
),
bigquery.SchemaField("sub_ro_budget", "FLOAT", mode="REQUIRED"),
bigquery.SchemaField("sub_ro_revenue_price", "FLOAT", mode="REQUIRED"),
bigquery.SchemaField("sub_ro_revenue_selected",
"STRUCT", mode="REQUIRED",
fields=[
bigquery.SchemaField("label", "STRING", mode="REQUIRED"),
bigquery.SchemaField("value", "STRING", mode="REQUIRED"),
]
),
bigquery.SchemaField("sub_ro_revenue_model_selected",
"STRUCT", mode="REQUIRED",
fields=[
bigquery.SchemaField("label", "STRING", mode="REQUIRED"),
bigquery.SchemaField("value", "STRING", mode="REQUIRED"),
]
),
bigquery.SchemaField("sub_ro_campaigns_selected","RECORD",
mode="REPEATED",
fields=[
bigquery.SchemaField("model_list",
"STRUCT", mode="REQUIRED",
fields=[
bigquery.SchemaField("label", "STRING", mode="REQUIRED"),
bigquery.SchemaField("value", "STRING", mode="REQUIRED"),
]
)
]),
bigquery.SchemaField("sub_ro_ios_selected","RECORD",
mode="REPEATED",
fields=[
bigquery.SchemaField("model_list",
"STRUCT", mode="REQUIRED",
fields=[
bigquery.SchemaField("label", "STRING", mode="REQUIRED"),
bigquery.SchemaField("value", "STRING", mode="REQUIRED"),
]
)
]),
bigquery.SchemaField("sub_ro_client_id","RECORD",
mode="REPEATED",
fields=[
bigquery.SchemaField("model_list",
"STRUCT", mode="REQUIRED",
fields=[
bigquery.SchemaField("label", "STRING", mode="REQUIRED"),
bigquery.SchemaField("value", "STRING", mode="REQUIRED"),
]
)
]),
#
bigquery.SchemaField("sub_ro_ids_selected","RECORD",
mode="REPEATED",
fields=[
bigquery.SchemaField("model_list",
"STRUCT", mode="REQUIRED",
fields=[
bigquery.SchemaField("label", "STRING", mode="REQUIRED"),
bigquery.SchemaField("value", "STRING", mode="REQUIRED"),
]
)
]),
bigquery.SchemaField("sub_ro_pixels_selected","RECORD",
mode="REPEATED",
fields=[
bigquery.SchemaField("model_list",
"STRUCT", mode="REQUIRED",
fields=[
bigquery.SchemaField("label", "STRING", mode="REQUIRED"),
bigquery.SchemaField("value", "STRING", mode="REQUIRED"),
]
)
]),
bigquery.SchemaField("kpi_1_metric_selected",
"STRUCT", mode="REQUIRED",
fields=[
bigquery.SchemaField("label", "STRING", mode="REQUIRED"),
bigquery.SchemaField("value", "STRING", mode="REQUIRED"),
]
),
bigquery.SchemaField("attribution_model_selected",
"STRUCT", mode="REQUIRED",
fields=[
bigquery.SchemaField("label", "STRING", mode="REQUIRED"),
bigquery.SchemaField("value", "STRING", mode="REQUIRED"),
]
),
bigquery.SchemaField("kpi_window_selected",
"STRUCT", mode="REQUIRED",
fields=[
bigquery.SchemaField("label", "STRING", mode="REQUIRED"),
bigquery.SchemaField("value", "STRING", mode="REQUIRED"),
]
),
bigquery.SchemaField("deepMetrics_selected",
"STRUCT", mode="REQUIRED",
fields=[
bigquery.SchemaField("label", "STRING", mode="REQUIRED"),
bigquery.SchemaField("value", "STRING", mode="REQUIRED"),
]
),
bigquery.SchemaField("sub_ro_kpi_goal", "STRING", mode="REQUIRED"),
],
)
]
When I try to upload this data using bigquery client library
I get this error:
job_config = bigquery.LoadJobConfig(schema=schema)
return bq.client.load_table_from_dataframe(
df, tablename, job_config=job_config
).result()
throws:
google.api_core.exceptions.BadRequest: 400 Error while reading data, error message: Schema mismatch: referenced variable 'ro_sub_ros.$is_not_null' has array levels of 1, while the corresponding field path to Parquet column has 0 repeated
fields.
Not sure what's going wrong here, In case my schema is too big and bulky to analyze, can someone show a minimal example of uploading a REPEATED RECORD
in google bigquery using client library and pandas data frame?
you can consider validating these options.
Validate the BigQuery schema is correct, this is an example, using repeating records. You can see official documentation.
Validate the records syntax is correct. Here is an example with values of the schema.
Consider using the “autodetect schema” in your python code. Similar to this example. You can see more documentation.
You can validate the JSON format in this page.