I am trying to convert a large Json file (~100GB) into csv using the ijson library in Python. This is my code:
import ijson
import csv
input_file_path = 'path_to_json_file' #json file is in dump
output_file_path = 'path_to_the_csv_file.csv'
# List the fieldnames you want to include in the CSV file
desired_fieldnames = [
"col_1","col_2","col_3",...
]
# Buffer to store rows before writing to CSV
buffer_size = 1000000
rows_buffer = []
def write_buffer(writer, buffer):
for row in buffer:
writer.writerow(row)
with open(input_file_path, 'rb') as input_file, open(output_file_path, 'w', newline='', encoding='utf-8') as output_file:
objects = ijson.items(input_file, 'rows.item.doc')
writer = csv.DictWriter(output_file, fieldnames=desired_fieldnames)
writer.writeheader()
for item in objects:
# Create a new dictionary with only the desired fields
filtered_item = {field: item.get(field, '') for field in desired_fieldnames}
rows_buffer.append(filtered_item)
if len(rows_buffer) >= buffer_size:
write_buffer(writer, rows_buffer)
rows_buffer = []
# Write any remaining rows in the buffer
if rows_buffer:
write_buffer(writer, rows_buffer)
And it results in this error:
Traceback (most recent call last):
File "path_to_python_program", line 64, in <module>
for item in objects:
ijson.common.IncompleteJSONError: parse error: unallowed token at this point in JSON text
~version":"sahdhdhash=="}}, ]}
(right here) ------^