To extract Table data from Image-embedded PDF file enter image description here
I want to improve accuracy of extracting data. Please leave messages to get table data from Image Embedded PDF.
from pdf2image import convert_from_path
import pytesseract
import csv
from pytesseract import Output
# Convert PDF to images
images = convert_from_path('input.pdf')
# Create a CSV file
csv_file_path = 'output.csv'
with open(csv_file_path, 'w', newline='', encoding='utf-8') as csv_file:
csv_writer = csv.writer(csv_file)
# Iterate over each image
for i, image in enumerate(images):
# Save the image
image_path = f'page_{i}.png'
image.save(image_path, 'PNG')
# Use pytesseract to extract text from the image
text = pytesseract.image_to_string(image_path, output_type=Output.STRING)
# Print the extracted text
print(text)
# Write the text to the CSV file
csv_writer.writerow([text])
print(f'CSV file created at: {csv_file_path}')