Extract Table Data from Image-embedded PDF file

92 views Asked by At

To extract Table data from Image-embedded PDF file enter image description here

I want to improve accuracy of extracting data. Please leave messages to get table data from Image Embedded PDF.

from pdf2image import convert_from_path
import pytesseract
import csv
from pytesseract import Output

# Convert PDF to images
images = convert_from_path('input.pdf')

# Create a CSV file
csv_file_path = 'output.csv'
with open(csv_file_path, 'w', newline='', encoding='utf-8') as csv_file:
  csv_writer = csv.writer(csv_file)

  # Iterate over each image
  for i, image in enumerate(images):
    # Save the image
    image_path = f'page_{i}.png'
    image.save(image_path, 'PNG')

    # Use pytesseract to extract text from the image
    text = pytesseract.image_to_string(image_path, output_type=Output.STRING)

    # Print the extracted text
    print(text)

    # Write the text to the CSV file
    csv_writer.writerow([text])

print(f'CSV file created at: {csv_file_path}')
0

There are 0 answers