Using Textract, how do you extract tables from a pdf file and output it into a csv file via .py script?

3.8k views Asked by At

I want to use textract (via aws cli) to extract tables from a pdf file (located in an s3 location) and export it into a csv file. I have tried writing a .py script but am struggling to read from the file. Any suggestions for writing the .py script is welcome.

This is my current script. I run into the error: File "extract-table.py", line 63, in get_table_csv_results bash: File: command not found blocks=response['Blocks'] KeyError: 'Blocks'

import webbrowser, os
import json
import boto3
import io
from io import BytesIO
import sys
from pprint import pprint





def get_rows_columns_map(table_result, blocks_map):
rows = {}
for relationship in table_result['Relationships']:
    if relationship['Type'] == 'CHILD':
        for child_id in relationship['Ids']:
            cell = blocks_map[child_id]
            if cell['BlockType'] == 'CELL':
                row_index = cell['RowIndex']
                col_index = cell['ColumnIndex']
                if row_index not in rows:
                    # create new row
                    rows[row_index] = {}
                    
                # get the text value
                rows[row_index][col_index] = get_text(cell, blocks_map)
return rows
def get_text(result, blocks_map):
text = ''
if 'Relationships' in result:
    for relationship in result['Relationships']:
        if relationship['Type'] == 'CHILD':
            for child_id in relationship['Ids']:
                word = blocks_map[child_id]
                if word['BlockType'] == 'WORD':
                    text += word['Text'] + ' '
                if word['BlockType'] == 'SELECTION_ELEMENT':
                    if word['SelectionStatus'] =='SELECTED':
                        text +=  'X '    
def get_table_csv_results(file_name):



with open(file_name, 'rb') as file:
    img_test = file.read()
    bytes_test = bytearray(img_test)
    print('Image loaded', file_name)

# process using image bytes
# get the results
client = boto3.client('textract')

#Response
response = client.start_document_text_detection(
DocumentLocation={
    'S3Object': {
        'Bucket': s3BucketName,
        'Name': documentName
    }
})

# Get the text blocks
blocks=response['Blocks']
pprint(blocks)

blocks_map = {}
table_blocks = []
for block in blocks:
    blocks_map[block['Id']] = block
    if block['BlockType'] == "TABLE":
        table_blocks.append(block)

if len(table_blocks) <= 0:
    return "<b> NO Table FOUND </b>"

csv = ''
for index, table in enumerate(table_blocks):
    csv += generate_table_csv(table, blocks_map, index +1)
    csv += '\n\n'

return csv
def generate_table_csv(table_result, blocks_map, table_index):
rows = get_rows_columns_map(table_result, blocks_map)



table_id = 'Table_' + str(table_index)

# get cells.
csv = 'Table: {0}\n\n'.format(table_id)

for row_index, cols in rows.items():
    
    for col_index, text in cols.items():
        csv += '{}'.format(text) + ","
    csv += '\n'
    
csv += '\n\n\n'
return csv
def main(file_name):
table_csv = get_table_csv_results(file_name)

output_file = 'output.csv'

# replace content
with open(output_file, "wt") as fout:
    fout.write(table_csv)

# show the results
print('CSV OUTPUT FILE: ', output_file)


# Document
s3BucketName = "chrisyou.sagemi.com"
documentName = "DETAIL.pdf"
if __name__ == "__main__":
file_name = sys.argv[1]
main(file_name)
2

There are 2 answers

0
Kipkirui On

I had to make slight changes to @Thomas answer by importing profile `

extractor = Textractor(profile_name="default") right after importing Textractor as shown below to avoid getting this error -> NameError: name 'textractor' is not defined.

    from textractor import Textractor
 extractor = Textractor(profile_name="default")

def extract_tables(s3_file_path, output_directory, s3_output_path):
   document = extractor.start_document_analysis(s3_file_path, textractor.data.constants.TextractFeatures.TABLES, s3_output_path)
    for j, page in enumerate(document.pages):
        for i, table in enumerate(document.tables):
            with open(output_directory+f'/output_p{j}_t{i}.csv', 'w') as csv_file:
                csv_file.write(table.to_csv())
    return document
    
document = extract_tables('s3://<INPUT_FILE.PDF>', './<LOCAL_DIRECTORY_FOR_CSV>', 's3://<TEXTRACT_OUTPUT_DIRECTORY>')

Hope it helps someone out there.

0
Thomas On

There is a much simpler way using the Amazon Textractor Textractor library. pip install amazon-textract-textractor

This will create a csv per table in your pdf document. e.g output_p0_t0.csv

from textractor import Textractor

def extract_tables(s3_file_path, output_directory, s3_output_path):
    extractor = Textractor(profile_name="default")
    document = extractor.start_document_analysis(s3_file_path, textractor.data.constants.TextractFeatures.TABLES, s3_output_path)
    for j, page in enumerate(document.pages):
        for i, table in enumerate(document.tables):
            with open(output_directory+f'/output_p{j}_t{i}.csv', 'w') as csv_file:
                csv_file.write(table.to_csv())
    return document
    
document = extract_tables('s3://<INPUT_FILE.PDF>', './<LOCAL_DIRECTORY_FOR_CSV>', 's3://<TEXTRACT_OUTPUT_DIRECTORY>')