I am trying to extract text from a two-column pdf document u sing this code:

#!/usr/bin/python
# -*- coding: utf-8 -*-

from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams

fp = open('page1.pdf', 'rb')
parser = PDFParser(fp)
doc = PDFDocument(parser)
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device =  PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)



for page in PDFPage.create_pages(doc):
    interpreter.process_page(page)
    layout = device.get_result()
for elem in layout:
    for line in elem:
        print (line.get_text())

When the line contains a hyphen ('-') at the end , this latter doesn't appear in the output. Here is a sample document "page1.pdf"

0 Answers