Pdf to text conversion in python gives misses some values from pdf

242 views Asked by At

I am trying to convert a pdf file into text format in python. I managed to do it but in some of the cases the values are not in correct position. like if a value in middle of a paragraph goes in the end or no where in the converted text. Does any one encountered this? is this something to do with type of pdf i am dealing with. Any help or information will be appreciated. For better understanding i am adding images from pdf and my output.

pdf screenshot

output screenshot

We can see the value 600,95 is missing in text output. Here is the code i have used

---- code ----

 from pdfminer.converter import PDFPageAggregator
    from pdfminer.layout import LAParams, LTTextBox, LTTextLine
    from pdfminer.pdfdocument import PDFDocument
    from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
    from pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowed
    from pdfminer.pdfparser import PDFParser
    from PyPDF2 import PdfFileReader
    from urllib.request import urlopen
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.converter import TextConverter
    from pdfminer.layout import LAParams
    from pdfminer.pdfpage import PDFPage
    from io import StringIO, BytesIO
    def readPDF(pdfFile):
            rsrcmgr = PDFResourceManager()
            retstr = StringIO()
            codec = 'utf-8'
            laparams = LAParams()
            device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
            
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            password = ""
            maxpages = 0
            caching = True
            pagenos=set()
            for page in PDFPage.get_pages(pdfFile, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
                interpreter.process_page(page)
        
            device.close()
            textstr = retstr.getvalue()
            retstr.close()
            return textstr
        
        if __name__ == "__main__":
            scrape = open("file location", 'rb')
            pdfFile = BytesIO(scrape.read())
            outputString = readPDF(pdfFile)
            print(outputString)
            pdfFile.close() 
1

There are 1 answers

1
darkness On

you don't really need that much imports as a fairly recent lib called pyPDF2 and as what I experience with it, it doesn't skip any bits. don't forget to install the library first pip install PyPDF2

import PyPDF2

# open the desired pdf
pdfFile=open('1.pdf','rb')

#read it and get the pages number
pdfreader=PyPDF2.PdfFileReader(pdfFile)
x=pdfreader.numPages
pageobj=pdfreader.getPage(x-1)

# extract the pdf to text
text=pageobj.extractText()

# save the output to a .txt
file1=open("1.txt","a")
file1.writelines(text)
file1.close()