How to read table from this particular PDF - nothing works: tabula.io, pdfplumber, camelot

127 views Asked by At

I am trying to read a table using Python from this PDF file Tedenski-jedilnik-od-5.pdf

But nothing seems to work for me, I have tried tabula.io, camelot (this doesn't even run due to some libraries incompatibilities), pdfplumber. But the output is not really usable to me.

What else should I try? How could I create a pandas dataframe out of this table in this PDF?

Thanks.

2

There are 2 answers

0
Py_Waterloo On

I had some code and modified it for your purpose. It outputs a table with the data from your example pdf. Hope this is already helpful. Just modify the self.path = '/the/dir/of/the/pdf' e.g. '/home/.../Downloads/.

import numpy as np
import pdfplumber
import ImageMagic
import ghostscript
import pandas as pd
from tabulate import tabulate
import re
import datetime

class Table_PDF_extractor:
    """
    Extract info from  PDF into dataframe.
    """

    def __init__(self, pdf_file):
        self.pdf_file = pdf_file
        self.path = '/home/.../Downloads/'
        self.pdf = pdfplumber.open(self.path + self.pdf_file)
        self.pgNbr = len(self.pdf.pages)

    def __str__(self):
        state1 = f"The pdf name is {self.pdf_file} and the reporting period is {self.extractInfo()}"
        state2 = f"The pdf has {self.pgNbr} page(s)."
        return state1 + "\n" + state2

    def getPos(self, pg=0):
        page = self.pdf.pages[pg]
        rows = page.extract_text_lines()
        for row in rows:
            # print(row['text'])
            if row["text"].startswith("PONEDELJEK"):
                top = row["top"]
            if row["text"].startswith("* Za otroke"):
                bottom = row["bottom"]
        return top, bottom

    def extractInfo(self):
        page = self.pdf.pages[0]
        rows = page.extract_text_lines()
        for row in rows:
            if row["text"].startswith("TEDENSKI JEDILNIK OD"):
                period = row["text"]
        pattern = re.compile(r"(\d{1}. \d{2}. \d{4})|(\d{2}. \d{2}. \d{4})")
        lst = list()
        # find all matches to groups
        for match in pattern.finditer(period):
            res = [i for i in match.group(1,2) if i is not None]
            lst.append(res[0])
        dates = [datetime.datetime.strptime(a_date, "%d. %m. %Y") for a_date in lst]
        return (
            str(dates[0].strftime("%Y-%m-%d"))
            + " - "
            + str(dates[1].strftime("%Y-%m-%d"))
        )


    def doIt_2ndPage(self, pg, top, bottom):
        page = self.pdf.pages[pg]
        table_settings = {
            "vertical_strategy": "explicit",
            "horizontal_strategy": "explicit",
            "explicit_vertical_lines": [24, 120, 230, 335, 480, 570],
            "explicit_horizontal_lines": [top - 3, bottom - 11],
        }
        # page.to_image(resolution=300).reset().debug_tablefinder(table_settings).show()
        bounding_box = (24, top -  3 , 570, bottom - 11)
        page.crop(bounding_box).to_image(resolution=150).outline_words().show()

        start = min([x["x0"] for x in page.crop(bounding_box).horizontal_edges])
        end = max([x["x1"] for x in page.crop(bounding_box).horizontal_edges])

        table_settings = {
            "vertical_strategy": "explicit",
            "horizontal_strategy": "lines",
            "snap_y_tolerance":3,
            "explicit_vertical_lines": page.curves + page.edges + [start, end],
        }
        table = page.crop(bounding_box).extract_table(table_settings)
        table_out = [row for row in table if "".join([str(i) for i in row]) != ""]
        df =pd.DataFrame(table_out,columns=['0','1','2','3','4','5','6','7'])
        dfn = df[['0','1','2','3','5','6']]
        dfn = dfn.astype('string')
        # replace field that's entirely space (or empty) with NaN
        dfn.replace(r'^\s*$', np.nan, regex=True, inplace=True)
        dfn.dropna(subset='0', inplace=True)
        dfn = dfn[['0','1','2','5','6']]
        return dfn

    def putItTogether(self):
        return self.doIt_2ndPage(0, self.getPos()[0], self.getPos()[1])

out = Table_PDF_extractor("Tedenski-jedilnik-od-5.pdf").putItTogether()
print(tabulate(out.iloc[:,0:7],headers=out.columns, tablefmt="grid"))
0
Py_Waterloo On

How many different layouts are there? Here an adapted version of the code for this type of pdf layout. Hope it will work for you.

import numpy as np
import pdfplumber
import ImageMagic
import ghostscript
import pandas as pd
from tabulate import tabulate
import re
import datetime

class Table_PDF_extractor:
    """
    Extract info from  PDF into dataframe.
    """

    def __init__(self, pdf_file):
        # instance variables
        self.pdf_file = pdf_file
        self.path = '/home/.../Downloads/'
        self.pdf = pdfplumber.open(self.path + self.pdf_file)
        self.pgNbr = len(self.pdf.pages)

    def __str__(self):
        state1 = f"The pdf name is {self.pdf_file} and the reporting period is {self.extractInfo()}"
        state2 = f"The pdf has {self.pgNbr} page(s)."
        return state1 + "\n" + state2

    def getPos(self, pg=0):
        page = self.pdf.pages[pg]
        rows = page.extract_text_lines()
        for row in rows:
            # print(row['text'])
            if "PONEDELJEK" in row["text"]:
                top_1 = row["top"]
                x0_1 = row["x0"]
            if row["text"].startswith("* Za otroke") | row["text"].startswith("Legenda"):
                bottom = row["bottom"]
                
        vert_words = page.extract_words()
        for word in vert_words:
            # print(word)
            if "TOREK" in word["text"]:
                x0_2 = word["x0"]
            if "SREDA" in word["text"]:
                x0_3 = word["x0"]
            if "ČETRTEK" in word["text"]:
                top_2 = word["top"]
        return (top_1, top_2, bottom, x0_1, x0_2, x0_3)

    def extractInfo(self):
        listOfWeekDays = self.doIt_2ndPage(0)[1] 
        weekDay = [_.split(',', 1)[1] for _ in listOfWeekDays[:5]]
        pattern = re.compile(r"(\d{1}. \d{2}. \d{4})|(\d{2}. \d{2}. \d{4})")
        lst = []
        for date in weekDay:
            lst.append(re.search(pattern,date).group())
        dates = [datetime.datetime.strptime(a_date, "%d. %m. %Y") for a_date in lst]
        return (
            str(min(dates).strftime("%Y-%m-%d"))
            + " - "
            + str(max(dates).strftime("%Y-%m-%d")))


    def doIt_2ndPage(self, pg=0):
        top_1, top_2, bottom, x0_1, x0_2, x0_3 = self.getPos(0) 
        page = self.pdf.pages[pg]
        table_settings = {
            "vertical_strategy": "explicit",
            "horizontal_strategy": "explicit",
            "explicit_vertical_lines": [x0_1, x0_2, x0_3, page.width-25],
            "explicit_horizontal_lines": [top_1 - 3, top_2, bottom - 11],}
        bounding_box = (x0_1, top_1 -  3 , page.width-25, bottom - 11)
        table = page.crop(bounding_box).extract_table(table_settings)
        lst = []
        lst_days = []
        for row in table:
            for i in row:
                lst.append([i])
                lst_days.append(i.splitlines()[0])

         
        df = pd.DataFrame(lst, columns=['A']) 
        return df, lst_days

    def putItTogether(self):
        return self.doIt_2ndPage(0)[0] 

out = Table_PDF_extractor("KOKRICA.pdf").putItTogether()
print(tabulate(out, tablefmt="grid"))
out2 = Table_PDF_extractor("KOKRICA.pdf")
print(out2)