I have a pdf file containing tabular data on all pages but some data of the table passes to another table present on the next page. That is why I am not seeing the data correctly. Here is my pdf file. I want the correct formatting of data such that each row data should be appeared correctly and not passed in the other rows.
Here is how the data is appearing in my dataframe. The code I have written to achieve this is like this
import pandas as pd
import tabula
import os
import numpy as np
import re
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
os.environ['JAVA_HOME'] = 'C:/Program Files/Java/jdk-21'
file_path = 'pdf path'
# Get the number of pages in the PDF
total_pages = tabula.read_pdf(file_path, pages="all", silent=True)
# Initialize an empty DataFrame to store the combined results
combined_df = pd.DataFrame()
# Iterate through each page
for i in range(1, len(total_pages) + 1):
try:
# Read tables from the current page with header starting at row 1
table = tabula.read_pdf(file_path, pages=i, pandas_options={'header': 1}, silent=True)
# Assuming the table[0] contains the DataFrame from the current page
df = table[0]
# Rename 'Unnamed: 0' column to 'Sl.No.'
if 'Unnamed: 0' in df.columns:
df.rename(columns={'Unnamed: 0': 'Sl.No.'}, inplace=True)
df.drop([0, 1], inplace=True)
# Check if 'Unnamed: 1' and 'Institute' columns exist together
if 'Unnamed: 1' in df.columns and 'Institute' in df.columns:
# Rename 'Unnamed: 1' to 'Institute' and drop 'Institute' column
df.drop(columns=['Institute'], inplace=True)
df.rename(columns={'Unnamed: 1': 'Institute'}, inplace=True)
# Group by 'Sl.No.' and aggregate to combine the rows
out = (
df.groupby(df['Sl.No.'].bfill())
.agg(lambda x: ' '.join(x.dropna().astype(str)))
)
# Concatenate the result to the combined DataFrame
combined_df = pd.concat([combined_df, out])
except Exception as e:
print(f"Error processing page {i}: {e}")
# Display the combined DataFrame
combined_df.reset_index(drop=True,inplace=True)