I'm trying to write a script that extracts some data from a pdf file ([example]
(https://drive.google.com/file/d/1kvhLPgdHHvluTtdVUIQ1GTDjiUFQDgSA/view?usp=drive_link))
The data I need to extract is:
"SERVICIOS AGROPECUARIOS CUYO SA", "HUMBERTO PRIMO N° 251/0 Piso: 10 Depto: 1 CP: (5800) - RIO CUARTO - CORDOBA", "7.564,40", "30/06/2022", "04/2012", "10/2021", "30711982333" and "2001792"
I've tried this 2 approaches but neither is not working.
import fitz # PyMuPDF
from docx import Document
import os
current_directory = os.getcwd()
ruta_pdf = current_directory
def extraer_datos_pdf(ruta_pdf):
documento = fitz.open(ruta_pdf)
texto_completo = ""
for pagina in documento:
texto_completo += pagina.get_text()
nombre = "Nombre no encontrado"
domicilio = "Domicilio no encontrado"
numero_documento = "123"
for pagina in documento:
for tabla in pagina.get_text("dict")["blocks"]:
if tabla["type"] == 0 and tabla["bbox"] is not None:
for linea in tabla["lines"]:
for palabra in linea["spans"]:
texto = palabra["text"].strip()
if texto.startswith("EMPRESA"):
nombre = texto.split(":", 1)[1].strip()
elif texto.startswith("DOMICILIO"):
domicilio = texto.split(":", 1)[1].strip()
elif texto.startswith("ACTA N"):
numero_documento = texto.split("°", 1)[1].strip()
return nombre, domicilio, numero_documento
import fitz # PyMuPDF
from docx import Document
import os
import camelot
import tabula
current_directory = os.getcwd()
directorio = current_directory
template_path = os.path.join(current_directory, "modelo.docx")
ruta_pdf = current_directory
def extraer_datos_pdf(ruta_pdf):
tables = camelot.read_pdf(ruta_pdf, flavor='stream')
nombre = "Nombre no encontrado"
domicilio = "Domicilio no encontrado"
numero_documento = "123"
if tables:
for table in tables:
for index, row in table.df.iterrows():
for col_index, cell in enumerate(row):
if "EMPRESA" in cell:
nombre = row[col_index + 1] if col_index + 1 < len(row) else "Valor no encontrado"
elif "DOMICILIO" in cell:
domicilio = row[col_index + 1] if col_index + 1 < len(row) else "Valor no encontrado"
elif "ACTA N°" in cell:
numero_documento = row[col_index + 1] if col_index + 1 < len(row) else "Valor no encontrado"
return nombre, domicilio, numero_documento