Code not detecting numbers in a word doc when written over a line

38 views Asked by At

I'm a chem teacher trying to come up with a code to scan an excel file containing my students numbers to then extract these numbers from their report (word, excel or pdf format). Then, the code names the folders according to their student numbers.

The code works well and I used ChatGPT to write it since my knowledge is very limited. The only issue is, the code can't extract numbers from a word document when the digits are written over a line. I don't mean in underline, but really over a line (see provided picture). Here's the code and the picture:

import os
import re
import shutil
import pandas as pd
from docx import Document
import fitz  # PyMuPDF library

# Function to extract numbers from text
def extract_numbers(text):
    return re.findall(r'\d+', text)

# Function to find valid numbers in a given text
def find_valid_numbers(text, valid_numbers):
    numbers = extract_numbers(text)
    return [number for number in numbers if number in valid_numbers]

# Input and output folders
input_folder = 'keeping this private :)'  # Change this to your input folder path
output_folder = 'also keeping this private :)'  # Change this to your output folder path
valid_numbers_file = 'liste_etudiant.xlsx'  # Excel file containing the list of valid numbers

# Read the entire Excel file into a DataFrame
valid_numbers_df = pd.read_excel(valid_numbers_file, header=None)

# Flatten the DataFrame into a list of all values
valid_numbers_list = valid_numbers_df.values.flatten().astype(str).tolist()

# Function to extract text from a PDF file using PyMuPDF
def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        pdf_document = fitz.open(pdf_path)
        for page_num in range(pdf_document.page_count):
            page = pdf_document[page_num]
            text += page.get_text()
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {str(e)}")
    return text

# Iterate through the files in the input folder
for filename in os.listdir(input_folder):
    file_path = os.path.join(input_folder, filename)
    
    try:
        if filename.endswith('.docx'):
            # Read and process Word documents
            doc = Document(file_path)
            doc_text = '\n'.join([para.text for para in doc.paragraphs])
            valid_numbers_found = set(find_valid_numbers(doc_text, valid_numbers_list))
        elif filename.endswith('.xlsx'):
            # Read and process Excel documents
            df = pd.read_excel(file_path, header=None)
            excel_values = df.values.flatten().astype(str).tolist()
            valid_numbers_found = set(find_valid_numbers(' '.join(excel_values), valid_numbers_list))
        elif filename.endswith('.pdf'):
            # Read and process PDF documents
            pdf_text = extract_text_from_pdf(file_path)
            valid_numbers_found = set(find_valid_numbers(pdf_text, valid_numbers_list))
        else:
            # Skip unsupported file types
            print(f"Skipping: {filename} (Unsupported file type)")
            continue

        if valid_numbers_found:
            # Construct the new filename using the found numbers separated by a hyphen
            new_filename = '-'.join(valid_numbers_found) + '_Rapport' + os.path.splitext(filename)[1]
            
            # Copy the file to the output folder with the new filename
            shutil.copy(file_path, os.path.join(output_folder, new_filename))
            
            print(f"Processed: {filename} -> {new_filename}")
        else:
            print(f"Skipping: {filename} (Could not find valid numbers in the document)")
    except Exception as e:
        print(f"Error processing {filename}: {str(e)}")

print("Processing complete.")

provided picture

I tried using ChatGPT for some trouble shooting but every solution would not work. Like I said, I can read and understand most of a simple code, but I'm not equipped to solve this problem.

Thank you for your help!

0

There are 0 answers