Problems with the order in which PDF files are created

15 views Asked by At

I created an app that generates keyword searchable PDF files with tesseact, but when the PDF file is created, the pages are randomly reordered by one page every few dozen pages. How can I get the PDF file to be created in the order in which the image files were read?

I used pool.map to match order but its not useless.

from multiprocessing import Pool, cpu_count
import pytesseract
import PyPDF2
import io
from datetime import datetime
import os
from PySide6.QtWidgets import QApplication, QFileDialog
import sys

# Tesseract path
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

def process_image(args):
    index, file_path = args
    try:
        # image to OCR PDF create
        page = pytesseract.image_to_pdf_or_hocr(file_path, extension='pdf', lang='eng+kor')
        return (index, file_path, page)
    except Exception as e:
        return (index, file_path, None)

def create_searchable_pdf_and_doc(files):
    pdf_writer = PyPDF2.PdfWriter()
    total_files = len(files)

    # multi processing
    with Pool(processes=cpu_count()) as pool:
        # file index and path
        tasks = [(i, file) for i, file in enumerate(files)]
        # pool.map match order
        results = pool.map(process_image, tasks)

        # arrange
        results.sort(key=lambda x: x[0])

    for i, (_, _, page) in enumerate(results):
        # processing status
        print(f"\rProcessing: {i+1}/{total_files}", end="")
        sys.stdout.flush()
        
        if page:
            pdf = PyPDF2.PdfReader(io.BytesIO(page))
            for pageNum in range(len(pdf.pages)):
                pdf_writer.add_page(pdf.pages[pageNum])

    print("\nAll files have been processed. Compiling into PDF...")

    # file name creation
    today_date = datetime.now().strftime("%Y%m%d")
    directory_name = os.path.basename(os.path.dirname(files[0]))
    final_pdf_name = f"{directory_name}_{today_date}.pdf"
    
    # PDF file save
    with open(final_pdf_name, "wb") as f_out:
        pdf_writer.write(f_out)
    print(f"PDF file created: {final_pdf_name}")

def select_files():
    app = QApplication(sys.argv)
    dialog = QFileDialog()
    dialog.setFileMode(QFileDialog.ExistingFiles)
    dialog.setNameFilter("Images (*.png *.xpm *.jpg *.jpeg *.bmp *.gif)")
    if dialog.exec():
        return dialog.selectedFiles()
    return []

if __name__ == "__main__":
    selected_files = select_files()
    if selected_files:
        create_searchable_pdf_and_doc(selected_files)
    else:
        print("Fie does not selected.")

How can I get the PDF file to be created in the order in which the image files were read?

0

There are 0 answers