Extract texts as well as images sequentially using Pymupdf

198 views Asked by At

I need to extract texts as well as images sequentially using Pymupdf. So I have to know which text block and between which text block the image is located. This is simple image extraction code.

import fitz  # Import PyMuPDF
import os

def extract_images(pdf_path, output_folder):
    doc = fitz.open(pdf_path)
    image_count = 0

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for page_num in range(len(doc)):
        page = doc[page_num]
        a = page.get_texttrace()
        for img in doc.get_page_images(page_num):
            xref = img[0]  # xref number
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]

            image_filename = f"image_{page_num + 1}_{image_count + 1}.png"
            image_filepath = os.path.join(output_folder, image_filename)

            with open(image_filepath, "wb") as img_file:
                img_file.write(image_bytes)

            image_count += 1

    doc.close()
    return image_count

pdf_file = 'example.pdf'
output_dir = 'output_directory'
num_images = extract_images(pdf_file, output_dir)
print(f"Extracted {num_images} images.")

I don't have any idea how to do it. Please help me, experts.

0

There are 0 answers