I need to extract texts as well as images sequentially using Pymupdf. So I have to know which text block and between which text block the image is located. This is simple image extraction code.
import fitz # Import PyMuPDF
import os
def extract_images(pdf_path, output_folder):
doc = fitz.open(pdf_path)
image_count = 0
if not os.path.exists(output_folder):
os.makedirs(output_folder)
for page_num in range(len(doc)):
page = doc[page_num]
a = page.get_texttrace()
for img in doc.get_page_images(page_num):
xref = img[0] # xref number
base_image = doc.extract_image(xref)
image_bytes = base_image["image"]
image_filename = f"image_{page_num + 1}_{image_count + 1}.png"
image_filepath = os.path.join(output_folder, image_filename)
with open(image_filepath, "wb") as img_file:
img_file.write(image_bytes)
image_count += 1
doc.close()
return image_count
pdf_file = 'example.pdf'
output_dir = 'output_directory'
num_images = extract_images(pdf_file, output_dir)
print(f"Extracted {num_images} images.")
I don't have any idea how to do it. Please help me, experts.