partition_pdf throws UnidentifiedImageError

279 views Asked by At

I'm trying to learn multimodal retrieval to provide a knowledge base for a Chatbot. I'm trying to run this example in google colab but when trying to run this block:

from lxml import html
from pydantic import BaseModel
from typing import Any, Optional
from unstructured.partition.pdf import partition_pdf

# Get elements
raw_pdf_elements = partition_pdf(filename=path+"Llama2.pdf",
                                 # Unstructured first finds embedded image blocks
                                 extract_images_in_pdf=False,
                                 # Use layout model (YOLOX) to get bounding boxes (for tables) and find titles
                                 # Titles are any sub-section of the document
                                 infer_table_structure=True,
                                 # Post processing to aggregate text once we have the title
                                 chunking_strategy="by_title",
                                 # Chunking params to aggregate text blocks
                                 # Attempt to create a new chunk 3800 chars
                                 # Attempt to keep chunks > 2000 chars
                                 max_characters=4000,
                                 new_after_n_chars=3800,
                                 combine_text_under_n_chars=2000,
                                 #image_output_dir_path=path
                                 )

And it prints this error: UnidentifiedImageError: cannot identify image file '/tmp/tmp7_ptk9_l/79a0d262-3288-4b70-99b3-c4f7674be008-01.ppm'

This is the full output of the block:

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
yolox_l0.05.onnx: 100%
217M/217M [00:01<00:00, 166MB/s]
---------------------------------------------------------------------------
UnidentifiedImageError                    Traceback (most recent call last)
<ipython-input-8-e21e15dc62ac> in <cell line: 7>()
      5 
      6 # Get elements
----> 7 raw_pdf_elements = partition_pdf(filename=path+"Llama2.pdf",
      8                                  # Unstructured first finds embedded image blocks
      9                                  extract_images_in_pdf=False,

10 frames
/usr/local/lib/python3.10/dist-packages/unstructured/documents/elements.py in wrapper(*args, **kwargs)
    512         @functools.wraps(func)
    513         def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> List[Element]:
--> 514             elements = func(*args, **kwargs)
    515             sig = inspect.signature(func)
    516             params: Dict[str, Any] = dict(**dict(zip(sig.parameters, args)), **kwargs)

/usr/local/lib/python3.10/dist-packages/unstructured/file_utils/filetype.py in wrapper(*args, **kwargs)
    589         @functools.wraps(func)
    590         def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> List[Element]:
--> 591             elements = func(*args, **kwargs)
    592             sig = inspect.signature(func)
    593             params: Dict[str, Any] = dict(**dict(zip(sig.parameters, args)), **kwargs)

/usr/local/lib/python3.10/dist-packages/unstructured/file_utils/filetype.py in wrapper(*args, **kwargs)
    544     @functools.wraps(func)
    545     def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> List[Element]:
--> 546         elements = func(*args, **kwargs)
    547         sig = inspect.signature(func)
    548         params: Dict[str, Any] = dict(**dict(zip(sig.parameters, args)), **kwargs)

/usr/local/lib/python3.10/dist-packages/unstructured/chunking/__init__.py in wrapper(*args, **kwargs)
     50         @functools.wraps(func)
     51         def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> List[Element]:
---> 52             elements = func(*args, **kwargs)
     53             sig = inspect.signature(func)
     54             params: Dict[str, Any] = dict(**dict(zip(sig.parameters, args)), **kwargs)

/usr/local/lib/python3.10/dist-packages/unstructured/partition/pdf.py in partition_pdf(filename, file, include_page_breaks, strategy, infer_table_structure, ocr_languages, languages, include_metadata, metadata_filename, metadata_last_modified, chunking_strategy, links, extract_images_in_pdf, extract_element_types, image_output_dir_path, **kwargs)
    189     languages = check_languages(languages, ocr_languages)
    190 
--> 191     return partition_pdf_or_image(
    192         filename=filename,
    193         file=file,

/usr/local/lib/python3.10/dist-packages/unstructured/partition/pdf.py in partition_pdf_or_image(filename, file, is_image, include_page_breaks, strategy, infer_table_structure, ocr_languages, languages, metadata_last_modified, extract_images_in_pdf, extract_element_types, image_output_dir_path, **kwargs)
    503         with warnings.catch_warnings():
    504             warnings.simplefilter("ignore")
--> 505             elements = _partition_pdf_or_image_local(
    506                 filename=filename,
    507                 file=spooled_to_bytes_io_if_needed(file),

/usr/local/lib/python3.10/dist-packages/unstructured/utils.py in wrapper(*args, **kwargs)
    212                     ),
    213                 )
--> 214             return func(*args, **kwargs)
    215 
    216         return wrapper

/usr/local/lib/python3.10/dist-packages/unstructured/partition/pdf.py in _partition_pdf_or_image_local(filename, file, is_image, infer_table_structure, include_page_breaks, languages, ocr_mode, model_name, metadata_last_modified, pdf_text_extractable, extract_images_in_pdf, extract_element_types, image_output_dir_path, pdf_image_dpi, analysis, analyzed_image_output_dir_path, **kwargs)
    286 
    287     if file is None:
--> 288         inferred_document_layout = process_file_with_model(
    289             filename,
    290             is_image=is_image,

/usr/local/lib/python3.10/dist-packages/unstructured_inference/inference/layout.py in process_file_with_model(filename, model_name, is_image, fixed_layouts, extract_tables, pdf_image_dpi, **kwargs)
    402         )
    403         if is_image
--> 404         else DocumentLayout.from_file(
    405             filename,
    406             detection_model=detection_model,

/usr/local/lib/python3.10/dist-packages/unstructured_inference/inference/layout.py in from_file(cls, filename, fixed_layouts, pdf_image_dpi, **kwargs)
     76                 # NOTE(robinson) - In the future, maybe we detect the page number and default
     77                 # to the index if it is not detected
---> 78                 with Image.open(image_path) as image:
     79                     page = PageLayout.from_image(
     80                         image,

/usr/local/lib/python3.10/dist-packages/PIL/Image.py in open(fp, mode, formats)
   3281                 raise
   3282         return None
-> 3283 
   3284     im = _open_core(fp, filename, prefix, formats)
   3285 

UnidentifiedImageError: cannot identify image file '/tmp/tmp7_ptk9_l/79a0d262-3288-4b70-99b3-c4f7674be008-01.ppm'

Does the same happen to anyone else? Do you know how to fix it?

EDIT: I solved the problem by updating outdated packages as follows

pip install tensorflow-probability imageio pillow

Then restart the environment

0

There are 0 answers