I'm trying to learn multimodal retrieval to provide a knowledge base for a Chatbot. I'm trying to run this example in google colab but when trying to run this block:
from lxml import html
from pydantic import BaseModel
from typing import Any, Optional
from unstructured.partition.pdf import partition_pdf
# Get elements
raw_pdf_elements = partition_pdf(filename=path+"Llama2.pdf",
# Unstructured first finds embedded image blocks
extract_images_in_pdf=False,
# Use layout model (YOLOX) to get bounding boxes (for tables) and find titles
# Titles are any sub-section of the document
infer_table_structure=True,
# Post processing to aggregate text once we have the title
chunking_strategy="by_title",
# Chunking params to aggregate text blocks
# Attempt to create a new chunk 3800 chars
# Attempt to keep chunks > 2000 chars
max_characters=4000,
new_after_n_chars=3800,
combine_text_under_n_chars=2000,
#image_output_dir_path=path
)
And it prints this error: UnidentifiedImageError: cannot identify image file '/tmp/tmp7_ptk9_l/79a0d262-3288-4b70-99b3-c4f7674be008-01.ppm'
This is the full output of the block:
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data] Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data] /root/nltk_data...
[nltk_data] Unzipping taggers/averaged_perceptron_tagger.zip.
yolox_l0.05.onnx: 100%
217M/217M [00:01<00:00, 166MB/s]
---------------------------------------------------------------------------
UnidentifiedImageError Traceback (most recent call last)
<ipython-input-8-e21e15dc62ac> in <cell line: 7>()
5
6 # Get elements
----> 7 raw_pdf_elements = partition_pdf(filename=path+"Llama2.pdf",
8 # Unstructured first finds embedded image blocks
9 extract_images_in_pdf=False,
10 frames
/usr/local/lib/python3.10/dist-packages/unstructured/documents/elements.py in wrapper(*args, **kwargs)
512 @functools.wraps(func)
513 def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> List[Element]:
--> 514 elements = func(*args, **kwargs)
515 sig = inspect.signature(func)
516 params: Dict[str, Any] = dict(**dict(zip(sig.parameters, args)), **kwargs)
/usr/local/lib/python3.10/dist-packages/unstructured/file_utils/filetype.py in wrapper(*args, **kwargs)
589 @functools.wraps(func)
590 def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> List[Element]:
--> 591 elements = func(*args, **kwargs)
592 sig = inspect.signature(func)
593 params: Dict[str, Any] = dict(**dict(zip(sig.parameters, args)), **kwargs)
/usr/local/lib/python3.10/dist-packages/unstructured/file_utils/filetype.py in wrapper(*args, **kwargs)
544 @functools.wraps(func)
545 def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> List[Element]:
--> 546 elements = func(*args, **kwargs)
547 sig = inspect.signature(func)
548 params: Dict[str, Any] = dict(**dict(zip(sig.parameters, args)), **kwargs)
/usr/local/lib/python3.10/dist-packages/unstructured/chunking/__init__.py in wrapper(*args, **kwargs)
50 @functools.wraps(func)
51 def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> List[Element]:
---> 52 elements = func(*args, **kwargs)
53 sig = inspect.signature(func)
54 params: Dict[str, Any] = dict(**dict(zip(sig.parameters, args)), **kwargs)
/usr/local/lib/python3.10/dist-packages/unstructured/partition/pdf.py in partition_pdf(filename, file, include_page_breaks, strategy, infer_table_structure, ocr_languages, languages, include_metadata, metadata_filename, metadata_last_modified, chunking_strategy, links, extract_images_in_pdf, extract_element_types, image_output_dir_path, **kwargs)
189 languages = check_languages(languages, ocr_languages)
190
--> 191 return partition_pdf_or_image(
192 filename=filename,
193 file=file,
/usr/local/lib/python3.10/dist-packages/unstructured/partition/pdf.py in partition_pdf_or_image(filename, file, is_image, include_page_breaks, strategy, infer_table_structure, ocr_languages, languages, metadata_last_modified, extract_images_in_pdf, extract_element_types, image_output_dir_path, **kwargs)
503 with warnings.catch_warnings():
504 warnings.simplefilter("ignore")
--> 505 elements = _partition_pdf_or_image_local(
506 filename=filename,
507 file=spooled_to_bytes_io_if_needed(file),
/usr/local/lib/python3.10/dist-packages/unstructured/utils.py in wrapper(*args, **kwargs)
212 ),
213 )
--> 214 return func(*args, **kwargs)
215
216 return wrapper
/usr/local/lib/python3.10/dist-packages/unstructured/partition/pdf.py in _partition_pdf_or_image_local(filename, file, is_image, infer_table_structure, include_page_breaks, languages, ocr_mode, model_name, metadata_last_modified, pdf_text_extractable, extract_images_in_pdf, extract_element_types, image_output_dir_path, pdf_image_dpi, analysis, analyzed_image_output_dir_path, **kwargs)
286
287 if file is None:
--> 288 inferred_document_layout = process_file_with_model(
289 filename,
290 is_image=is_image,
/usr/local/lib/python3.10/dist-packages/unstructured_inference/inference/layout.py in process_file_with_model(filename, model_name, is_image, fixed_layouts, extract_tables, pdf_image_dpi, **kwargs)
402 )
403 if is_image
--> 404 else DocumentLayout.from_file(
405 filename,
406 detection_model=detection_model,
/usr/local/lib/python3.10/dist-packages/unstructured_inference/inference/layout.py in from_file(cls, filename, fixed_layouts, pdf_image_dpi, **kwargs)
76 # NOTE(robinson) - In the future, maybe we detect the page number and default
77 # to the index if it is not detected
---> 78 with Image.open(image_path) as image:
79 page = PageLayout.from_image(
80 image,
/usr/local/lib/python3.10/dist-packages/PIL/Image.py in open(fp, mode, formats)
3281 raise
3282 return None
-> 3283
3284 im = _open_core(fp, filename, prefix, formats)
3285
UnidentifiedImageError: cannot identify image file '/tmp/tmp7_ptk9_l/79a0d262-3288-4b70-99b3-c4f7674be008-01.ppm'
Does the same happen to anyone else? Do you know how to fix it?
EDIT: I solved the problem by updating outdated packages as follows
pip install tensorflow-probability imageio pillow
Then restart the environment