I'm using the tabula package to try to read this pdf: https://www.conass.org.br/wp-content/uploads/2022/01/RENAME-2022.pdf
import numpy as np
import pandas as pd
import tabula
import os
import jpype #!pip install jpype1
path = 'RENAME-2022.pdf'
tabula.read_pdf(input_path = path,
output_format = "dataframe",
pages = '28-78',
encoding = 'latin-1')
I tried to run this with different encodings (including none), as 'ISO-8859-1, 'cp1252' ,'latin-1'and "utf-8". But they all return the same error:
UnicodeDecodeError Traceback (most recent call last)
Cell In[12], line 3
1 path = 'RENAME-2022.pdf'
2 initial_page = 28
----> 3 tabula.read_pdf(input_path = path,
4 output_format = "dataframe",
5 pages = '24-78',
6 encoding = 'ISO-8859-1')
File ~\anaconda3\Lib\site-packages\tabula\io.py:395, in read_pdf(input_path, output_format, encoding, java_options, pandas_options, multiple_tables, user_agent, use_raw_url, pages, guess, area, relative_area, lattice, stream, password, silent, columns, relative_columns, format, batch, output_path, force_subprocess, options)
392 raise ValueError(f"{path} is empty. Check the file, or download it manually.")
394 try:
--> 395 output = _run(
396 tabula_options,
397 java_options,
398 path,
399 encoding=encoding,
400 force_subprocess=force_subprocess,
401 )
402 finally:
403 if temporary:
File ~\anaconda3\Lib\site-packages\tabula\io.py:82, in _run(options, java_options, path, encoding, force_subprocess)
79 elif set(java_options) - IGNORED_JAVA_OPTIONS:
80 logger.warning("java_options is ignored until rebooting the Python process.")
---> 82 return _tabula_vm.call_tabula_java(options, path)
File ~\anaconda3\Lib\site-packages\tabula\backend.py:117, in SubprocessTabula.call_tabula_java(self, options, path)
115 if result.stderr:
116 logger.warning(f"Got stderr: {result.stderr.decode(self.encoding)}")
--> 117 return result.stdout.decode(self.encoding)
118 except FileNotFoundError:
119 raise JavaNotFoundError(JAVA_NOT_FOUND_ERROR)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xed in position 4: invalid continuation byte
I've searched for this error, but I could find none related specifically to the tabula package. Most of the suggestions were to use encoding 'latin-1', which also didn't work for me. I tried to adapt this solution ('utf-8' codec can't decode byte 0xe2 : invalid continuation byte error) to my case, but it also didn't work:
with open(path, 'rb') as fopen:
q = fopen.read()
df = tabula.read_pdf(input_path = q,
output_format = "dataframe",
pages = '24-78',
encoding = 'ISO-8859-1')
Returning:
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
Cell In[15], line 4
2 with open(path, 'rb') as fopen:
3 q = fopen.read()
----> 4 df = tabula.read_pdf(input_path = q,
5 output_format = "dataframe",
6 pages = '24-78',
7 encoding = 'ISO-8859-1')
File ~\anaconda3\Lib\site-packages\tabula\io.py:389, in read_pdf(input_path, output_format, encoding, java_options, pandas_options, multiple_tables, user_agent, use_raw_url, pages, guess, area, relative_area, lattice, stream, password, silent, columns, relative_columns, format, batch, output_path, force_subprocess, options)
386 path, temporary = localize_file(input_path, user_agent, use_raw_url=use_raw_url)
388 if not os.path.exists(path):
--> 389 raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), path)
391 if os.path.getsize(path) == 0:
392 raise ValueError(f"{path} is empty. Check the file, or download it manually.")
I would appreciate any help. Thanks in advance.