Here is the code from the Jupyter notebook that takes the pdf files in the data folder, and extracts the text using the line text = textract.process(pdf_path, method='pdfminer').
tokenizer = tiktoken.get_encoding("cl100k_base")
# Process each PDF file and prepare for embedding
for pdf_file in pdf_files:
pdf_path = os.path.join(data_dir,pdf_file)
print(pdf_path)
# # Extract the raw text from each PDF using textract
text = textract.process(pdf_path, method='pdfminer')
I'm getting the following stack trace as a result, it seems like the filename isn't getting passed properly at some point during this process. I've spent several hours looking at this and haven't made any progress:
---------------------------------------------------------------------------
OSError Traceback (most recent call last)
File c:\Python311\Lib\site-packages\textract\parsers\pdf_parser.py:54, in Parser.extract_pdfminer(self, filename, **kwargs)
53 try:
---> 54 stdout, _ = self.run(['pdf2txt.py', filename])
55 except OSError:
File c:\Python311\Lib\site-packages\textract\parsers\utils.py:87, in ShellParser.run(self, args)
86 try:
---> 87 pipe = subprocess.Popen(
88 args,
89 stdout=subprocess.PIPE, stderr=subprocess.PIPE,
90 )
91 except OSError as e:
File c:\Python311\Lib\subprocess.py:1024, in Popen.__init__(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, user, group, extra_groups, encoding, errors, text, umask, pipesize, process_group)
1021 self.stderr = io.TextIOWrapper(self.stderr,
1022 encoding=encoding, errors=errors)
-> 1024 self._execute_child(args, executable, preexec_fn, close_fds,
1025 pass_fds, cwd, env,
1026 startupinfo, creationflags, shell,
1027 p2cread, p2cwrite,
1028 c2pread, c2pwrite,
1029 errread, errwrite,
1030 restore_signals,
1031 gid, gids, uid, umask,
1032 start_new_session, process_group)
1033 except:
1034 # Cleanup if the child failed starting.
File c:\Python311\Lib\subprocess.py:1493, in Popen._execute_child(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, unused_restore_signals, unused_gid, unused_gids, unused_uid, unused_umask, unused_start_new_session, unused_process_group)
1492 try:
-> 1493 hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
1494 # no special security
1495 None, None,
1496 int(not close_fds),
1497 creationflags,
1498 env,
1499 cwd,
1500 startupinfo)
1501 finally:
1502 # Child is launched. Close the parent's copy of those pipe
1503 # handles that only the child should have open. You need
(...)
1506 # pipe will not close when the child process exits and the
1507 # ReadFile will hang.
OSError: [WinError 193] %1 is not a valid Win32 application
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
File :12
File c:\Python311\Lib\site-packages\textract\parsers\__init__.py:79, in process(filename, input_encoding, output_encoding, extension, **kwargs)
76 # do the extraction
78 parser = filetype_module.Parser()
---> 79 return parser.process(filename, input_encoding, output_encoding, **kwargs)
File c:\Python311\Lib\site-packages\textract\parsers\utils.py:46, in BaseParser.process(self, filename, input_encoding, output_encoding, **kwargs)
36 """Process ``filename`` and encode byte-string with ``encoding``. This
37 method is called by :func:`textract.parsers.process` and wraps
38 the :meth:`.BaseParser.extract` method in `a delicious unicode
39 sandwich `_.
40
41 """
42 # make a "unicode sandwich" to handle dealing with unknown
43 # input byte strings and converting them to a predictable
44 # output encoding
45 # http://nedbatchelder.com/text/unipain/unipain.html#35
---> 46 byte_string = self.extract(filename, **kwargs)
47 unicode_string = self.decode(byte_string, input_encoding)
48 return self.encode(unicode_string, output_encoding)
File c:\Python311\Lib\site-packages\textract\parsers\pdf_parser.py:32, in Parser.extract(self, filename, method, **kwargs)
29 raise ex
31 elif method == 'pdfminer':
---> 32 return self.extract_pdfminer(filename, **kwargs)
33 elif method == 'tesseract':
34 return self.extract_tesseract(filename, **kwargs)
File c:\Python311\Lib\site-packages\textract\parsers\pdf_parser.py:57, in Parser.extract_pdfminer(self, filename, **kwargs)
55 except OSError:
56 try:
---> 57 stdout, _ = self.run(['python3',pdf2txt_path, filename])
58 except ShellError:
59 stdout, _ = self.run(['python2',pdf2txt_path, filename])
File c:\Python311\Lib\site-packages\textract\parsers\utils.py:87, in ShellParser.run(self, args)
85 # run a subprocess and put the stdout and stderr on the pipe object
86 try:
---> 87 pipe = subprocess.Popen(
88 args,
89 stdout=subprocess.PIPE, stderr=subprocess.PIPE,
90 )
91 except OSError as e:
92 if e.errno == errno.ENOENT:
93 # File not found.
94 # This is equivalent to getting exitcode 127 from sh
File c:\Python311\Lib\subprocess.py:1024, in Popen.__init__(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, user, group, extra_groups, encoding, errors, text, umask, pipesize, process_group)
1020 if self.text_mode:
1021 self.stderr = io.TextIOWrapper(self.stderr,
1022 encoding=encoding, errors=errors)
-> 1024 self._execute_child(args, executable, preexec_fn, close_fds,
1025 pass_fds, cwd, env,
1026 startupinfo, creationflags, shell,
1027 p2cread, p2cwrite,
1028 c2pread, c2pwrite,
1029 errread, errwrite,
1030 restore_signals,
1031 gid, gids, uid, umask,
1032 start_new_session, process_group)
1033 except:
1034 # Cleanup if the child failed starting.
1035 for f in filter(None, (self.stdin, self.stdout, self.stderr)):
File c:\Python311\Lib\subprocess.py:1433, in Popen._execute_child(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, unused_restore_signals, unused_gid, unused_gids, unused_uid, unused_umask, unused_start_new_session, unused_process_group)
1431 args = list2cmdline([args])
1432 else:
-> 1433 args = list2cmdline(args)
1435 if executable is not None:
1436 executable = os.fsdecode(executable)
File c:\Python311\Lib\subprocess.py:608, in list2cmdline(seq)
606 result = []
607 needquote = False
--> 608 for arg in map(os.fsdecode, seq):
609 bs_buf = []
611 # Add a space to separate this argument from the others
File :824, in fsdecode(filename)
TypeError: expected str, bytes or os.PathLike object, not NoneType
I've mostly tried experimenting with the input into textract.process(pdf_path, method='pdfminer')
The first thing I noticed is that the pdf_path had '.\' in front of the path, so I tried the following input textract.process('data\\' + pdf_file, method='pdfminer'), but ended up with the same error.
I also tried just passing in the pdf_file like so textract.process(pdf_file, method='pdfminer'), but got the following error:
---------------------------------------------------------------------------
MissingFileError Traceback (most recent call last)
File :12
File c:\Python311\Lib\site-packages\textract\parsers\__init__.py:41, in process(filename, input_encoding, output_encoding, extension, **kwargs)
39 # make sure the filename exists
40 if not os.path.exists(filename):
---> 41 raise exceptions.MissingFileError(filename)
43 # get the filename extension, which is something like .docx for
44 # example, and import the module dynamically using importlib. This
45 # is a relative import so the name of the package is necessary
46 # normally, file extension will be extracted from the file name
47 # if the file name has no extension, then the user can pass the
48 # extension as an argument
49 if extension:
MissingFileError: The file "f1040.pdf" can not be found.
Is this the right path/to/file/you/want/to/extract.pdf?
I've also tried to add additional PDF files to the data folder to check to see if the issue had something to do with the fact that I was using one file, I've also tried changing the PDF file names to see if the issue had to do with the name of the file.