forked from CommerceDataService/patent-process-and-search
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathparse_pdf.py
More file actions
24 lines (20 loc) · 788 Bytes
/
parse_pdf.py
File metadata and controls
24 lines (20 loc) · 788 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
'''
A parser for PDF documents that implements the doc_processing_toolkit from 18F:
https://github.com/18f/doc_processing_toolkit
This needs to be used because Tika is only able to parse a portion of our files.
'''
import sys
from textextraction.extractors import text_extractor
import os.path
filepath = sys.argv[-1]
base_path = os.path.abspath(os.path.dirname(__file__))
full_filepath = os.path.join(base_path, filepath)
try:
# this will create a file by the same name, in the same location, with the .txt extension
file_contents = text_extractor(doc_path=full_filepath, force_convert=False)
except FileNotFoundError as e:
print('Path {} does not exist'.format(full_filepath))
print(e)
except Exception as e:
print('Something went wrong. :(')
print(e)