1818 service_logger ,
1919 QUEUES_NAMES ,
2020)
21+ from domain .PdfFile import PdfFile
2122from domain .ResultMessage import ResultMessage
2223from domain .Task import Task
2324from use_cases .extract_segments_use_case import ocr_pdf , get_xml_name , extract_segments
@@ -33,6 +34,23 @@ def get_failed_results_message(task: Task, message: str) -> ResultMessage:
3334 )
3435
3536
37+ def is_valid_pdf (filepath ):
38+ try :
39+ with open (filepath , "rb" ) as f :
40+ header = f .read (5 )
41+ if header != b"%PDF-" :
42+ return False
43+ f .seek (- 1024 , 2 )
44+ end_bytes = f .read (1024 )
45+ if b"%%EOF" not in end_bytes :
46+ return False
47+ return True
48+ except FileNotFoundError :
49+ return False
50+ except Exception :
51+ return False
52+
53+
3654def ocr_pdf_task (task ):
3755 ocr_pdf (task )
3856
@@ -53,24 +71,26 @@ def process(message):
5371 try :
5472 task = Task (** message )
5573 except ValidationError :
56- service_logger .error (f"validation error : { message } " , exc_info = True )
74+ service_logger .error (f"The message was incorrectly formatted : { message } " )
5775 return None
5876
5977 try :
6078 service_logger .info (f"Processing Redis message: { message } " )
6179
80+ if not is_valid_pdf (PdfFile (task .tenant ).get_path (task .params .filename )):
81+ extraction_message = get_failed_results_message (task , f"The file does not appear to be a valid PDF" )
82+ service_logger .info (extraction_message .model_dump_json ())
83+ return extraction_message .model_dump_json ()
84+
6285 if task .task == "ocr" :
6386 return ocr_pdf_task (task )
6487
6588 return process_task (task ).model_dump_json ()
66- except RuntimeError :
67- extraction_message = get_failed_results_message (task , "Error processing PDF" )
68- service_logger .error (extraction_message .model_dump_json (), exc_info = True )
6989 except FileNotFoundError :
70- extraction_message = get_failed_results_message (task , "Error FileNotFoundError " )
90+ extraction_message = get_failed_results_message (task , "The PDF could not be found " )
7191 service_logger .error (extraction_message .model_dump_json (), exc_info = True )
72- except Exception :
73- extraction_message = get_failed_results_message (task , "Error " )
92+ except ( RuntimeError , Exception ) :
93+ extraction_message = get_failed_results_message (task , "An unexpected error occurred " )
7494 service_logger .error (extraction_message .model_dump_json (), exc_info = True )
7595
7696 return extraction_message .model_dump_json ()
0 commit comments