Skip to content

Commit 590edfe

Browse files
committed
Add PDF check before processing it
1 parent 4f8600a commit 590edfe

File tree

3 files changed

+39
-9
lines changed

3 files changed

+39
-9
lines changed

docker-compose.yml

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,17 +31,25 @@ services:
3131
- ENVIRONMENT=${ENVIRONMENT:-development}
3232
- SENTRY_DSN=${SENTRY_DSN:-}
3333
- PYTHONUNBUFFERED=1
34+
- USE_FAST=false
3435
network_mode: host
3536
depends_on:
3637
- mongo-pdf-layout
3738

3839
worker-pdf-layout-gpu:
39-
container_name: "worker-pdf-layout-no-gpu"
40+
container_name: "worker-pdf-layout-gpu"
4041
entrypoint: [ "gunicorn", "-k", "uvicorn.workers.UvicornWorker", "--chdir", "./src", "app:app", "--bind", "0.0.0.0:5060", "--timeout", "10000"]
4142
image: ghcr.io/huridocs/pdf-document-layout-analysis:0.0.23
4243
init: true
4344
restart: unless-stopped
4445
network_mode: host
46+
deploy:
47+
resources:
48+
reservations:
49+
devices:
50+
- driver: nvidia
51+
count: 1
52+
capabilities: [ gpu ]
4553
volumes:
4654
- data:/app/xmls
4755

src/drivers/queues_processor/start_queue_processor.py

Lines changed: 27 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
service_logger,
1919
QUEUES_NAMES,
2020
)
21+
from domain.PdfFile import PdfFile
2122
from domain.ResultMessage import ResultMessage
2223
from domain.Task import Task
2324
from use_cases.extract_segments_use_case import ocr_pdf, get_xml_name, extract_segments
@@ -33,6 +34,23 @@ def get_failed_results_message(task: Task, message: str) -> ResultMessage:
3334
)
3435

3536

37+
def is_valid_pdf(filepath):
38+
try:
39+
with open(filepath, "rb") as f:
40+
header = f.read(5)
41+
if header != b"%PDF-":
42+
return False
43+
f.seek(-1024, 2)
44+
end_bytes = f.read(1024)
45+
if b"%%EOF" not in end_bytes:
46+
return False
47+
return True
48+
except FileNotFoundError:
49+
return False
50+
except Exception:
51+
return False
52+
53+
3654
def ocr_pdf_task(task):
3755
ocr_pdf(task)
3856

@@ -53,24 +71,26 @@ def process(message):
5371
try:
5472
task = Task(**message)
5573
except ValidationError:
56-
service_logger.error(f"validation error: {message}", exc_info=True)
74+
service_logger.error(f"The message was incorrectly formatted: {message}")
5775
return None
5876

5977
try:
6078
service_logger.info(f"Processing Redis message: {message}")
6179

80+
if not is_valid_pdf(PdfFile(task.tenant).get_path(task.params.filename)):
81+
extraction_message = get_failed_results_message(task, f"The file does not appear to be a valid PDF")
82+
service_logger.info(extraction_message.model_dump_json())
83+
return extraction_message.model_dump_json()
84+
6285
if task.task == "ocr":
6386
return ocr_pdf_task(task)
6487

6588
return process_task(task).model_dump_json()
66-
except RuntimeError:
67-
extraction_message = get_failed_results_message(task, "Error processing PDF")
68-
service_logger.error(extraction_message.model_dump_json(), exc_info=True)
6989
except FileNotFoundError:
70-
extraction_message = get_failed_results_message(task, "Error FileNotFoundError")
90+
extraction_message = get_failed_results_message(task, "The PDF could not be found")
7191
service_logger.error(extraction_message.model_dump_json(), exc_info=True)
72-
except Exception:
73-
extraction_message = get_failed_results_message(task, "Error")
92+
except (RuntimeError, Exception):
93+
extraction_message = get_failed_results_message(task, "An unexpected error occurred")
7494
service_logger.error(extraction_message.model_dump_json(), exc_info=True)
7595

7696
return extraction_message.model_dump_json()

src/tests/test_end_to_end.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,7 @@ def test_error_ocr(self):
140140
self.assertEqual(False, extraction_message.success)
141141

142142
@staticmethod
143-
def get_redis_message() -> ResultMessage:
143+
def get_redis_message() -> ResultMessage | None:
144144
queues_names = ["segmentation", "ocr"]
145145

146146
for i in range(160):
@@ -151,3 +151,5 @@ def get_redis_message() -> ResultMessage:
151151
if message:
152152
queue.deleteMessage(id=message["id"]).execute()
153153
return ResultMessage(**json.loads(message["message"]))
154+
155+
return None

0 commit comments

Comments
 (0)