From 848d9c8b4a3cb3c84bb8c6e2008afa2e3ca0d48a Mon Sep 17 00:00:00 2001 From: Ben Browning Date: Sat, 9 Nov 2024 21:36:44 -0500 Subject: [PATCH] Only use CPU for the docling OCR models Because GPU memory is extremely tight in many of our supported hardware configurations, and because our GitHub Mac CI runners error out when running the OCR models with MPS acceleration, let's just explicitly pin the OCR models to the CPU. See DS4SD/docling#286 for a bit more context. Signed-off-by: Ben Browning --- src/instructlab/sdg/utils/chunkers.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py index 8831bde6..8946b8b7 100644 --- a/src/instructlab/sdg/utils/chunkers.py +++ b/src/instructlab/sdg/utils/chunkers.py @@ -213,6 +213,8 @@ def chunk_documents(self) -> List: model_artifacts_path = StandardPdfPipeline.download_models_hf() pipeline_options = PdfPipelineOptions(artifacts_path=model_artifacts_path) + # Keep OCR models on the CPU instead of GPU + pipeline_options.ocr_options.use_gpu = False converter = DocumentConverter( format_options={ InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)