From 848d9c8b4a3cb3c84bb8c6e2008afa2e3ca0d48a Mon Sep 17 00:00:00 2001
From: Ben Browning <bbrownin@redhat.com>
Date: Sat, 9 Nov 2024 21:36:44 -0500
Subject: [PATCH] Only use CPU for the docling OCR models

Because GPU memory is extremely tight in many of our supported
hardware configurations, and because our GitHub Mac CI runners error
out when running the OCR models with MPS acceleration, let's just
explicitly pin the OCR models to the CPU.

See DS4SD/docling#286 for a bit more context.

Signed-off-by: Ben Browning <bbrownin@redhat.com>
---
 src/instructlab/sdg/utils/chunkers.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py
index 8831bde6..8946b8b7 100644
--- a/src/instructlab/sdg/utils/chunkers.py
+++ b/src/instructlab/sdg/utils/chunkers.py
@@ -213,6 +213,8 @@ def chunk_documents(self) -> List:
 
         model_artifacts_path = StandardPdfPipeline.download_models_hf()
         pipeline_options = PdfPipelineOptions(artifacts_path=model_artifacts_path)
+        # Keep OCR models on the CPU instead of GPU
+        pipeline_options.ocr_options.use_gpu = False
         converter = DocumentConverter(
             format_options={
                 InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)