diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py index 8831bde6..8946b8b7 100644 --- a/src/instructlab/sdg/utils/chunkers.py +++ b/src/instructlab/sdg/utils/chunkers.py @@ -213,6 +213,8 @@ def chunk_documents(self) -> List: model_artifacts_path = StandardPdfPipeline.download_models_hf() pipeline_options = PdfPipelineOptions(artifacts_path=model_artifacts_path) + # Keep OCR models on the CPU instead of GPU + pipeline_options.ocr_options.use_gpu = False converter = DocumentConverter( format_options={ InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)