feat: Introduce support for GPU Accelerators (#593)

* Upgraded Layout Postprocessing, sending old code back to ERZ Signed-off-by: Christoph Auer <[email protected]> * Implement hierachical cluster layout processing Signed-off-by: Christoph Auer <[email protected]> * Pass nested cluster processing through full pipeline Signed-off-by: Christoph Auer <[email protected]> * Pass nested clusters through GLM as payload Signed-off-by: Christoph Auer <[email protected]> * Move to_docling_document from ds-glm to this repo Signed-off-by: Christoph Auer <[email protected]> * Clean up imports again Signed-off-by: Christoph Auer <[email protected]> * feat(Accelerator): Introduce options to control the num_threads and device from API, envvars, CLI. - Introduce the AcceleratorOptions, AcceleratorDevice and use them to set the device where the models run. - Introduce the accelerator_utils with function to decide the device and resolve the AUTO setting. - Refactor the way how the docling-ibm-models are called to match the new init signature of models. - Translate the accelerator options to the specific inputs for third-party models. - Extend the docling CLI with parameters to set the num_threads and device. - Add new unit tests. - Write new example how to use the accelerator options. * fix: Improve the pydantic objects in the pipeline_options and imports. Signed-off-by: Nikos Livathinos <[email protected]> * fix: TableStructureModel: Refactor the artifacts path to use the new structure for fast/accurate model Signed-off-by: Nikos Livathinos <[email protected]> * Updated test ground-truth Signed-off-by: Christoph Auer <[email protected]> * Updated test ground-truth (again), bugfix for empty layout Signed-off-by: Christoph Auer <[email protected]> * fix: Do proper check to set the device in EasyOCR, RapidOCR. Signed-off-by: Nikos Livathinos <[email protected]> * Rollback changes from main Signed-off-by: Christoph Auer <[email protected]> * Update test gt Signed-off-by: Christoph Auer <[email protected]> * Remove unused debug settings Signed-off-by: Christoph Auer <[email protected]> * Review fixes Signed-off-by: Christoph Auer <[email protected]> * Nail the accelerator defaults for MPS Signed-off-by: Christoph Auer <[email protected]> --------- Signed-off-by: Christoph Auer <[email protected]> Signed-off-by: Nikos Livathinos <[email protected]> Co-authored-by: Christoph Auer <[email protected]> Co-authored-by: Christoph Auer <[email protected]>
DS4SD · Dec 13, 2024 · 19fad92 · 19fad92
1 parent 365a1e7
commit 19fad92
Show file tree

Hide file tree

Showing 38 changed files with 384 additions and 93 deletions.
diff --git a/docling/cli/main.py b/docling/cli/main.py
@@ -26,6 +26,8 @@
 )
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
+    AcceleratorDevice,
+    AcceleratorOptions,
     EasyOcrOptions,
     OcrEngine,
     OcrMacOptions,
@@ -257,6 +259,10 @@ def convert(
             help="The timeout for processing each document, in seconds.",
         ),
     ] = None,
+    num_threads: Annotated[int, typer.Option(..., help="Number of threads")] = 4,
+    device: Annotated[
+        AcceleratorDevice, typer.Option(..., help="Accelerator device")
+    ] = AcceleratorDevice.AUTO,
 ):
     if verbose == 0:
         logging.basicConfig(level=logging.WARNING)
@@ -336,7 +342,9 @@ def convert(
         if ocr_lang_list is not None:
             ocr_options.lang = ocr_lang_list
 
+        accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
         pipeline_options = PdfPipelineOptions(
+            accelerator_options=accelerator_options,
             do_ocr=ocr,
             ocr_options=ocr_options,
             do_table_structure=True,

diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py
@@ -1,8 +1,66 @@
+import logging
+import os
+import warnings
 from enum import Enum
 from pathlib import Path
-from typing import List, Literal, Optional, Union
+from typing import Annotated, Any, Dict, List, Literal, Optional, Tuple, Type, Union
 
-from pydantic import BaseModel, ConfigDict, Field
+from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
+from pydantic_settings import (
+    BaseSettings,
+    PydanticBaseSettingsSource,
+    SettingsConfigDict,
+)
+from typing_extensions import deprecated
+
+_log = logging.getLogger(__name__)
+
+
+class AcceleratorDevice(str, Enum):
+    """Devices to run model inference"""
+
+    AUTO = "auto"
+    CPU = "cpu"
+    CUDA = "cuda"
+    MPS = "mps"
+
+
+class AcceleratorOptions(BaseSettings):
+    model_config = SettingsConfigDict(
+        env_prefix="DOCLING_", env_nested_delimiter="_", populate_by_name=True
+    )
+
+    num_threads: int = 4
+    device: AcceleratorDevice = AcceleratorDevice.AUTO
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_alternative_envvars(cls, data: Any) -> Any:
+        r"""
+        Set num_threads from the "alternative" envvar OMP_NUM_THREADS.
+        The alternative envvar is used only if it is valid and the regular envvar is not set.
+
+        Notice: The standard pydantic settings mechanism with parameter "aliases" does not provide
+        the same functionality. In case the alias envvar is set and the user tries to override the
+        parameter in settings initialization, Pydantic treats the parameter provided in __init__()
+        as an extra input instead of simply overwriting the evvar value for that parameter.
+        """
+        if isinstance(data, dict):
+            input_num_threads = data.get("num_threads")
+
+            # Check if to set the num_threads from the alternative envvar
+            if input_num_threads is None:
+                docling_num_threads = os.getenv("DOCLING_NUM_THREADS")
+                omp_num_threads = os.getenv("OMP_NUM_THREADS")
+                if docling_num_threads is None and omp_num_threads is not None:
+                    try:
+                        data["num_threads"] = int(omp_num_threads)
+                    except ValueError:
+                        _log.error(
+                            "Ignoring misformatted envvar OMP_NUM_THREADS '%s'",
+                            omp_num_threads,
+                        )
+        return data
 
 
 class TableFormerMode(str, Enum):
@@ -78,9 +136,11 @@ class EasyOcrOptions(OcrOptions):
 
     kind: Literal["easyocr"] = "easyocr"
     lang: List[str] = ["fr", "de", "es", "en"]
-    use_gpu: bool = True  # same default as easyocr.Reader
+
+    use_gpu: Optional[bool] = None
+
     model_storage_directory: Optional[str] = None
-    download_enabled: bool = True  # same default as easyocr.Reader
+    download_enabled: bool = True
 
     model_config = ConfigDict(
         extra="forbid",
@@ -153,6 +213,7 @@ class PipelineOptions(BaseModel):
         True  # This default will be set to False on a future version of docling
     )
     document_timeout: Optional[float] = None
+    accelerator_options: AcceleratorOptions = AcceleratorOptions()
 
 
 class PdfPipelineOptions(PipelineOptions):

diff --git a/docling/models/easyocr_model.py b/docling/models/easyocr_model.py
@@ -1,4 +1,5 @@
 import logging
+import warnings
 from typing import Iterable
 
 import numpy
@@ -7,16 +8,26 @@
 
 from docling.datamodel.base_models import Cell, OcrCell, Page
 from docling.datamodel.document import ConversionResult
-from docling.datamodel.pipeline_options import EasyOcrOptions
+from docling.datamodel.pipeline_options import (
+    AcceleratorDevice,
+    AcceleratorOptions,
+    EasyOcrOptions,
+)
 from docling.datamodel.settings import settings
 from docling.models.base_ocr_model import BaseOcrModel
+from docling.utils.accelerator_utils import decide_device
 from docling.utils.profiling import TimeRecorder
 
 _log = logging.getLogger(__name__)
 
 
 class EasyOcrModel(BaseOcrModel):
-    def __init__(self, enabled: bool, options: EasyOcrOptions):
+    def __init__(
+        self,
+        enabled: bool,
+        options: EasyOcrOptions,
+        accelerator_options: AcceleratorOptions,
+    ):
         super().__init__(enabled=enabled, options=options)
         self.options: EasyOcrOptions
 
@@ -31,11 +42,32 @@ def __init__(self, enabled: bool, options: EasyOcrOptions):
                     "Alternatively, Docling has support for other OCR engines. See the documentation."
                 )
 
+            if self.options.use_gpu is None:
+                device = decide_device(accelerator_options.device)
+                # Enable easyocr GPU if running on CUDA, MPS
+                use_gpu = any(
+                    [
+                        device.startswith(x)
+                        for x in [
+                            AcceleratorDevice.CUDA.value,
+                            AcceleratorDevice.MPS.value,
+                        ]
+                    ]
+                )
+            else:
+                warnings.warn(
+                    "Deprecated field. Better to set the `accelerator_options.device` in `pipeline_options`. "
+                    "When `use_gpu and accelerator_options.device == AcceleratorDevice.CUDA` the GPU is used "
+                    "to run EasyOCR. Otherwise, EasyOCR runs in CPU."
+                )
+                use_gpu = self.options.use_gpu
+
             self.reader = easyocr.Reader(
                 lang_list=self.options.lang,
-                gpu=self.options.use_gpu,
+                gpu=use_gpu,
                 model_storage_directory=self.options.model_storage_directory,
                 download_enabled=self.options.download_enabled,
+                verbose=False,
             )
 
     def __call__(

diff --git a/docling/models/layout_model.py b/docling/models/layout_model.py
@@ -9,6 +9,7 @@
 from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
 from PIL import ImageDraw
 
+import docling.utils.layout_utils as lu
 from docling.datamodel.base_models import (
     BoundingBox,
     Cell,
@@ -17,9 +18,10 @@
     Page,
 )
 from docling.datamodel.document import ConversionResult
+from docling.datamodel.pipeline_options import AcceleratorDevice, AcceleratorOptions
 from docling.datamodel.settings import settings
 from docling.models.base_model import BasePageModel
-from docling.utils import layout_utils as lu
+from docling.utils.accelerator_utils import decide_device
 from docling.utils.profiling import TimeRecorder
 
 _log = logging.getLogger(__name__)
@@ -46,8 +48,16 @@ class LayoutModel(BasePageModel):
     FIGURE_LABEL = DocItemLabel.PICTURE
     FORMULA_LABEL = DocItemLabel.FORMULA
 
-    def __init__(self, artifacts_path: Path):
-        self.layout_predictor = LayoutPredictor(artifacts_path)  # TODO temporary
+    def __init__(self, artifacts_path: Path, accelerator_options: AcceleratorOptions):
+        device = decide_device(accelerator_options.device)
+
+        self.layout_predictor = LayoutPredictor(
+            artifact_path=str(artifacts_path),
+            device=device,
+            num_threads=accelerator_options.num_threads,
+            base_threshold=0.6,
+            blacklist_classes={"Form", "Key-Value Region"},
+        )
 
     def postprocess(self, clusters_in: List[Cluster], cells: List[Cell], page_height):
         MIN_INTERSECTION = 0.2

diff --git a/docling/models/rapid_ocr_model.py b/docling/models/rapid_ocr_model.py
@@ -6,16 +6,26 @@
 
 from docling.datamodel.base_models import OcrCell, Page
 from docling.datamodel.document import ConversionResult
-from docling.datamodel.pipeline_options import RapidOcrOptions
+from docling.datamodel.pipeline_options import (
+    AcceleratorDevice,
+    AcceleratorOptions,
+    RapidOcrOptions,
+)
 from docling.datamodel.settings import settings
 from docling.models.base_ocr_model import BaseOcrModel
+from docling.utils.accelerator_utils import decide_device
 from docling.utils.profiling import TimeRecorder
 
 _log = logging.getLogger(__name__)
 
 
 class RapidOcrModel(BaseOcrModel):
-    def __init__(self, enabled: bool, options: RapidOcrOptions):
+    def __init__(
+        self,
+        enabled: bool,
+        options: RapidOcrOptions,
+        accelerator_options: AcceleratorOptions,
+    ):
         super().__init__(enabled=enabled, options=options)
         self.options: RapidOcrOptions
 
@@ -30,52 +40,21 @@ def __init__(self, enabled: bool, options: RapidOcrOptions):
                     "Alternatively, Docling has support for other OCR engines. See the documentation."
                 )
 
-            # This configuration option will be revamped while introducing device settings for all models.
-            # For the moment we will default to auto and let onnx-runtime pick the best.
-            cls_use_cuda = True
-            rec_use_cuda = True
-            det_use_cuda = True
-            det_use_dml = True
-            cls_use_dml = True
-            rec_use_dml = True
-
-            # # Same as Defaults in RapidOCR
-            # cls_use_cuda = False
-            # rec_use_cuda = False
-            # det_use_cuda = False
-            # det_use_dml = False
-            # cls_use_dml = False
-            # rec_use_dml = False
-
-            # # If we set everything to true onnx-runtime would automatically choose the fastest accelerator
-            # if self.options.device == self.options.Device.AUTO:
-            #     cls_use_cuda = True
-            #     rec_use_cuda = True
-            #     det_use_cuda = True
-            #     det_use_dml = True
-            #     cls_use_dml = True
-            #     rec_use_dml = True
-
-            # # If we set use_cuda to true onnx would use the cuda device available in runtime if no cuda device is available it would run on CPU.
-            # elif self.options.device == self.options.Device.CUDA:
-            #     cls_use_cuda = True
-            #     rec_use_cuda = True
-            #     det_use_cuda = True
-
-            # # If we set use_dml to true onnx would use the dml device available in runtime if no dml device is available it would work on CPU.
-            # elif self.options.device == self.options.Device.DIRECTML:
-            #     det_use_dml = True
-            #     cls_use_dml = True
-            #     rec_use_dml = True
+            # Decide the accelerator devices
+            device = decide_device(accelerator_options.device)
+            use_cuda = str(AcceleratorDevice.CUDA.value).lower() in device
+            use_dml = accelerator_options.device == AcceleratorDevice.AUTO
+            intra_op_num_threads = accelerator_options.num_threads
 
             self.reader = RapidOCR(
                 text_score=self.options.text_score,
-                cls_use_cuda=cls_use_cuda,
-                rec_use_cuda=rec_use_cuda,
-                det_use_cuda=det_use_cuda,
-                det_use_dml=det_use_dml,
-                cls_use_dml=cls_use_dml,
-                rec_use_dml=rec_use_dml,
+                cls_use_cuda=use_cuda,
+                rec_use_cuda=use_cuda,
+                det_use_cuda=use_cuda,
+                det_use_dml=use_dml,
+                cls_use_dml=use_dml,
+                rec_use_dml=use_dml,
+                intra_op_num_threads=intra_op_num_threads,
                 print_verbose=self.options.print_verbose,
                 det_model_path=self.options.det_model_path,
                 cls_model_path=self.options.cls_model_path,

diff --git a/docling/models/table_structure_model.py b/docling/models/table_structure_model.py
@@ -9,15 +9,25 @@
 
 from docling.datamodel.base_models import Page, Table, TableStructurePrediction
 from docling.datamodel.document import ConversionResult
-from docling.datamodel.pipeline_options import TableFormerMode, TableStructureOptions
+from docling.datamodel.pipeline_options import (
+    AcceleratorDevice,
+    AcceleratorOptions,
+    TableFormerMode,
+    TableStructureOptions,
+)
 from docling.datamodel.settings import settings
 from docling.models.base_model import BasePageModel
+from docling.utils.accelerator_utils import decide_device
 from docling.utils.profiling import TimeRecorder
 
 
 class TableStructureModel(BasePageModel):
     def __init__(
-        self, enabled: bool, artifacts_path: Path, options: TableStructureOptions
+        self,
+        enabled: bool,
+        artifacts_path: Path,
+        options: TableStructureOptions,
+        accelerator_options: AcceleratorOptions,
     ):
         self.options = options
         self.do_cell_matching = self.options.do_cell_matching
@@ -26,16 +36,26 @@ def __init__(
         self.enabled = enabled
         if self.enabled:
             if self.mode == TableFormerMode.ACCURATE:
-                artifacts_path = artifacts_path / "fat"
+                artifacts_path = artifacts_path / "accurate"
+            else:
+                artifacts_path = artifacts_path / "fast"
 
             # Third Party
             import docling_ibm_models.tableformer.common as c
 
+            device = decide_device(accelerator_options.device)
+
+            # Disable MPS here, until we know why it makes things slower.
+            if device == AcceleratorDevice.MPS.value:
+                device = AcceleratorDevice.CPU.value
+
             self.tm_config = c.read_config(f"{artifacts_path}/tm_config.json")
             self.tm_config["model"]["save_dir"] = artifacts_path
             self.tm_model_type = self.tm_config["model"]["type"]
 
-            self.tf_predictor = TFPredictor(self.tm_config)
+            self.tf_predictor = TFPredictor(
+                self.tm_config, device, accelerator_options.num_threads
+            )
             self.scale = 2.0  # Scale up table input images to 144 dpi
 
     def draw_table_and_cells(