Skip to content

Commit

Permalink
feat: Introduce support for GPU Accelerators (#593)
Browse files Browse the repository at this point in the history
* Upgraded Layout Postprocessing, sending old code back to ERZ

Signed-off-by: Christoph Auer <[email protected]>

* Implement hierachical cluster layout processing

Signed-off-by: Christoph Auer <[email protected]>

* Pass nested cluster processing through full pipeline

Signed-off-by: Christoph Auer <[email protected]>

* Pass nested clusters through GLM as payload

Signed-off-by: Christoph Auer <[email protected]>

* Move to_docling_document from ds-glm to this repo

Signed-off-by: Christoph Auer <[email protected]>

* Clean up imports again

Signed-off-by: Christoph Auer <[email protected]>

* feat(Accelerator): Introduce options to control the num_threads and device from API, envvars, CLI.
- Introduce the AcceleratorOptions, AcceleratorDevice and use them to set the device where the models run.
- Introduce the accelerator_utils with function to decide the device and resolve the AUTO setting.
- Refactor the way how the docling-ibm-models are called to match the new init signature of models.
- Translate the accelerator options to the specific inputs for third-party models.
- Extend the docling CLI with parameters to set the num_threads and device.
- Add new unit tests.
- Write new example how to use the accelerator options.

* fix: Improve the pydantic objects in the pipeline_options and imports.

Signed-off-by: Nikos Livathinos <[email protected]>

* fix: TableStructureModel: Refactor the artifacts path to use the new structure for fast/accurate model

Signed-off-by: Nikos Livathinos <[email protected]>

* Updated test ground-truth

Signed-off-by: Christoph Auer <[email protected]>

* Updated test ground-truth (again), bugfix for empty layout

Signed-off-by: Christoph Auer <[email protected]>

* fix: Do proper check to set the device in EasyOCR, RapidOCR.

Signed-off-by: Nikos Livathinos <[email protected]>

* Rollback changes from main

Signed-off-by: Christoph Auer <[email protected]>

* Update test gt

Signed-off-by: Christoph Auer <[email protected]>

* Remove unused debug settings

Signed-off-by: Christoph Auer <[email protected]>

* Review fixes

Signed-off-by: Christoph Auer <[email protected]>

* Nail the accelerator defaults for MPS

Signed-off-by: Christoph Auer <[email protected]>

---------

Signed-off-by: Christoph Auer <[email protected]>
Signed-off-by: Nikos Livathinos <[email protected]>
Co-authored-by: Christoph Auer <[email protected]>
Co-authored-by: Christoph Auer <[email protected]>
  • Loading branch information
3 people authored Dec 13, 2024
1 parent 365a1e7 commit 19fad92
Show file tree
Hide file tree
Showing 38 changed files with 384 additions and 93 deletions.
8 changes: 8 additions & 0 deletions docling/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
)
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
AcceleratorDevice,
AcceleratorOptions,
EasyOcrOptions,
OcrEngine,
OcrMacOptions,
Expand Down Expand Up @@ -257,6 +259,10 @@ def convert(
help="The timeout for processing each document, in seconds.",
),
] = None,
num_threads: Annotated[int, typer.Option(..., help="Number of threads")] = 4,
device: Annotated[
AcceleratorDevice, typer.Option(..., help="Accelerator device")
] = AcceleratorDevice.AUTO,
):
if verbose == 0:
logging.basicConfig(level=logging.WARNING)
Expand Down Expand Up @@ -336,7 +342,9 @@ def convert(
if ocr_lang_list is not None:
ocr_options.lang = ocr_lang_list

accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
pipeline_options = PdfPipelineOptions(
accelerator_options=accelerator_options,
do_ocr=ocr,
ocr_options=ocr_options,
do_table_structure=True,
Expand Down
69 changes: 65 additions & 4 deletions docling/datamodel/pipeline_options.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,66 @@
import logging
import os
import warnings
from enum import Enum
from pathlib import Path
from typing import List, Literal, Optional, Union
from typing import Annotated, Any, Dict, List, Literal, Optional, Tuple, Type, Union

from pydantic import BaseModel, ConfigDict, Field
from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
from pydantic_settings import (
BaseSettings,
PydanticBaseSettingsSource,
SettingsConfigDict,
)
from typing_extensions import deprecated

_log = logging.getLogger(__name__)


class AcceleratorDevice(str, Enum):
"""Devices to run model inference"""

AUTO = "auto"
CPU = "cpu"
CUDA = "cuda"
MPS = "mps"


class AcceleratorOptions(BaseSettings):
model_config = SettingsConfigDict(
env_prefix="DOCLING_", env_nested_delimiter="_", populate_by_name=True
)

num_threads: int = 4
device: AcceleratorDevice = AcceleratorDevice.AUTO

@model_validator(mode="before")
@classmethod
def check_alternative_envvars(cls, data: Any) -> Any:
r"""
Set num_threads from the "alternative" envvar OMP_NUM_THREADS.
The alternative envvar is used only if it is valid and the regular envvar is not set.
Notice: The standard pydantic settings mechanism with parameter "aliases" does not provide
the same functionality. In case the alias envvar is set and the user tries to override the
parameter in settings initialization, Pydantic treats the parameter provided in __init__()
as an extra input instead of simply overwriting the evvar value for that parameter.
"""
if isinstance(data, dict):
input_num_threads = data.get("num_threads")

# Check if to set the num_threads from the alternative envvar
if input_num_threads is None:
docling_num_threads = os.getenv("DOCLING_NUM_THREADS")
omp_num_threads = os.getenv("OMP_NUM_THREADS")
if docling_num_threads is None and omp_num_threads is not None:
try:
data["num_threads"] = int(omp_num_threads)
except ValueError:
_log.error(
"Ignoring misformatted envvar OMP_NUM_THREADS '%s'",
omp_num_threads,
)
return data


class TableFormerMode(str, Enum):
Expand Down Expand Up @@ -78,9 +136,11 @@ class EasyOcrOptions(OcrOptions):

kind: Literal["easyocr"] = "easyocr"
lang: List[str] = ["fr", "de", "es", "en"]
use_gpu: bool = True # same default as easyocr.Reader

use_gpu: Optional[bool] = None

model_storage_directory: Optional[str] = None
download_enabled: bool = True # same default as easyocr.Reader
download_enabled: bool = True

model_config = ConfigDict(
extra="forbid",
Expand Down Expand Up @@ -153,6 +213,7 @@ class PipelineOptions(BaseModel):
True # This default will be set to False on a future version of docling
)
document_timeout: Optional[float] = None
accelerator_options: AcceleratorOptions = AcceleratorOptions()


class PdfPipelineOptions(PipelineOptions):
Expand Down
38 changes: 35 additions & 3 deletions docling/models/easyocr_model.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
import warnings
from typing import Iterable

import numpy
Expand All @@ -7,16 +8,26 @@

from docling.datamodel.base_models import Cell, OcrCell, Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import EasyOcrOptions
from docling.datamodel.pipeline_options import (
AcceleratorDevice,
AcceleratorOptions,
EasyOcrOptions,
)
from docling.datamodel.settings import settings
from docling.models.base_ocr_model import BaseOcrModel
from docling.utils.accelerator_utils import decide_device
from docling.utils.profiling import TimeRecorder

_log = logging.getLogger(__name__)


class EasyOcrModel(BaseOcrModel):
def __init__(self, enabled: bool, options: EasyOcrOptions):
def __init__(
self,
enabled: bool,
options: EasyOcrOptions,
accelerator_options: AcceleratorOptions,
):
super().__init__(enabled=enabled, options=options)
self.options: EasyOcrOptions

Expand All @@ -31,11 +42,32 @@ def __init__(self, enabled: bool, options: EasyOcrOptions):
"Alternatively, Docling has support for other OCR engines. See the documentation."
)

if self.options.use_gpu is None:
device = decide_device(accelerator_options.device)
# Enable easyocr GPU if running on CUDA, MPS
use_gpu = any(
[
device.startswith(x)
for x in [
AcceleratorDevice.CUDA.value,
AcceleratorDevice.MPS.value,
]
]
)
else:
warnings.warn(
"Deprecated field. Better to set the `accelerator_options.device` in `pipeline_options`. "
"When `use_gpu and accelerator_options.device == AcceleratorDevice.CUDA` the GPU is used "
"to run EasyOCR. Otherwise, EasyOCR runs in CPU."
)
use_gpu = self.options.use_gpu

self.reader = easyocr.Reader(
lang_list=self.options.lang,
gpu=self.options.use_gpu,
gpu=use_gpu,
model_storage_directory=self.options.model_storage_directory,
download_enabled=self.options.download_enabled,
verbose=False,
)

def __call__(
Expand Down
16 changes: 13 additions & 3 deletions docling/models/layout_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
from PIL import ImageDraw

import docling.utils.layout_utils as lu
from docling.datamodel.base_models import (
BoundingBox,
Cell,
Expand All @@ -17,9 +18,10 @@
Page,
)
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import AcceleratorDevice, AcceleratorOptions
from docling.datamodel.settings import settings
from docling.models.base_model import BasePageModel
from docling.utils import layout_utils as lu
from docling.utils.accelerator_utils import decide_device
from docling.utils.profiling import TimeRecorder

_log = logging.getLogger(__name__)
Expand All @@ -46,8 +48,16 @@ class LayoutModel(BasePageModel):
FIGURE_LABEL = DocItemLabel.PICTURE
FORMULA_LABEL = DocItemLabel.FORMULA

def __init__(self, artifacts_path: Path):
self.layout_predictor = LayoutPredictor(artifacts_path) # TODO temporary
def __init__(self, artifacts_path: Path, accelerator_options: AcceleratorOptions):
device = decide_device(accelerator_options.device)

self.layout_predictor = LayoutPredictor(
artifact_path=str(artifacts_path),
device=device,
num_threads=accelerator_options.num_threads,
base_threshold=0.6,
blacklist_classes={"Form", "Key-Value Region"},
)

def postprocess(self, clusters_in: List[Cluster], cells: List[Cell], page_height):
MIN_INTERSECTION = 0.2
Expand Down
69 changes: 24 additions & 45 deletions docling/models/rapid_ocr_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,26 @@

from docling.datamodel.base_models import OcrCell, Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import RapidOcrOptions
from docling.datamodel.pipeline_options import (
AcceleratorDevice,
AcceleratorOptions,
RapidOcrOptions,
)
from docling.datamodel.settings import settings
from docling.models.base_ocr_model import BaseOcrModel
from docling.utils.accelerator_utils import decide_device
from docling.utils.profiling import TimeRecorder

_log = logging.getLogger(__name__)


class RapidOcrModel(BaseOcrModel):
def __init__(self, enabled: bool, options: RapidOcrOptions):
def __init__(
self,
enabled: bool,
options: RapidOcrOptions,
accelerator_options: AcceleratorOptions,
):
super().__init__(enabled=enabled, options=options)
self.options: RapidOcrOptions

Expand All @@ -30,52 +40,21 @@ def __init__(self, enabled: bool, options: RapidOcrOptions):
"Alternatively, Docling has support for other OCR engines. See the documentation."
)

# This configuration option will be revamped while introducing device settings for all models.
# For the moment we will default to auto and let onnx-runtime pick the best.
cls_use_cuda = True
rec_use_cuda = True
det_use_cuda = True
det_use_dml = True
cls_use_dml = True
rec_use_dml = True

# # Same as Defaults in RapidOCR
# cls_use_cuda = False
# rec_use_cuda = False
# det_use_cuda = False
# det_use_dml = False
# cls_use_dml = False
# rec_use_dml = False

# # If we set everything to true onnx-runtime would automatically choose the fastest accelerator
# if self.options.device == self.options.Device.AUTO:
# cls_use_cuda = True
# rec_use_cuda = True
# det_use_cuda = True
# det_use_dml = True
# cls_use_dml = True
# rec_use_dml = True

# # If we set use_cuda to true onnx would use the cuda device available in runtime if no cuda device is available it would run on CPU.
# elif self.options.device == self.options.Device.CUDA:
# cls_use_cuda = True
# rec_use_cuda = True
# det_use_cuda = True

# # If we set use_dml to true onnx would use the dml device available in runtime if no dml device is available it would work on CPU.
# elif self.options.device == self.options.Device.DIRECTML:
# det_use_dml = True
# cls_use_dml = True
# rec_use_dml = True
# Decide the accelerator devices
device = decide_device(accelerator_options.device)
use_cuda = str(AcceleratorDevice.CUDA.value).lower() in device
use_dml = accelerator_options.device == AcceleratorDevice.AUTO
intra_op_num_threads = accelerator_options.num_threads

self.reader = RapidOCR(
text_score=self.options.text_score,
cls_use_cuda=cls_use_cuda,
rec_use_cuda=rec_use_cuda,
det_use_cuda=det_use_cuda,
det_use_dml=det_use_dml,
cls_use_dml=cls_use_dml,
rec_use_dml=rec_use_dml,
cls_use_cuda=use_cuda,
rec_use_cuda=use_cuda,
det_use_cuda=use_cuda,
det_use_dml=use_dml,
cls_use_dml=use_dml,
rec_use_dml=use_dml,
intra_op_num_threads=intra_op_num_threads,
print_verbose=self.options.print_verbose,
det_model_path=self.options.det_model_path,
cls_model_path=self.options.cls_model_path,
Expand Down
28 changes: 24 additions & 4 deletions docling/models/table_structure_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,25 @@

from docling.datamodel.base_models import Page, Table, TableStructurePrediction
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import TableFormerMode, TableStructureOptions
from docling.datamodel.pipeline_options import (
AcceleratorDevice,
AcceleratorOptions,
TableFormerMode,
TableStructureOptions,
)
from docling.datamodel.settings import settings
from docling.models.base_model import BasePageModel
from docling.utils.accelerator_utils import decide_device
from docling.utils.profiling import TimeRecorder


class TableStructureModel(BasePageModel):
def __init__(
self, enabled: bool, artifacts_path: Path, options: TableStructureOptions
self,
enabled: bool,
artifacts_path: Path,
options: TableStructureOptions,
accelerator_options: AcceleratorOptions,
):
self.options = options
self.do_cell_matching = self.options.do_cell_matching
Expand All @@ -26,16 +36,26 @@ def __init__(
self.enabled = enabled
if self.enabled:
if self.mode == TableFormerMode.ACCURATE:
artifacts_path = artifacts_path / "fat"
artifacts_path = artifacts_path / "accurate"
else:
artifacts_path = artifacts_path / "fast"

# Third Party
import docling_ibm_models.tableformer.common as c

device = decide_device(accelerator_options.device)

# Disable MPS here, until we know why it makes things slower.
if device == AcceleratorDevice.MPS.value:
device = AcceleratorDevice.CPU.value

self.tm_config = c.read_config(f"{artifacts_path}/tm_config.json")
self.tm_config["model"]["save_dir"] = artifacts_path
self.tm_model_type = self.tm_config["model"]["type"]

self.tf_predictor = TFPredictor(self.tm_config)
self.tf_predictor = TFPredictor(
self.tm_config, device, accelerator_options.num_threads
)
self.scale = 2.0 # Scale up table input images to 144 dpi

def draw_table_and_cells(
Expand Down
Loading

0 comments on commit 19fad92

Please sign in to comment.