Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Support tableformer model choice #90

Merged
merged 10 commits into from
Sep 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ FROM python:3.11-slim-bookworm
ENV GIT_SSH_COMMAND="ssh -o StrictHostKeyChecking=no"

RUN apt-get update \
&& apt-get install -y libgl1 libglib2.0-0 curl wget git \
&& apt-get install -y libgl1 libglib2.0-0 curl wget git procps \
&& apt-get clean

# This will install torch with *only* cpu support
Expand Down
16 changes: 16 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,8 @@ This can improve output quality if you find that multiple columns in extracted t


```python
from docling.datamodel.pipeline_options import PipelineOptions

pipeline_options = PipelineOptions(do_table_structure=True)
pipeline_options.table_structure_options.do_cell_matching = False # uses text cells predicted from table structure model

Expand All @@ -168,6 +170,20 @@ doc_converter = DocumentConverter(
)
```

Since docling 1.16.0: You can control which TableFormer mode you want to use. Choose between `TableFormerMode.FAST` (default) and `TableFormerMode.ACCURATE` (better, but slower) to receive better quality with difficult table structures.

```python
from docling.datamodel.pipeline_options import PipelineOptions, TableFormerMode

pipeline_options = PipelineOptions(do_table_structure=True)
pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE # use more accurate TableFormer model

doc_converter = DocumentConverter(
artifacts_path=artifacts_path,
pipeline_options=pipeline_options,
)
```

### Impose limits on the document size

You can limit the file size and number of pages which should be allowed to process per document:
Expand Down
3 changes: 2 additions & 1 deletion docling/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@

from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import ConversionStatus, PipelineOptions
from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.datamodel.pipeline_options import PipelineOptions
from docling.document_converter import DocumentConverter

warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
Expand Down
20 changes: 4 additions & 16 deletions docling/datamodel/base_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@
from typing_extensions import Self

from docling.backend.abstract_backend import PdfPageBackend
from docling.datamodel.pipeline_options import ( # Must be imported here for backward compatibility.
PipelineOptions,
TableStructureOptions,
)


class ConversionStatus(str, Enum):
Expand Down Expand Up @@ -298,22 +302,6 @@ class DocumentStream(BaseModel):
stream: BytesIO


class TableStructureOptions(BaseModel):
do_cell_matching: bool = (
True
# True: Matches predictions back to PDF cells. Can break table output if PDF cells
# are merged across table columns.
# False: Let table structure model define the text cells, ignore PDF cells.
)


class PipelineOptions(BaseModel):
do_table_structure: bool = True # True: perform table structure extraction
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text

table_structure_options: TableStructureOptions = TableStructureOptions()


class AssembleOptions(BaseModel):
keep_page_images: Annotated[
bool,
Expand Down
2 changes: 1 addition & 1 deletion docling/datamodel/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@
from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Type, Union

from docling_core.types import BaseCell, BaseText
from docling_core.types import BoundingBox as DsBoundingBox
from docling_core.types import Document as DsDocument
from docling_core.types import DocumentDescription as DsDocumentDescription
from docling_core.types import FileInfoObject as DsFileInfoObject
from docling_core.types import PageDimensions, PageReference, Prov, Ref
from docling_core.types import Table as DsSchemaTable
from docling_core.types import TableCell
from docling_core.types.doc.base import BoundingBox as DsBoundingBox
from docling_core.types.doc.base import Figure
from pydantic import BaseModel
from typing_extensions import deprecated
Expand Down
25 changes: 25 additions & 0 deletions docling/datamodel/pipeline_options.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from enum import Enum, auto

from pydantic import BaseModel


class TableFormerMode(str, Enum):
FAST = auto()
ACCURATE = auto()


class TableStructureOptions(BaseModel):
do_cell_matching: bool = (
True
# True: Matches predictions back to PDF cells. Can break table output if PDF cells
# are merged across table columns.
# False: Let table structure model define the text cells, ignore PDF cells.
)
mode: TableFormerMode = TableFormerMode.FAST


class PipelineOptions(BaseModel):
do_table_structure: bool = True # True: perform table structure extraction
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text

table_structure_options: TableStructureOptions = TableStructureOptions()
2 changes: 1 addition & 1 deletion docling/document_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,13 @@
DoclingComponentType,
ErrorItem,
Page,
PipelineOptions,
)
from docling.datamodel.document import (
ConversionResult,
DocumentConversionInput,
InputDocument,
)
from docling.datamodel.pipeline_options import PipelineOptions
from docling.datamodel.settings import settings
from docling.models.ds_glm_model import GlmModel
from docling.models.page_assemble_model import PageAssembleModel
Expand Down
9 changes: 8 additions & 1 deletion docling/models/table_structure_model.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import copy
from pathlib import Path
from typing import Iterable, List

import numpy
Expand All @@ -12,16 +13,22 @@
TableElement,
TableStructurePrediction,
)
from docling.datamodel.pipeline_options import TableFormerMode


class TableStructureModel:
def __init__(self, config):
self.config = config
self.do_cell_matching = config["do_cell_matching"]
self.mode = config["mode"]

self.enabled = config["enabled"]
if self.enabled:
artifacts_path = config["artifacts_path"]
artifacts_path: Path = config["artifacts_path"]

if self.mode == TableFormerMode.ACCURATE:
artifacts_path = artifacts_path / "fat"

# Third Party
import docling_ibm_models.tableformer.common as c

Expand Down
3 changes: 2 additions & 1 deletion docling/pipeline/base_model_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from pathlib import Path
from typing import Callable, Iterable, List

from docling.datamodel.base_models import Page, PipelineOptions
from docling.datamodel.base_models import Page
from docling.datamodel.pipeline_options import PipelineOptions


class BaseModelPipeline:
Expand Down
3 changes: 2 additions & 1 deletion docling/pipeline/standard_model_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from pathlib import Path

from docling.datamodel.base_models import PipelineOptions
from docling.datamodel.pipeline_options import PipelineOptions
from docling.models.easyocr_model import EasyOcrModel
from docling.models.layout_model import LayoutModel
from docling.models.table_structure_model import TableStructureModel
Expand Down Expand Up @@ -32,6 +32,7 @@ def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
"artifacts_path": artifacts_path
/ StandardModelPipeline._table_model_path,
"enabled": pipeline_options.do_table_structure,
"mode": pipeline_options.table_structure_options.mode,
"do_cell_matching": pipeline_options.table_structure_options.do_cell_matching,
}
),
Expand Down
2 changes: 1 addition & 1 deletion examples/batch_convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from pathlib import Path
from typing import Iterable

from docling.datamodel.base_models import ConversionStatus, PipelineOptions
from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.document_converter import DocumentConverter

Expand Down
2 changes: 1 addition & 1 deletion examples/custom_convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def main():
# PyPdfium with OCR
# -----------------
# pipeline_options = PipelineOptions()
# pipeline_options.do_ocr=False
# pipeline_options.do_ocr=True
# pipeline_options.do_table_structure=True
# pipeline_options.table_structure_options.do_cell_matching = True

Expand Down
Loading
Loading