Skip to content

Commit

Permalink
feat: add simplified single-doc conversion (#20)
Browse files Browse the repository at this point in the history
Signed-off-by: Panos Vagenas <[email protected]>
  • Loading branch information
vagenas authored Jul 26, 2024
1 parent 3eca8b8 commit d603137
Show file tree
Hide file tree
Showing 4 changed files with 80 additions and 6 deletions.
26 changes: 21 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,19 +30,35 @@ To use Docling, simply install `docling` from your package manager, e.g. pip:
pip install docling
```

> [!NOTE]
> [!NOTE]
> Works on macOS and Linux environments. Windows platforms are currently not tested.
### Development setup

To develop for Docling, you need Python 3.10 / 3.11 / 3.12 and Poetry. You can then install from your local clone's root dir:
```bash
poetry install
poetry install --all-extras
```

## Usage

For basic usage, see the [convert.py](https://github.com/DS4SD/docling/blob/main/examples/convert.py) example module. Run with:
### Convert a single document

To convert invidual PDF documents, use `convert_single()`, for example:
```python
from docling.document_converter import DocumentConverter

source = "https://arxiv.org/pdf/2206.01062" # PDF path or URL
converter = DocumentConverter()
doc = converter.convert_single(source)
print(doc.export_to_markdown()) # output: "## DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis [...]"
```

### Convert a batch of documents

For an example of converting multiple documents, see [convert.py](https://github.com/DS4SD/docling/blob/main/examples/convert.py).

From a local repo clone, you can run it with:

```
python examples/convert.py
Expand All @@ -58,7 +74,7 @@ You can control if table structure recognition or OCR should be performed by arg
doc_converter = DocumentConverter(
artifacts_path=artifacts_path,
pipeline_options=PipelineOptions(
do_table_structure=False, # controls if table structure is recovered
do_table_structure=False, # controls if table structure is recovered
do_ocr=True, # controls if OCR is applied (ignores programmatic content)
),
)
Expand Down Expand Up @@ -90,7 +106,7 @@ conv_input = DocumentConversionInput.from_paths(
)
```

### Convert from binary PDF streams
### Convert from binary PDF streams

You can convert PDFs from a binary stream instead of from the filesystem as follows:
```python
Expand Down
56 changes: 56 additions & 0 deletions docling/document_converter.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
import functools
import logging
import tempfile
import time
import traceback
from pathlib import Path
from typing import Iterable, Optional, Type, Union

import requests
from docling_core.types import Document
from PIL import ImageDraw
from pydantic import AnyHttpUrl, TypeAdapter, ValidationError

from docling.backend.abstract_backend import PdfDocumentBackend
from docling.datamodel.base_models import (
Expand All @@ -32,6 +36,7 @@
class DocumentConverter:
_layout_model_path = "model_artifacts/layout/beehive_v0.0.5"
_table_model_path = "model_artifacts/tableformer"
_default_download_filename = "file.pdf"

def __init__(
self,
Expand Down Expand Up @@ -80,6 +85,57 @@ def convert(self, input: DocumentConversionInput) -> Iterable[ConvertedDocument]
# Note: Pdfium backend is not thread-safe, thread pool usage was disabled.
yield from map(self.process_document, input_batch)

def convert_single(self, source: Path | AnyHttpUrl | str) -> Document:
"""Convert a single document.
Args:
source (Path | AnyHttpUrl | str): The PDF input source. Can be a path or URL.
Raises:
ValueError: If source is of unexpected type.
RuntimeError: If conversion fails.
Returns:
Document: The converted document object.
"""
with tempfile.TemporaryDirectory() as temp_dir:
try:
http_url: AnyHttpUrl = TypeAdapter(AnyHttpUrl).validate_python(source)
res = requests.get(http_url, stream=True)
res.raise_for_status()
fname = None
# try to get filename from response header
if cont_disp := res.headers.get("Content-Disposition"):
for par in cont_disp.strip().split(";"):
# currently only handling directive "filename" (not "*filename")
if (split := par.split("=")) and split[0].strip() == "filename":
fname = "=".join(split[1:]).strip().strip("'\"") or None
break
# otherwise, use name from URL:
if fname is None:
fname = Path(http_url.path).name or self._default_download_filename
local_path = Path(temp_dir) / fname
with open(local_path, "wb") as f:
for chunk in res.iter_content(chunk_size=1024): # using 1-KB chunks
f.write(chunk)
except ValidationError:
try:
local_path = TypeAdapter(Path).validate_python(source)
except ValidationError:
raise ValueError(
f"Unexpected file path type encountered: {type(source)}"
)
conv_inp = DocumentConversionInput.from_paths(paths=[local_path])
converted_docs_iter = self.convert(conv_inp)
converted_doc: ConvertedDocument = next(converted_docs_iter)
if converted_doc.status not in {
ConversionStatus.SUCCESS,
ConversionStatus.SUCCESS_WITH_ERRORS,
}:
raise RuntimeError(f"Conversion failed with status: {converted_doc.status}")
doc = converted_doc.to_ds_document()
return doc

def process_document(self, in_doc: InputDocument) -> ConvertedDocument:
start_doc_time = time.time()
converted_doc = ConvertedDocument(input=in_doc)
Expand Down
3 changes: 2 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ filetype = "^1.2.0"
pypdfium2 = "^4.30.0"
pydantic-settings = "^2.3.0"
huggingface_hub = ">=0.23,<1"
requests = "^2.32.3"
easyocr = { version = "^1.7", optional = true }

[tool.poetry.group.dev.dependencies]
Expand Down

0 comments on commit d603137

Please sign in to comment.