From d60313738340c20f9af64dfe51e28b7670ff64ef Mon Sep 17 00:00:00 2001 From: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Date: Fri, 26 Jul 2024 16:55:33 +0200 Subject: [PATCH] feat: add simplified single-doc conversion (#20) Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> --- README.md | 26 ++++++++++++---- docling/document_converter.py | 56 +++++++++++++++++++++++++++++++++++ poetry.lock | 3 +- pyproject.toml | 1 + 4 files changed, 80 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 5689e038..071d26de 100644 --- a/README.md +++ b/README.md @@ -30,19 +30,35 @@ To use Docling, simply install `docling` from your package manager, e.g. pip: pip install docling ``` -> [!NOTE] +> [!NOTE] > Works on macOS and Linux environments. Windows platforms are currently not tested. ### Development setup To develop for Docling, you need Python 3.10 / 3.11 / 3.12 and Poetry. You can then install from your local clone's root dir: ```bash -poetry install +poetry install --all-extras ``` ## Usage -For basic usage, see the [convert.py](https://github.com/DS4SD/docling/blob/main/examples/convert.py) example module. Run with: +### Convert a single document + +To convert invidual PDF documents, use `convert_single()`, for example: +```python +from docling.document_converter import DocumentConverter + +source = "https://arxiv.org/pdf/2206.01062" # PDF path or URL +converter = DocumentConverter() +doc = converter.convert_single(source) +print(doc.export_to_markdown()) # output: "## DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis [...]" +``` + +### Convert a batch of documents + +For an example of converting multiple documents, see [convert.py](https://github.com/DS4SD/docling/blob/main/examples/convert.py). + +From a local repo clone, you can run it with: ``` python examples/convert.py @@ -58,7 +74,7 @@ You can control if table structure recognition or OCR should be performed by arg doc_converter = DocumentConverter( artifacts_path=artifacts_path, pipeline_options=PipelineOptions( - do_table_structure=False, # controls if table structure is recovered + do_table_structure=False, # controls if table structure is recovered do_ocr=True, # controls if OCR is applied (ignores programmatic content) ), ) @@ -90,7 +106,7 @@ conv_input = DocumentConversionInput.from_paths( ) ``` -### Convert from binary PDF streams +### Convert from binary PDF streams You can convert PDFs from a binary stream instead of from the filesystem as follows: ```python diff --git a/docling/document_converter.py b/docling/document_converter.py index cf24c9a6..95b30a06 100644 --- a/docling/document_converter.py +++ b/docling/document_converter.py @@ -1,11 +1,15 @@ import functools import logging +import tempfile import time import traceback from pathlib import Path from typing import Iterable, Optional, Type, Union +import requests +from docling_core.types import Document from PIL import ImageDraw +from pydantic import AnyHttpUrl, TypeAdapter, ValidationError from docling.backend.abstract_backend import PdfDocumentBackend from docling.datamodel.base_models import ( @@ -32,6 +36,7 @@ class DocumentConverter: _layout_model_path = "model_artifacts/layout/beehive_v0.0.5" _table_model_path = "model_artifacts/tableformer" + _default_download_filename = "file.pdf" def __init__( self, @@ -80,6 +85,57 @@ def convert(self, input: DocumentConversionInput) -> Iterable[ConvertedDocument] # Note: Pdfium backend is not thread-safe, thread pool usage was disabled. yield from map(self.process_document, input_batch) + def convert_single(self, source: Path | AnyHttpUrl | str) -> Document: + """Convert a single document. + + Args: + source (Path | AnyHttpUrl | str): The PDF input source. Can be a path or URL. + + Raises: + ValueError: If source is of unexpected type. + RuntimeError: If conversion fails. + + Returns: + Document: The converted document object. + """ + with tempfile.TemporaryDirectory() as temp_dir: + try: + http_url: AnyHttpUrl = TypeAdapter(AnyHttpUrl).validate_python(source) + res = requests.get(http_url, stream=True) + res.raise_for_status() + fname = None + # try to get filename from response header + if cont_disp := res.headers.get("Content-Disposition"): + for par in cont_disp.strip().split(";"): + # currently only handling directive "filename" (not "*filename") + if (split := par.split("=")) and split[0].strip() == "filename": + fname = "=".join(split[1:]).strip().strip("'\"") or None + break + # otherwise, use name from URL: + if fname is None: + fname = Path(http_url.path).name or self._default_download_filename + local_path = Path(temp_dir) / fname + with open(local_path, "wb") as f: + for chunk in res.iter_content(chunk_size=1024): # using 1-KB chunks + f.write(chunk) + except ValidationError: + try: + local_path = TypeAdapter(Path).validate_python(source) + except ValidationError: + raise ValueError( + f"Unexpected file path type encountered: {type(source)}" + ) + conv_inp = DocumentConversionInput.from_paths(paths=[local_path]) + converted_docs_iter = self.convert(conv_inp) + converted_doc: ConvertedDocument = next(converted_docs_iter) + if converted_doc.status not in { + ConversionStatus.SUCCESS, + ConversionStatus.SUCCESS_WITH_ERRORS, + }: + raise RuntimeError(f"Conversion failed with status: {converted_doc.status}") + doc = converted_doc.to_ds_document() + return doc + def process_document(self, in_doc: InputDocument) -> ConvertedDocument: start_doc_time = time.time() converted_doc = ConvertedDocument(input=in_doc) diff --git a/poetry.lock b/poetry.lock index 09f695a0..9715593d 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2510,6 +2510,7 @@ description = "Nvidia JIT LTO Library" optional = false python-versions = ">=3" files = [ + {file = "nvidia_nvjitlink_cu12-12.5.82-py3-none-manylinux2014_aarch64.whl", hash = "sha256:98103729cc5226e13ca319a10bbf9433bbbd44ef64fe72f45f067cacc14b8d27"}, {file = "nvidia_nvjitlink_cu12-12.5.82-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f9b37bc5c8cf7509665cb6ada5aaa0ce65618f2332b7d3e78e9790511f111212"}, {file = "nvidia_nvjitlink_cu12-12.5.82-py3-none-win_amd64.whl", hash = "sha256:e782564d705ff0bf61ac3e1bf730166da66dd2fe9012f111ede5fc49b64ae697"}, ] @@ -4881,4 +4882,4 @@ ocr = ["easyocr"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "3ffc5161fd49fe2186ee2afbb3319922964661c769c434fc7386aae40f4aab19" +content-hash = "dcb00c6601f61b087fd204d040149c20a7dcd72ab353e912e78dc265c86e4d00" diff --git a/pyproject.toml b/pyproject.toml index 544af9ae..a142eac2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,6 +30,7 @@ filetype = "^1.2.0" pypdfium2 = "^4.30.0" pydantic-settings = "^2.3.0" huggingface_hub = ">=0.23,<1" +requests = "^2.32.3" easyocr = { version = "^1.7", optional = true } [tool.poetry.group.dev.dependencies]