From d60313738340c20f9af64dfe51e28b7670ff64ef Mon Sep 17 00:00:00 2001
From: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
Date: Fri, 26 Jul 2024 16:55:33 +0200
Subject: [PATCH] feat: add simplified single-doc conversion (#20)

Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
---
 README.md                     | 26 ++++++++++++----
 docling/document_converter.py | 56 +++++++++++++++++++++++++++++++++++
 poetry.lock                   |  3 +-
 pyproject.toml                |  1 +
 4 files changed, 80 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 5689e038..071d26de 100644
--- a/README.md
+++ b/README.md
@@ -30,19 +30,35 @@ To use Docling, simply install `docling` from your package manager, e.g. pip:
 pip install docling
 ```
 
-> [!NOTE]  
+> [!NOTE]
 > Works on macOS and Linux environments. Windows platforms are currently not tested.
 
 ### Development setup
 
 To develop for Docling, you need Python 3.10 / 3.11 / 3.12 and Poetry. You can then install from your local clone's root dir:
 ```bash
-poetry install
+poetry install --all-extras
 ```
 
 ## Usage
 
-For basic usage, see the [convert.py](https://github.com/DS4SD/docling/blob/main/examples/convert.py) example module. Run with:
+### Convert a single document
+
+To convert invidual PDF documents, use `convert_single()`, for example:
+```python
+from docling.document_converter import DocumentConverter
+
+source = "https://arxiv.org/pdf/2206.01062"  # PDF path or URL
+converter = DocumentConverter()
+doc = converter.convert_single(source)
+print(doc.export_to_markdown())  # output: "## DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis [...]"
+```
+
+### Convert a batch of documents
+
+For an example of converting multiple documents, see [convert.py](https://github.com/DS4SD/docling/blob/main/examples/convert.py).
+
+From a local repo clone, you can run it with:
 
 ```
 python examples/convert.py
@@ -58,7 +74,7 @@ You can control if table structure recognition or OCR should be performed by arg
 doc_converter = DocumentConverter(
     artifacts_path=artifacts_path,
     pipeline_options=PipelineOptions(
-        do_table_structure=False,  # controls if table structure is recovered 
+        do_table_structure=False,  # controls if table structure is recovered
         do_ocr=True,  # controls if OCR is applied (ignores programmatic content)
     ),
 )
@@ -90,7 +106,7 @@ conv_input = DocumentConversionInput.from_paths(
 )
 ```
 
-### Convert from binary PDF streams 
+### Convert from binary PDF streams
 
 You can convert PDFs from a binary stream instead of from the filesystem as follows:
 ```python
diff --git a/docling/document_converter.py b/docling/document_converter.py
index cf24c9a6..95b30a06 100644
--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@@ -1,11 +1,15 @@
 import functools
 import logging
+import tempfile
 import time
 import traceback
 from pathlib import Path
 from typing import Iterable, Optional, Type, Union
 
+import requests
+from docling_core.types import Document
 from PIL import ImageDraw
+from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
 
 from docling.backend.abstract_backend import PdfDocumentBackend
 from docling.datamodel.base_models import (
@@ -32,6 +36,7 @@
 class DocumentConverter:
     _layout_model_path = "model_artifacts/layout/beehive_v0.0.5"
     _table_model_path = "model_artifacts/tableformer"
+    _default_download_filename = "file.pdf"
 
     def __init__(
         self,
@@ -80,6 +85,57 @@ def convert(self, input: DocumentConversionInput) -> Iterable[ConvertedDocument]
             # Note: Pdfium backend is not thread-safe, thread pool usage was disabled.
             yield from map(self.process_document, input_batch)
 
+    def convert_single(self, source: Path | AnyHttpUrl | str) -> Document:
+        """Convert a single document.
+
+        Args:
+            source (Path | AnyHttpUrl | str): The PDF input source. Can be a path or URL.
+
+        Raises:
+            ValueError: If source is of unexpected type.
+            RuntimeError: If conversion fails.
+
+        Returns:
+            Document: The converted document object.
+        """
+        with tempfile.TemporaryDirectory() as temp_dir:
+            try:
+                http_url: AnyHttpUrl = TypeAdapter(AnyHttpUrl).validate_python(source)
+                res = requests.get(http_url, stream=True)
+                res.raise_for_status()
+                fname = None
+                # try to get filename from response header
+                if cont_disp := res.headers.get("Content-Disposition"):
+                    for par in cont_disp.strip().split(";"):
+                        # currently only handling directive "filename" (not "*filename")
+                        if (split := par.split("=")) and split[0].strip() == "filename":
+                            fname = "=".join(split[1:]).strip().strip("'\"") or None
+                            break
+                # otherwise, use name from URL:
+                if fname is None:
+                    fname = Path(http_url.path).name or self._default_download_filename
+                local_path = Path(temp_dir) / fname
+                with open(local_path, "wb") as f:
+                    for chunk in res.iter_content(chunk_size=1024):  # using 1-KB chunks
+                        f.write(chunk)
+            except ValidationError:
+                try:
+                    local_path = TypeAdapter(Path).validate_python(source)
+                except ValidationError:
+                    raise ValueError(
+                        f"Unexpected file path type encountered: {type(source)}"
+                    )
+            conv_inp = DocumentConversionInput.from_paths(paths=[local_path])
+            converted_docs_iter = self.convert(conv_inp)
+            converted_doc: ConvertedDocument = next(converted_docs_iter)
+        if converted_doc.status not in {
+            ConversionStatus.SUCCESS,
+            ConversionStatus.SUCCESS_WITH_ERRORS,
+        }:
+            raise RuntimeError(f"Conversion failed with status: {converted_doc.status}")
+        doc = converted_doc.to_ds_document()
+        return doc
+
     def process_document(self, in_doc: InputDocument) -> ConvertedDocument:
         start_doc_time = time.time()
         converted_doc = ConvertedDocument(input=in_doc)
diff --git a/poetry.lock b/poetry.lock
index 09f695a0..9715593d 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -2510,6 +2510,7 @@ description = "Nvidia JIT LTO Library"
 optional = false
 python-versions = ">=3"
 files = [
+    {file = "nvidia_nvjitlink_cu12-12.5.82-py3-none-manylinux2014_aarch64.whl", hash = "sha256:98103729cc5226e13ca319a10bbf9433bbbd44ef64fe72f45f067cacc14b8d27"},
     {file = "nvidia_nvjitlink_cu12-12.5.82-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f9b37bc5c8cf7509665cb6ada5aaa0ce65618f2332b7d3e78e9790511f111212"},
     {file = "nvidia_nvjitlink_cu12-12.5.82-py3-none-win_amd64.whl", hash = "sha256:e782564d705ff0bf61ac3e1bf730166da66dd2fe9012f111ede5fc49b64ae697"},
 ]
@@ -4881,4 +4882,4 @@ ocr = ["easyocr"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
-content-hash = "3ffc5161fd49fe2186ee2afbb3319922964661c769c434fc7386aae40f4aab19"
+content-hash = "dcb00c6601f61b087fd204d040149c20a7dcd72ab353e912e78dc265c86e4d00"
diff --git a/pyproject.toml b/pyproject.toml
index 544af9ae..a142eac2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -30,6 +30,7 @@ filetype = "^1.2.0"
 pypdfium2 = "^4.30.0"
 pydantic-settings = "^2.3.0"
 huggingface_hub = ">=0.23,<1"
+requests = "^2.32.3"
 easyocr = { version = "^1.7", optional = true }
 
 [tool.poetry.group.dev.dependencies]