docling-project · FrigaZzz · Oct 7, 2025 · Oct 11, 2025 · Oct 11, 2025
diff --git a/docs/examples/plugin_tutorial.md b/docs/examples/plugin_tutorial.md
diff --git a/docs/examples/third_party_plugins/api_usage/api_usage_plugin.py b/docs/examples/third_party_plugins/api_usage/api_usage_plugin.py
@@ -0,0 +1,7 @@
+from api_usage.models.picture_description_api_model import (
+    PictureDescriptionApiModelWithUsage,
+)
+
+
+def picture_description():
+    return {"picture_description": [PictureDescriptionApiModelWithUsage]}
diff --git a/...lugins/api_usage/datamodel/pipeline_options/picture_description_api_options_with_usage.py b/...lugins/api_usage/datamodel/pipeline_options/picture_description_api_options_with_usage.py
@@ -0,0 +1,14 @@
+from typing import Any, ClassVar, Dict, List, Literal, Optional, Union
+
+from pydantic import (
+    AnyUrl,
+    Field,
+)
+
+from docling.datamodel.pipeline_options import PictureDescriptionApiOptions
+
+
+class PictureDescriptionApiOptionsWithUsage(PictureDescriptionApiOptions):
+    """DescriptionAnnotation."""
+
+    kind: ClassVar[Literal["api_usage"]] = "api_usage"
diff --git a/docs/examples/third_party_plugins/api_usage/datamodel/utils/api_image_request_with_usage.py b/docs/examples/third_party_plugins/api_usage/datamodel/utils/api_image_request_with_usage.py
@@ -0,0 +1,70 @@
+import base64
+import json
+import logging
+from io import BytesIO
+from typing import Dict, List, Optional, Tuple
+
+import requests
+from PIL import Image
+from pydantic import AnyUrl
+
+from docling.datamodel.base_models import OpenAiApiResponse, OpenAiResponseUsage
+from docling.models.utils.generation_utils import GenerationStopper
+
+_log = logging.getLogger(__name__)
+
+
+def api_image_request_with_usage(
+    image: Image.Image,
+    prompt: str,
+    url: AnyUrl,
+    timeout: float = 20,
+    headers: Optional[Dict[str, str]] = None,
+    **params,
+) -> Tuple[str, Optional[OpenAiResponseUsage]]:
+    """Send an image+prompt to an OpenAI-compatible API and return (text, usage).
+
+    If no usage data is available, the second tuple element will be None.
+    """
+    img_io = BytesIO()
+    image.save(img_io, "PNG")
+    image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {"url": f"data:image/png;base64,{image_base64}"},
+                },
+                {
+                    "type": "text",
+                    "text": prompt,
+                },
+            ],
+        }
+    ]
+
+    payload = {
+        "messages": messages,
+        **params,
+    }
+
+    headers = headers or {}
+
+    r = requests.post(
+        str(url),
+        headers=headers,
+        json=payload,
+        timeout=timeout,
+    )
+    if not r.ok:
+        _log.error(f"Error calling the API. Response was {r.text}")
+    r.raise_for_status()
+
+    api_resp = OpenAiApiResponse.model_validate_json(r.text)
+    generated_text = api_resp.choices[0].message.content.strip()
+
+    usage = api_resp.usage if hasattr(api_resp, "usage") else None
+
+    return generated_text, usage
diff --git a/docs/examples/third_party_plugins/api_usage/models/picture_description_api_model.py b/docs/examples/third_party_plugins/api_usage/models/picture_description_api_model.py
@@ -0,0 +1,124 @@
+from collections.abc import Iterable
+from concurrent.futures import ThreadPoolExecutor
+from pathlib import Path
+from typing import List, Optional, Type, Union
+
+from api_usage.datamodel.utils.api_image_request_with_usage import (
+    api_image_request_with_usage,
+)
+from docling_core.types.doc import DoclingDocument, NodeItem, PictureItem
+from docling_core.types.doc.document import (
+    DescriptionAnnotation,
+)  # TODO: move import to docling_core.types.doc
+from PIL import Image
+
+from docling.datamodel.accelerator_options import AcceleratorOptions
+from docling.datamodel.base_models import OpenAiResponseUsage
+from docling.datamodel.pipeline_options import PictureDescriptionBaseOptions
+from docling.exceptions import OperationNotAllowed
+from docling.models.base_model import ItemAndImageEnrichmentElement
+from docling.models.picture_description_api_model import PictureDescriptionApiModel
+from docs.examples.third_party_plugins.api_usage.datamodel.pipeline_options.picture_description_api_options_with_usage import (
+    PictureDescriptionApiOptionsWithUsage,
+)
+
+
+class DescriptionAnnotationWithUsage(DescriptionAnnotation):
+    """DescriptionAnnotation."""
+
+    usage: Optional[OpenAiResponseUsage] = None
+
+
+class PictureDescriptionApiModelWithUsage(PictureDescriptionApiModel):
+    # elements_batch_size = 4
+
+    @classmethod
+    def get_options_type(cls) -> Type[PictureDescriptionBaseOptions]:
+        return PictureDescriptionApiOptionsWithUsage
+
+    def __init__(
+        self,
+        enabled: bool,
+        enable_remote_services: bool,
+        artifacts_path: Optional[Union[Path, str]],
+        options: PictureDescriptionApiOptionsWithUsage,
+        accelerator_options: AcceleratorOptions,
+    ):
+        super().__init__(
+            enabled=enabled,
+            enable_remote_services=enable_remote_services,
+            artifacts_path=artifacts_path,
+            options=options,
+            accelerator_options=accelerator_options,
+        )
+        self.options: PictureDescriptionApiOptionsWithUsage
+        self.concurrency = self.options.concurrency
+
+        if self.enabled:
+            if not enable_remote_services:
+                raise OperationNotAllowed(
+                    "Connections to remote services is only allowed when set explicitly. "
+                    "pipeline_options.enable_remote_services=True."
+                )
+
+    def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
+        # Note: technically we could make a batch request here,
+        # but not all APIs will allow for it. For example, vllm won't allow more than 1.
+        def _api_request(image):
+            return api_image_request_with_usage(
+                image=image,
+                prompt=self.options.prompt,
+                url=self.options.url,
+                timeout=self.options.timeout,
+                headers=self.options.headers,
+                **self.options.params,
+            )
+
+        with ThreadPoolExecutor(max_workers=self.concurrency) as executor:
+            yield from executor.map(_api_request, images)
+
+    def __call__(
+        self,
+        doc: DoclingDocument,
+        element_batch: Iterable[ItemAndImageEnrichmentElement],
+    ) -> Iterable[NodeItem]:
+        if not self.enabled:
+            for element in element_batch:
+                yield element.item
+            return
+
+        images: List[Image.Image] = []
+        elements: List[PictureItem] = []
+        for el in element_batch:
+            assert isinstance(el.item, PictureItem)
+            describe_image = True
+            # Don't describe the image if it's smaller than the threshold
+            if len(el.item.prov) > 0:
+                prov = el.item.prov[0]  # PictureItems have at most a single provenance
+                page = doc.pages.get(prov.page_no)
+                if page is not None:
+                    page_area = page.size.width * page.size.height
+                    if page_area > 0:
+                        area_fraction = prov.bbox.area() / page_area
+                        if area_fraction < self.options.picture_area_threshold:
+                            describe_image = False
+            if describe_image:
+                elements.append(el.item)
+                images.append(el.image)
+
+        outputs = self._annotate_images(images)
+
+        for item, output in zip(elements, outputs):
+            # api_image_request now may return (text, usage) or plain text;
+            # normalize to tuple
+            if isinstance(output, tuple):
+                text, usage = output
+            else:
+                text, usage = output, None
+
+            item.annotations.append(
+                DescriptionAnnotationWithUsage(
+                    text=text, provenance=self.provenance, usage=usage
+                )
+            )
+            yield item
diff --git a/docs/examples/third_party_plugins/main.py b/docs/examples/third_party_plugins/main.py
@@ -0,0 +1,97 @@
+"""
+Example: Docling pipeline using the third-party picture description plugin
+located in docs/examples/third_party_plugins/api_usage.
+
+Prerequisites:
+- Ensure you have Docling installed in the same Python environment
+- Install this example plugin in editable mode:
+    pip install -e docs/examples/third_party_plugins
+- Optionally set environment variables for the API backend:
+    OPENAI_COMPATIBLE_API_URL, OPENAI_COMPATIBLE_API_KEY, OPENAI_COMPATIBLE_API_HEADER_NAME
+    (or provide url/headers directly below)
+
+Run:
+    python docs/examples/third_party_plugins/main.py
+"""
+
+import os
+from typing import Dict
+
+from dotenv import load_dotenv
+
+# Import the options class from the installed example plugin package
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.pipeline_options import PdfPipelineOptions
+from docling.document_converter import DocumentConverter, PdfFormatOption
+from docs.examples.third_party_plugins.api_usage.datamodel.pipeline_options.picture_description_api_options_with_usage import (
+    PictureDescriptionApiOptionsWithUsage,
+)
+
+load_dotenv()
+
+
+def main():
+    # Resolve a simple OpenAI-compatible backend from environment variables
+    url = os.getenv(
+        "OPENAI_COMPATIBLE_API_URL", "http://localhost:8000/v1/chat/completions"
+    )
+    key = os.getenv("OPENAI_COMPATIBLE_API_KEY")
+    header_name = os.getenv("OPENAI_COMPATIBLE_API_HEADER_NAME", "api-key")
+    headers: Dict[str, str] = {header_name: key} if key else {}
+
+    # Configure pipeline options
+    pipeline_options = PdfPipelineOptions()
+    pipeline_options.allow_external_plugins = True
+
+    # Enable image processing for paginated PDF processing
+    pipeline_options.generate_picture_images = True
+    pipeline_options.images_scale = 2  # higher resolution thumbnails
+    pipeline_options.do_picture_description = True
+
+    # Enable remote services (required for external API calls)
+    pipeline_options.enable_remote_services = True
+
+    # Configure picture description via the example plugin options
+    pipeline_options.picture_description_options = (
+        PictureDescriptionApiOptionsWithUsage(
+            url=url,
+            headers=headers,
+            params={"model": "gpt-5-mini", "temperature": 1},
+            prompt="Describe the image clearly and concisely in a few sentences.",
+            timeout=45.0,
+            concurrency=2,
+        )
+    )
+
+    # Create converter with the configured options
+    converter = DocumentConverter(
+        format_options={
+            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
+        }
+    )
+
+    # Convert the document (local path or URL)
+    source = os.getenv("SOURCE_DOCUMENT", "https://arxiv.org/pdf/2408.09869")
+    print(f"\nConverting source: {source}\n")
+
+    result = converter.convert(source)
+    doc = result.document
+
+    # Print the markdown result
+    print(doc.export_to_markdown())
+
+    # Print token usage for each picture annotation (if provided by backend)
+    for idx, pic in enumerate(doc.pictures):
+        print(f"\nPicture #{idx}:")
+        if not getattr(pic, "annotations", None):
+            print("  (no annotations)")
+            continue
+
+        for ann_idx, ann in enumerate(pic.annotations):
+            usage = getattr(ann, "usage", None)
+            ann_text = getattr(ann, "text", None)
+            print(f"  Annotation {ann_idx}: text={ann_text!r} usage={usage!r}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/examples/third_party_plugins/pyproject.toml b/docs/examples/third_party_plugins/pyproject.toml
@@ -0,0 +1,25 @@
+[build-system]
+requires = ["setuptools>=61.0", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "docling-plugin-api-usage-example"
+version = "0.1.0"
+description = "Example Docling third-party plugin: picture description via OpenAI-compatible API with token usage."
+readme = "README.md"
+requires-python = ">=3.10"
+authors = [
+  { name = "Docling Examples", email = "[email protected]" }
+]
+dependencies = [
+  "docling>=0.1.0",  # pin to a compatible version for your environment
+  "pydantic>=2.0.0",
+  "Pillow>=10.0.0",
+  "requests>=2.31.0",
+]
+
+[project.entry-points."docling"]
+api_usage_plugin = "api_usage.api_usage_plugin"
+
+[tool.setuptools]
+packages = ["api_usage"]
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -111,6 +111,8 @@ nav:
     - 🖼️ Picture annotation:
       - "Annotate picture with local VLM": examples/pictures_description.ipynb
       - "Annotate picture with remote VLM": examples/pictures_description_api.py
+    - 🧩 Plugins:
+      - "Third‑party plugin tutorial": examples/plugin_tutorial.md
     - ✨ Enrichment development:
       - "Figure enrichment": examples/develop_picture_enrichment.py
       - "Formula enrichment": examples/develop_formula_understanding.py