From c2ae1cc4cab0f9e693c7ca460fe8afa5b515ee94 Mon Sep 17 00:00:00 2001
From: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
Date: Sun, 26 Jan 2025 08:10:33 +0100
Subject: [PATCH] docs: description of supported formats and backends (#788)

* chore: remove type-ignore marks for attaching text to non GroupItems

After commit b74208 of docling-core, text items can be attached to any NodeItem
and therefore the ignore[arg-type] type marks can be removed.

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>

* test: remove unnecessary imports

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>

* docs: add documentation on supported formats and backends

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>

* docs: add notebook example with XML backends

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>

---------

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
---
 docling/backend/xml/uspto_backend.py |   50 +-
 docs/examples/backend_xml_rag.ipynb  | 1078 ++++++++++++++++++++++++++
 docs/usage.md                        |   40 +
 mkdocs.yml                           |    3 +-
 tests/test_backend_msexcel.py        |    7 +-
 tests/test_backend_patent_uspto.py   |    9 +-
 tests/test_backend_pubmed.py         |    1 -
 7 files changed, 1147 insertions(+), 41 deletions(-)
 create mode 100644 docs/examples/backend_xml_rag.ipynb

diff --git a/docling/backend/xml/uspto_backend.py b/docling/backend/xml/uspto_backend.py
index ef253b21..21001ab7 100644
--- a/docling/backend/xml/uspto_backend.py
+++ b/docling/backend/xml/uspto_backend.py
@@ -389,7 +389,7 @@ def _add_property(self, name: str, text: str) -> None:
             if name == self.Element.TITLE.value:
                 if text:
                     self.parents[self.level + 1] = self.doc.add_title(
-                        parent=self.parents[self.level],  # type: ignore[arg-type]
+                        parent=self.parents[self.level],
                         text=text,
                     )
                     self.level += 1
@@ -406,7 +406,7 @@ def _add_property(self, name: str, text: str) -> None:
                     abstract_item = self.doc.add_heading(
                         heading_text,
                         level=heading_level,
-                        parent=self.parents[heading_level],  # type: ignore[arg-type]
+                        parent=self.parents[heading_level],
                     )
                     self.doc.add_text(
                         label=DocItemLabel.PARAGRAPH,
@@ -434,7 +434,7 @@ def _add_property(self, name: str, text: str) -> None:
                 claims_item = self.doc.add_heading(
                     heading_text,
                     level=heading_level,
-                    parent=self.parents[heading_level],  # type: ignore[arg-type]
+                    parent=self.parents[heading_level],
                 )
                 for text in self.claims:
                     self.doc.add_text(
@@ -452,7 +452,7 @@ def _add_property(self, name: str, text: str) -> None:
                     self.doc.add_text(
                         label=DocItemLabel.PARAGRAPH,
                         text=text,
-                        parent=self.parents[self.level],  # type: ignore[arg-type]
+                        parent=self.parents[self.level],
                     )
                 self.text = ""
 
@@ -460,7 +460,7 @@ def _add_property(self, name: str, text: str) -> None:
                 self.parents[self.level + 1] = self.doc.add_heading(
                     text=text,
                     level=self.level,
-                    parent=self.parents[self.level],  # type: ignore[arg-type]
+                    parent=self.parents[self.level],
                 )
                 self.level += 1
                 self.text = ""
@@ -470,7 +470,7 @@ def _add_property(self, name: str, text: str) -> None:
                 empty_table = TableData(num_rows=0, num_cols=0, table_cells=[])
                 self.doc.add_table(
                     data=empty_table,
-                    parent=self.parents[self.level],  # type: ignore[arg-type]
+                    parent=self.parents[self.level],
                 )
 
         def _apply_style(self, text: str, style_tag: str) -> str:
@@ -721,7 +721,7 @@ def _add_property(self, name: str, text: str) -> None:
                 if self.Element.TITLE.value in self.property and text.strip():
                     title = text.strip()
                     self.parents[self.level + 1] = self.doc.add_title(
-                        parent=self.parents[self.level],  # type: ignore[arg-type]
+                        parent=self.parents[self.level],
                         text=title,
                     )
                     self.level += 1
@@ -749,7 +749,7 @@ def _add_property(self, name: str, text: str) -> None:
                     self.parents[self.level + 1] = self.doc.add_heading(
                         text=text.strip(),
                         level=self.level,
-                        parent=self.parents[self.level],  # type: ignore[arg-type]
+                        parent=self.parents[self.level],
                     )
                     self.level += 1
 
@@ -769,7 +769,7 @@ def _add_property(self, name: str, text: str) -> None:
                 claims_item = self.doc.add_heading(
                     heading_text,
                     level=heading_level,
-                    parent=self.parents[heading_level],  # type: ignore[arg-type]
+                    parent=self.parents[heading_level],
                 )
                 for text in self.claims:
                     self.doc.add_text(
@@ -787,7 +787,7 @@ def _add_property(self, name: str, text: str) -> None:
                 abstract_item = self.doc.add_heading(
                     heading_text,
                     level=heading_level,
-                    parent=self.parents[heading_level],  # type: ignore[arg-type]
+                    parent=self.parents[heading_level],
                 )
                 self.doc.add_text(
                     label=DocItemLabel.PARAGRAPH, text=abstract, parent=abstract_item
@@ -799,7 +799,7 @@ def _add_property(self, name: str, text: str) -> None:
                     self.doc.add_text(
                         label=DocItemLabel.PARAGRAPH,
                         text=paragraph,
-                        parent=self.parents[self.level],  # type: ignore[arg-type]
+                        parent=self.parents[self.level],
                     )
                 elif self.Element.CLAIM.value in self.property:
                     # we may need a space after a paragraph in claim text
@@ -811,7 +811,7 @@ def _add_property(self, name: str, text: str) -> None:
                 empty_table = TableData(num_rows=0, num_cols=0, table_cells=[])
                 self.doc.add_table(
                     data=empty_table,
-                    parent=self.parents[self.level],  # type: ignore[arg-type]
+                    parent=self.parents[self.level],
                 )
 
         def _apply_style(self, text: str, style_tag: str) -> str:
@@ -938,7 +938,7 @@ def store_section(self, section: str) -> None:
         self.parents[self.level + 1] = self.doc.add_heading(
             heading.value,
             level=self.level,
-            parent=self.parents[self.level],  # type: ignore[arg-type]
+            parent=self.parents[self.level],
         )
         self.level += 1
 
@@ -959,7 +959,7 @@ def store_content(self, section: str, field: str, value: str) -> None:
 
         if field == self.Field.TITLE.value:
             self.parents[self.level + 1] = self.doc.add_title(
-                parent=self.parents[self.level], text=value  # type: ignore[arg-type]
+                parent=self.parents[self.level], text=value
             )
             self.level += 1
 
@@ -971,14 +971,14 @@ def store_content(self, section: str, field: str, value: str) -> None:
                 self.doc.add_text(
                     label=DocItemLabel.PARAGRAPH,
                     text=value,
-                    parent=self.parents[self.level],  # type: ignore[arg-type]
+                    parent=self.parents[self.level],
                 )
 
         elif field == self.Field.NUMBER.value and section == self.Section.CLAIMS.value:
             self.doc.add_text(
                 label=DocItemLabel.PARAGRAPH,
                 text="",
-                parent=self.parents[self.level],  # type: ignore[arg-type]
+                parent=self.parents[self.level],
             )
 
         elif (
@@ -996,7 +996,7 @@ def store_content(self, section: str, field: str, value: str) -> None:
                 last_claim = self.doc.add_text(
                     label=DocItemLabel.PARAGRAPH,
                     text="",
-                    parent=self.parents[self.level],  # type: ignore[arg-type]
+                    parent=self.parents[self.level],
                 )
 
             last_claim.text += f" {value}" if last_claim.text else value
@@ -1012,7 +1012,7 @@ def store_content(self, section: str, field: str, value: str) -> None:
             self.parents[self.level + 1] = self.doc.add_heading(
                 value,
                 level=self.level,
-                parent=self.parents[self.level],  # type: ignore[arg-type]
+                parent=self.parents[self.level],
             )
             self.level += 1
 
@@ -1029,7 +1029,7 @@ def store_content(self, section: str, field: str, value: str) -> None:
             self.doc.add_text(
                 label=DocItemLabel.PARAGRAPH,
                 text=value,
-                parent=self.parents[self.level],  # type: ignore[arg-type]
+                parent=self.parents[self.level],
             )
 
     def parse(self, patent_content: str) -> Optional[DoclingDocument]:
@@ -1283,7 +1283,7 @@ def _add_property(self, name: str, text: str) -> None:
                 title = text.strip()
                 if title:
                     self.parents[self.level + 1] = self.doc.add_text(
-                        parent=self.parents[self.level],  # type: ignore[arg-type]
+                        parent=self.parents[self.level],
                         label=DocItemLabel.TITLE,
                         text=title,
                     )
@@ -1301,7 +1301,7 @@ def _add_property(self, name: str, text: str) -> None:
                     abstract_item = self.doc.add_heading(
                         heading_text,
                         level=heading_level,
-                        parent=self.parents[heading_level],  # type: ignore[arg-type]
+                        parent=self.parents[heading_level],
                     )
                     self.doc.add_text(
                         label=DocItemLabel.PARAGRAPH,
@@ -1331,7 +1331,7 @@ def _add_property(self, name: str, text: str) -> None:
                 claims_item = self.doc.add_heading(
                     heading_text,
                     level=heading_level,
-                    parent=self.parents[heading_level],  # type: ignore[arg-type]
+                    parent=self.parents[heading_level],
                 )
                 for text in self.claims:
                     self.doc.add_text(
@@ -1350,14 +1350,14 @@ def _add_property(self, name: str, text: str) -> None:
                         self.parents[self.level + 1] = self.doc.add_heading(
                             text=text,
                             level=self.level,
-                            parent=self.parents[self.level],  # type: ignore[arg-type]
+                            parent=self.parents[self.level],
                         )
                         self.level += 1
                     else:
                         self.doc.add_text(
                             label=DocItemLabel.PARAGRAPH,
                             text=text,
-                            parent=self.parents[self.level],  # type: ignore[arg-type]
+                            parent=self.parents[self.level],
                         )
                 self.text = ""
 
@@ -1366,7 +1366,7 @@ def _add_property(self, name: str, text: str) -> None:
                 empty_table = TableData(num_rows=0, num_cols=0, table_cells=[])
                 self.doc.add_table(
                     data=empty_table,
-                    parent=self.parents[self.level],  # type: ignore[arg-type]
+                    parent=self.parents[self.level],
                 )
 
         def _apply_style(self, text: str, style_tag: str) -> str:
diff --git a/docs/examples/backend_xml_rag.ipynb b/docs/examples/backend_xml_rag.ipynb
new file mode 100644
index 00000000..aef8ce00
--- /dev/null
+++ b/docs/examples/backend_xml_rag.ipynb
@@ -0,0 +1,1078 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<a href=\"https://colab.research.google.com/github/DS4SD/docling/blob/main/docs/examples/backend_xml_rag.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Conversion of custom XML"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "| Step | Tech | Execution | \n",
+    "| --- | --- | --- |\n",
+    "| Embedding | Hugging Face / Sentence Transformers | 💻 Local |\n",
+    "| Vector store | Milvus | 💻 Local |\n",
+    "| Gen AI | Hugging Face Inference API | 🌐 Remote | "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Overview"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This is an example of using [Docling](https://ds4sd.github.io/docling/) for converting structured data (XML) into a unified document\n",
+    "representation format, `DoclingDocument`, and leverage its riched structured content for RAG applications.\n",
+    "\n",
+    "Data used in this example consist of patents from the [United States Patent and Trademark Office (USPTO)](https://www.uspto.gov/) and medical\n",
+    "articles from [PubMed Central® (PMC)](https://pmc.ncbi.nlm.nih.gov/).\n",
+    "\n",
+    "In this notebook, we accomplish the following:\n",
+    "- [Simple conversion](#simple-conversion) of supported XML files in a nutshell\n",
+    "- An [end-to-end application](#end-to-end-application) using public collections of XML files supported by Docling\n",
+    "  - [Setup](#setup) the API access for generative AI\n",
+    "  - [Fetch the data](#fetch-the-data) from USPTO and PubMed Central® sites, using Docling custom backends\n",
+    "  - [Parse, chunk, and index](#parse-chunk-and-index) the documents in a vector database\n",
+    "  - [Perform RAG](#question-answering-with-rag) using [LlamaIndex Docling extension](../../integrations/llamaindex/)\n",
+    "  - [Delete the temporary files](#delete-temporary-files) used in notebook\n",
+    "\n",
+    "For more details on document chunking with Docling, refer to the [Chunking](../../concepts/chunking/) documentation. For RAG with Docling and LlamaIndex, also check the example [RAG with LlamaIndex](../rag_llamaindex/)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Simple conversion\n",
+    "\n",
+    "The XML file format defines and stores data in a format that is both human-readable and machine-readable.\n",
+    "Because of this flexibility, Docling requires custom backend processors to interpret XML definitions and convert them into `DoclingDocument` objects.\n",
+    "\n",
+    "Some public data collections in XML format are already supported by Docling (USTPO patents and PMC articles). In these cases, the document conversion is straightforward and the same as with any other supported format, such as PDF or HTML. The execution example in [Simple Conversion](../minimal/) is the recommended usage of Docling for a single file:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "ConversionStatus.SUCCESS\n"
+     ]
+    }
+   ],
+   "source": [
+    "from docling.document_converter import DocumentConverter\n",
+    "\n",
+    "# a sample PMC article:\n",
+    "source = \"../../tests/data/pubmed/elife-56337.nxml\"\n",
+    "converter = DocumentConverter()\n",
+    "result = converter.convert(source)\n",
+    "print(result.status)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Once the document is converted, it can be exported to any format supported by Docling. For instance, to markdown (showing here the first lines only):"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "# KRAB-zinc finger protein gene expansion in response to active retrotransposons in the murine lineage\n",
+      "\n",
+      "Wolf Gernot; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; de Iaco Alberto; 2: School of Life Sciences, École Polytechnique Fédérale de Lausanne (EPFL): Lausanne: Switzerland; Sun Ming-An; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Bruno Melania; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Tinkham Matthew; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Hoang Don; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Mitra Apratim; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Ralls Sherry; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Trono Didier; 2: School of Life Sciences, École Polytechnique Fédérale de Lausanne (EPFL): Lausanne: Switzerland; Macfarlan Todd S; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States\n",
+      "\n",
+      "## Abstract\n",
+      "\n",
+      "The Krüppel-associated box zinc finger protein (KRAB-ZFP) family diversified in mammals. The majority of human KRAB-ZFPs bind transposable elements (TEs), however, since most TEs are inactive in humans it is unclear whether KRAB-ZFPs emerged to suppress TEs. We demonstrate that many recently emerged murine KRAB-ZFPs also bind to TEs, including the active ETn, IAP, and L1 families. Using a CRISPR/Cas9-based engineering approach, we genetically deleted five large clusters of KRAB-ZFPs and demonstrate that target TEs are de-repressed, unleashing TE-encoded enhancers. Homozygous knockout mice lacking one of two KRAB-ZFP gene clusters on chromosome 2 and chromosome 4 were nonetheless viable. In pedigrees of chromosome 4 cluster KRAB-ZFP mutants, we identified numerous novel ETn insertions with a modest increase in mutants. Our data strongly support the current model that recent waves of retrotransposon activity drove the expansion of KRAB-ZFP genes in mice and that many KRAB-ZFPs play a redundant role restricting TE activity.\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "md_doc = result.document.export_to_markdown()\n",
+    "\n",
+    "delim = \"\\n\"\n",
+    "print(delim.join(md_doc.split(delim)[:8]))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If the XML file is not supported, a `ConversionError` message will be raised."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Input document docling_test.xml does not match any allowed format.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "File format not allowed: docling_test.xml\n"
+     ]
+    }
+   ],
+   "source": [
+    "from io import BytesIO\n",
+    "\n",
+    "from docling.datamodel.base_models import DocumentStream\n",
+    "from docling.exceptions import ConversionError\n",
+    "\n",
+    "xml_content = (\n",
+    "    b'<?xml version=\"1.0\" encoding=\"UTF-8\"?><!DOCTYPE docling_test SYSTEM '\n",
+    "    b'\"test.dtd\"><docling>Random content</docling>'\n",
+    ")\n",
+    "stream = DocumentStream(name=\"docling_test.xml\", stream=BytesIO(xml_content))\n",
+    "try:\n",
+    "    result = converter.convert(stream)\n",
+    "except ConversionError as ce:\n",
+    "    print(ce)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You can always refer to the [Usage](../../usage/#supported-formats) documentation page for a list of supported formats."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## End-to-end application\n",
+    "\n",
+    "This section describes a step-by-step application for processing XML files from supported public collections and use them for question-answering."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Setup"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Requirements can be installed as shown below. The `--no-warn-conflicts` argument is meant for Colab's pre-populated Python environment, feel free to remove for stricter usage."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Note: you may need to restart the kernel to use updated packages.\n"
+     ]
+    }
+   ],
+   "source": [
+    "%pip install -q --progress-bar off --no-warn-conflicts llama-index-core llama-index-readers-docling llama-index-node-parser-docling llama-index-embeddings-huggingface llama-index-llms-huggingface-api llama-index-vector-stores-milvus llama-index-readers-file python-dotenv"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This notebook uses HuggingFace's Inference API. For an increased LLM quota, a token can be provided via the environment variable `HF_TOKEN`.\n",
+    "\n",
+    "If you're running this notebook in Google Colab, make sure you [add](https://medium.com/@parthdasawant/how-to-use-secrets-in-google-colab-450c38e3ec75) your API key as a secret."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from warnings import filterwarnings\n",
+    "\n",
+    "from dotenv import load_dotenv\n",
+    "\n",
+    "\n",
+    "def _get_env_from_colab_or_os(key):\n",
+    "    try:\n",
+    "        from google.colab import userdata\n",
+    "\n",
+    "        try:\n",
+    "            return userdata.get(key)\n",
+    "        except userdata.SecretNotFoundError:\n",
+    "            pass\n",
+    "    except ImportError:\n",
+    "        pass\n",
+    "    return os.getenv(key)\n",
+    "\n",
+    "\n",
+    "load_dotenv()\n",
+    "\n",
+    "filterwarnings(action=\"ignore\", category=UserWarning, module=\"pydantic\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can now define the main parameters:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "from tempfile import mkdtemp\n",
+    "\n",
+    "from llama_index.embeddings.huggingface import HuggingFaceEmbedding\n",
+    "from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI\n",
+    "\n",
+    "EMBED_MODEL_ID = \"BAAI/bge-small-en-v1.5\"\n",
+    "EMBED_MODEL = HuggingFaceEmbedding(model_name=EMBED_MODEL_ID)\n",
+    "TEMP_DIR = Path(mkdtemp())\n",
+    "MILVUS_URI = str(TEMP_DIR / \"docling.db\")\n",
+    "GEN_MODEL = HuggingFaceInferenceAPI(\n",
+    "    token=_get_env_from_colab_or_os(\"HF_TOKEN\"),\n",
+    "    model_name=\"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n",
+    ")\n",
+    "embed_dim = len(EMBED_MODEL.get_text_embedding(\"hi\"))\n",
+    "# https://github.com/huggingface/transformers/issues/5486:\n",
+    "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Fetch the data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In this notebook we will use XML data from collections supported by Docling:\n",
+    "- Medical articles from the [PubMed Central® (PMC)](https://pmc.ncbi.nlm.nih.gov/). They are available in an [FTP server](https://ftp.ncbi.nlm.nih.gov/pub/pmc/) as `.tar.gz` files. Each file contains the full article data in XML format, among other supplementary files like images or spreadsheets.\n",
+    "- Patents from the [United States Patent and Trademark Office](https://www.uspto.gov/). They are available in the [Bulk Data Storage System (BDSS)](https://bulkdata.uspto.gov/) as zip files. Each zip file may contain several patents in XML format.\n",
+    "\n",
+    "The raw files will be downloaded form the source and saved in a temporary directory."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### PMC articles\n",
+    "\n",
+    "The [OA file](https://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_file_list.csv) is a manifest file of all the PMC articles, including the URL path to download the source files. In this notebook we will use as example the article [Pathogens spread by high-altitude windborne mosquitoes](https://pmc.ncbi.nlm.nih.gov/articles/PMC11703268/), which is available in the archive file [PMC11703268.tar.gz](https://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/e3/6b/PMC11703268.tar.gz)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading https://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/e3/6b/PMC11703268.tar.gz...\n",
+      "Extracting and storing the XML file containing the article text...\n",
+      "Stored XML file nihpp-2024.12.26.630351v1.nxml\n"
+     ]
+    }
+   ],
+   "source": [
+    "import tarfile\n",
+    "from io import BytesIO\n",
+    "\n",
+    "import requests\n",
+    "\n",
+    "# PMC article PMC11703268\n",
+    "url: str = \"https://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/e3/6b/PMC11703268.tar.gz\"\n",
+    "\n",
+    "print(f\"Downloading {url}...\")\n",
+    "buf = BytesIO(requests.get(url).content)\n",
+    "print(\"Extracting and storing the XML file containing the article text...\")\n",
+    "with tarfile.open(fileobj=buf, mode=\"r:gz\") as tar_file:\n",
+    "    for tarinfo in tar_file:\n",
+    "        if tarinfo.isreg():\n",
+    "            file_path = Path(tarinfo.name)\n",
+    "            if file_path.suffix == \".nxml\":\n",
+    "                with open(TEMP_DIR / file_path.name, \"wb\") as file_obj:\n",
+    "                    file_obj.write(tar_file.extractfile(tarinfo).read())\n",
+    "                print(f\"Stored XML file {file_path.name}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### USPTO patents\n",
+    "\n",
+    "Since each USPTO file is a concatenation of several patents, we need to split its content into valid XML pieces. The following code downloads a sample zip file, split its content in sections, and dumps each section as an XML file. For simplicity, this pipeline is shown here in a sequential manner, but it could be parallelized."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import zipfile\n",
+    "\n",
+    "# Patent grants from December 17-23, 2024\n",
+    "url: str = (\n",
+    "    \"https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/2024/ipg241217.zip\"\n",
+    ")\n",
+    "XML_SPLITTER: str = '<?xml version=\"1.0\"'\n",
+    "doc_num: int = 0\n",
+    "\n",
+    "print(f\"Downloading {url}...\")\n",
+    "buf = BytesIO(requests.get(url).content)\n",
+    "print(f\"Parsing zip file, splitting into XML sections, and exporting to files...\")\n",
+    "with zipfile.ZipFile(buf) as zf:\n",
+    "    res = zf.testzip()\n",
+    "    if res:\n",
+    "        print(\"Error validating zip file\")\n",
+    "    else:\n",
+    "        with zf.open(zf.namelist()[0]) as xf:\n",
+    "            is_patent = False\n",
+    "            patent_buffer = BytesIO()\n",
+    "            for xf_line in xf:\n",
+    "                decoded_line = xf_line.decode(errors=\"ignore\").rstrip()\n",
+    "                xml_index = decoded_line.find(XML_SPLITTER)\n",
+    "                if xml_index != -1:\n",
+    "                    if (\n",
+    "                        xml_index > 0\n",
+    "                    ):  # cases like </sequence-cwu><?xml version=\"1.0\"...\n",
+    "                        patent_buffer.write(xf_line[:xml_index])\n",
+    "                        patent_buffer.write(b\"\\r\\n\")\n",
+    "                        xf_line = xf_line[xml_index:]\n",
+    "                    if patent_buffer.getbuffer().nbytes > 0 and is_patent:\n",
+    "                        doc_num += 1\n",
+    "                        patent_id = f\"ipg241217-{doc_num}\"\n",
+    "                        with open(TEMP_DIR / f\"{patent_id}.xml\", \"wb\") as file_obj:\n",
+    "                            file_obj.write(patent_buffer.getbuffer())\n",
+    "                    is_patent = False\n",
+    "                    patent_buffer = BytesIO()\n",
+    "                elif decoded_line.startswith(\"<!DOCTYPE\"):\n",
+    "                    is_patent = True\n",
+    "                patent_buffer.write(xf_line)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fetched and exported 4014 documents.\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(f\"Fetched and exported {doc_num} documents.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Using the backend converter (optional)\n",
+    "\n",
+    "- The custom backend converters `PubMedDocumentBackend` and `PatentUsptoDocumentBackend` aim at handling the parsing of PMC articles and USPTO patents, respectively.\n",
+    "- As any other backends, you can leverage the function `is_valid()` to check if the input document is supported by the this backend.\n",
+    "- Note that some XML sections in the original USPTO zip file may not represent patents, like sequence listings, and therefore they will show as invalid by the backend."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Document nihpp-2024.12.26.630351v1.nxml is a valid PMC article? True\n",
+      "Document ipg241217-1.xml is a valid patent? True\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "3964d1ff30f74588a2f6b53ca8865a9f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/4014 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Found 3928 patents out of 4014 XML files.\n"
+     ]
+    }
+   ],
+   "source": [
+    "from tqdm.notebook import tqdm\n",
+    "\n",
+    "from docling.backend.xml.pubmed_backend import PubMedDocumentBackend\n",
+    "from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend\n",
+    "from docling.datamodel.base_models import InputFormat\n",
+    "from docling.datamodel.document import InputDocument\n",
+    "\n",
+    "# check PMC\n",
+    "in_doc = InputDocument(\n",
+    "    path_or_stream=TEMP_DIR / \"nihpp-2024.12.26.630351v1.nxml\",\n",
+    "    format=InputFormat.XML_PUBMED,\n",
+    "    backend=PubMedDocumentBackend,\n",
+    ")\n",
+    "backend = PubMedDocumentBackend(\n",
+    "    in_doc=in_doc, path_or_stream=TEMP_DIR / \"nihpp-2024.12.26.630351v1.nxml\"\n",
+    ")\n",
+    "print(f\"Document {in_doc.file.name} is a valid PMC article? {backend.is_valid()}\")\n",
+    "\n",
+    "# check USPTO\n",
+    "in_doc = InputDocument(\n",
+    "    path_or_stream=TEMP_DIR / \"ipg241217-1.xml\",\n",
+    "    format=InputFormat.XML_USPTO,\n",
+    "    backend=PatentUsptoDocumentBackend,\n",
+    ")\n",
+    "backend = PatentUsptoDocumentBackend(\n",
+    "    in_doc=in_doc, path_or_stream=TEMP_DIR / \"ipg241217-1.xml\"\n",
+    ")\n",
+    "print(f\"Document {in_doc.file.name} is a valid patent? {backend.is_valid()}\")\n",
+    "\n",
+    "patent_valid = 0\n",
+    "pbar = tqdm(TEMP_DIR.glob(\"*.xml\"), total=doc_num)\n",
+    "for in_path in pbar:\n",
+    "    in_doc = InputDocument(\n",
+    "        path_or_stream=in_path,\n",
+    "        format=InputFormat.XML_USPTO,\n",
+    "        backend=PatentUsptoDocumentBackend,\n",
+    "    )\n",
+    "    backend = PatentUsptoDocumentBackend(in_doc=in_doc, path_or_stream=in_path)\n",
+    "    patent_valid += int(backend.is_valid())\n",
+    "\n",
+    "print(f\"Found {patent_valid} patents out of {doc_num} XML files.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Calling the function `convert()` will convert the input document into a `DoclingDocument`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Patent \"Semiconductor package\" has 19 claims\n"
+     ]
+    }
+   ],
+   "source": [
+    "doc = backend.convert()\n",
+    "\n",
+    "claims_sec = [item for item in doc.texts if item.text == \"CLAIMS\"][0]\n",
+    "print(f'Patent \"{doc.texts[0].text}\" has {len(claims_sec.children)} claims')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "✏️ **Tip**: in general, there is no need to use the backend converters to parse USPTO or PubMed XML files. The generic `DocumentConverter` object tries to guess the input document format and applies the corresponding backend parser. The conversion shown in [Simple Conversion](#simple-conversion) is the recommended usage for the supported XML files."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Parse, chunk, and index"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The `DoclingDocument` format of the converted patents has a rich hierarchical structure, inherited from the original XML document and preserved by the Docling custom backend.\n",
+    "In this notebook, we will leverage:\n",
+    "- The `SimpleDirectoryReader` pattern to iterate over the exported XML files created in section [Fetch the data](#fetch-the-data).\n",
+    "- The LlamaIndex extensions, `DoclingReader` and `DoclingNodeParser`, to ingest the patent chunks into a Milvus vectore store.\n",
+    "- The `HierarchicalChunker` implementation, which applies a document-based hierarchical chunking, to leverage the patent structures like sections and paragraphs within sections.\n",
+    "\n",
+    "Refer to other possible implementations and usage patterns in the [Chunking](../../concepts/chunking/) documentation and the [RAG with LlamaIndex](../rag_llamaindex/) notebook."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "##### Set the Docling reader and the directory reader\n",
+    "\n",
+    "Note that `DoclingReader` uses Docling's `DocumentConverter` by default and therefore it will recognize the format of the XML files and leverage the `PatentUsptoDocumentBackend` automatically.\n",
+    "\n",
+    "For demonstration purposes, we limit the scope of the analysis to the first 100 patents."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_index.core import SimpleDirectoryReader\n",
+    "from llama_index.readers.docling import DoclingReader\n",
+    "\n",
+    "reader = DoclingReader(export_type=DoclingReader.ExportType.JSON)\n",
+    "dir_reader = SimpleDirectoryReader(\n",
+    "    input_dir=TEMP_DIR,\n",
+    "    exclude=[\"docling.db\", \"*.nxml\"],\n",
+    "    file_extractor={\".xml\": reader},\n",
+    "    filename_as_id=True,\n",
+    "    num_files_limit=100,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "##### Set the node parser\n",
+    "\n",
+    "Note that the `HierarchicalChunker` is the default chunking implementation of the `DoclingNodeParser`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_index.node_parser.docling import DoclingNodeParser\n",
+    "\n",
+    "node_parser = DoclingNodeParser()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "##### Set a local Milvus database and run the ingestion"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2025-01-24 16:49:57,108 [DEBUG][_create_connection]: Created new connection using: 2d58fad6c63448a486c0c0ffe3b7b28c (async_milvus_client.py:600)\n",
+      "Loading files:  51%|█████     | 51/100 [00:00<00:00, 67.88file/s]Input document ipg241217-1050.xml does not match any allowed format.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Failed to load file /var/folders/2r/b2sdj1512g1_0m7wzzy7sftr0000gn/T/tmp11rjcdj8/ipg241217-1050.xml with error: File format not allowed: /var/folders/2r/b2sdj1512g1_0m7wzzy7sftr0000gn/T/tmp11rjcdj8/ipg241217-1050.xml. Skipping...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Loading files: 100%|██████████| 100/100 [00:01<00:00, 58.05file/s]\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e9208639f1a4418d97267a28305d18fa",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Parsing nodes:   0%|          | 0/99 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "88026613f6f44f0c8476dceaa1cb78cd",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7522b8b434b54616b4cfc3d71e9556d7",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "5879d8161c2041f5b100959e69ff9017",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "557912b5e3c741f3a06127156bc46379",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "843bb145942b449aa55fc5b8208da734",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c7dba09a4aed422998e9b9c2c3a70317",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0bd031356c7e4e879dcbe1d04e6c4a4e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating embeddings:   0%|          | 0/425 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from llama_index.core import StorageContext, VectorStoreIndex\n",
+    "from llama_index.vector_stores.milvus import MilvusVectorStore\n",
+    "\n",
+    "vector_store = MilvusVectorStore(\n",
+    "    uri=MILVUS_URI,\n",
+    "    dim=embed_dim,\n",
+    "    overwrite=True,\n",
+    ")\n",
+    "\n",
+    "index = VectorStoreIndex.from_documents(\n",
+    "    documents=dir_reader.load_data(show_progress=True),\n",
+    "    transformations=[node_parser],\n",
+    "    storage_context=StorageContext.from_defaults(vector_store=vector_store),\n",
+    "    embed_model=EMBED_MODEL,\n",
+    "    show_progress=True,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finally, add the PMC article to the vector store directly from the reader."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x373a7f7d0>"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "index.from_documents(\n",
+    "    documents=reader.load_data(TEMP_DIR / \"nihpp-2024.12.26.630351v1.nxml\"),\n",
+    "    transformations=[node_parser],\n",
+    "    storage_context=StorageContext.from_defaults(vector_store=vector_store),\n",
+    "    embed_model=EMBED_MODEL,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Question-answering with RAG"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The retriever can be used to identify highly relevant documents:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Node ID: 5afd36c0-a739-4a88-a51c-6d0f75358db5\n",
+      "Text: The portable fitness monitoring device 102 may be a device such\n",
+      "as, for example, a mobile phone, a personal digital assistant, a music\n",
+      "file player (e.g. and MP3 player), an intelligent article for wearing\n",
+      "(e.g. a fitness monitoring garment, wrist band, or watch), a dongle\n",
+      "(e.g. a small hardware device that protects software) that includes a\n",
+      "fitn...\n",
+      "Score:  0.772\n",
+      "\n",
+      "Node ID: f294b5fd-9089-43cb-8c4e-d1095a634ff1\n",
+      "Text: US Patent Application US 20120071306 entitled “Portable\n",
+      "Multipurpose Whole Body Exercise Device” discloses a portable\n",
+      "multipurpose whole body exercise device which can be used for general\n",
+      "fitness, Pilates-type, core strengthening, therapeutic, and\n",
+      "rehabilitative exercises as well as stretching and physical therapy\n",
+      "and which includes storable acc...\n",
+      "Score:  0.749\n",
+      "\n",
+      "Node ID: 8251c7ef-1165-42e1-8c91-c99c8a711bf7\n",
+      "Text: Program products, methods, and systems for providing fitness\n",
+      "monitoring services of the present invention can include any software\n",
+      "application executed by one or more computing devices. A computing\n",
+      "device can be any type of computing device having one or more\n",
+      "processors. For example, a computing device can be a workstation,\n",
+      "mobile device (e.g., ...\n",
+      "Score:  0.744\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "retriever = index.as_retriever(similarity_top_k=3)\n",
+    "results = retriever.retrieve(\"What patents are related to fitness devices?\")\n",
+    "\n",
+    "for item in results:\n",
+    "    print(item)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "With the query engine, we can run the question-answering with the RAG pattern on the set of indexed documents.\n",
+    "\n",
+    "First, we can prompt the LLM directly:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">╭──────────────────────────────────────────────────── Prompt ─────────────────────────────────────────────────────╮</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│</span> Do mosquitoes in high altitude expand viruses over large distances?                                             <span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯</span>\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1;31m╭─\u001b[0m\u001b[1;31m───────────────────────────────────────────────────\u001b[0m\u001b[1;31m Prompt \u001b[0m\u001b[1;31m────────────────────────────────────────────────────\u001b[0m\u001b[1;31m─╮\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m Do mosquitoes in high altitude expand viruses over large distances?                                             \u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">╭─────────────────────────────────────────────── Generated Content ───────────────────────────────────────────────╮</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span> Mosquitoes can be found at high altitudes, but their ability to transmit viruses over long distances is not     <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span> primarily dependent on altitude. Mosquitoes are vectors for various diseases, such as malaria, dengue fever,    <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span> and Zika virus, and their transmission range is more closely related to their movement, the presence of a host, <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span> and environmental conditions that support their survival and reproduction.                                      <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>                                                                                                                 <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span> At high altitudes, the environment can be less suitable for mosquitoes due to factors such as colder            <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span> temperatures, lower humidity, and stronger winds, which can limit their population size and distribution.       <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span> However, some species of mosquitoes have adapted to high-altitude environments and can still transmit diseases  <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span> in these areas.                                                                                                 <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>                                                                                                                 <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span> It is possible for mosquitoes to be transported by wind or human activities to higher altitudes, but this is    <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span> not a significant factor in their ability to transmit viruses over long distances. Instead, long-distance       <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span> transmission of viruses is more often associated with human travel and transportation, which can rapidly spread <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span> infected mosquitoes or humans to new areas, leading to the spread of disease.                                   <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯</span>\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1;32m╭─\u001b[0m\u001b[1;32m──────────────────────────────────────────────\u001b[0m\u001b[1;32m Generated Content \u001b[0m\u001b[1;32m──────────────────────────────────────────────\u001b[0m\u001b[1;32m─╮\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m Mosquitoes can be found at high altitudes, but their ability to transmit viruses over long distances is not     \u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m primarily dependent on altitude. Mosquitoes are vectors for various diseases, such as malaria, dengue fever,    \u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m and Zika virus, and their transmission range is more closely related to their movement, the presence of a host, \u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m and environmental conditions that support their survival and reproduction.                                      \u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m                                                                                                                 \u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m At high altitudes, the environment can be less suitable for mosquitoes due to factors such as colder            \u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m temperatures, lower humidity, and stronger winds, which can limit their population size and distribution.       \u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m However, some species of mosquitoes have adapted to high-altitude environments and can still transmit diseases  \u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m in these areas.                                                                                                 \u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m                                                                                                                 \u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m It is possible for mosquitoes to be transported by wind or human activities to higher altitudes, but this is    \u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m not a significant factor in their ability to transmit viruses over long distances. Instead, long-distance       \u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m transmission of viruses is more often associated with human travel and transportation, which can rapidly spread \u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m infected mosquitoes or humans to new areas, leading to the spread of disease.                                   \u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from llama_index.core.base.llms.types import ChatMessage, MessageRole\n",
+    "from rich.console import Console\n",
+    "from rich.panel import Panel\n",
+    "\n",
+    "console = Console()\n",
+    "query = \"Do mosquitoes in high altitude expand viruses over large distances?\"\n",
+    "\n",
+    "usr_msg = ChatMessage(role=MessageRole.USER, content=query)\n",
+    "response = GEN_MODEL.chat(messages=[usr_msg])\n",
+    "\n",
+    "console.print(Panel(query, title=\"Prompt\", border_style=\"bold red\"))\n",
+    "console.print(\n",
+    "    Panel(\n",
+    "        response.message.content.strip(),\n",
+    "        title=\"Generated Content\",\n",
+    "        border_style=\"bold green\",\n",
+    "    )\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now, we can compare the response when the model is prompted with the indexed PMC article as supporting context:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">╭────────────────────────────────────────── Generated Content with RAG ───────────────────────────────────────────╮</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span> Yes, mosquitoes in high altitude can expand viruses over large distances. A study intercepted 1,017 female      <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span> mosquitoes at altitudes of 120-290 m above ground over Mali and Ghana and screened them for infection with      <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span> arboviruses, plasmodia, and filariae. The study found that 3.5% of the mosquitoes were infected with            <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span> flaviviruses, and 1.1% were infectious. Additionally, the study identified 19 mosquito-borne pathogens,         <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span> including three arboviruses that affect humans (dengue, West Nile, and M’Poko viruses). The study provides      <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span> compelling evidence that mosquito-borne pathogens are often spread by windborne mosquitoes at altitude.         <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯</span>\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1;32m╭─\u001b[0m\u001b[1;32m─────────────────────────────────────────\u001b[0m\u001b[1;32m Generated Content with RAG \u001b[0m\u001b[1;32m──────────────────────────────────────────\u001b[0m\u001b[1;32m─╮\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m Yes, mosquitoes in high altitude can expand viruses over large distances. A study intercepted 1,017 female      \u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m mosquitoes at altitudes of 120-290 m above ground over Mali and Ghana and screened them for infection with      \u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m arboviruses, plasmodia, and filariae. The study found that 3.5% of the mosquitoes were infected with            \u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m flaviviruses, and 1.1% were infectious. Additionally, the study identified 19 mosquito-borne pathogens,         \u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m including three arboviruses that affect humans (dengue, West Nile, and M’Poko viruses). The study provides      \u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m compelling evidence that mosquito-borne pathogens are often spread by windborne mosquitoes at altitude.         \u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from llama_index.core.vector_stores import ExactMatchFilter, MetadataFilters\n",
+    "\n",
+    "filters = MetadataFilters(\n",
+    "    filters=[\n",
+    "        ExactMatchFilter(key=\"filename\", value=\"nihpp-2024.12.26.630351v1.nxml\"),\n",
+    "    ]\n",
+    ")\n",
+    "\n",
+    "query_engine = index.as_query_engine(llm=GEN_MODEL, filter=filters, similarity_top_k=3)\n",
+    "result = query_engine.query(query)\n",
+    "\n",
+    "console.print(\n",
+    "    Panel(\n",
+    "        result.response.strip(),\n",
+    "        title=\"Generated Content with RAG\",\n",
+    "        border_style=\"bold green\",\n",
+    "    )\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Delete temporary files\n",
+    "\n",
+    "The XML files used in this notebook, as well as the Milvus local database will be removed."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import shutil\n",
+    "\n",
+    "shutil.rmtree(TEMP_DIR)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/usage.md b/docs/usage.md
index 9a5b555a..824f0f22 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -24,6 +24,20 @@ docling https://arxiv.org/pdf/2206.01062
 
 To see all available options (export formats etc.) run `docling --help`. More details in the [CLI reference page](./reference/cli.md).
 
+### Supported formats
+
+The document conversion in Docling supports several popular formats, including:
+
+- **PDF** (Portable Document Format): the format developed by Adobe to present documents compatible across application software, hardware, and operating systems.
+- **.docx**, **.xlsx**, **.pptx** (Word, Excel, and PowerPoint): the Open XML formats suppored by Microsof Office.
+- **Markdown**:  a lightweight markup language to add formatting elements to plain text documents.
+- **AsciiDoc**: a plain text markup language for writing technical content.
+- **HTML** (Hypertext Markup Language): the standard markup language for creating web pages.
+- **XHTML** (Extensible Hypertext Markup Language): the XML-based version of HTML.
+- **XML** (Extensible Markup Language): a markup format for storing and transmitting data. Due to its flexibility, Docling requires custom implementations to identify the
+semantics of the data. Currently, Docling supports the parsing of [USPTO](https://www.uspto.gov/patents) patents and [PubMed Central® (PMC)](https://pmc.ncbi.nlm.nih.gov/) articles.
+
+
 ### Advanced options
 
 #### Adjust pipeline features
@@ -126,6 +140,32 @@ result = converter.convert(source)
 You can limit the CPU threads used by Docling by setting the environment variable `OMP_NUM_THREADS` accordingly. The default setting is using 4 CPU threads.
 
 
+#### Use specific backend converters
+
+By default, Docling will try to identify the document format to apply the appropriate conversion backend (see the list of [supported formats](#supported-formats)).
+You can restrict the `DocumentConverter` to a set of allowed document formats, as shown in the [Multi-format conversion](./examples/run_with_formats.py) example.
+Alternatively, you can also use the specific backend that matches your document content. For instance, you can use `HTMLDocumentBackend` for HTML pages:
+
+```python
+import urllib.request
+from io import BytesIO
+from docling.backend.html_backend import HTMLDocumentBackend
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.document import InputDocument
+
+url = "https://en.wikipedia.org/wiki/Duck"
+text = urllib.request.urlopen(url).read()
+in_doc = InputDocument(
+    path_or_stream=BytesIO(text),
+    format=InputFormat.HTML,
+    backend=HTMLDocumentBackend,
+    filename="duck.html",
+)
+backend = HTMLDocumentBackend(in_doc=in_doc, path_or_stream=BytesIO(text))
+result = backend.convert()
+print(result.export_to_markdown())
+```
+
 ## Chunking
 
 You can chunk a Docling document using a [chunker](concepts/chunking.md), such as a
diff --git a/mkdocs.yml b/mkdocs.yml
index 2b2e2da0..bbff382e 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -77,7 +77,8 @@ nav:
       - "Force full page OCR": examples/full_page_ocr.py
       - "Automatic OCR language detection with tesseract": examples/tesseract_lang_detection.py
       - "Accelerator options": examples/run_with_accelerator.py
-      - "Simple translation": examples/translate.py
+      - "Simple translation": examples/translate.py   
+      - examples/backend_xml_rag.ipynb
     - ✂️ Chunking:
       - examples/hybrid_chunking.ipynb
     - 🤖 RAG with AI dev frameworks:
diff --git a/tests/test_backend_msexcel.py b/tests/test_backend_msexcel.py
index 0a4440dc..e664ed34 100644
--- a/tests/test_backend_msexcel.py
+++ b/tests/test_backend_msexcel.py
@@ -2,13 +2,8 @@
 import os
 from pathlib import Path
 
-from docling.backend.msword_backend import MsWordDocumentBackend
 from docling.datamodel.base_models import InputFormat
-from docling.datamodel.document import (
-    ConversionResult,
-    InputDocument,
-    SectionHeaderItem,
-)
+from docling.datamodel.document import ConversionResult
 from docling.document_converter import DocumentConverter
 
 GENERATE = False
diff --git a/tests/test_backend_patent_uspto.py b/tests/test_backend_patent_uspto.py
index 466568ac..21bc88c5 100644
--- a/tests/test_backend_patent_uspto.py
+++ b/tests/test_backend_patent_uspto.py
@@ -3,23 +3,16 @@
 import json
 import logging
 import os
-import unittest
 from pathlib import Path
 from tempfile import NamedTemporaryFile
 
 import pytest
-import yaml
 from docling_core.types import DoclingDocument
 from docling_core.types.doc import DocItemLabel, TableData, TextItem
 
 from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend, XmlTable
 from docling.datamodel.base_models import InputFormat
-from docling.datamodel.document import (
-    ConversionResult,
-    InputDocument,
-    SectionHeaderItem,
-)
-from docling.document_converter import DocumentConverter
+from docling.datamodel.document import InputDocument
 
 GENERATE: bool = True
 DATA_PATH: Path = Path("./tests/data/uspto/")
diff --git a/tests/test_backend_pubmed.py b/tests/test_backend_pubmed.py
index 4476bd24..8481c3dd 100644
--- a/tests/test_backend_pubmed.py
+++ b/tests/test_backend_pubmed.py
@@ -1,5 +1,4 @@
 import json
-import logging
 import os
 from io import BytesIO
 from pathlib import Path