docling-project · vagenas · Jan 8, 2025 · Jan 8, 2025
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -4,31 +4,31 @@ repos:
     hooks:
       - id: black
         name: Black
-        entry: poetry run black docling_langchain test
+        entry: poetry run black langchain_docling test
         pass_filenames: false
         language: system
         files: '\.py$'
       - id: isort
         name: isort
-        entry: poetry run isort docling_langchain test
+        entry: poetry run isort langchain_docling test
         pass_filenames: false
         language: system
         files: '\.py$'
       - id: autoflake
         name: autoflake
-        entry: poetry run autoflake docling_langchain test
+        entry: poetry run autoflake langchain_docling test
         pass_filenames: false
         language: system
         files: '\.py$'
       - id: mypy
         name: MyPy
-        entry: poetry run mypy docling_langchain test
+        entry: poetry run mypy langchain_docling test
         pass_filenames: false
         language: system
         files: '\.py$'
       - id: flake8
         name: Flake8
-        entry: poetry run flake8 docling_langchain
+        entry: poetry run flake8 langchain_docling
         pass_filenames: false
         language: system
         files: '\.py$'

diff --git a/README.md b/README.md
@@ -1,7 +1,7 @@
 # Docling LangChain integration
 
-[![PyPI version](https://img.shields.io/pypi/v/docling-langchain)](https://pypi.org/project/docling-langchain/)
-[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/docling-langchain)](https://pypi.org/project/docling-langchain/)
+[![PyPI version](https://img.shields.io/pypi/v/langchain-docling)](https://pypi.org/project/langchain-docling/)
+[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/langchain-docling)](https://pypi.org/project/langchain-docling/)
 [![Poetry](https://img.shields.io/endpoint?url=https://python-poetry.org/badge/v0.json)](https://python-poetry.org/)
 [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
 [![Imports: isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://pycqa.github.io/isort/)
@@ -14,17 +14,17 @@ A [Docling](https://github.com/DS4SD/docling) integration for
 
 ## Installation
 
-Simply install `docling-langchain` from your package manager, e.g. pip:
+Simply install `langchain-docling` from your package manager, e.g. pip:
 ```bash
-pip install docling-langchain
+pip install langchain-docling
 ```
 
 ## Usage
 
 Basic usage looks as follows:
 
 ```python
-from docling_langchain import DoclingLoader
+from langchain_docling import DoclingLoader
 
 FILE_PATH = ["https://arxiv.org/pdf/2408.09869"]  # Docling Technical Report
 

diff --git a/examples/docling_loader.ipynb b/examples/docling_loader.ipynb
@@ -30,9 +30,8 @@
    "metadata": {},
    "source": [
     "This example leverages the\n",
-    "[LangChain Docling integration](../../integrations/langchain/), along with\n",
-    "Milvus-based document store and retriever instances, as well as sentence-transformers\n",
-    "embeddings.\n",
+    "[LangChain Docling integration](../../integrations/langchain/), along with a Milvus\n",
+    "vector store, as well as sentence-transformers embeddings.\n",
     "\n",
     "The presented `DoclingLoader` component enables you to:\n",
     "- use various document types in your LLM applications with ease and speed, and\n",
@@ -44,8 +43,8 @@
     "- `ExportType.DOC_CHUNKS` (default): if you want to have each input document chunked and\n",
     "  to then capture each individual chunk as a separate LangChain document downstream.\n",
     "\n",
-    "The example allows to explore both modes via parameter `EXPORT_TYPE`; depending on the\n",
-    "value set, the ingestion and RAG pipelines are then set up accordingly."
+    "The example allows exploring both modes via parameter `EXPORT_TYPE`; depending on the\n",
+    "value set, the example pipeline is then set up accordingly."
    ]
   },
   {
@@ -78,8 +77,7 @@
     }
    ],
    "source": [
-    "# %pip install -q --progress-bar off --no-warn-conflicts docling-langchain langchain-text-splitters\n",
-    "%pip install -q --progress-bar off --no-warn-conflicts langchain-text-splitters"
+    "%pip install -q --progress-bar off --no-warn-conflicts langchain-docling langchain-core langchain-huggingface langchain_milvus langchain python-dotenv"
    ]
   },
   {
@@ -104,7 +102,7 @@
     "from dotenv import load_dotenv\n",
     "from langchain_core.prompts import PromptTemplate\n",
     "\n",
-    "from docling_langchain.loader import ExportType\n",
+    "from langchain_docling.loader import ExportType\n",
     "\n",
     "\n",
     "def _get_env_from_colab_or_os(key):\n",
@@ -161,11 +159,14 @@
     }
    ],
    "source": [
-    "from docling_langchain import DoclingLoader\n",
+    "from docling.chunking import HybridChunker\n",
+    "\n",
+    "from langchain_docling import DoclingLoader\n",
     "\n",
     "loader = DoclingLoader(\n",
     "    file_path=FILE_PATH,\n",
     "    export_type=EXPORT_TYPE,\n",
+    "    chunker=HybridChunker(tokenizer=EMBED_MODEL_ID),\n",
     ")\n",
     "\n",
     "docs = loader.load()"
@@ -257,6 +258,7 @@
     "vectorstore = Milvus.from_documents(\n",
     "    documents=splits,\n",
     "    embedding=embedding,\n",
+    "    collection_name=\"docling_demo\",\n",
     "    connection_args={\"uri\": milvus_uri},\n",
     "    index_params={\"index_type\": \"FLAT\"},\n",
     "    drop_old=True,\n",
@@ -274,6 +276,27 @@
    "cell_type": "code",
    "execution_count": 7,
    "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.chains import create_retrieval_chain\n",
+    "from langchain.chains.combine_documents import create_stuff_documents_chain\n",
+    "from langchain_huggingface import HuggingFaceEndpoint\n",
+    "\n",
+    "retriever = vectorstore.as_retriever(search_kwargs={\"k\": TOP_K})\n",
+    "llm = HuggingFaceEndpoint(\n",
+    "    repo_id=GEN_MODEL_ID,\n",
+    "    huggingfacehub_api_token=HF_TOKEN,\n",
+    ")\n",
+    "\n",
+    "\n",
+    "def clip_text(text, threshold=100):\n",
+    "    return f\"{text[:threshold]}...\" if len(text) > threshold else text"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -283,53 +306,36 @@
       "Which are the main AI models in Docling?\n",
       "\n",
       "Answer:\n",
-      "\"The main AI models in Docling are:\\n1. A layout analysis model, an accurate object-detector for page elements.\\n2. TableFormer, a state-of-the-art table structure recognition model.\"\n",
+      "Docling currently supports two main AI models, namely a layout analysis model and a table structure recognition model. The first model is a layout analysis model, an accurate object-detector for page ...\n",
       "\n",
       "Source 1:\n",
-      "  text: \"3.2 AI models\\nAs part of Docling, we initially release two highly capable AI models to the open-source community, which have been developed and published recently by our team. The first model is a lay...\"\n",
+      "  text: \"3.2 AI models\\nAs part of Docling, we initially release two highly capable AI models to the open-source community, which have been developed and published recently by our team. The first model is a layout analysis model, an accurate object-detector for page elements [13]. The second model is TableFormer [12, 9], a state-of-the-art table structure re...\"\n",
       "  dl_meta: {'schema_name': 'docling_core.transforms.chunker.DocMeta', 'version': '1.0.0', 'doc_items': [{'self_ref': '#/texts/50', 'parent': {'$ref': '#/body'}, 'children': [], 'label': 'text', 'prov': [{'page_no': 3, 'bbox': {'l': 108.0, 't': 405.1419982910156, 'r': 504.00299072265625, 'b': 330.7799987792969, 'coord_origin': 'BOTTOMLEFT'}, 'charspan': [0, 608]}]}], 'headings': ['3.2 AI models'], 'origin': {'mimetype': 'application/pdf', 'binary_hash': 11465328351749295394, 'filename': '2408.09869v5.pdf'}}\n",
       "  source: https://arxiv.org/pdf/2408.09869\n",
       "\n",
       "Source 2:\n",
-      "  text: \"3 Processing pipeline\\nDocling implements a linear pipeline of operations, which execute sequentially on each given document (see Fig. 1). Each document is first parsed by a PDF backend, which retrieve...\"\n",
+      "  text: \"3 Processing pipeline\\nDocling implements a linear pipeline of operations, which execute sequentially on each given document (see Fig. 1). Each document is first parsed by a PDF backend, which retrieves the programmatic text tokens, consisting of string content and its coordinates on the page, and also renders a bitmap image of each page to support ...\"\n",
       "  dl_meta: {'schema_name': 'docling_core.transforms.chunker.DocMeta', 'version': '1.0.0', 'doc_items': [{'self_ref': '#/texts/26', 'parent': {'$ref': '#/body'}, 'children': [], 'label': 'text', 'prov': [{'page_no': 2, 'bbox': {'l': 108.0, 't': 273.01800537109375, 'r': 504.00299072265625, 'b': 176.83799743652344, 'coord_origin': 'BOTTOMLEFT'}, 'charspan': [0, 796]}]}], 'headings': ['3 Processing pipeline'], 'origin': {'mimetype': 'application/pdf', 'binary_hash': 11465328351749295394, 'filename': '2408.09869v5.pdf'}}\n",
       "  source: https://arxiv.org/pdf/2408.09869\n",
       "\n",
       "Source 3:\n",
-      "  text: \"6 Future work and contributions\\nDocling is designed to allow easy extension of the model library and pipelines. In the future, we plan to extend Docling with several more models, such as a figure-clas...\"\n",
+      "  text: \"6 Future work and contributions\\nDocling is designed to allow easy extension of the model library and pipelines. In the future, we plan to extend Docling with several more models, such as a figure-classifier model, an equationrecognition model, a code-recognition model and more. This will help improve the quality of conversion for specific types of ...\"\n",
       "  dl_meta: {'schema_name': 'docling_core.transforms.chunker.DocMeta', 'version': '1.0.0', 'doc_items': [{'self_ref': '#/texts/76', 'parent': {'$ref': '#/body'}, 'children': [], 'label': 'text', 'prov': [{'page_no': 5, 'bbox': {'l': 108.0, 't': 322.468994140625, 'r': 504.00299072265625, 'b': 259.0169982910156, 'coord_origin': 'BOTTOMLEFT'}, 'charspan': [0, 543]}]}, {'self_ref': '#/texts/77', 'parent': {'$ref': '#/body'}, 'children': [], 'label': 'text', 'prov': [{'page_no': 5, 'bbox': {'l': 108.0, 't': 251.6540069580078, 'r': 504.00299072265625, 'b': 198.99200439453125, 'coord_origin': 'BOTTOMLEFT'}, 'charspan': [0, 402]}]}], 'headings': ['6 Future work and contributions'], 'origin': {'mimetype': 'application/pdf', 'binary_hash': 11465328351749295394, 'filename': '2408.09869v5.pdf'}}\n",
-      "  source: https://arxiv.org/pdf/2408.09869\n",
-      "\n",
-      "Source 4:\n",
-      "  text: \"3.3 Assembly\\nIn the final pipeline stage, Docling assembles all prediction results produced on each page into a well-defined datatype that encapsulates a converted document, as defined in the auxiliar...\"\n",
-      "  dl_meta: {'schema_name': 'docling_core.transforms.chunker.DocMeta', 'version': '1.0.0', 'doc_items': [{'self_ref': '#/texts/62', 'parent': {'$ref': '#/body'}, 'children': [], 'label': 'text', 'prov': [{'page_no': 4, 'bbox': {'l': 108.0, 't': 506.08099365234375, 'r': 504.00299072265625, 'b': 431.718994140625, 'coord_origin': 'BOTTOMLEFT'}, 'charspan': [0, 622]}]}], 'headings': ['3.3 Assembly'], 'origin': {'mimetype': 'application/pdf', 'binary_hash': 11465328351749295394, 'filename': '2408.09869v5.pdf'}}\n",
       "  source: https://arxiv.org/pdf/2408.09869\n"
      ]
     }
    ],
    "source": [
-    "from langchain.chains import create_retrieval_chain\n",
-    "from langchain.chains.combine_documents import create_stuff_documents_chain\n",
-    "from langchain_huggingface import HuggingFaceEndpoint\n",
-    "\n",
-    "llm = HuggingFaceEndpoint(repo_id=GEN_MODEL_ID)\n",
-    "\n",
-    "\n",
-    "def clip_text(text, threshold=100):\n",
-    "    return f\"{text[:threshold]}...\" if len(text) > threshold else text\n",
-    "\n",
-    "\n",
-    "retriever = vectorstore.as_retriever()\n",
     "question_answer_chain = create_stuff_documents_chain(llm, PROMPT)\n",
     "rag_chain = create_retrieval_chain(retriever, question_answer_chain)\n",
     "resp_dict = rag_chain.invoke({\"input\": QUESTION})\n",
     "\n",
-    "answer = clip_text(resp_dict[\"answer\"], threshold=200)\n",
-    "print(f\"Question:\\n{resp_dict['input']}\\n\\nAnswer:\\n{json.dumps(answer)}\")\n",
+    "clipped_answer = clip_text(resp_dict[\"answer\"], threshold=200)\n",
+    "print(f\"Question:\\n{resp_dict['input']}\\n\\nAnswer:\\n{clipped_answer}\")\n",
     "for i, doc in enumerate(resp_dict[\"context\"]):\n",
     "    print()\n",
     "    print(f\"Source {i+1}:\")\n",
-    "    print(f\"  text: {json.dumps(clip_text(doc.page_content, threshold=200))}\")\n",
+    "    print(f\"  text: {json.dumps(clip_text(doc.page_content, threshold=350))}\")\n",
     "    for key in doc.metadata:\n",
     "        if key != \"pk\":\n",
     "            val = doc.metadata.get(key)\n",
@@ -361,7 +367,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.7"
+   "version": "3.12.8"
   }
  },
  "nbformat": 4,

diff --git a/docling_langchain/__init__.py → langchain_docling/__init__.py b/docling_langchain/__init__.py → langchain_docling/__init__.py
@@ -4,4 +4,4 @@
 #
 """Docling LangChain package."""
 
-from docling_langchain.loader import DoclingLoader
+from langchain_docling.loader import DoclingLoader
diff --git a/docling_langchain/loader.py → langchain_docling/loader.py b/docling_langchain/loader.py → langchain_docling/loader.py
diff --git a/docling_langchain/py.typed → langchain_docling/py.typed b/docling_langchain/py.typed → langchain_docling/py.typed
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,5 +1,5 @@
 [tool.poetry]
-name = "docling-langchain"
+name = "langchain-docling"
 version = "0.1.0"  # DO NOT EDIT, updated automatically
 description = "Docling LangChain integration"
 authors = ["Panos Vagenas <[email protected]>"]
@@ -16,7 +16,7 @@ classifiers = [
     "Topic :: Scientific/Engineering :: Artificial Intelligence",
     "Programming Language :: Python :: 3"
 ]
-packages = [{include = "docling_langchain"}]
+packages = [{include = "langchain_docling"}]
 
 [tool.poetry.dependencies]
 python = ">=3.9,<3.13"

diff --git a/test/test_loader.py b/test/test_loader.py
@@ -5,7 +5,7 @@
 from docling.chunking import HierarchicalChunker
 from docling.datamodel.document import DoclingDocument as DLDocument
 
-from docling_langchain.loader import DoclingLoader, ExportType
+from langchain_docling.loader import DoclingLoader, ExportType
 
 in_json_str = json.dumps(
     {