diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 92427d2..b228845 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -4,31 +4,31 @@ repos: hooks: - id: black name: Black - entry: poetry run black docling_langchain test + entry: poetry run black langchain_docling test pass_filenames: false language: system files: '\.py$' - id: isort name: isort - entry: poetry run isort docling_langchain test + entry: poetry run isort langchain_docling test pass_filenames: false language: system files: '\.py$' - id: autoflake name: autoflake - entry: poetry run autoflake docling_langchain test + entry: poetry run autoflake langchain_docling test pass_filenames: false language: system files: '\.py$' - id: mypy name: MyPy - entry: poetry run mypy docling_langchain test + entry: poetry run mypy langchain_docling test pass_filenames: false language: system files: '\.py$' - id: flake8 name: Flake8 - entry: poetry run flake8 docling_langchain + entry: poetry run flake8 langchain_docling pass_filenames: false language: system files: '\.py$' diff --git a/README.md b/README.md index baae83f..986d146 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # Docling LangChain integration -[![PyPI version](https://img.shields.io/pypi/v/docling-langchain)](https://pypi.org/project/docling-langchain/) -[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/docling-langchain)](https://pypi.org/project/docling-langchain/) +[![PyPI version](https://img.shields.io/pypi/v/langchain-docling)](https://pypi.org/project/langchain-docling/) +[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/langchain-docling)](https://pypi.org/project/langchain-docling/) [![Poetry](https://img.shields.io/endpoint?url=https://python-poetry.org/badge/v0.json)](https://python-poetry.org/) [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) [![Imports: isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://pycqa.github.io/isort/) @@ -14,9 +14,9 @@ A [Docling](https://github.com/DS4SD/docling) integration for ## Installation -Simply install `docling-langchain` from your package manager, e.g. pip: +Simply install `langchain-docling` from your package manager, e.g. pip: ```bash -pip install docling-langchain +pip install langchain-docling ``` ## Usage @@ -24,7 +24,7 @@ pip install docling-langchain Basic usage looks as follows: ```python -from docling_langchain import DoclingLoader +from langchain_docling import DoclingLoader FILE_PATH = ["https://arxiv.org/pdf/2408.09869"] # Docling Technical Report diff --git a/examples/docling_loader.ipynb b/examples/docling_loader.ipynb index cafb16a..e93ce2d 100644 --- a/examples/docling_loader.ipynb +++ b/examples/docling_loader.ipynb @@ -30,9 +30,8 @@ "metadata": {}, "source": [ "This example leverages the\n", - "[LangChain Docling integration](../../integrations/langchain/), along with\n", - "Milvus-based document store and retriever instances, as well as sentence-transformers\n", - "embeddings.\n", + "[LangChain Docling integration](../../integrations/langchain/), along with a Milvus\n", + "vector store, as well as sentence-transformers embeddings.\n", "\n", "The presented `DoclingLoader` component enables you to:\n", "- use various document types in your LLM applications with ease and speed, and\n", @@ -44,8 +43,8 @@ "- `ExportType.DOC_CHUNKS` (default): if you want to have each input document chunked and\n", " to then capture each individual chunk as a separate LangChain document downstream.\n", "\n", - "The example allows to explore both modes via parameter `EXPORT_TYPE`; depending on the\n", - "value set, the ingestion and RAG pipelines are then set up accordingly." + "The example allows exploring both modes via parameter `EXPORT_TYPE`; depending on the\n", + "value set, the example pipeline is then set up accordingly." ] }, { @@ -78,8 +77,7 @@ } ], "source": [ - "# %pip install -q --progress-bar off --no-warn-conflicts docling-langchain langchain-text-splitters\n", - "%pip install -q --progress-bar off --no-warn-conflicts langchain-text-splitters" + "%pip install -q --progress-bar off --no-warn-conflicts langchain-docling langchain-core langchain-huggingface langchain_milvus langchain python-dotenv" ] }, { @@ -104,7 +102,7 @@ "from dotenv import load_dotenv\n", "from langchain_core.prompts import PromptTemplate\n", "\n", - "from docling_langchain.loader import ExportType\n", + "from langchain_docling.loader import ExportType\n", "\n", "\n", "def _get_env_from_colab_or_os(key):\n", @@ -161,11 +159,14 @@ } ], "source": [ - "from docling_langchain import DoclingLoader\n", + "from docling.chunking import HybridChunker\n", + "\n", + "from langchain_docling import DoclingLoader\n", "\n", "loader = DoclingLoader(\n", " file_path=FILE_PATH,\n", " export_type=EXPORT_TYPE,\n", + " chunker=HybridChunker(tokenizer=EMBED_MODEL_ID),\n", ")\n", "\n", "docs = loader.load()" @@ -257,6 +258,7 @@ "vectorstore = Milvus.from_documents(\n", " documents=splits,\n", " embedding=embedding,\n", + " collection_name=\"docling_demo\",\n", " connection_args={\"uri\": milvus_uri},\n", " index_params={\"index_type\": \"FLAT\"},\n", " drop_old=True,\n", @@ -274,6 +276,27 @@ "cell_type": "code", "execution_count": 7, "metadata": {}, + "outputs": [], + "source": [ + "from langchain.chains import create_retrieval_chain\n", + "from langchain.chains.combine_documents import create_stuff_documents_chain\n", + "from langchain_huggingface import HuggingFaceEndpoint\n", + "\n", + "retriever = vectorstore.as_retriever(search_kwargs={\"k\": TOP_K})\n", + "llm = HuggingFaceEndpoint(\n", + " repo_id=GEN_MODEL_ID,\n", + " huggingfacehub_api_token=HF_TOKEN,\n", + ")\n", + "\n", + "\n", + "def clip_text(text, threshold=100):\n", + " return f\"{text[:threshold]}...\" if len(text) > threshold else text" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -283,53 +306,36 @@ "Which are the main AI models in Docling?\n", "\n", "Answer:\n", - "\"The main AI models in Docling are:\\n1. A layout analysis model, an accurate object-detector for page elements.\\n2. TableFormer, a state-of-the-art table structure recognition model.\"\n", + "Docling currently supports two main AI models, namely a layout analysis model and a table structure recognition model. The first model is a layout analysis model, an accurate object-detector for page ...\n", "\n", "Source 1:\n", - " text: \"3.2 AI models\\nAs part of Docling, we initially release two highly capable AI models to the open-source community, which have been developed and published recently by our team. The first model is a lay...\"\n", + " text: \"3.2 AI models\\nAs part of Docling, we initially release two highly capable AI models to the open-source community, which have been developed and published recently by our team. The first model is a layout analysis model, an accurate object-detector for page elements [13]. The second model is TableFormer [12, 9], a state-of-the-art table structure re...\"\n", " dl_meta: {'schema_name': 'docling_core.transforms.chunker.DocMeta', 'version': '1.0.0', 'doc_items': [{'self_ref': '#/texts/50', 'parent': {'$ref': '#/body'}, 'children': [], 'label': 'text', 'prov': [{'page_no': 3, 'bbox': {'l': 108.0, 't': 405.1419982910156, 'r': 504.00299072265625, 'b': 330.7799987792969, 'coord_origin': 'BOTTOMLEFT'}, 'charspan': [0, 608]}]}], 'headings': ['3.2 AI models'], 'origin': {'mimetype': 'application/pdf', 'binary_hash': 11465328351749295394, 'filename': '2408.09869v5.pdf'}}\n", " source: https://arxiv.org/pdf/2408.09869\n", "\n", "Source 2:\n", - " text: \"3 Processing pipeline\\nDocling implements a linear pipeline of operations, which execute sequentially on each given document (see Fig. 1). Each document is first parsed by a PDF backend, which retrieve...\"\n", + " text: \"3 Processing pipeline\\nDocling implements a linear pipeline of operations, which execute sequentially on each given document (see Fig. 1). Each document is first parsed by a PDF backend, which retrieves the programmatic text tokens, consisting of string content and its coordinates on the page, and also renders a bitmap image of each page to support ...\"\n", " dl_meta: {'schema_name': 'docling_core.transforms.chunker.DocMeta', 'version': '1.0.0', 'doc_items': [{'self_ref': '#/texts/26', 'parent': {'$ref': '#/body'}, 'children': [], 'label': 'text', 'prov': [{'page_no': 2, 'bbox': {'l': 108.0, 't': 273.01800537109375, 'r': 504.00299072265625, 'b': 176.83799743652344, 'coord_origin': 'BOTTOMLEFT'}, 'charspan': [0, 796]}]}], 'headings': ['3 Processing pipeline'], 'origin': {'mimetype': 'application/pdf', 'binary_hash': 11465328351749295394, 'filename': '2408.09869v5.pdf'}}\n", " source: https://arxiv.org/pdf/2408.09869\n", "\n", "Source 3:\n", - " text: \"6 Future work and contributions\\nDocling is designed to allow easy extension of the model library and pipelines. In the future, we plan to extend Docling with several more models, such as a figure-clas...\"\n", + " text: \"6 Future work and contributions\\nDocling is designed to allow easy extension of the model library and pipelines. In the future, we plan to extend Docling with several more models, such as a figure-classifier model, an equationrecognition model, a code-recognition model and more. This will help improve the quality of conversion for specific types of ...\"\n", " dl_meta: {'schema_name': 'docling_core.transforms.chunker.DocMeta', 'version': '1.0.0', 'doc_items': [{'self_ref': '#/texts/76', 'parent': {'$ref': '#/body'}, 'children': [], 'label': 'text', 'prov': [{'page_no': 5, 'bbox': {'l': 108.0, 't': 322.468994140625, 'r': 504.00299072265625, 'b': 259.0169982910156, 'coord_origin': 'BOTTOMLEFT'}, 'charspan': [0, 543]}]}, {'self_ref': '#/texts/77', 'parent': {'$ref': '#/body'}, 'children': [], 'label': 'text', 'prov': [{'page_no': 5, 'bbox': {'l': 108.0, 't': 251.6540069580078, 'r': 504.00299072265625, 'b': 198.99200439453125, 'coord_origin': 'BOTTOMLEFT'}, 'charspan': [0, 402]}]}], 'headings': ['6 Future work and contributions'], 'origin': {'mimetype': 'application/pdf', 'binary_hash': 11465328351749295394, 'filename': '2408.09869v5.pdf'}}\n", - " source: https://arxiv.org/pdf/2408.09869\n", - "\n", - "Source 4:\n", - " text: \"3.3 Assembly\\nIn the final pipeline stage, Docling assembles all prediction results produced on each page into a well-defined datatype that encapsulates a converted document, as defined in the auxiliar...\"\n", - " dl_meta: {'schema_name': 'docling_core.transforms.chunker.DocMeta', 'version': '1.0.0', 'doc_items': [{'self_ref': '#/texts/62', 'parent': {'$ref': '#/body'}, 'children': [], 'label': 'text', 'prov': [{'page_no': 4, 'bbox': {'l': 108.0, 't': 506.08099365234375, 'r': 504.00299072265625, 'b': 431.718994140625, 'coord_origin': 'BOTTOMLEFT'}, 'charspan': [0, 622]}]}], 'headings': ['3.3 Assembly'], 'origin': {'mimetype': 'application/pdf', 'binary_hash': 11465328351749295394, 'filename': '2408.09869v5.pdf'}}\n", " source: https://arxiv.org/pdf/2408.09869\n" ] } ], "source": [ - "from langchain.chains import create_retrieval_chain\n", - "from langchain.chains.combine_documents import create_stuff_documents_chain\n", - "from langchain_huggingface import HuggingFaceEndpoint\n", - "\n", - "llm = HuggingFaceEndpoint(repo_id=GEN_MODEL_ID)\n", - "\n", - "\n", - "def clip_text(text, threshold=100):\n", - " return f\"{text[:threshold]}...\" if len(text) > threshold else text\n", - "\n", - "\n", - "retriever = vectorstore.as_retriever()\n", "question_answer_chain = create_stuff_documents_chain(llm, PROMPT)\n", "rag_chain = create_retrieval_chain(retriever, question_answer_chain)\n", "resp_dict = rag_chain.invoke({\"input\": QUESTION})\n", "\n", - "answer = clip_text(resp_dict[\"answer\"], threshold=200)\n", - "print(f\"Question:\\n{resp_dict['input']}\\n\\nAnswer:\\n{json.dumps(answer)}\")\n", + "clipped_answer = clip_text(resp_dict[\"answer\"], threshold=200)\n", + "print(f\"Question:\\n{resp_dict['input']}\\n\\nAnswer:\\n{clipped_answer}\")\n", "for i, doc in enumerate(resp_dict[\"context\"]):\n", " print()\n", " print(f\"Source {i+1}:\")\n", - " print(f\" text: {json.dumps(clip_text(doc.page_content, threshold=200))}\")\n", + " print(f\" text: {json.dumps(clip_text(doc.page_content, threshold=350))}\")\n", " for key in doc.metadata:\n", " if key != \"pk\":\n", " val = doc.metadata.get(key)\n", @@ -361,7 +367,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.7" + "version": "3.12.8" } }, "nbformat": 4, diff --git a/docling_langchain/__init__.py b/langchain_docling/__init__.py similarity index 66% rename from docling_langchain/__init__.py rename to langchain_docling/__init__.py index 438d305..f46261d 100644 --- a/docling_langchain/__init__.py +++ b/langchain_docling/__init__.py @@ -4,4 +4,4 @@ # """Docling LangChain package.""" -from docling_langchain.loader import DoclingLoader +from langchain_docling.loader import DoclingLoader diff --git a/docling_langchain/loader.py b/langchain_docling/loader.py similarity index 100% rename from docling_langchain/loader.py rename to langchain_docling/loader.py diff --git a/docling_langchain/py.typed b/langchain_docling/py.typed similarity index 100% rename from docling_langchain/py.typed rename to langchain_docling/py.typed diff --git a/pyproject.toml b/pyproject.toml index e195cdc..a35e460 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [tool.poetry] -name = "docling-langchain" +name = "langchain-docling" version = "0.1.0" # DO NOT EDIT, updated automatically description = "Docling LangChain integration" authors = ["Panos Vagenas "] @@ -16,7 +16,7 @@ classifiers = [ "Topic :: Scientific/Engineering :: Artificial Intelligence", "Programming Language :: Python :: 3" ] -packages = [{include = "docling_langchain"}] +packages = [{include = "langchain_docling"}] [tool.poetry.dependencies] python = ">=3.9,<3.13" diff --git a/test/test_loader.py b/test/test_loader.py index 0130e30..d68fe13 100644 --- a/test/test_loader.py +++ b/test/test_loader.py @@ -5,7 +5,7 @@ from docling.chunking import HierarchicalChunker from docling.datamodel.document import DoclingDocument as DLDocument -from docling_langchain.loader import DoclingLoader, ExportType +from langchain_docling.loader import DoclingLoader, ExportType in_json_str = json.dumps( {