From 7a45b92078b3a9fdd8f0650002eddc03e9d780af Mon Sep 17 00:00:00 2001 From: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Date: Thu, 21 Nov 2024 17:23:04 +0100 Subject: [PATCH] docs: add DocETL, Kotaemon, spaCy integrations; minor docs improvements (#408) Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> --- docs/concepts/index.md | 2 +- docs/index.md | 3 ++- docs/integrations/.template.md | 9 +++++++++ docs/integrations/data_prep_kit.md | 10 +++++----- docs/integrations/docetl.md | 9 +++++++++ docs/integrations/kotaemon.md | 9 +++++++++ docs/integrations/llamaindex.md | 18 ++++++++---------- docs/integrations/spacy.md | 9 +++++++++ mkdocs.yml | 4 ++++ 9 files changed, 56 insertions(+), 17 deletions(-) create mode 100644 docs/integrations/.template.md create mode 100644 docs/integrations/docetl.md create mode 100644 docs/integrations/kotaemon.md create mode 100644 docs/integrations/spacy.md diff --git a/docs/concepts/index.md b/docs/concepts/index.md index 54f24b64..b48345e2 100644 --- a/docs/concepts/index.md +++ b/docs/concepts/index.md @@ -1 +1 @@ -Use the navigation on the left to browse some core Docling concepts. +Use the navigation on the left to browse through some core Docling concepts. diff --git a/docs/index.md b/docs/index.md index efafc2b0..0224f29b 100644 --- a/docs/index.md +++ b/docs/index.md @@ -7,13 +7,14 @@ [![arXiv](https://img.shields.io/badge/arXiv-2408.09869-b31b1b.svg)](https://arxiv.org/abs/2408.09869) [![PyPI version](https://img.shields.io/pypi/v/docling)](https://pypi.org/project/docling/) -![Python](https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12-blue) +[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/docling)](https://pypi.org/project/docling/) [![Poetry](https://img.shields.io/endpoint?url=https://python-poetry.org/badge/v0.json)](https://python-poetry.org/) [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) [![Imports: isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://pycqa.github.io/isort/) [![Pydantic v2](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/pydantic/pydantic/main/docs/badge/v2.json)](https://pydantic.dev) [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit) [![License MIT](https://img.shields.io/github/license/DS4SD/docling)](https://opensource.org/licenses/MIT) +[![PyPI Downloads](https://static.pepy.tech/badge/docling/month)](https://pepy.tech/projects/docling) Docling parses documents and exports them to the desired format with ease and speed. diff --git a/docs/integrations/.template.md b/docs/integrations/.template.md new file mode 100644 index 00000000..60fb4d8d --- /dev/null +++ b/docs/integrations/.template.md @@ -0,0 +1,9 @@ +Docling is available as a plugin for [EXAMPLE](https://example.com). + +- 💻 [GitHub][github] +- 📖 [Docs][docs] +- 📦 [PyPI][pypi] + +[github]: https://github.com/... +[docs]: https://... +[pypi]: https://pypi.org/project/... diff --git a/docs/integrations/data_prep_kit.md b/docs/integrations/data_prep_kit.md index 5885e8ed..c9b3755a 100644 --- a/docs/integrations/data_prep_kit.md +++ b/docs/integrations/data_prep_kit.md @@ -1,13 +1,13 @@ ## Get started -Docling is used by the [Data Prep Kit \[↗\]](https://ibm.github.io/data-prep-kit/) open-source toolkit for preparing unstructured data for LLM application development ranging from laptop scale to datacenter scale. +Docling is used by the [Data Prep Kit](https://ibm.github.io/data-prep-kit/) open-source toolkit for preparing unstructured data for LLM application development ranging from laptop scale to datacenter scale. Below you find the Data Prep Kit modules powered by Docling. ## PDF ingestion to Parquet -- 💻 [GitHub \[↗\]](https://github.com/IBM/data-prep-kit/tree/dev/transforms/language/pdf2parquet) -- 📖 [API docs \[↗\]](https://ibm.github.io/data-prep-kit/transforms/language/pdf2parquet/python/) +- 💻 [PDF-to-Parquet GitHub](https://github.com/IBM/data-prep-kit/tree/dev/transforms/language/pdf2parquet) +- 📖 [PDF-to-Parquet Docs](https://ibm.github.io/data-prep-kit/transforms/language/pdf2parquet/python/) ## Document chunking -- 💻 [GitHub \[↗\]](https://github.com/IBM/data-prep-kit/tree/dev/transforms/language/doc_chunk) -- 📖 [API docs \[↗\]](https://ibm.github.io/data-prep-kit/transforms/language/doc_chunk/python/) +- 💻 [Doc Chunking GitHub](https://github.com/IBM/data-prep-kit/tree/dev/transforms/language/doc_chunk) +- 📖 [Doc Chunking Docs](https://ibm.github.io/data-prep-kit/transforms/language/doc_chunk/python/) diff --git a/docs/integrations/docetl.md b/docs/integrations/docetl.md new file mode 100644 index 00000000..a03a4044 --- /dev/null +++ b/docs/integrations/docetl.md @@ -0,0 +1,9 @@ +Docling is available as a file conversion method in [DocETL](https://github.com/ucbepic/docetl): + +- 💻 [DocETL GitHub][github] +- 📖 [DocETL Docs][docs] +- 📦 [DocETL PyPI][pypi] + +[github]: https://github.com/ucbepic/docetl +[docs]: https://ucbepic.github.io/docetl/ +[pypi]: https://pypi.org/project/docetl/ diff --git a/docs/integrations/kotaemon.md b/docs/integrations/kotaemon.md new file mode 100644 index 00000000..14cfc130 --- /dev/null +++ b/docs/integrations/kotaemon.md @@ -0,0 +1,9 @@ +Docling is available in [Kotaemon](https://cinnamon.github.io/kotaemon/) as the `DoclingReader` loader: + +- 💻 [Kotaemon GitHub][github] +- 📖 [DoclingReader Docs][docs] +- ⚙️ [Docling Setup in Kotaemon][setup] + +[github]: https://github.com/Cinnamon/kotaemon +[docs]: https://cinnamon.github.io/kotaemon/reference/loaders/docling_loader/ +[setup]: https://cinnamon.github.io/kotaemon/development/?h=docling#setup-multimodal-document-parsing-ocr-table-parsing-figure-extraction diff --git a/docs/integrations/llamaindex.md b/docs/integrations/llamaindex.md index 41eb6e3d..dc61b34f 100644 --- a/docs/integrations/llamaindex.md +++ b/docs/integrations/llamaindex.md @@ -1,8 +1,8 @@ ## Get started -Docling is available as an official [LlamaIndex \[↗\]](https://docs.llamaindex.ai/) extension. +Docling is available as an official [LlamaIndex](https://docs.llamaindex.ai/) extension. -To get started, check out the [step-by-step guide in LlamaIndex \[↗\]](https://docs.llamaindex.ai/en/stable/examples/data_connectors/DoclingReaderDemo/). +To get started, check out the [step-by-step guide in LlamaIndex](https://docs.llamaindex.ai/en/stable/examples/data_connectors/DoclingReaderDemo/). ## Components @@ -10,16 +10,14 @@ To get started, check out the [step-by-step guide in LlamaIndex \[↗\]](https:/ Reads document files and uses Docling to populate LlamaIndex `Document` objects — either serializing Docling's data model (losslessly, e.g. as JSON) or exporting to a simplified format (lossily, e.g. as Markdown). -- 💻 [GitHub \[↗\]](https://github.com/run-llama/llama_index/tree/main/llama-index-integrations/readers/llama-index-readers-docling) -- 📖 [API docs \[↗\]](https://docs.llamaindex.ai/en/stable/api_reference/readers/docling/) -- 📦 [PyPI \[↗\]](https://pypi.org/project/llama-index-readers-docling/) -- 🦙 [LlamaHub \[↗\]](https://llamahub.ai/l/readers/llama-index-readers-docling) +- 💻 [Docling Reader GitHub](https://github.com/run-llama/llama_index/tree/main/llama-index-integrations/readers/llama-index-readers-docling) +- 📖 [Docling Reader Docs](https://docs.llamaindex.ai/en/stable/api_reference/readers/docling/) +- 📦 [Docling Reader PyPI](https://pypi.org/project/llama-index-readers-docling/) ### Docling Node Parser Reads LlamaIndex `Document` objects populated in Docling's format by Docling Reader and, using its knowledge of the Docling format, parses them to LlamaIndex `Node` objects for downstream usage in LlamaIndex applications, e.g. as chunks for embedding. -- 💻 [GitHub \[↗\]](https://github.com/run-llama/llama_index/tree/main/llama-index-integrations/node_parser/llama-index-node-parser-docling) -- 📖 [API docs \[↗\]](https://docs.llamaindex.ai/en/stable/api_reference/node_parser/docling/) -- 📦 [PyPI \[↗\]](https://pypi.org/project/llama-index-node-parser-docling/) -- 🦙 [LlamaHub \[↗\]](https://llamahub.ai/l/node_parser/llama-index-node-parser-docling) +- 💻 [Docling Node Parser GitHub](https://github.com/run-llama/llama_index/tree/main/llama-index-integrations/node_parser/llama-index-node-parser-docling) +- 📖 [Docling Node Parser Docs](https://docs.llamaindex.ai/en/stable/api_reference/node_parser/docling/) +- 📦 [Docling Node Parser PyPI](https://pypi.org/project/llama-index-node-parser-docling/) diff --git a/docs/integrations/spacy.md b/docs/integrations/spacy.md new file mode 100644 index 00000000..82a20890 --- /dev/null +++ b/docs/integrations/spacy.md @@ -0,0 +1,9 @@ +Docling is available in [spaCy](https://spacy.io/) as the "SpaCy Layout" plugin: + +- 💻 [SpacyLayout GitHub][github] +- 📖 [SpacyLayout Docs][docs] +- 📦 [SpacyLayout PyPI][pypi] + +[github]: https://github.com/explosion/spacy-layout +[docs]: https://github.com/explosion/spacy-layout?tab=readme-ov-file#readme +[pypi]: https://pypi.org/project/spacy-layout/ diff --git a/mkdocs.yml b/mkdocs.yml index 44a32039..43012b1c 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -38,6 +38,7 @@ theme: - content.code.annotate - content.code.copy - announce.dismiss + - navigation.footer - navigation.tabs - navigation.indexes # <= if set, each "section" can have its own page, if index.md is used - navigation.instant @@ -85,7 +86,10 @@ nav: - Integrations: - Integrations: integrations/index.md - "Data Prep Kit": integrations/data_prep_kit.md + - "DocETL": integrations/docetl.md + - "Kotaemon": integrations/kotaemon.md - "LlamaIndex 🦙": integrations/llamaindex.md + - "spaCy": integrations/spacy.md # - "LangChain 🦜🔗": integrations/langchain.md # - API reference: # - API reference: api_reference/index.md