diff --git a/docling/chunking/__init__.py b/docling/chunking/__init__.py new file mode 100644 index 00000000..e72deb97 --- /dev/null +++ b/docling/chunking/__init__.py @@ -0,0 +1,12 @@ +# +# Copyright IBM Corp. 2024 - 2024 +# SPDX-License-Identifier: MIT +# + +from docling_core.transforms.chunker.base import BaseChunk, BaseChunker, BaseMeta +from docling_core.transforms.chunker.hierarchical_chunker import ( + DocChunk, + DocMeta, + HierarchicalChunker, +) +from docling_core.transforms.chunker.hybrid_chunker import HybridChunker diff --git a/docs/assets/docling_arch.png b/docs/assets/docling_arch.png index cecda04b..69f45129 100644 Binary files a/docs/assets/docling_arch.png and b/docs/assets/docling_arch.png differ diff --git a/docs/assets/docling_arch.pptx b/docs/assets/docling_arch.pptx index 19c22172..f1f00a57 100644 Binary files a/docs/assets/docling_arch.pptx and b/docs/assets/docling_arch.pptx differ diff --git a/docs/concepts/chunking.md b/docs/concepts/chunking.md new file mode 100644 index 00000000..bed8bce3 --- /dev/null +++ b/docs/concepts/chunking.md @@ -0,0 +1,65 @@ +## Introduction + +A *chunker* is a Docling abstraction that, given a +[`DoclingDocument`](./docling_document.md), returns a stream of chunks, each of which +captures some part of the document as a string accompanied by respective metadata. + +To enable both flexibility for downstream applications and out-of-the-box utility, +Docling defines a chunker class hierarchy, providing a base type, `BaseChunker`, as well +as specific subclasses. + +Docling integration with gen AI frameworks like LlamaIndex is done using the +`BaseChunker` interface, so users can easily plug in any built-in, self-defined, or +third-party `BaseChunker` implementation. + +## Base Chunker + +The `BaseChunker` base class API defines that any chunker should provide the following: + +- `def chunk(self, dl_doc: DoclingDocument, **kwargs) -> Iterator[BaseChunk]`: + Returning the chunks for the provided document. +- `def serialize(self, chunk: BaseChunk) -> str`: + Returning the potentially metadata-enriched serialization of the chunk, typically + used to feed an embedding model (or generation model). + +## Hybrid Chunker + +!!! note "To access `HybridChunker`" + + - If you are using the `docling` package, you can import as follows: + ```python + from docling.chunking import HybridChunker + ``` + - If you are only using the `docling-core` package, you must ensure to install + the `chunking` extra, e.g. + ```shell + pip install 'docling-core[chunking]' + ``` + and then you + can import as follows: + ```python + from docling_core.transforms.chunker.hybrid_chunker import HybridChunker + ``` + +The `HybridChunker` implementation uses a hybrid approach, applying tokenization-aware +refinements on top of document-based [hierarchical](#hierarchical-chunker) chunking. + +More precisely: + +- it starts from the result of the hierarchical chunker and, based on the user-provided + tokenizer (typically to be aligned to the embedding model tokenizer), it: +- does one pass where it splits chunks only when needed (i.e. oversized w.r.t. +tokens), & +- another pass where it merges chunks only when possible (i.e. undersized successive +chunks with same headings & captions) ā€” users can opt out of this step via param +`merge_peers` (by default `True`) + +šŸ‘‰ Example: see [here](../../examples/hybrid_chunking). + +## Hierarchical Chunker + +The `HierarchicalChunker` implementation uses the document structure information from +the [`DoclingDocument`](../docling_document) to create one chunk for each individual +detected document element, by default only merging together list items (can be opted out +via param `merge_list_items`). It also takes care of attaching all relevant document +metadata, including headers and captions. diff --git a/docs/examples/hybrid_chunking.ipynb b/docs/examples/hybrid_chunking.ipynb new file mode 100644 index 00000000..6f097a8f --- /dev/null +++ b/docs/examples/hybrid_chunking.ipynb @@ -0,0 +1,439 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Hybrid Chunking" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "%pip install -qU 'docling-core[chunking]' sentence-transformers transformers lancedb" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Conversion" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from docling.document_converter import DocumentConverter\n", + "\n", + "DOC_SOURCE = \"../../tests/data/md/wiki.md\"\n", + "\n", + "doc = DocumentConverter().convert(source=DOC_SOURCE).document" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Chunking" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notice how `tokenizer` and `embed_model` further below are single-sourced from `EMBED_MODEL_ID`.\n", + "\n", + "This is important for making sure the chunker and the embedding model are using the same tokenizer." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import AutoTokenizer\n", + "\n", + "from docling.chunking import HybridChunker\n", + "\n", + "EMBED_MODEL_ID = \"sentence-transformers/all-MiniLM-L6-v2\"\n", + "MAX_TOKENS = 64\n", + "\n", + "tokenizer = AutoTokenizer.from_pretrained(EMBED_MODEL_ID)\n", + "\n", + "chunker = HybridChunker(\n", + " tokenizer=tokenizer, # can also just pass model name instead of tokenizer instance\n", + " max_tokens=MAX_TOKENS, # optional, by default derived from `tokenizer`\n", + " # merge_peers=True, # optional, defaults to True\n", + ")\n", + "chunk_iter = chunker.chunk(dl_doc=doc)\n", + "chunks = list(chunk_iter)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Points to notice:\n", + "- Where possible, we fit the limit of 64 tokens for the metadata-enriched serialization form (see chunk 2)\n", + "- Where neeeded, we stop before the limit, e.g. see cases of 63 as it would otherwise run into a comma (see chunk 6)\n", + "- Where possible, we merge undersized peer chunks (see chunk 0)\n", + "- \"Tail\" chunks trailing right after merges may still be undersized (see chunk 8)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== 0 ===\n", + "chunk.text (55 tokens):\n", + "'International Business Machines Corporation (using the trademark IBM), nicknamed Big Blue, is an American multinational technology company headquartered in Armonk, New York and present in over 175 countries.\\nIt is a publicly traded company and one of the 30 companies in the Dow Jones Industrial Average.'\n", + "chunker.serialize(chunk) (56 tokens):\n", + "'IBM\\nInternational Business Machines Corporation (using the trademark IBM), nicknamed Big Blue, is an American multinational technology company headquartered in Armonk, New York and present in over 175 countries.\\nIt is a publicly traded company and one of the 30 companies in the Dow Jones Industrial Average.'\n", + "\n", + "=== 1 ===\n", + "chunk.text (45 tokens):\n", + "'IBM is the largest industrial research organization in the world, with 19 research facilities across a dozen countries, having held the record for most annual U.S. patents generated by a business for 29 consecutive years from 1993 to 2021.'\n", + "chunker.serialize(chunk) (46 tokens):\n", + "'IBM\\nIBM is the largest industrial research organization in the world, with 19 research facilities across a dozen countries, having held the record for most annual U.S. patents generated by a business for 29 consecutive years from 1993 to 2021.'\n", + "\n", + "=== 2 ===\n", + "chunk.text (63 tokens):\n", + "'IBM was founded in 1911 as the Computing-Tabulating-Recording Company (CTR), a holding company of manufacturers of record-keeping and measuring systems. It was renamed \"International Business Machines\" in 1924 and soon became the leading manufacturer of punch-card tabulating systems. During the 1960s and 1970s, the'\n", + "chunker.serialize(chunk) (64 tokens):\n", + "'IBM\\nIBM was founded in 1911 as the Computing-Tabulating-Recording Company (CTR), a holding company of manufacturers of record-keeping and measuring systems. It was renamed \"International Business Machines\" in 1924 and soon became the leading manufacturer of punch-card tabulating systems. During the 1960s and 1970s, the'\n", + "\n", + "=== 3 ===\n", + "chunk.text (44 tokens):\n", + "\"IBM mainframe, exemplified by the System/360, was the world's dominant computing platform, with the company producing 80 percent of computers in the U.S. and 70 percent of computers worldwide.[11]\"\n", + "chunker.serialize(chunk) (45 tokens):\n", + "\"IBM\\nIBM mainframe, exemplified by the System/360, was the world's dominant computing platform, with the company producing 80 percent of computers in the U.S. and 70 percent of computers worldwide.[11]\"\n", + "\n", + "=== 4 ===\n", + "chunk.text (63 tokens):\n", + "'IBM debuted in the microcomputer market in 1981 with the IBM Personal Computer, ā€” its DOS software provided by Microsoft, ā€” which became the basis for the majority of personal computers to the present day.[12] The company later also found success in the portable space with the ThinkPad. Since the 1990s,'\n", + "chunker.serialize(chunk) (64 tokens):\n", + "'IBM\\nIBM debuted in the microcomputer market in 1981 with the IBM Personal Computer, ā€” its DOS software provided by Microsoft, ā€” which became the basis for the majority of personal computers to the present day.[12] The company later also found success in the portable space with the ThinkPad. Since the 1990s,'\n", + "\n", + "=== 5 ===\n", + "chunk.text (61 tokens):\n", + "'IBM has concentrated on computer services, software, supercomputers, and scientific research; it sold its microcomputer division to Lenovo in 2005. IBM continues to develop mainframes, and its supercomputers have consistently ranked among the most powerful in the world in the 21st century.'\n", + "chunker.serialize(chunk) (62 tokens):\n", + "'IBM\\nIBM has concentrated on computer services, software, supercomputers, and scientific research; it sold its microcomputer division to Lenovo in 2005. IBM continues to develop mainframes, and its supercomputers have consistently ranked among the most powerful in the world in the 21st century.'\n", + "\n", + "=== 6 ===\n", + "chunk.text (62 tokens):\n", + "\"As one of the world's oldest and largest technology companies, IBM has been responsible for several technological innovations, including the automated teller machine (ATM), dynamic random-access memory (DRAM), the floppy disk, the hard disk drive, the magnetic stripe card, the relational database, the SQL programming\"\n", + "chunker.serialize(chunk) (63 tokens):\n", + "\"IBM\\nAs one of the world's oldest and largest technology companies, IBM has been responsible for several technological innovations, including the automated teller machine (ATM), dynamic random-access memory (DRAM), the floppy disk, the hard disk drive, the magnetic stripe card, the relational database, the SQL programming\"\n", + "\n", + "=== 7 ===\n", + "chunk.text (63 tokens):\n", + "'language, and the UPC barcode. The company has made inroads in advanced computer chips, quantum computing, artificial intelligence, and data infrastructure.[13][14][15] IBM employees and alumni have won various recognitions for their scientific research and inventions, including six Nobel Prizes and six Turing'\n", + "chunker.serialize(chunk) (64 tokens):\n", + "'IBM\\nlanguage, and the UPC barcode. The company has made inroads in advanced computer chips, quantum computing, artificial intelligence, and data infrastructure.[13][14][15] IBM employees and alumni have won various recognitions for their scientific research and inventions, including six Nobel Prizes and six Turing'\n", + "\n", + "=== 8 ===\n", + "chunk.text (5 tokens):\n", + "'Awards.[16]'\n", + "chunker.serialize(chunk) (6 tokens):\n", + "'IBM\\nAwards.[16]'\n", + "\n", + "=== 9 ===\n", + "chunk.text (56 tokens):\n", + "'IBM originated with several technological innovations developed and commercialized in the late 19th century. Julius E. Pitrap patented the computing scale in 1885;[17] Alexander Dey invented the dial recorder (1888);[18] Herman Hollerith patented the Electric Tabulating Machine'\n", + "chunker.serialize(chunk) (60 tokens):\n", + "'IBM\\n1910sā€“1950s\\nIBM originated with several technological innovations developed and commercialized in the late 19th century. Julius E. Pitrap patented the computing scale in 1885;[17] Alexander Dey invented the dial recorder (1888);[18] Herman Hollerith patented the Electric Tabulating Machine'\n", + "\n", + "=== 10 ===\n", + "chunk.text (60 tokens):\n", + "\"(1889);[19] and Willard Bundy invented a time clock to record workers' arrival and departure times on a paper tape (1889).[20] On June 16, 1911, their four companies were amalgamated in New York State by Charles Ranlett Flint forming a fifth company, the\"\n", + "chunker.serialize(chunk) (64 tokens):\n", + "\"IBM\\n1910sā€“1950s\\n(1889);[19] and Willard Bundy invented a time clock to record workers' arrival and departure times on a paper tape (1889).[20] On June 16, 1911, their four companies were amalgamated in New York State by Charles Ranlett Flint forming a fifth company, the\"\n", + "\n", + "=== 11 ===\n", + "chunk.text (59 tokens):\n", + "'Computing-Tabulating-Recording Company (CTR) based in Endicott, New York.[1][21] The five companies had 1,300 employees and offices and plants in Endicott and Binghamton, New York; Dayton, Ohio; Detroit, Michigan; Washington,'\n", + "chunker.serialize(chunk) (63 tokens):\n", + "'IBM\\n1910sā€“1950s\\nComputing-Tabulating-Recording Company (CTR) based in Endicott, New York.[1][21] The five companies had 1,300 employees and offices and plants in Endicott and Binghamton, New York; Dayton, Ohio; Detroit, Michigan; Washington,'\n", + "\n", + "=== 12 ===\n", + "chunk.text (13 tokens):\n", + "'D.C.; and Toronto, Canada.[22]'\n", + "chunker.serialize(chunk) (17 tokens):\n", + "'IBM\\n1910sā€“1950s\\nD.C.; and Toronto, Canada.[22]'\n", + "\n", + "=== 13 ===\n", + "chunk.text (60 tokens):\n", + "'Collectively, the companies manufactured a wide array of machinery for sale and lease, ranging from commercial scales and industrial time recorders, meat and cheese slicers, to tabulators and punched cards. Thomas J. Watson, Sr., fired from the National Cash Register Company by John Henry Patterson, called'\n", + "chunker.serialize(chunk) (64 tokens):\n", + "'IBM\\n1910sā€“1950s\\nCollectively, the companies manufactured a wide array of machinery for sale and lease, ranging from commercial scales and industrial time recorders, meat and cheese slicers, to tabulators and punched cards. Thomas J. Watson, Sr., fired from the National Cash Register Company by John Henry Patterson, called'\n", + "\n", + "=== 14 ===\n", + "chunk.text (59 tokens):\n", + "\"on Flint and, in 1914, was offered a position at CTR.[23] Watson joined CTR as general manager and then, 11 months later, was made President when antitrust cases relating to his time at NCR were resolved.[24] Having learned Patterson's pioneering business\"\n", + "chunker.serialize(chunk) (63 tokens):\n", + "\"IBM\\n1910sā€“1950s\\non Flint and, in 1914, was offered a position at CTR.[23] Watson joined CTR as general manager and then, 11 months later, was made President when antitrust cases relating to his time at NCR were resolved.[24] Having learned Patterson's pioneering business\"\n", + "\n", + "=== 15 ===\n", + "chunk.text (23 tokens):\n", + "\"practices, Watson proceeded to put the stamp of NCR onto CTR's companies.[23]:\\n105\"\n", + "chunker.serialize(chunk) (27 tokens):\n", + "\"IBM\\n1910sā€“1950s\\npractices, Watson proceeded to put the stamp of NCR onto CTR's companies.[23]:\\n105\"\n", + "\n", + "=== 16 ===\n", + "chunk.text (59 tokens):\n", + "'He implemented sales conventions, \"generous sales incentives, a focus on customer service, an insistence on well-groomed, dark-suited salesmen and had an evangelical fervor for instilling company pride and loyalty in every worker\".[25][26] His favorite slogan,'\n", + "chunker.serialize(chunk) (63 tokens):\n", + "'IBM\\n1910sā€“1950s\\nHe implemented sales conventions, \"generous sales incentives, a focus on customer service, an insistence on well-groomed, dark-suited salesmen and had an evangelical fervor for instilling company pride and loyalty in every worker\".[25][26] His favorite slogan,'\n", + "\n", + "=== 17 ===\n", + "chunk.text (60 tokens):\n", + "'\"THINK\", became a mantra for each company\\'s employees.[25] During Watson\\'s first four years, revenues reached $9 million ($158 million today) and the company\\'s operations expanded to Europe, South America, Asia and Australia.[25] Watson never liked the'\n", + "chunker.serialize(chunk) (64 tokens):\n", + "'IBM\\n1910sā€“1950s\\n\"THINK\", became a mantra for each company\\'s employees.[25] During Watson\\'s first four years, revenues reached $9 million ($158 million today) and the company\\'s operations expanded to Europe, South America, Asia and Australia.[25] Watson never liked the'\n", + "\n", + "=== 18 ===\n", + "chunk.text (57 tokens):\n", + "'clumsy hyphenated name \"Computing-Tabulating-Recording Company\" and chose to replace it with the more expansive title \"International Business Machines\" which had previously been used as the name of CTR\\'s Canadian Division;[27] the name was changed on February 14,'\n", + "chunker.serialize(chunk) (61 tokens):\n", + "'IBM\\n1910sā€“1950s\\nclumsy hyphenated name \"Computing-Tabulating-Recording Company\" and chose to replace it with the more expansive title \"International Business Machines\" which had previously been used as the name of CTR\\'s Canadian Division;[27] the name was changed on February 14,'\n", + "\n", + "=== 19 ===\n", + "chunk.text (21 tokens):\n", + "'1924.[28] By 1933, most of the subsidiaries had been merged into one company, IBM.'\n", + "chunker.serialize(chunk) (25 tokens):\n", + "'IBM\\n1910sā€“1950s\\n1924.[28] By 1933, most of the subsidiaries had been merged into one company, IBM.'\n", + "\n", + "=== 20 ===\n", + "chunk.text (22 tokens):\n", + "'In 1961, IBM developed the SABRE reservation system for American Airlines and introduced the highly successful Selectric typewriter.'\n", + "chunker.serialize(chunk) (26 tokens):\n", + "'IBM\\n1960sā€“1980s\\nIn 1961, IBM developed the SABRE reservation system for American Airlines and introduced the highly successful Selectric typewriter.'\n", + "\n" + ] + } + ], + "source": [ + "for i, chunk in enumerate(chunks):\n", + " print(f\"=== {i} ===\")\n", + " txt_tokens = len(tokenizer.tokenize(chunk.text, max_length=None))\n", + " print(f\"chunk.text ({txt_tokens} tokens):\\n{repr(chunk.text)}\")\n", + "\n", + " ser_txt = chunker.serialize(chunk=chunk)\n", + " ser_tokens = len(tokenizer.tokenize(ser_txt, max_length=None))\n", + " print(f\"chunker.serialize(chunk) ({ser_tokens} tokens):\\n{repr(ser_txt)}\")\n", + "\n", + " print()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Vector Retrieval" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "To disable this warning, you can either:\n", + "\t- Avoid using `tokenizers` before the fork if possible\n", + "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" + ] + } + ], + "source": [ + "from sentence_transformers import SentenceTransformer\n", + "\n", + "embed_model = SentenceTransformer(EMBED_MODEL_ID)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
vectortextheadingscaptions_distance
0[-0.1269039, -0.01948185, -0.07718097, -0.1116...language, and the UPC barcode. The company has...[IBM]None1.164613
1[-0.10198064, 0.0055981805, -0.05095279, -0.13...IBM originated with several technological inno...[IBM, 1910sā€“1950s]None1.245144
2[-0.057121325, -0.034115084, -0.018113216, -0....As one of the world's oldest and largest techn...[IBM]None1.355586
3[-0.04429054, -0.058111433, -0.009330196, -0.0...IBM is the largest industrial research organiz...[IBM]None1.398617
4[-0.11920792, 0.053496413, -0.042391937, -0.03...Awards.[16][IBM]None1.446295
\n", + "
" + ], + "text/plain": [ + " vector \\\n", + "0 [-0.1269039, -0.01948185, -0.07718097, -0.1116... \n", + "1 [-0.10198064, 0.0055981805, -0.05095279, -0.13... \n", + "2 [-0.057121325, -0.034115084, -0.018113216, -0.... \n", + "3 [-0.04429054, -0.058111433, -0.009330196, -0.0... \n", + "4 [-0.11920792, 0.053496413, -0.042391937, -0.03... \n", + "\n", + " text headings \\\n", + "0 language, and the UPC barcode. The company has... [IBM] \n", + "1 IBM originated with several technological inno... [IBM, 1910sā€“1950s] \n", + "2 As one of the world's oldest and largest techn... [IBM] \n", + "3 IBM is the largest industrial research organiz... [IBM] \n", + "4 Awards.[16] [IBM] \n", + "\n", + " captions _distance \n", + "0 None 1.164613 \n", + "1 None 1.245144 \n", + "2 None 1.355586 \n", + "3 None 1.398617 \n", + "4 None 1.446295 " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from pathlib import Path\n", + "from tempfile import mkdtemp\n", + "\n", + "import lancedb\n", + "\n", + "\n", + "def make_lancedb_index(db_uri, index_name, chunks, embedding_model):\n", + " db = lancedb.connect(db_uri)\n", + " data = []\n", + " for chunk in chunks:\n", + " embeddings = embedding_model.encode(chunker.serialize(chunk=chunk))\n", + " data_item = {\n", + " \"vector\": embeddings,\n", + " \"text\": chunk.text,\n", + " \"headings\": chunk.meta.headings,\n", + " \"captions\": chunk.meta.captions,\n", + " }\n", + " data.append(data_item)\n", + " tbl = db.create_table(index_name, data=data, exist_ok=True)\n", + " return tbl\n", + "\n", + "\n", + "db_uri = str(Path(mkdtemp()) / \"docling.db\")\n", + "index = make_lancedb_index(db_uri, doc.name, chunks, embed_model)\n", + "\n", + "sample_query = \"invent\"\n", + "sample_embedding = embed_model.encode(sample_query)\n", + "results = index.search(sample_embedding).limit(5)\n", + "\n", + "results.to_pandas()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/mkdocs.yml b/mkdocs.yml index 687ae6d6..81abcc6a 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -63,7 +63,7 @@ nav: - Concepts: concepts/index.md - Architecture: concepts/architecture.md - Docling Document: concepts/docling_document.md - # - Chunking: concepts/chunking.md + - Chunking: concepts/chunking.md - Examples: - Examples: examples/index.md - Conversion: @@ -80,7 +80,8 @@ nav: - "RAG with LlamaIndex šŸ¦™": examples/rag_llamaindex.ipynb - "RAG with LangChain šŸ¦œšŸ”—": examples/rag_langchain.ipynb - "Hybrid RAG with Qdrant": examples/hybrid_rag_qdrant.ipynb - # - Chunking: + - Chunking: + - "Hybrid chunking": examples/hybrid_chunking.ipynb # - Chunking: examples/chunking.md # - CLI: # - CLI: examples/cli.md diff --git a/poetry.lock b/poetry.lock index becf1b33..2398567a 100644 --- a/poetry.lock +++ b/poetry.lock @@ -890,13 +890,13 @@ files = [ [[package]] name = "docling-core" -version = "2.7.1" +version = "2.8.0" description = "A python library to define and validate data types in Docling." optional = false python-versions = "<4.0,>=3.9" files = [ - {file = "docling_core-2.7.1-py3-none-any.whl", hash = "sha256:a3d3df9ed8755f98acfdcc8960e8d7b1eaf7dada9aded644e2487d43dc418ce5"}, - {file = "docling_core-2.7.1.tar.gz", hash = "sha256:563b4f3da2d7e4fd70ba4ce0e418e4898478f452d917665cdcd4cdde17befa55"}, + {file = "docling_core-2.8.0-py3-none-any.whl", hash = "sha256:392aad49e25f5fd1d279410118fbd91d9aaab9dd92d043738d20c10c57193d86"}, + {file = "docling_core-2.8.0.tar.gz", hash = "sha256:6ac5cbc6f0abcbdf599c2a4b1a3f7b52fd8baebf3c4ebf94d7b7e2ee061a654e"}, ] [package.dependencies] @@ -906,9 +906,14 @@ pandas = ">=2.1.4,<3.0.0" pillow = ">=10.3.0,<11.0.0" pydantic = ">=2.6.0,<2.10.0 || >2.10.0,<2.10.1 || >2.10.1,<2.10.2 || >2.10.2,<3.0.0" pyyaml = ">=5.1,<7.0.0" +semchunk = {version = ">=2.2.0,<3.0.0", optional = true, markers = "extra == \"chunking\""} tabulate = ">=0.9.0,<0.10.0" +transformers = {version = ">=4.34.0,<5.0.0", optional = true, markers = "extra == \"chunking\""} typing-extensions = ">=4.12.2,<5.0.0" +[package.extras] +chunking = ["semchunk (>=2.2.0,<3.0.0)", "transformers (>=4.34.0,<5.0.0)"] + [[package]] name = "docling-ibm-models" version = "2.0.7" @@ -2823,6 +2828,32 @@ files = [ {file = "more_itertools-10.5.0-py3-none-any.whl", hash = "sha256:037b0d3203ce90cca8ab1defbbdac29d5f993fc20131f3664dc8d6acfa872aef"}, ] +[[package]] +name = "mpire" +version = "2.10.2" +description = "A Python package for easy multiprocessing, but faster than multiprocessing" +optional = false +python-versions = "*" +files = [ + {file = "mpire-2.10.2-py3-none-any.whl", hash = "sha256:d627707f7a8d02aa4c7f7d59de399dec5290945ddf7fbd36cbb1d6ebb37a51fb"}, + {file = "mpire-2.10.2.tar.gz", hash = "sha256:f66a321e93fadff34585a4bfa05e95bd946cf714b442f51c529038eb45773d97"}, +] + +[package.dependencies] +multiprocess = [ + {version = "*", optional = true, markers = "python_version < \"3.11\" and extra == \"dill\""}, + {version = ">=0.70.15", optional = true, markers = "python_version >= \"3.11\" and extra == \"dill\""}, +] +pygments = ">=2.0" +pywin32 = {version = ">=301", markers = "platform_system == \"Windows\""} +tqdm = ">=4.27" + +[package.extras] +dashboard = ["flask"] +dill = ["multiprocess", "multiprocess (>=0.70.15)"] +docs = ["docutils (==0.17.1)", "sphinx (==3.2.1)", "sphinx-autodoc-typehints (==1.11.0)", "sphinx-rtd-theme (==0.5.0)", "sphinx-versions (==1.0.1)", "sphinxcontrib-images (==0.9.2)"] +testing = ["ipywidgets", "multiprocess", "multiprocess (>=0.70.15)", "numpy", "pywin32 (>=301)", "rich"] + [[package]] name = "mpmath" version = "1.3.0" @@ -6040,6 +6071,11 @@ files = [ {file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f60021ec1574e56632be2a36b946f8143bf4e5e6af4a06d85281adc22938e0dd"}, {file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:394397841449853c2290a32050382edaec3da89e35b3e03d6cc966aebc6a8ae6"}, {file = "scikit_learn-1.5.2-cp312-cp312-win_amd64.whl", hash = "sha256:57cc1786cfd6bd118220a92ede80270132aa353647684efa385a74244a41e3b1"}, + {file = "scikit_learn-1.5.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e9a702e2de732bbb20d3bad29ebd77fc05a6b427dc49964300340e4c9328b3f5"}, + {file = "scikit_learn-1.5.2-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:b0768ad641981f5d3a198430a1d31c3e044ed2e8a6f22166b4d546a5116d7908"}, + {file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:178ddd0a5cb0044464fc1bfc4cca5b1833bfc7bb022d70b05db8530da4bb3dd3"}, + {file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f7284ade780084d94505632241bf78c44ab3b6f1e8ccab3d2af58e0e950f9c12"}, + {file = "scikit_learn-1.5.2-cp313-cp313-win_amd64.whl", hash = "sha256:b7b0f9a0b1040830d38c39b91b3a44e1b643f4b36e36567b80b7c6bd2202a27f"}, {file = "scikit_learn-1.5.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:757c7d514ddb00ae249832fe87100d9c73c6ea91423802872d9e74970a0e40b9"}, {file = "scikit_learn-1.5.2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:52788f48b5d8bca5c0736c175fa6bdaab2ef00a8f536cda698db61bd89c551c1"}, {file = "scikit_learn-1.5.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:643964678f4b5fbdc95cbf8aec638acc7aa70f5f79ee2cdad1eec3df4ba6ead8"}, @@ -6120,6 +6156,21 @@ files = [ cryptography = ">=2.0" jeepney = ">=0.6" +[[package]] +name = "semchunk" +version = "2.2.0" +description = "A fast and lightweight Python library for splitting text into semantically meaningful chunks." +optional = false +python-versions = ">=3.9" +files = [ + {file = "semchunk-2.2.0-py3-none-any.whl", hash = "sha256:7db19ca90ddb48f99265e789e07a7bb111ae25185f9cc3d44b94e1e61b9067fc"}, + {file = "semchunk-2.2.0.tar.gz", hash = "sha256:4de761ce614036fa3bea61adbe47e3ade7c96ac9b062f223b3ac353dbfd26743"}, +] + +[package.dependencies] +mpire = {version = "*", extras = ["dill"]} +tqdm = "*" + [[package]] name = "semver" version = "2.13.0" @@ -7561,4 +7612,4 @@ tesserocr = ["tesserocr"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "5320329d9899ce8577ab91f634df8568f97ab3a9d6d27c06ceba4cffca255533" +content-hash = "c397fcd5c719605f28352cd6e0a3828f082e9684ba64558539e0c3173bdd1fc5" diff --git a/pyproject.toml b/pyproject.toml index 8d1a8dcf..d0b2a3fc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,7 @@ packages = [{include = "docling"}] # actual dependencies: ###################### python = "^3.9" -docling-core = "^2.7.1" +docling-core = { version = "^2.8.0", extras = ["chunking"] } pydantic = "^2.0.0" docling-ibm-models = "^2.0.6" deepsearch-glm = "^0.26.1" diff --git a/tests/data/md/wiki.md b/tests/data/md/wiki.md new file mode 100644 index 00000000..134e456e --- /dev/null +++ b/tests/data/md/wiki.md @@ -0,0 +1,23 @@ +# IBM + +International Business Machines Corporation (using the trademark IBM), nicknamed Big Blue, is an American multinational technology company headquartered in Armonk, New York and present in over 175 countries. + +It is a publicly traded company and one of the 30 companies in the Dow Jones Industrial Average. + +IBM is the largest industrial research organization in the world, with 19 research facilities across a dozen countries, having held the record for most annual U.S. patents generated by a business for 29 consecutive years from 1993 to 2021. + +IBM was founded in 1911 as the Computing-Tabulating-Recording Company (CTR), a holding company of manufacturers of record-keeping and measuring systems. It was renamed "International Business Machines" in 1924 and soon became the leading manufacturer of punch-card tabulating systems. During the 1960s and 1970s, the IBM mainframe, exemplified by the System/360, was the world's dominant computing platform, with the company producing 80 percent of computers in the U.S. and 70 percent of computers worldwide.[11] + +IBM debuted in the microcomputer market in 1981 with the IBM Personal Computer, ā€” its DOS software provided by Microsoft, ā€” which became the basis for the majority of personal computers to the present day.[12] The company later also found success in the portable space with the ThinkPad. Since the 1990s, IBM has concentrated on computer services, software, supercomputers, and scientific research; it sold its microcomputer division to Lenovo in 2005. IBM continues to develop mainframes, and its supercomputers have consistently ranked among the most powerful in the world in the 21st century. + +As one of the world's oldest and largest technology companies, IBM has been responsible for several technological innovations, including the automated teller machine (ATM), dynamic random-access memory (DRAM), the floppy disk, the hard disk drive, the magnetic stripe card, the relational database, the SQL programming language, and the UPC barcode. The company has made inroads in advanced computer chips, quantum computing, artificial intelligence, and data infrastructure.[13][14][15] IBM employees and alumni have won various recognitions for their scientific research and inventions, including six Nobel Prizes and six Turing Awards.[16] + +## 1910sā€“1950s + +IBM originated with several technological innovations developed and commercialized in the late 19th century. Julius E. Pitrap patented the computing scale in 1885;[17] Alexander Dey invented the dial recorder (1888);[18] Herman Hollerith patented the Electric Tabulating Machine (1889);[19] and Willard Bundy invented a time clock to record workers' arrival and departure times on a paper tape (1889).[20] On June 16, 1911, their four companies were amalgamated in New York State by Charles Ranlett Flint forming a fifth company, the Computing-Tabulating-Recording Company (CTR) based in Endicott, New York.[1][21] The five companies had 1,300 employees and offices and plants in Endicott and Binghamton, New York; Dayton, Ohio; Detroit, Michigan; Washington, D.C.; and Toronto, Canada.[22] + +Collectively, the companies manufactured a wide array of machinery for sale and lease, ranging from commercial scales and industrial time recorders, meat and cheese slicers, to tabulators and punched cards. Thomas J. Watson, Sr., fired from the National Cash Register Company by John Henry Patterson, called on Flint and, in 1914, was offered a position at CTR.[23] Watson joined CTR as general manager and then, 11 months later, was made President when antitrust cases relating to his time at NCR were resolved.[24] Having learned Patterson's pioneering business practices, Watson proceeded to put the stamp of NCR onto CTR's companies.[23]:ā€Š105ā€Š He implemented sales conventions, "generous sales incentives, a focus on customer service, an insistence on well-groomed, dark-suited salesmen and had an evangelical fervor for instilling company pride and loyalty in every worker".[25][26] His favorite slogan, "THINK", became a mantra for each company's employees.[25] During Watson's first four years, revenues reached $9 million ($158 million today) and the company's operations expanded to Europe, South America, Asia and Australia.[25] Watson never liked the clumsy hyphenated name "Computing-Tabulating-Recording Company" and chose to replace it with the more expansive title "International Business Machines" which had previously been used as the name of CTR's Canadian Division;[27] the name was changed on February 14, 1924.[28] By 1933, most of the subsidiaries had been merged into one company, IBM. + +## 1960sā€“1980s + +In 1961, IBM developed the SABRE reservation system for American Airlines and introduced the highly successful Selectric typewriter.