From 98efb8957e37ade0b67780dcef42474a06f2aace Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Tue, 5 Nov 2024 07:29:45 +0100 Subject: [PATCH] reformatted the code to pass the tests Signed-off-by: Peter Staar --- .../advanced_chunking_with_merging.ipynb | 33 +++++++------------ docs/examples/rag_langchain.ipynb | 1 + 2 files changed, 13 insertions(+), 21 deletions(-) diff --git a/docs/examples/advanced_chunking_with_merging.ipynb b/docs/examples/advanced_chunking_with_merging.ipynb index af778b58..c68b0bc1 100644 --- a/docs/examples/advanced_chunking_with_merging.ipynb +++ b/docs/examples/advanced_chunking_with_merging.ipynb @@ -16,20 +16,16 @@ ], "source": [ "from typing import Iterator\n", + "\n", + "import lancedb\n", "import semchunk\n", - "from docling_core.transforms.chunker import (\n", - " BaseChunk,\n", - " BaseChunker,\n", - " HierarchicalChunker\n", - ")\n", + "from docling_core.transforms.chunker import BaseChunk, BaseChunker, HierarchicalChunker\n", "from docling_core.types import DoclingDocument\n", "from pydantic import PositiveInt\n", - "\n", - "from docling.document_converter import DocumentConverter\n", - "import lancedb\n", - "\n", + "from sentence_transformers import SentenceTransformer\n", "from transformers import AutoTokenizer\n", - "from sentence_transformers import SentenceTransformer" + "\n", + "from docling.document_converter import DocumentConverter" ] }, { @@ -321,9 +317,7 @@ " return t1 + \"\\n\" + t2\n", "\n", "\n", - "def split_by_doc_items(\n", - " doc_chunk: DocChunk, tokenizer, chunk_size: int\n", - "):\n", + "def split_by_doc_items(doc_chunk: DocChunk, tokenizer, chunk_size: int):\n", " if doc_chunk.meta.doc_items == None or len(doc_chunk.meta.doc_items) <= 1:\n", " return [doc_chunk]\n", " length = doc_chunk_length(doc_chunk, tokenizer)\n", @@ -618,9 +612,7 @@ "def adjust_chunks_for_fixed_size(doc, original_chunks, tokenizer, splitter, chunk_size):\n", " chunks_after_splitting_by_items = []\n", " for chunk in original_chunks:\n", - " chunk_split_by_doc_items = split_by_doc_items(\n", - " chunk, tokenizer, chunk_size\n", - " )\n", + " chunk_split_by_doc_items = split_by_doc_items(chunk, tokenizer, chunk_size)\n", " chunks_after_splitting_by_items.extend(chunk_split_by_doc_items)\n", " chunks_after_splitting_recursively = []\n", " for chunk in chunks_after_splitting_by_items:\n", @@ -828,11 +820,11 @@ " output = \"\"\n", " if chunk.meta.headings != None:\n", " for h in chunk.meta.headings:\n", - " output += h + '\\n'\n", + " output += h + \"\\n\"\n", " if chunk.meta.captions != None:\n", " for c in chunk.meta.captions:\n", - " output += c + '\\n'\n", - " output += chunk.text \n", + " output += c + \"\\n\"\n", + " output += chunk.text\n", " return output" ] }, @@ -854,7 +846,6 @@ } ], "source": [ - "\n", "print(make_text_for_embedding(chunks[19]))" ] }, @@ -874,7 +865,7 @@ " \"vector\": embeddings,\n", " \"text\": chunk.text,\n", " \"headings\": chunk.meta.headings,\n", - " \"captions\": chunk.meta.captions\n", + " \"captions\": chunk.meta.captions,\n", " }\n", " data.append(data_item)\n", " tbl = db.create_table(index_name, data=data, exist_ok=True)\n", diff --git a/docs/examples/rag_langchain.ipynb b/docs/examples/rag_langchain.ipynb index 31ff009a..3f166d98 100644 --- a/docs/examples/rag_langchain.ipynb +++ b/docs/examples/rag_langchain.ipynb @@ -85,6 +85,7 @@ "\n", "from docling.document_converter import DocumentConverter\n", "\n", + "\n", "class DoclingPDFLoader(BaseLoader):\n", "\n", " def __init__(self, file_path: str | list[str]) -> None:\n",