Skip to content

Commit

Permalink
reformatted the code to pass the tests
Browse files Browse the repository at this point in the history
Signed-off-by: Peter Staar <[email protected]>
  • Loading branch information
PeterStaar-IBM committed Nov 5, 2024
1 parent 4cab7d6 commit 98efb89
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 21 deletions.
33 changes: 12 additions & 21 deletions docs/examples/advanced_chunking_with_merging.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -16,20 +16,16 @@
],
"source": [
"from typing import Iterator\n",
"\n",
"import lancedb\n",
"import semchunk\n",
"from docling_core.transforms.chunker import (\n",
" BaseChunk,\n",
" BaseChunker,\n",
" HierarchicalChunker\n",
")\n",
"from docling_core.transforms.chunker import BaseChunk, BaseChunker, HierarchicalChunker\n",
"from docling_core.types import DoclingDocument\n",
"from pydantic import PositiveInt\n",
"\n",
"from docling.document_converter import DocumentConverter\n",
"import lancedb\n",
"\n",
"from sentence_transformers import SentenceTransformer\n",
"from transformers import AutoTokenizer\n",
"from sentence_transformers import SentenceTransformer"
"\n",
"from docling.document_converter import DocumentConverter"
]
},
{
Expand Down Expand Up @@ -321,9 +317,7 @@
" return t1 + \"\\n\" + t2\n",
"\n",
"\n",
"def split_by_doc_items(\n",
" doc_chunk: DocChunk, tokenizer, chunk_size: int\n",
"):\n",
"def split_by_doc_items(doc_chunk: DocChunk, tokenizer, chunk_size: int):\n",
" if doc_chunk.meta.doc_items == None or len(doc_chunk.meta.doc_items) <= 1:\n",
" return [doc_chunk]\n",
" length = doc_chunk_length(doc_chunk, tokenizer)\n",
Expand Down Expand Up @@ -618,9 +612,7 @@
"def adjust_chunks_for_fixed_size(doc, original_chunks, tokenizer, splitter, chunk_size):\n",
" chunks_after_splitting_by_items = []\n",
" for chunk in original_chunks:\n",
" chunk_split_by_doc_items = split_by_doc_items(\n",
" chunk, tokenizer, chunk_size\n",
" )\n",
" chunk_split_by_doc_items = split_by_doc_items(chunk, tokenizer, chunk_size)\n",
" chunks_after_splitting_by_items.extend(chunk_split_by_doc_items)\n",
" chunks_after_splitting_recursively = []\n",
" for chunk in chunks_after_splitting_by_items:\n",
Expand Down Expand Up @@ -828,11 +820,11 @@
" output = \"\"\n",
" if chunk.meta.headings != None:\n",
" for h in chunk.meta.headings:\n",
" output += h + '\\n'\n",
" output += h + \"\\n\"\n",
" if chunk.meta.captions != None:\n",
" for c in chunk.meta.captions:\n",
" output += c + '\\n'\n",
" output += chunk.text \n",
" output += c + \"\\n\"\n",
" output += chunk.text\n",
" return output"
]
},
Expand All @@ -854,7 +846,6 @@
}
],
"source": [
"\n",
"print(make_text_for_embedding(chunks[19]))"
]
},
Expand All @@ -874,7 +865,7 @@
" \"vector\": embeddings,\n",
" \"text\": chunk.text,\n",
" \"headings\": chunk.meta.headings,\n",
" \"captions\": chunk.meta.captions\n",
" \"captions\": chunk.meta.captions,\n",
" }\n",
" data.append(data_item)\n",
" tbl = db.create_table(index_name, data=data, exist_ok=True)\n",
Expand Down
1 change: 1 addition & 0 deletions docs/examples/rag_langchain.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@
"\n",
"from docling.document_converter import DocumentConverter\n",
"\n",
"\n",
"class DoclingPDFLoader(BaseLoader):\n",
"\n",
" def __init__(self, file_path: str | list[str]) -> None:\n",
Expand Down

0 comments on commit 98efb89

Please sign in to comment.