From e41b41306a2aa55f82787213395d6950d190f4b2 Mon Sep 17 00:00:00 2001 From: Bill Murdock Date: Fri, 1 Nov 2024 08:41:37 -0400 Subject: [PATCH] Add files via upload Signed-off-by: Bill Murdock --- .../advanced_chunking_with_merging.ipynb | 812 ++++++++++++++++++ 1 file changed, 812 insertions(+) create mode 100644 docs/examples/advanced_chunking_with_merging.ipynb diff --git a/docs/examples/advanced_chunking_with_merging.ipynb b/docs/examples/advanced_chunking_with_merging.ipynb new file mode 100644 index 00000000..0461a5e0 --- /dev/null +++ b/docs/examples/advanced_chunking_with_merging.ipynb @@ -0,0 +1,812 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/bmurdock/.pyenv/versions/bmurdock-pyenv-virtualenv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "from docling.document_converter import DocumentConverter\n", + "from docling_core.transforms.chunker import HierarchicalChunker, BaseChunk, BaseMeta, BaseChunker\n", + "from docling_core.types.doc.document import DocItem\n", + "from docling_core.types import DoclingDocument\n", + "\n", + "import semchunk\n", + "\n", + "from pydantic import Field, PositiveInt\n", + "from typing import Optional, Iterator\n", + "\n", + "from transformers import AutoTokenizer" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Fetching 9 files: 100%|██████████| 9/9 [00:00<00:00, 44567.57it/s]\n" + ] + } + ], + "source": [ + "conv_res = DocumentConverter().convert(\"http://bill.murdocks.org/iccbr2011murdock_web.pdf\")\n", + "doc = conv_res.document\n", + "chunks = list(HierarchicalChunker().chunk(doc))" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "19\n" + ] + } + ], + "source": [ + "i = 0\n", + "for c in chunks:\n", + " # Finding the block of text containing the big bulletted list starting with \"Local\" because that's useful for testing the handling of lists.\n", + " if \"Local\" in c.meta.doc_items[0].text:\n", + " print(i)\n", + " i += 1" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'\\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.\\n\\uf0b7 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.\\n\\uf0b7 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).\\n\\uf0b7 Match Evaluation: As in [1], the total score for a match in LFACS is the sum of the match scores for the local match hypotheses included in the maximal consistent global map. Local match scores in LFACS are computed using inverse-document frequency (IDF) from our text corpus. Terms with high IDF scores occur rarely in the corpus so the fact that they align with the clue is less likely to be a coincidence and thus more likely to imply that the answer is correct.'" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chunks[19].text" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[ListItem(self_ref='#/texts/25', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=124.41297912597656, t=541.7998657226562, r=473.1099853515625, b=481.2223815917969, coord_origin=), charspan=(0, 363))], orig='\\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.', text='\\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.', enumerated=False, marker='-'),\n", + " ListItem(self_ref='#/texts/26', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=123.96786499023438, t=481.559814453125, r=473.1015930175781, b=408.8705139160156, coord_origin=), charspan=(0, 451))], orig='\\uf0b7 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.', text='\\uf0b7 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.', enumerated=False, marker='-'),\n", + " ListItem(self_ref='#/texts/27', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=124.11611938476562, t=409.2998352050781, r=472.8858947753906, b=384.54046630859375, coord_origin=), charspan=(0, 162))], orig='\\uf0b7 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).', text='\\uf0b7 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).', enumerated=False, marker='-'),\n", + " ListItem(self_ref='#/texts/28', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=124.39203643798828, t=385.059814453125, r=473.0816345214844, b=312.2467041015625, coord_origin=), charspan=(0, 467))], orig='\\uf0b7 Match Evaluation: As in [1], the total score for a match in LFACS is the sum of the match scores for the local match hypotheses included in the maximal consistent global map. Local match scores in LFACS are computed using inverse-document frequency (IDF) from our text corpus. Terms with high IDF scores occur rarely in the corpus so the fact that they align with the clue is less likely to be a coincidence and thus more likely to imply that the answer is correct.', text='\\uf0b7 Match Evaluation: As in [1], the total score for a match in LFACS is the sum of the match scores for the local match hypotheses included in the maximal consistent global map. Local match scores in LFACS are computed using inverse-document frequency (IDF) from our text corpus. Terms with high IDF scores occur rarely in the corpus so the fact that they align with the clue is less likely to be a coincidence and thus more likely to imply that the answer is correct.', enumerated=False, marker='-')]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chunks[19].meta.doc_items" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'iccbr2011murdock_web'" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "doc.name" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "EMBED_MODEL_ID = 'sentence-transformers/all-MiniLM-L6-v2'" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "TOKENIZER = AutoTokenizer.from_pretrained(EMBED_MODEL_ID)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['i', 'like', 'ike', '.', 'bob', 'likes', 'joe', '.']" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "TOKENIZER.tokenize('I like Ike.\\nBob likes Joe.')" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "8" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(TOKENIZER.tokenize('I like Ike.\\nBob likes Joe.'))" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "def count_tokens(text, tokenizer):\n", + " if text == None:\n", + " return 0\n", + " elif isinstance(text, list):\n", + " total = 0\n", + " for t in text:\n", + " total += count_tokens(t, tokenizer)\n", + " return total\n", + " return len(tokenizer.tokenize(text, max_length=None))" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "8" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "count_tokens(['I like Ike.\\nBob likes Joe.'], TOKENIZER)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "def make_splitter(tokenizer, chunk_size):\n", + " return semchunk.chunkerify(tokenizer, chunk_size)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['I like', 'Ike.', 'Bob likes', 'Joe.']" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "s = make_splitter(TOKENIZER, 2)\n", + "s.chunk('I like Ike.\\nBob likes Joe.')" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "def doc_chunk_length(doc_chunk, title_length, tokenizer):\n", + " text_length = count_tokens(doc_chunk.text, tokenizer)\n", + " # Note that count_tokens handles None and lists, making this code simpler:\n", + " headings_length = count_tokens(doc_chunk.meta.headings, tokenizer)\n", + " captions_length = count_tokens(doc_chunk.meta.captions, tokenizer)\n", + " total = title_length + text_length + headings_length + captions_length\n", + " return {\n", + " 'total': total,\n", + " 'text': text_length,\n", + " 'other': total - text_length\n", + " } " + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'total': 307, 'text': 304, 'other': 3}" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "doc_chunk_length(chunks[19], 1, TOKENIZER)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "# Simplified version of DocMeta from the Hierarchical Chunker. We can't just use that structure because the attributes are private_attributes as tracked by pydantic.\n", + "\n", + "class DocumentMeta(BaseMeta):\n", + " \"\"\"Data model for chunk metadata.\"\"\"\n", + "\n", + " doc_items: list[DocItem] = Field(\n", + " min_length=1\n", + " )\n", + " headings: Optional[list[str]] = Field(\n", + " default=None,\n", + " min_length=1\n", + " )\n", + " captions: Optional[list[str]] = Field(\n", + " default=None,\n", + " min_length=1\n", + " )\n", + "\n", + "\n", + "class DocumentChunk(BaseChunk):\n", + " \"\"\"Data model for chunks.\"\"\"\n", + "\n", + " meta: BaseMeta" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "def make_chunk_from_doc_items(doc_chunk, window_text, window_start, window_end):\n", + " meta=DocumentMeta(doc_items=doc_chunk.meta.doc_items[window_start:window_end+1],\n", + " headings=doc_chunk.meta.headings,\n", + " captions=doc_chunk.meta.captions)\n", + " new_chunk = DocumentChunk(text=window_text, meta=meta)\n", + " return new_chunk\n", + "\n", + "\n", + "def merge_text(t1, t2):\n", + " if t1 == \"\":\n", + " return t2\n", + " elif t2 == \"\":\n", + " return t1\n", + " else:\n", + " return t1 + \"\\n\" + t2\n", + "\n", + "\n", + "def split_by_doc_items(doc_chunk, title_length, tokenizer, chunk_size):\n", + " if doc_chunk.meta.doc_items == None or len(doc_chunk.meta.doc_items) <= 1:\n", + " return [doc_chunk]\n", + " length = doc_chunk_length(doc_chunk, title_length, tokenizer)\n", + " if length['total'] <= chunk_size:\n", + " return [doc_chunk]\n", + " else:\n", + " chunks = []\n", + " window_start = 0\n", + " window_end = 0\n", + " window_text = \"\"\n", + " window_text_length = 0\n", + " other_length = length['other']\n", + " l = len(doc_chunk.meta.doc_items)\n", + " while window_end < l:\n", + " doc_item = doc_chunk.meta.doc_items[window_end]\n", + " text = doc_item.text\n", + " text_length = count_tokens(text, tokenizer)\n", + " if text_length + window_text_length + other_length < chunk_size and window_end < l - 1:\n", + " # Still room left to add more to this chunk AND still at least one item left\n", + " window_end += 1\n", + " window_text_length += text_length\n", + " window_text = merge_text(window_text, text)\n", + " elif text_length + window_text_length + other_length < chunk_size:\n", + " # All the items in the window fit into the chunk and there are no other items left\n", + " window_text = merge_text(window_text, text)\n", + " new_chunk = make_chunk_from_doc_items(doc_chunk, window_text, window_start, window_end)\n", + " chunks.append(new_chunk)\n", + " window_end = l\n", + " elif window_start == window_end:\n", + " # Only one item in the window and it doesn't fit into the chunk. So we'll just make it a chunk for now and it will get split in the plain text splitter.\n", + " window_text = merge_text(window_text, text)\n", + " new_chunk = make_chunk_from_doc_items(doc_chunk, window_text, window_start, window_end)\n", + " chunks.append(new_chunk)\n", + " window_start = window_end+1\n", + " window_end = window_start\n", + " window_text = ''\n", + " window_text_length = 0\n", + " else:\n", + " # Multiple items in the window but they don't fit into the chunk. However, the existing items must have fit or we wouldn't have gotten here.\n", + " # So we put everything but the last item into the chunk and then start a new window INCLUDING the current window end.\n", + " new_chunk = make_chunk_from_doc_items(doc_chunk, window_text, window_start, window_end-1)\n", + " chunks.append(new_chunk)\n", + " window_start = window_end\n", + " window_text = ''\n", + " window_text_length = 0\n", + " return chunks" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[DocumentChunk(text='\\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.\\n\\uf0b7 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.\\n\\uf0b7 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).', meta=DocumentMeta(doc_items=[ListItem(self_ref='#/texts/25', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=124.41297912597656, t=541.7998657226562, r=473.1099853515625, b=481.2223815917969, coord_origin=), charspan=(0, 363))], orig='\\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.', text='\\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.', enumerated=False, marker='-'), ListItem(self_ref='#/texts/26', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=123.96786499023438, t=481.559814453125, r=473.1015930175781, b=408.8705139160156, coord_origin=), charspan=(0, 451))], orig='\\uf0b7 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.', text='\\uf0b7 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.', enumerated=False, marker='-'), ListItem(self_ref='#/texts/27', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=124.11611938476562, t=409.2998352050781, r=472.8858947753906, b=384.54046630859375, coord_origin=), charspan=(0, 162))], orig='\\uf0b7 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).', text='\\uf0b7 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).', enumerated=False, marker='-')], headings=['4 Algorithm'], captions=None)),\n", + " DocumentChunk(text='\\uf0b7 Match Evaluation: As in [1], the total score for a match in LFACS is the sum of the match scores for the local match hypotheses included in the maximal consistent global map. Local match scores in LFACS are computed using inverse-document frequency (IDF) from our text corpus. Terms with high IDF scores occur rarely in the corpus so the fact that they align with the clue is less likely to be a coincidence and thus more likely to imply that the answer is correct.', meta=DocumentMeta(doc_items=[ListItem(self_ref='#/texts/28', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=124.39203643798828, t=385.059814453125, r=473.0816345214844, b=312.2467041015625, coord_origin=), charspan=(0, 467))], orig='\\uf0b7 Match Evaluation: As in [1], the total score for a match in LFACS is the sum of the match scores for the local match hypotheses included in the maximal consistent global map. Local match scores in LFACS are computed using inverse-document frequency (IDF) from our text corpus. Terms with high IDF scores occur rarely in the corpus so the fact that they align with the clue is less likely to be a coincidence and thus more likely to imply that the answer is correct.', text='\\uf0b7 Match Evaluation: As in [1], the total score for a match in LFACS is the sum of the match scores for the local match hypotheses included in the maximal consistent global map. Local match scores in LFACS are computed using inverse-document frequency (IDF) from our text corpus. Terms with high IDF scores occur rarely in the corpus so the fact that they align with the clue is less likely to be a coincidence and thus more likely to imply that the answer is correct.', enumerated=False, marker='-')], headings=['4 Algorithm'], captions=None))]" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "split_chunks = split_by_doc_items(chunks[19], 5, TOKENIZER, 300)\n", + "split_chunks" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Item lengths\n", + " Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.\n", + "84\n", + " Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.\n", + "85\n", + " Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).\n", + "33\n", + " Match Evaluation: As in [1], the total score for a match in LFACS is the sum of the match scores for the local match hypotheses included in the maximal consistent global map. Local match scores in LFACS are computed using inverse-document frequency (IDF) from our text corpus. Terms with high IDF scores occur rarely in the corpus so the fact that they align with the clue is less likely to be a coincidence and thus more likely to imply that the answer is correct.\n", + "102\n", + "Chunk lengths\n", + " Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.\n", + " Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.\n", + " Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).\n", + "202\n", + " Match Evaluation: As in [1], the total score for a match in LFACS is the sum of the match scores for the local match hypotheses included in the maximal consistent global map. Local match scores in LFACS are computed using inverse-document frequency (IDF) from our text corpus. Terms with high IDF scores occur rarely in the corpus so the fact that they align with the clue is less likely to be a coincidence and thus more likely to imply that the answer is correct.\n", + "102\n" + ] + } + ], + "source": [ + "print('Item lengths')\n", + "\n", + "for item in chunks[19].meta.doc_items:\n", + " count = count_tokens(item.text, TOKENIZER)\n", + " print(item.text)\n", + " print(count)\n", + "\n", + "print('Chunk lengths')\n", + "\n", + "for c in split_chunks:\n", + " count = count_tokens(c.text, TOKENIZER)\n", + " print(c.text)\n", + " print(count)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "def split_using_plain_text(doc_chunk, title_length, tokenizer, plain_text_splitter, chunk_size):\n", + " lengths = doc_chunk_length(doc_chunk, title_length, tokenizer)\n", + " if lengths['total'] <= chunk_size:\n", + " return [doc_chunk]\n", + " else:\n", + " # How much room is there for text after subtracting out the title, headers, and captions:\n", + " available_length = chunk_size - title_length - lengths['other']\n", + " if available_length <= 0:\n", + " raise ValueError(\"Title, headers, and captions for this chunk are longer than the total amount of size for the chunk. This is not supported now.\")\n", + " text = doc_chunk.text\n", + " segments = plain_text_splitter.chunk(text)\n", + " chunks = []\n", + " for s in segments:\n", + " new_chunk = DocumentChunk(text=s, meta=doc_chunk.meta)\n", + " chunks.append(new_chunk)\n", + " return chunks" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[DocumentChunk(text='\\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of', meta=DocumentMeta(doc_items=[ListItem(self_ref='#/texts/25', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=124.41297912597656, t=541.7998657226562, r=473.1099853515625, b=481.2223815917969, coord_origin=), charspan=(0, 363))], orig='\\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.', text='\\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.', enumerated=False, marker='-'), ListItem(self_ref='#/texts/26', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=123.96786499023438, t=481.559814453125, r=473.1015930175781, b=408.8705139160156, coord_origin=), charspan=(0, 451))], orig='\\uf0b7 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.', text='\\uf0b7 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.', enumerated=False, marker='-'), ListItem(self_ref='#/texts/27', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=124.11611938476562, t=409.2998352050781, r=472.8858947753906, b=384.54046630859375, coord_origin=), charspan=(0, 162))], orig='\\uf0b7 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).', text='\\uf0b7 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).', enumerated=False, marker='-')], headings=['4 Algorithm'], captions=None)),\n", + " DocumentChunk(text='resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.', meta=DocumentMeta(doc_items=[ListItem(self_ref='#/texts/25', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=124.41297912597656, t=541.7998657226562, r=473.1099853515625, b=481.2223815917969, coord_origin=), charspan=(0, 363))], orig='\\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.', text='\\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.', enumerated=False, marker='-'), ListItem(self_ref='#/texts/26', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=123.96786499023438, t=481.559814453125, r=473.1015930175781, b=408.8705139160156, coord_origin=), charspan=(0, 451))], orig='\\uf0b7 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.', text='\\uf0b7 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.', enumerated=False, marker='-'), ListItem(self_ref='#/texts/27', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=124.11611938476562, t=409.2998352050781, r=472.8858947753906, b=384.54046630859375, coord_origin=), charspan=(0, 162))], orig='\\uf0b7 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).', text='\\uf0b7 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).', enumerated=False, marker='-')], headings=['4 Algorithm'], captions=None)),\n", + " DocumentChunk(text='\\uf0b7 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local', meta=DocumentMeta(doc_items=[ListItem(self_ref='#/texts/25', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=124.41297912597656, t=541.7998657226562, r=473.1099853515625, b=481.2223815917969, coord_origin=), charspan=(0, 363))], orig='\\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.', text='\\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.', enumerated=False, marker='-'), ListItem(self_ref='#/texts/26', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=123.96786499023438, t=481.559814453125, r=473.1015930175781, b=408.8705139160156, coord_origin=), charspan=(0, 451))], orig='\\uf0b7 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.', text='\\uf0b7 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.', enumerated=False, marker='-'), ListItem(self_ref='#/texts/27', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=124.11611938476562, t=409.2998352050781, r=472.8858947753906, b=384.54046630859375, coord_origin=), charspan=(0, 162))], orig='\\uf0b7 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).', text='\\uf0b7 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).', enumerated=False, marker='-')], headings=['4 Algorithm'], captions=None)),\n", + " DocumentChunk(text='matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.', meta=DocumentMeta(doc_items=[ListItem(self_ref='#/texts/25', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=124.41297912597656, t=541.7998657226562, r=473.1099853515625, b=481.2223815917969, coord_origin=), charspan=(0, 363))], orig='\\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.', text='\\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.', enumerated=False, marker='-'), ListItem(self_ref='#/texts/26', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=123.96786499023438, t=481.559814453125, r=473.1015930175781, b=408.8705139160156, coord_origin=), charspan=(0, 451))], orig='\\uf0b7 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.', text='\\uf0b7 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.', enumerated=False, marker='-'), ListItem(self_ref='#/texts/27', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=124.11611938476562, t=409.2998352050781, r=472.8858947753906, b=384.54046630859375, coord_origin=), charspan=(0, 162))], orig='\\uf0b7 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).', text='\\uf0b7 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).', enumerated=False, marker='-')], headings=['4 Algorithm'], captions=None)),\n", + " DocumentChunk(text='\\uf0b7 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).', meta=DocumentMeta(doc_items=[ListItem(self_ref='#/texts/25', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=124.41297912597656, t=541.7998657226562, r=473.1099853515625, b=481.2223815917969, coord_origin=), charspan=(0, 363))], orig='\\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.', text='\\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.', enumerated=False, marker='-'), ListItem(self_ref='#/texts/26', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=123.96786499023438, t=481.559814453125, r=473.1015930175781, b=408.8705139160156, coord_origin=), charspan=(0, 451))], orig='\\uf0b7 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.', text='\\uf0b7 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.', enumerated=False, marker='-'), ListItem(self_ref='#/texts/27', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=124.11611938476562, t=409.2998352050781, r=472.8858947753906, b=384.54046630859375, coord_origin=), charspan=(0, 162))], orig='\\uf0b7 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).', text='\\uf0b7 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).', enumerated=False, marker='-')], headings=['4 Algorithm'], captions=None))]" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Normally we'd have the same chunk_size for this step too, but for testing I am taking the first output from the previous step and splitting it into even smaller chunks.\n", + "\n", + "chunk_size = 50\n", + "plain_text_splitter = make_splitter(TOKENIZER, chunk_size)\n", + "resplit_chunks = split_using_plain_text(split_chunks[0], 5, TOKENIZER, plain_text_splitter, chunk_size)\n", + "resplit_chunks " + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of\n", + "50\n", + "resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.\n", + "34\n", + " Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local\n", + "50\n", + "matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.\n", + "35\n", + " Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).\n", + "33\n" + ] + } + ], + "source": [ + "for c in resplit_chunks:\n", + " count = count_tokens(c.text, TOKENIZER)\n", + " print(c.text)\n", + " print(count)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "def merge_chunks_with_matching_metadata(chunks, title_length, tokenizer, chunk_size):\n", + " output_chunks = []\n", + " window_start = 0\n", + " window_end = 0\n", + " l = len(chunks)\n", + " while window_end < l:\n", + " chunk = chunks[window_end]\n", + " lengths = doc_chunk_length(chunk, title_length, tokenizer)\n", + " headings_and_captions = (chunk.meta.headings, chunk.meta.captions)\n", + " if window_start == window_end:\n", + " # starting a new block of chunks to potentially merge\n", + " current_headings_and_captions = headings_and_captions\n", + " window_text = chunk.text\n", + " window_other_length = lengths['other']\n", + " window_text_length = lengths['text']\n", + " window_items = chunk.meta.doc_items\n", + " window_end += 1\n", + " first_chunk_of_window = chunk\n", + " elif headings_and_captions == current_headings_and_captions and window_text_length + window_other_length + lengths['text'] <= chunk_size:\n", + " # there is room to include the new chunk so add it to the window and continue\n", + " window_text = merge_text(window_text, chunk.text)\n", + " window_text_length += lengths['text']\n", + " window_items = window_items + chunk.meta.doc_items\n", + " window_end += 1\n", + " else:\n", + " # no more room OR the start of new metadata. Either way, end the block and use the current window_end as the start of a new block\n", + " if window_start + 1 == window_end:\n", + " # just one chunk so use it as is\n", + " output_chunks.append(first_chunk_of_window)\n", + " else:\n", + " new_meta = DocumentMeta(doc_items=window_items, headings=headings_and_captions[0], captions=headings_and_captions[1])\n", + " new_chunk = DocumentChunk(text=window_text, meta=new_meta)\n", + " output_chunks.append(new_chunk)\n", + " window_start = window_end # no need to reset window_text, etc. because that will be reset in the next iteration in the if window_start == window_end block\n", + "\n", + " return output_chunks\n", + "\n", + "\n", + "def merge_chunks_with_mismatching_metadata(chunks, *_):\n", + " # placeholder, for now we're not merging across text with different headings+captions\n", + " # in principal it seems like a good idea for cases where you can merge entire sections\n", + " # but it is not clear what you do about the metadata then because some of it apples to \n", + " return chunks\n", + "\n", + "\n", + "def merge_chunks(chunks, title_length, tokenizer, chunk_size):\n", + " # merges as many chunks as possible that have the same headings+captions.\n", + " initial_merged_chunks = merge_chunks_with_matching_metadata(chunks, title_length, tokenizer, chunk_size)\n", + " # merges chunks with different headings+captions. This is later so that merges within a section or other grouping are preferred.\n", + " final_merged_chunks = merge_chunks_with_mismatching_metadata(initial_merged_chunks, title_length, tokenizer, chunk_size)\n", + " return final_merged_chunks" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "def adjust_chunks_for_fixed_size(doc, original_chunks, tokenizer, splitter, chunk_size):\n", + " title = doc.name\n", + " title_length = count_tokens(title, tokenizer)\n", + " chunks_after_splitting_by_items = []\n", + " for chunk in original_chunks:\n", + " chunk_split_by_doc_items = split_by_doc_items(chunk, title_length, tokenizer, chunk_size)\n", + " chunks_after_splitting_by_items.extend(chunk_split_by_doc_items)\n", + " chunks_after_splitting_recursively = []\n", + " for chunk in chunks_after_splitting_by_items:\n", + " chunk_split_recursively = split_using_plain_text(chunk, title_length, tokenizer, splitter, chunk_size)\n", + " chunks_after_splitting_recursively.extend(chunk_split_recursively)\n", + " chunks_afer_merging = merge_chunks(chunks_after_splitting_recursively, title_length, tokenizer, chunk_size)\n", + " return chunks_afer_merging" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[DocumentChunk(text='\\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.\\n\\uf0b7 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.\\n\\uf0b7 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).', meta=DocumentMeta(doc_items=[ListItem(self_ref='#/texts/25', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=124.41297912597656, t=541.7998657226562, r=473.1099853515625, b=481.2223815917969, coord_origin=), charspan=(0, 363))], orig='\\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.', text='\\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.', enumerated=False, marker='-'), ListItem(self_ref='#/texts/26', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=123.96786499023438, t=481.559814453125, r=473.1015930175781, b=408.8705139160156, coord_origin=), charspan=(0, 451))], orig='\\uf0b7 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.', text='\\uf0b7 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.', enumerated=False, marker='-'), ListItem(self_ref='#/texts/27', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=124.11611938476562, t=409.2998352050781, r=472.8858947753906, b=384.54046630859375, coord_origin=), charspan=(0, 162))], orig='\\uf0b7 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).', text='\\uf0b7 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).', enumerated=False, marker='-')], headings=['4 Algorithm'], captions=None)), DocumentChunk(text='\\uf0b7 Match Evaluation: As in [1], the total score for a match in LFACS is the sum of the match scores for the local match hypotheses included in the maximal consistent global map. Local match scores in LFACS are computed using inverse-document frequency (IDF) from our text corpus. Terms with high IDF scores occur rarely in the corpus so the fact that they align with the clue is less likely to be a coincidence and thus more likely to imply that the answer is correct.\\nIn using this algorithm, we have encountered a wide variety of technical issues that are specific to natural-language. For example, some concepts can be expressed as either a verb or a noun (e.g., destroy-destruction ). We address those issues through some combination of graph preprocessing (e.g., adding edges to indicate the logical subject of destruction during relation detection) and specialized logic that is internal to the local match construction (e.g., allowing the destroy to match destruction ).', meta=DocumentMeta(doc_items=[ListItem(self_ref='#/texts/28', parent=RefItem(cref='#/groups/0'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=124.39203643798828, t=385.059814453125, r=473.0816345214844, b=312.2467041015625, coord_origin=), charspan=(0, 467))], orig='\\uf0b7 Match Evaluation: As in [1], the total score for a match in LFACS is the sum of the match scores for the local match hypotheses included in the maximal consistent global map. Local match scores in LFACS are computed using inverse-document frequency (IDF) from our text corpus. Terms with high IDF scores occur rarely in the corpus so the fact that they align with the clue is less likely to be a coincidence and thus more likely to imply that the answer is correct.', text='\\uf0b7 Match Evaluation: As in [1], the total score for a match in LFACS is the sum of the match scores for the local match hypotheses included in the maximal consistent global map. Local match scores in LFACS are computed using inverse-document frequency (IDF) from our text corpus. Terms with high IDF scores occur rarely in the corpus so the fact that they align with the clue is less likely to be a coincidence and thus more likely to imply that the answer is correct.', enumerated=False, marker='-'), TextItem(self_ref='#/texts/29', parent=RefItem(cref='#/body'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=123.8291244506836, t=311.80438232421875, r=473.0190734863281, b=240.17425537109375, coord_origin=), charspan=(0, 508))], orig='In using this algorithm, we have encountered a wide variety of technical issues that are specific to natural-language. For example, some concepts can be expressed as either a verb or a noun (e.g., destroy-destruction ). We address those issues through some combination of graph preprocessing (e.g., adding edges to indicate the logical subject of destruction during relation detection) and specialized logic that is internal to the local match construction (e.g., allowing the destroy to match destruction ).', text='In using this algorithm, we have encountered a wide variety of technical issues that are specific to natural-language. For example, some concepts can be expressed as either a verb or a noun (e.g., destroy-destruction ). We address those issues through some combination of graph preprocessing (e.g., adding edges to indicate the logical subject of destruction during relation detection) and specialized logic that is internal to the local match construction (e.g., allowing the destroy to match destruction ).')], headings=['4 Algorithm'], captions=None)), DocChunk(text='Our approach to generating local match hypotheses mostly focuses on determining equivalence (or at least rough equivalence) between nodes. This focus reflects the fact that we are interested in similarity, but not analogy per se . If we were to try to address examples like the Charles de Gaul analogy in the introduction of this paper, we would need to relax those restrictions and adjust the confidence in our conclusions accordingly. This may be extremely important in domains where there is less direct evidence involving the candidate answers.', meta=DocMeta(schema_name='docling_core.transforms.chunker.DocMeta', version='1.0.0', doc_items=[TextItem(self_ref='#/texts/30', parent=RefItem(cref='#/body'), children=[], label=, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=123.81511688232422, t=239.7743682861328, r=473.023681640625, b=156.86865234375, coord_origin=), charspan=(0, 548))], orig='Our approach to generating local match hypotheses mostly focuses on determining equivalence (or at least rough equivalence) between nodes. This focus reflects the fact that we are interested in similarity, but not analogy per se . If we were to try to address examples like the Charles de Gaul analogy in the introduction of this paper, we would need to relax those restrictions and adjust the confidence in our conclusions accordingly. This may be extremely important in domains where there is less direct evidence involving the candidate answers.', text='Our approach to generating local match hypotheses mostly focuses on determining equivalence (or at least rough equivalence) between nodes. This focus reflects the fact that we are interested in similarity, but not analogy per se . If we were to try to address examples like the Charles de Gaul analogy in the introduction of this paper, we would need to relax those restrictions and adjust the confidence in our conclusions accordingly. This may be extremely important in domains where there is less direct evidence involving the candidate answers.')], headings=['4 Algorithm'], captions=None, origin=DocumentOrigin(mimetype='application/pdf', binary_hash=2576718022335104320, filename='iccbr2011murdock_web.pdf', uri=None))), DocChunk(text='Detailed evaluations of deep evidence scoring components will be presented in a future publication. LFACS has statistically significant impact on question answering accuracy when included in either a simple baseline DeepQA question answering system or to the complete Watson question answering system that competed with human grand champions. This impact, while significant, is small: less than half of one percent in the full system; the full system has an enormous number of answer scoring components and there is a great deal of overlap in the signal they provide. Other deep evidence scoring components in DeepQA (e.g., counting term matches, comparing word order) are more aggressive in what they consider to be a match. These aggressive components have the disadvantage that they do not draw on the full richness of the syntactic and semantic structure but the advantage that they can draw evidence from passages that have little structural similarity to the question.', meta=DocMeta(schema_name='docling_core.transforms.chunker.DocMeta', version='1.0.0', doc_items=[TextItem(self_ref='#/texts/32', parent=RefItem(cref='#/body'), children=[], label=, prov=[ProvenanceItem(page_no=5, bbox=BoundingBox(l=123.72936248779297, t=666.814453125, r=473.1099853515625, b=523.5120239257812, coord_origin=), charspan=(0, 974))], orig='Detailed evaluations of deep evidence scoring components will be presented in a future publication. LFACS has statistically significant impact on question answering accuracy when included in either a simple baseline DeepQA question answering system or to the complete Watson question answering system that competed with human grand champions. This impact, while significant, is small: less than half of one percent in the full system; the full system has an enormous number of answer scoring components and there is a great deal of overlap in the signal they provide. Other deep evidence scoring components in DeepQA (e.g., counting term matches, comparing word order) are more aggressive in what they consider to be a match. These aggressive components have the disadvantage that they do not draw on the full richness of the syntactic and semantic structure but the advantage that they can draw evidence from passages that have little structural similarity to the question.', text='Detailed evaluations of deep evidence scoring components will be presented in a future publication. LFACS has statistically significant impact on question answering accuracy when included in either a simple baseline DeepQA question answering system or to the complete Watson question answering system that competed with human grand champions. This impact, while significant, is small: less than half of one percent in the full system; the full system has an enormous number of answer scoring components and there is a great deal of overlap in the signal they provide. Other deep evidence scoring components in DeepQA (e.g., counting term matches, comparing word order) are more aggressive in what they consider to be a match. These aggressive components have the disadvantage that they do not draw on the full richness of the syntactic and semantic structure but the advantage that they can draw evidence from passages that have little structural similarity to the question.')], headings=['5 Evaluation and Conclusions'], captions=None, origin=DocumentOrigin(mimetype='application/pdf', binary_hash=2576718022335104320, filename='iccbr2011murdock_web.pdf', uri=None))), DocChunk(text='The impact of LFACS when added to the simple baseline was smaller than that of the more aggressive components. However, in the complete system (containing many more features), the impact of LFACS (while small in an absolute sense) is larger than the impact of those components. The effect of ablating all of the deep evidence scoring components in the full system is much bigger than the effects of ablating any of them. These results have important implications for developers of question answering (or similar) technology. Simple, aggressive approaches are well-suited to quickly and easily attaining moderate effectiveness. However, as a system becomes more sophisticated, the opportunities for components of that sort to have impact becomes very limitted. In those cases, more algorithms such as LFACS that make effective use of syntatic and/or semantic structure can further enhance the effectiveness of a question answering system. As a result, additional and improved algorithms of this sort that draw on the full richness of our deep syntatic and semantic analysis are an important area for future research.', meta=DocMeta(schema_name='docling_core.transforms.chunker.DocMeta', version='1.0.0', doc_items=[TextItem(self_ref='#/texts/33', parent=RefItem(cref='#/body'), children=[], label=, prov=[ProvenanceItem(page_no=5, bbox=BoundingBox(l=123.72576141357422, t=522.784423828125, r=473.09423828125, b=355.3149108886719, coord_origin=), charspan=(0, 1115))], orig='The impact of LFACS when added to the simple baseline was smaller than that of the more aggressive components. However, in the complete system (containing many more features), the impact of LFACS (while small in an absolute sense) is larger than the impact of those components. The effect of ablating all of the deep evidence scoring components in the full system is much bigger than the effects of ablating any of them. These results have important implications for developers of question answering (or similar) technology. Simple, aggressive approaches are well-suited to quickly and easily attaining moderate effectiveness. However, as a system becomes more sophisticated, the opportunities for components of that sort to have impact becomes very limitted. In those cases, more algorithms such as LFACS that make effective use of syntatic and/or semantic structure can further enhance the effectiveness of a question answering system. As a result, additional and improved algorithms of this sort that draw on the full richness of our deep syntatic and semantic analysis are an important area for future research.', text='The impact of LFACS when added to the simple baseline was smaller than that of the more aggressive components. However, in the complete system (containing many more features), the impact of LFACS (while small in an absolute sense) is larger than the impact of those components. The effect of ablating all of the deep evidence scoring components in the full system is much bigger than the effects of ablating any of them. These results have important implications for developers of question answering (or similar) technology. Simple, aggressive approaches are well-suited to quickly and easily attaining moderate effectiveness. However, as a system becomes more sophisticated, the opportunities for components of that sort to have impact becomes very limitted. In those cases, more algorithms such as LFACS that make effective use of syntatic and/or semantic structure can further enhance the effectiveness of a question answering system. As a result, additional and improved algorithms of this sort that draw on the full richness of our deep syntatic and semantic analysis are an important area for future research.')], headings=['5 Evaluation and Conclusions'], captions=None, origin=DocumentOrigin(mimetype='application/pdf', binary_hash=2576718022335104320, filename='iccbr2011murdock_web.pdf', uri=None))), DocumentChunk(text='1. Falkenhainer, B., Forbus, K. and Gentner, D. (1989). The Structure Mapping Engine: Algorithm and examples. Artificial Intelligence , 41, 1-63.\\n2. Ferrucci, D., Brown, E., Chu-Carroll, J., Fan, J., Gondek, D., Kalyanpur, A., Lally, A., Murdock, J. W., Nyberg, E., Prager, J., Schlaefer, N., and Welty, C. (2010) Building Watson: An Overview of the DeepQA Project. AI Magazine 31(3):59-79.\\n3. Forbus, K. and Oblinger, D. (1990). Making SME greedy and pragmatic. Proceedings of the Cognitive Science Society .\\n4. McCord, M. C. (1990). Slot Grammar: A System for Simpler Construction of Practical Natural Language Grammars. Natural Language and Logic: International Scientific Symposium . Lecture Notes in Computer Science 459. Berlin: Springer Verlag.', meta=DocumentMeta(doc_items=[ListItem(self_ref='#/texts/35', parent=RefItem(cref='#/groups/1'), children=[], label=, prov=[ProvenanceItem(page_no=5, bbox=BoundingBox(l=129.25999450683594, t=309.7490539550781, r=472.53497314453125, b=288.49505615234375, coord_origin=), charspan=(0, 145))], orig='1. Falkenhainer, B., Forbus, K. and Gentner, D. (1989). The Structure Mapping Engine: Algorithm and examples. Artificial Intelligence , 41, 1-63.', text='1. Falkenhainer, B., Forbus, K. and Gentner, D. (1989). The Structure Mapping Engine: Algorithm and examples. Artificial Intelligence , 41, 1-63.', enumerated=False, marker='-'), ListItem(self_ref='#/texts/36', parent=RefItem(cref='#/groups/1'), children=[], label=, prov=[ProvenanceItem(page_no=5, bbox=BoundingBox(l=128.63955688476562, t=287.6390380859375, r=472.5289001464844, b=255.7160186767578, coord_origin=), charspan=(0, 244))], orig='2. Ferrucci, D., Brown, E., Chu-Carroll, J., Fan, J., Gondek, D., Kalyanpur, A., Lally, A., Murdock, J. W., Nyberg, E., Prager, J., Schlaefer, N., and Welty, C. (2010) Building Watson: An Overview of the DeepQA Project. AI Magazine 31(3):59-79.', text='2. Ferrucci, D., Brown, E., Chu-Carroll, J., Fan, J., Gondek, D., Kalyanpur, A., Lally, A., Murdock, J. W., Nyberg, E., Prager, J., Schlaefer, N., and Welty, C. (2010) Building Watson: An Overview of the DeepQA Project. AI Magazine 31(3):59-79.', enumerated=False, marker='-'), ListItem(self_ref='#/texts/37', parent=RefItem(cref='#/groups/1'), children=[], label=, prov=[ProvenanceItem(page_no=5, bbox=BoundingBox(l=128.71495056152344, t=254.63900756835938, r=472.8249816894531, b=233.75601196289062, coord_origin=), charspan=(0, 118))], orig='3. Forbus, K. and Oblinger, D. (1990). Making SME greedy and pragmatic. Proceedings of the Cognitive Science Society .', text='3. Forbus, K. and Oblinger, D. (1990). Making SME greedy and pragmatic. Proceedings of the Cognitive Science Society .', enumerated=False, marker='-'), ListItem(self_ref='#/texts/38', parent=RefItem(cref='#/groups/1'), children=[], label=, prov=[ProvenanceItem(page_no=5, bbox=BoundingBox(l=128.1455535888672, t=232.67901611328125, r=472.8609619140625, b=200.1046142578125, coord_origin=), charspan=(0, 241))], orig='4. McCord, M. C. (1990). Slot Grammar: A System for Simpler Construction of Practical Natural Language Grammars. Natural Language and Logic: International Scientific Symposium . Lecture Notes in Computer Science 459. Berlin: Springer Verlag.', text='4. McCord, M. C. (1990). Slot Grammar: A System for Simpler Construction of Practical Natural Language Grammars. Natural Language and Logic: International Scientific Symposium . Lecture Notes in Computer Science 459. Berlin: Springer Verlag.', enumerated=False, marker='-')], headings=['References'], captions=None))]\n" + ] + } + ], + "source": [ + "chunk_size = 256\n", + "test_chunks = chunks[19:25]\n", + "adjusted = adjust_chunks_for_fixed_size(doc, test_chunks, TOKENIZER, make_splitter(TOKENIZER, chunk_size), chunk_size)\n", + "print(adjusted)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Original chunks\n", + " Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.\n", + " Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.\n", + " Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).\n", + " Match Evaluation: As in [1], the total score for a match in LFACS is the sum of the match scores for the local match hypotheses included in the maximal consistent global map. Local match scores in LFACS are computed using inverse-document frequency (IDF) from our text corpus. Terms with high IDF scores occur rarely in the corpus so the fact that they align with the clue is less likely to be a coincidence and thus more likely to imply that the answer is correct.\n", + "304\n", + "In using this algorithm, we have encountered a wide variety of technical issues that are specific to natural-language. For example, some concepts can be expressed as either a verb or a noun (e.g., destroy-destruction ). We address those issues through some combination of graph preprocessing (e.g., adding edges to indicate the logical subject of destruction during relation detection) and specialized logic that is internal to the local match construction (e.g., allowing the destroy to match destruction ).\n", + "105\n", + "Our approach to generating local match hypotheses mostly focuses on determining equivalence (or at least rough equivalence) between nodes. This focus reflects the fact that we are interested in similarity, but not analogy per se . If we were to try to address examples like the Charles de Gaul analogy in the introduction of this paper, we would need to relax those restrictions and adjust the confidence in our conclusions accordingly. This may be extremely important in domains where there is less direct evidence involving the candidate answers.\n", + "98\n", + "Detailed evaluations of deep evidence scoring components will be presented in a future publication. LFACS has statistically significant impact on question answering accuracy when included in either a simple baseline DeepQA question answering system or to the complete Watson question answering system that competed with human grand champions. This impact, while significant, is small: less than half of one percent in the full system; the full system has an enormous number of answer scoring components and there is a great deal of overlap in the signal they provide. Other deep evidence scoring components in DeepQA (e.g., counting term matches, comparing word order) are more aggressive in what they consider to be a match. These aggressive components have the disadvantage that they do not draw on the full richness of the syntactic and semantic structure but the advantage that they can draw evidence from passages that have little structural similarity to the question.\n", + "177\n", + "The impact of LFACS when added to the simple baseline was smaller than that of the more aggressive components. However, in the complete system (containing many more features), the impact of LFACS (while small in an absolute sense) is larger than the impact of those components. The effect of ablating all of the deep evidence scoring components in the full system is much bigger than the effects of ablating any of them. These results have important implications for developers of question answering (or similar) technology. Simple, aggressive approaches are well-suited to quickly and easily attaining moderate effectiveness. However, as a system becomes more sophisticated, the opportunities for components of that sort to have impact becomes very limitted. In those cases, more algorithms such as LFACS that make effective use of syntatic and/or semantic structure can further enhance the effectiveness of a question answering system. As a result, additional and improved algorithms of this sort that draw on the full richness of our deep syntatic and semantic analysis are an important area for future research.\n", + "213\n", + "1. Falkenhainer, B., Forbus, K. and Gentner, D. (1989). The Structure Mapping Engine: Algorithm and examples. Artificial Intelligence , 41, 1-63.\n", + "2. Ferrucci, D., Brown, E., Chu-Carroll, J., Fan, J., Gondek, D., Kalyanpur, A., Lally, A., Murdock, J. W., Nyberg, E., Prager, J., Schlaefer, N., and Welty, C. (2010) Building Watson: An Overview of the DeepQA Project. AI Magazine 31(3):59-79.\n", + "3. Forbus, K. and Oblinger, D. (1990). Making SME greedy and pragmatic. Proceedings of the Cognitive Science Society .\n", + "4. McCord, M. C. (1990). Slot Grammar: A System for Simpler Construction of Practical Natural Language Grammars. Natural Language and Logic: International Scientific Symposium . Lecture Notes in Computer Science 459. Berlin: Springer Verlag.\n", + "5. Miller, G. A. (1995). WordNet: A Lexical Database for English. Communications of the ACM Vol. 38, No. 11: 39-41.\n", + "273\n", + "Adjusted chunks\n", + " Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.\n", + " Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.\n", + " Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).\n", + "202\n", + " Match Evaluation: As in [1], the total score for a match in LFACS is the sum of the match scores for the local match hypotheses included in the maximal consistent global map. Local match scores in LFACS are computed using inverse-document frequency (IDF) from our text corpus. Terms with high IDF scores occur rarely in the corpus so the fact that they align with the clue is less likely to be a coincidence and thus more likely to imply that the answer is correct.\n", + "In using this algorithm, we have encountered a wide variety of technical issues that are specific to natural-language. For example, some concepts can be expressed as either a verb or a noun (e.g., destroy-destruction ). We address those issues through some combination of graph preprocessing (e.g., adding edges to indicate the logical subject of destruction during relation detection) and specialized logic that is internal to the local match construction (e.g., allowing the destroy to match destruction ).\n", + "207\n", + "Our approach to generating local match hypotheses mostly focuses on determining equivalence (or at least rough equivalence) between nodes. This focus reflects the fact that we are interested in similarity, but not analogy per se . If we were to try to address examples like the Charles de Gaul analogy in the introduction of this paper, we would need to relax those restrictions and adjust the confidence in our conclusions accordingly. This may be extremely important in domains where there is less direct evidence involving the candidate answers.\n", + "98\n", + "Detailed evaluations of deep evidence scoring components will be presented in a future publication. LFACS has statistically significant impact on question answering accuracy when included in either a simple baseline DeepQA question answering system or to the complete Watson question answering system that competed with human grand champions. This impact, while significant, is small: less than half of one percent in the full system; the full system has an enormous number of answer scoring components and there is a great deal of overlap in the signal they provide. Other deep evidence scoring components in DeepQA (e.g., counting term matches, comparing word order) are more aggressive in what they consider to be a match. These aggressive components have the disadvantage that they do not draw on the full richness of the syntactic and semantic structure but the advantage that they can draw evidence from passages that have little structural similarity to the question.\n", + "177\n", + "The impact of LFACS when added to the simple baseline was smaller than that of the more aggressive components. However, in the complete system (containing many more features), the impact of LFACS (while small in an absolute sense) is larger than the impact of those components. The effect of ablating all of the deep evidence scoring components in the full system is much bigger than the effects of ablating any of them. These results have important implications for developers of question answering (or similar) technology. Simple, aggressive approaches are well-suited to quickly and easily attaining moderate effectiveness. However, as a system becomes more sophisticated, the opportunities for components of that sort to have impact becomes very limitted. In those cases, more algorithms such as LFACS that make effective use of syntatic and/or semantic structure can further enhance the effectiveness of a question answering system. As a result, additional and improved algorithms of this sort that draw on the full richness of our deep syntatic and semantic analysis are an important area for future research.\n", + "213\n", + "1. Falkenhainer, B., Forbus, K. and Gentner, D. (1989). The Structure Mapping Engine: Algorithm and examples. Artificial Intelligence , 41, 1-63.\n", + "2. Ferrucci, D., Brown, E., Chu-Carroll, J., Fan, J., Gondek, D., Kalyanpur, A., Lally, A., Murdock, J. W., Nyberg, E., Prager, J., Schlaefer, N., and Welty, C. (2010) Building Watson: An Overview of the DeepQA Project. AI Magazine 31(3):59-79.\n", + "3. Forbus, K. and Oblinger, D. (1990). Making SME greedy and pragmatic. Proceedings of the Cognitive Science Society .\n", + "4. McCord, M. C. (1990). Slot Grammar: A System for Simpler Construction of Practical Natural Language Grammars. Natural Language and Logic: International Scientific Symposium . Lecture Notes in Computer Science 459. Berlin: Springer Verlag.\n", + "234\n" + ] + } + ], + "source": [ + "print('Original chunks')\n", + "\n", + "for chunk in test_chunks:\n", + " count = count_tokens(chunk.text, TOKENIZER)\n", + " print(chunk.text)\n", + " print(count)\n", + "\n", + "print('Adjusted chunks')\n", + "\n", + "for c in adjusted:\n", + " count = count_tokens(c.text, TOKENIZER)\n", + " print(c.text)\n", + " print(count)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "class MaxTokenLimitingChunkerWithMerging(BaseChunker):\n", + " inner_chunker: BaseChunker = HierarchicalChunker()\n", + " max_tokens: PositiveInt = 512\n", + " embedding_model_id: str\n", + " def chunk(self, dl_doc: DoclingDocument, **kwargs) -> Iterator[BaseChunk]:\n", + " preliminary_chunks = self.inner_chunker.chunk(dl_doc=dl_doc, **kwargs)\n", + " tokenizer = AutoTokenizer.from_pretrained(self.embedding_model_id)\n", + " splitter = make_splitter(tokenizer, self.max_tokens)\n", + " output_chunks = adjust_chunks_for_fixed_size(doc, preliminary_chunks, tokenizer, splitter, self.max_tokens)\n", + " return iter(output_chunks)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "murdockj@us.ibm.com IBM T.J. Watson Research Center P.O. Box 704 Yorktown Heights, NY 10598\n", + "33\n", + "Abstract. The Jeopardy! television quiz show asks natural-language questions and requires natural-language answers. One useful source of information for answering Jeopardy! questions is text from written sources such as encyclopedias or news articles. A text passage may partially or fully indicate that some candidate answer is the correct answer to the question. Recognizing\n", + "64\n", + "whether it does requires determining the extent to which what the passage is saying about the candidate answer is similar to what the question is saying about the desired answer. This paper describes how structure mapping [1] (an algorithm originally developed for analogical reasoning) is applied to determine similarity between content in questions and passages. That algorithm\n", + "64\n", + "is one of many used in the Watson question answering system [2]. It contributes a significant amount to Watson's effectiveness.\n", + "26\n", + "Watson is a question answering system built on a set of technologies known as DeepQA [2]. Watson has been customized and configured to compete at Jeopardy!, an American television quiz show. Watson takes in a question and produces a ranked list of answers with confidence scores attached to each of these answers.\n", + "60\n", + "One of the stages in the DeepQA question answering pipeline is deep evidence scoring. This stage receives as input a question and a candidate answer in the context of some supporting evidence (typically a passage containing that answer). Questions typically have a focus identified for them (i.e., the term in the question indicating the\n", + "64\n", + "answer being sought). For example, a deep evidence scorer could be given a question like \" He was the first U.S. President \" and a passage like \" George Washington was the first U.S. President .\" \" He \" in the question will be marked as the focus. If the candidate answer is\n", + "64\n", + "\" George Washington ,\" each of the deep evidence scorers will attempt to determine the extent to which what the passage says about the \" George Washington \" addresses what the question asks about the \" He \". In this example, there is a perfect match, and all of Watson's deep evidence scoring mechanisms will conclude that\n", + "64\n", + "this passage strongly supports the specified answer. However, other passages may answer the question less directly, or provide evidence for only a portion of what the question is asking for (e.g., that Washington was a president).\n", + "46\n", + "Source: A. Ram and N. Wiratunga (Eds.): ICCBR 2011 , LNAI 6880, pp. 6-10, 2011. © Springer-Verlag Berlin Heidelberg 2011. The original publication is available at www.springerlink.com\n", + "54\n", + "The examples above do not require any explicit analogy. One could envision passages that say (for example) that (a) Charles de Gaul was a great French general who fought for the liberation of France, (b) that Charles de Gaulle was the first president of the fifth republic of France, and (c) that\n", + "64\n" + ] + } + ], + "source": [ + "chunker = MaxTokenLimitingChunkerWithMerging(max_tokens=64, embedding_model_id=EMBED_MODEL_ID)\n", + "final_output_chunks = chunker.chunk(dl_doc=doc)\n", + "\n", + "\n", + "i = 0\n", + "for chunk in final_output_chunks:\n", + " print(chunk.text) \n", + " print(count_tokens(chunk.text, TOKENIZER))\n", + " i += 1\n", + " if i > 10:\n", + " break" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "bmurdock-pyenv-virtualenv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}