From 4cab7d63dbf2502f84f64be7f31dc8c8f508680f Mon Sep 17 00:00:00 2001
From: Bill Murdock <bmurdock@redhat.com>
Date: Mon, 4 Nov 2024 17:00:02 -0500
Subject: [PATCH] Removed the use of doc.name

Earlier versions used the `doc.name` as the overall title of the document, but the discussion revealed that probably it is better to just trust the `doc_chunk.meta.headings` to have the title information sooner or later.  So I've removed all the special title stuff and am just relying on the headers now.

Signed-off-by: Bill Murdock <bmurdock@redhat.com>
---
 .../advanced_chunking_with_merging.ipynb      | 341 ++++++++++++------
 1 file changed, 233 insertions(+), 108 deletions(-)

diff --git a/docs/examples/advanced_chunking_with_merging.ipynb b/docs/examples/advanced_chunking_with_merging.ipynb
index b4ee9ddb..af778b58 100644
--- a/docs/examples/advanced_chunking_with_merging.ipynb
+++ b/docs/examples/advanced_chunking_with_merging.ipynb
@@ -2,9 +2,18 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": 1,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/bmurdock/.pyenv/versions/bmurdock-pyenv-virtualenv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
    "source": [
     "from typing import Iterator\n",
     "import semchunk\n",
@@ -15,11 +24,12 @@
     ")\n",
     "from docling_core.types import DoclingDocument\n",
     "from pydantic import PositiveInt\n",
-    "from transformers import AutoTokenizer\n",
-    "from sentence_transformers import SentenceTransformer\n",
     "\n",
     "from docling.document_converter import DocumentConverter\n",
-    "import lancedb"
+    "import lancedb\n",
+    "\n",
+    "from transformers import AutoTokenizer\n",
+    "from sentence_transformers import SentenceTransformer"
    ]
   },
   {
@@ -31,7 +41,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Fetching 9 files: 100%|██████████| 9/9 [00:00<00:00, 83514.90it/s]\n"
+      "Fetching 9 files: 100%|██████████| 9/9 [00:00<00:00, 63872.65it/s]\n"
      ]
     }
    ],
@@ -112,26 +122,6 @@
    "cell_type": "code",
    "execution_count": 6,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'iccbr2011murdock_web'"
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "doc.name"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
    "outputs": [],
    "source": [
     "EMBED_MODEL_ID = \"sentence-transformers/all-MiniLM-L6-v2\""
@@ -139,7 +129,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -148,47 +138,55 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Token indices sequence length is longer than the specified maximum sequence length for this model (1600 > 512). Running this sequence through the model will result in indexing errors\n"
+     ]
+    },
     {
      "data": {
       "text/plain": [
-       "['i', 'like', 'ike', '.', 'bob', 'likes', 'joe', '.']"
+       "1600"
       ]
      },
-     "execution_count": 9,
+     "execution_count": 8,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "TOKENIZER.tokenize(\"I like Ike.\\nBob likes Joe.\")"
+    "res = TOKENIZER.tokenize(\"I like Ike.\\nBob likes Joe. \" * 200)\n",
+    "len(res)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "8"
+       "10"
       ]
      },
-     "execution_count": 10,
+     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "len(TOKENIZER.tokenize(\"I like Ike.\\nBob likes Joe.\"))"
+    "len(TOKENIZER.encode(\"I like Ike.\\nBob likes Joe.\"))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -205,7 +203,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
@@ -214,7 +212,7 @@
        "8"
       ]
      },
-     "execution_count": 12,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -225,7 +223,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -235,7 +233,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
@@ -244,7 +242,7 @@
        "['I like', 'Ike.', 'Bob likes', 'Joe.']"
       ]
      },
-     "execution_count": 14,
+     "execution_count": 13,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -256,45 +254,45 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [],
    "source": [
     "from docling_core.transforms.chunker.hierarchical_chunker import DocChunk\n",
     "\n",
     "\n",
-    "def doc_chunk_length(doc_chunk: DocChunk, title_length: int, tokenizer):\n",
+    "def doc_chunk_length(doc_chunk: DocChunk, tokenizer):\n",
     "    text_length = count_tokens(doc_chunk.text, tokenizer)\n",
     "    # Note that count_tokens handles None and lists, making this code simpler:\n",
     "    headings_length = count_tokens(doc_chunk.meta.headings, tokenizer)\n",
     "    captions_length = count_tokens(doc_chunk.meta.captions, tokenizer)\n",
-    "    total = title_length + text_length + headings_length + captions_length\n",
+    "    total = text_length + headings_length + captions_length\n",
     "    return {\"total\": total, \"text\": text_length, \"other\": total - text_length}"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "{'total': 307, 'text': 304, 'other': 3}"
+       "{'total': 306, 'text': 304, 'other': 2}"
       ]
      },
-     "execution_count": 16,
+     "execution_count": 15,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "doc_chunk_length(chunks[19], 1, TOKENIZER)"
+    "doc_chunk_length(chunks[19], TOKENIZER)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -324,11 +322,11 @@
     "\n",
     "\n",
     "def split_by_doc_items(\n",
-    "    doc_chunk: DocChunk, title_length: int, tokenizer, chunk_size: int\n",
+    "    doc_chunk: DocChunk, tokenizer, chunk_size: int\n",
     "):\n",
     "    if doc_chunk.meta.doc_items == None or len(doc_chunk.meta.doc_items) <= 1:\n",
     "        return [doc_chunk]\n",
-    "    length = doc_chunk_length(doc_chunk, title_length, tokenizer)\n",
+    "    length = doc_chunk_length(doc_chunk, tokenizer)\n",
     "    if length[\"total\"] <= chunk_size:\n",
     "        return [doc_chunk]\n",
     "    else:\n",
@@ -385,7 +383,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [
     {
@@ -395,19 +393,19 @@
        " DocChunk(text='\\uf0b7 Match Evaluation: As in [1], the total score for a match in LFACS is the sum of the match scores for the local match hypotheses included in the maximal consistent global map. Local match scores in LFACS are computed using inverse-document frequency (IDF) from our text corpus. Terms with high IDF scores occur rarely in the corpus so the fact that they align with the clue is less likely to be a coincidence and thus more likely to imply that the answer is correct.', meta=DocMeta(schema_name='docling_core.transforms.chunker.DocMeta', version='1.0.0', doc_items=[ListItem(self_ref='#/texts/28', parent=RefItem(cref='#/groups/0'), children=[], label=<DocItemLabel.LIST_ITEM: 'list_item'>, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=124.39203643798828, t=385.059814453125, r=473.0816345214844, b=312.2467041015625, coord_origin=<CoordOrigin.BOTTOMLEFT: 'BOTTOMLEFT'>), charspan=(0, 467))], orig='\\uf0b7 Match Evaluation: As in [1], the total score for a match in LFACS is the sum of the match scores for the local match hypotheses included in the maximal consistent global map. Local match scores in LFACS are computed using inverse-document frequency (IDF) from our text corpus. Terms with high IDF scores occur rarely in the corpus so the fact that they align with the clue is less likely to be a coincidence and thus more likely to imply that the answer is correct.', text='\\uf0b7 Match Evaluation: As in [1], the total score for a match in LFACS is the sum of the match scores for the local match hypotheses included in the maximal consistent global map. Local match scores in LFACS are computed using inverse-document frequency (IDF) from our text corpus. Terms with high IDF scores occur rarely in the corpus so the fact that they align with the clue is less likely to be a coincidence and thus more likely to imply that the answer is correct.', enumerated=False, marker='-')], headings=['4 Algorithm'], captions=None, origin=None))]"
       ]
      },
-     "execution_count": 18,
+     "execution_count": 17,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "split_chunks = split_by_doc_items(chunks[19], 5, TOKENIZER, 300)\n",
+    "split_chunks = split_by_doc_items(chunks[19], TOKENIZER, 300)\n",
     "split_chunks"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 18,
    "metadata": {},
    "outputs": [
     {
@@ -451,26 +449,25 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 19,
    "metadata": {},
    "outputs": [],
    "source": [
     "def split_using_plain_text(\n",
     "    doc_chunk: DocChunk,\n",
-    "    title_length: int,\n",
     "    tokenizer,\n",
     "    plain_text_splitter,\n",
     "    chunk_size: int,\n",
     "):\n",
-    "    lengths = doc_chunk_length(doc_chunk, title_length, tokenizer)\n",
+    "    lengths = doc_chunk_length(doc_chunk, tokenizer)\n",
     "    if lengths[\"total\"] <= chunk_size:\n",
     "        return [doc_chunk]\n",
     "    else:\n",
-    "        # How much room is there for text after subtracting out the title, headers, and captions:\n",
-    "        available_length = chunk_size - title_length - lengths[\"other\"]\n",
+    "        # How much room is there for text after subtracting out the headers and captions:\n",
+    "        available_length = chunk_size - lengths[\"other\"]\n",
     "        if available_length <= 0:\n",
     "            raise ValueError(\n",
-    "                \"Title, headers, and captions for this chunk are longer than the total amount of size for the chunk.  This is not supported now.\"\n",
+    "                \"Headers and captions for this chunk are longer than the total amount of size for the chunk.  This is not supported now.\"\n",
     "            )\n",
     "        text = doc_chunk.text\n",
     "        segments = plain_text_splitter.chunk(text)\n",
@@ -483,7 +480,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 20,
    "metadata": {},
    "outputs": [
     {
@@ -496,7 +493,7 @@
        " DocChunk(text='\\uf0b7 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).', meta=DocMeta(schema_name='docling_core.transforms.chunker.DocMeta', version='1.0.0', doc_items=[ListItem(self_ref='#/texts/25', parent=RefItem(cref='#/groups/0'), children=[], label=<DocItemLabel.LIST_ITEM: 'list_item'>, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=124.41297912597656, t=541.7998657226562, r=473.1099853515625, b=481.2223815917969, coord_origin=<CoordOrigin.BOTTOMLEFT: 'BOTTOMLEFT'>), charspan=(0, 363))], orig='\\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.', text='\\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.', enumerated=False, marker='-'), ListItem(self_ref='#/texts/26', parent=RefItem(cref='#/groups/0'), children=[], label=<DocItemLabel.LIST_ITEM: 'list_item'>, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=123.96786499023438, t=481.559814453125, r=473.1015930175781, b=408.8705139160156, coord_origin=<CoordOrigin.BOTTOMLEFT: 'BOTTOMLEFT'>), charspan=(0, 451))], orig='\\uf0b7 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.', text='\\uf0b7 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.', enumerated=False, marker='-'), ListItem(self_ref='#/texts/27', parent=RefItem(cref='#/groups/0'), children=[], label=<DocItemLabel.LIST_ITEM: 'list_item'>, prov=[ProvenanceItem(page_no=4, bbox=BoundingBox(l=124.11611938476562, t=409.2998352050781, r=472.8858947753906, b=384.54046630859375, coord_origin=<CoordOrigin.BOTTOMLEFT: 'BOTTOMLEFT'>), charspan=(0, 162))], orig='\\uf0b7 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).', text='\\uf0b7 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).', enumerated=False, marker='-')], headings=['4 Algorithm'], captions=None, origin=None))]"
       ]
      },
-     "execution_count": 21,
+     "execution_count": 20,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -508,14 +505,14 @@
     "chunk_size = 50\n",
     "plain_text_splitter = make_splitter(TOKENIZER, chunk_size)\n",
     "resplit_chunks = split_using_plain_text(\n",
-    "    split_chunks[0], 5, TOKENIZER, plain_text_splitter, chunk_size\n",
+    "    split_chunks[0], TOKENIZER, plain_text_splitter, chunk_size\n",
     ")\n",
     "resplit_chunks"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 21,
    "metadata": {},
    "outputs": [
     {
@@ -544,18 +541,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 22,
    "metadata": {},
    "outputs": [],
    "source": [
-    "def merge_chunks_with_matching_metadata(chunks, title_length, tokenizer, chunk_size):\n",
+    "def merge_chunks_with_matching_metadata(chunks, tokenizer, chunk_size):\n",
     "    output_chunks = []\n",
     "    window_start = 0\n",
     "    window_end = 0\n",
     "    l = len(chunks)\n",
     "    while window_end < l:\n",
     "        chunk = chunks[window_end]\n",
-    "        lengths = doc_chunk_length(chunk, title_length, tokenizer)\n",
+    "        lengths = doc_chunk_length(chunk, tokenizer)\n",
     "        headings_and_captions = (chunk.meta.headings, chunk.meta.captions)\n",
     "        if window_start == window_end:\n",
     "            # starting a new block of chunks to potentially merge\n",
@@ -600,48 +597,46 @@
     "    return chunks\n",
     "\n",
     "\n",
-    "def merge_chunks(chunks, title_length, tokenizer, chunk_size):\n",
+    "def merge_chunks(chunks, tokenizer, chunk_size):\n",
     "    # merges as many chunks as possible that have the same headings+captions.\n",
     "    initial_merged_chunks = merge_chunks_with_matching_metadata(\n",
-    "        chunks, title_length, tokenizer, chunk_size\n",
+    "        chunks, tokenizer, chunk_size\n",
     "    )\n",
     "    # merges chunks with different headings+captions.  This is later so that merges within a section or other grouping are preferred.\n",
     "    final_merged_chunks = merge_chunks_with_mismatching_metadata(\n",
-    "        initial_merged_chunks, title_length, tokenizer, chunk_size\n",
+    "        initial_merged_chunks, tokenizer, chunk_size\n",
     "    )\n",
     "    return final_merged_chunks"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 23,
    "metadata": {},
    "outputs": [],
    "source": [
     "def adjust_chunks_for_fixed_size(doc, original_chunks, tokenizer, splitter, chunk_size):\n",
-    "    title = doc.name\n",
-    "    title_length = count_tokens(title, tokenizer)\n",
     "    chunks_after_splitting_by_items = []\n",
     "    for chunk in original_chunks:\n",
     "        chunk_split_by_doc_items = split_by_doc_items(\n",
-    "            chunk, title_length, tokenizer, chunk_size\n",
+    "            chunk, tokenizer, chunk_size\n",
     "        )\n",
     "        chunks_after_splitting_by_items.extend(chunk_split_by_doc_items)\n",
     "    chunks_after_splitting_recursively = []\n",
     "    for chunk in chunks_after_splitting_by_items:\n",
     "        chunk_split_recursively = split_using_plain_text(\n",
-    "            chunk, title_length, tokenizer, splitter, chunk_size\n",
+    "            chunk, tokenizer, splitter, chunk_size\n",
     "        )\n",
     "        chunks_after_splitting_recursively.extend(chunk_split_recursively)\n",
     "    chunks_afer_merging = merge_chunks(\n",
-    "        chunks_after_splitting_recursively, title_length, tokenizer, chunk_size\n",
+    "        chunks_after_splitting_recursively, tokenizer, chunk_size\n",
     "    )\n",
     "    return chunks_afer_merging"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 24,
    "metadata": {},
    "outputs": [
     {
@@ -663,7 +658,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 25,
    "metadata": {},
    "outputs": [
     {
@@ -730,7 +725,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 26,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -751,7 +746,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 27,
    "metadata": {},
    "outputs": [
     {
@@ -787,7 +782,7 @@
     "chunker = MaxTokenLimitingChunkerWithMerging(\n",
     "    max_tokens=64, embedding_model_id=EMBED_MODEL_ID\n",
     ")\n",
-    "final_output_chunks = chunker.chunk(dl_doc=doc)\n",
+    "final_output_chunks = list(chunker.chunk(dl_doc=doc))\n",
     "\n",
     "\n",
     "i = 0\n",
@@ -801,7 +796,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 42,
+   "execution_count": 28,
    "metadata": {},
    "outputs": [
     {
@@ -812,7 +807,7 @@
        "      dtype=float32)"
       ]
      },
-     "execution_count": 42,
+     "execution_count": 28,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -825,34 +820,47 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 29,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['\\uf0b7 Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.\\n\\uf0b7 Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.\\n\\uf0b7 Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).\\n\\uf0b7 Match Evaluation: As in [1], the total score for a match in LFACS is the sum of the match scores for the local match hypotheses included in the maximal consistent global map. Local match scores in LFACS are computed using inverse-document frequency (IDF) from our text corpus. Terms with high IDF scores occur rarely in the corpus so the fact that they align with the clue is less likely to be a coincidence and thus more likely to imply that the answer is correct.',\n",
-       " '4 Algorithm']"
-      ]
-     },
-     "execution_count": 41,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "def make_text_for_embedding(chunk):\n",
-    "    output = [chunk.text]\n",
+    "    output = \"\"\n",
     "    if chunk.meta.headings != None:\n",
-    "        output.extend(chunk.meta.headings)\n",
+    "        for h in chunk.meta.headings:\n",
+    "            output += h + '\\n'\n",
     "    if chunk.meta.captions != None:\n",
-    "        output.extend(chunk.meta.captions)\n",
+    "        for c in chunk.meta.captions:\n",
+    "            output += c + '\\n'\n",
+    "    output += chunk.text        \n",
     "    return output"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 43,
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "4 Algorithm\n",
+      " Local Match Construction: LFACS matches both edges and nodes. Edges are matched using a formal ontology, e.g., the authorOf relation is a subrelation of the creatorOfWork relation. Nodes are matched using a variety of resources for determining equivalent terms, e.g., WordNet [5], Wikipedia redirects, and has specialized logic for matching dates, numbers, etc.\n",
+      " Global Map Construction: Unlike [1], LFACS is only concerned with global matches that align the focus to the specified candidate answer. Thus global map construction begins with the focus and candidate answer and search outward from those nodes through the space of local matches. As in [1], the global match construction process ensures consistency of global maps, requiring that no single node in the question map to multiple nodes in the passage.\n",
+      " Candidate Inference Construction: LFACS omits this step because the inference to be drawn is implied by its inputs (aligning the focus to the candidate answer).\n",
+      " Match Evaluation: As in [1], the total score for a match in LFACS is the sum of the match scores for the local match hypotheses included in the maximal consistent global map. Local match scores in LFACS are computed using inverse-document frequency (IDF) from our text corpus. Terms with high IDF scores occur rarely in the corpus so the fact that they align with the clue is less likely to be a coincidence and thus more likely to imply that the answer is correct.\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "print(make_text_for_embedding(chunks[19]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -869,29 +877,146 @@
     "            \"captions\": chunk.meta.captions\n",
     "        }\n",
     "        data.append(data_item)\n",
-    "\n",
-    "    tbl = db.create_table(index_name, data=data)\n",
+    "    tbl = db.create_table(index_name, data=data, exist_ok=True)\n",
     "    return tbl"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 32,
    "metadata": {},
    "outputs": [],
    "source": [
-    "index = make_lancedb_index(\"data/lancedb\", doc.name, chunks, embedding_model)"
+    "index = make_lancedb_index(\"data/lancedb\", doc.name, final_output_chunks, EMBED_MODEL)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 33,
    "metadata": {},
    "outputs": [],
    "source": [
     "sample_query = \"Making SME greedy and pragmatic\"\n",
-    "sample_embedding = EMBED_MODEL.encode(sample_query)"
+    "sample_embedding = EMBED_MODEL.encode(sample_query)\n",
+    "results = index.search(sample_embedding).limit(5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>vector</th>\n",
+       "      <th>text</th>\n",
+       "      <th>headings</th>\n",
+       "      <th>captions</th>\n",
+       "      <th>_distance</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>[-0.025746465, 0.038881335, 0.003366889, -0.03...</td>\n",
+       "      <td>3. Forbus, K. and Oblinger, D. (1990). Making ...</td>\n",
+       "      <td>[References]</td>\n",
+       "      <td>None</td>\n",
+       "      <td>0.332435</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>[0.034203574, 0.10181023, 0.003722381, 0.00506...</td>\n",
+       "      <td>consider to be a match. These aggressive compo...</td>\n",
+       "      <td>[5 Evaluation and Conclusions]</td>\n",
+       "      <td>None</td>\n",
+       "      <td>1.469304</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>[0.044002376, -0.034766, -0.00025529932, 0.004...</td>\n",
+       "      <td>4. McCord, M. C. (1990). Slot Grammar: A Syste...</td>\n",
+       "      <td>[References]</td>\n",
+       "      <td>None</td>\n",
+       "      <td>1.525625</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>[0.112926856, -0.010892127, 0.007714555, -0.06...</td>\n",
+       "      <td>play , about , Utopia , author . There are sti...</td>\n",
+       "      <td>[3 Syntactic-Semantic Graphs]</td>\n",
+       "      <td>None</td>\n",
+       "      <td>1.540549</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>[0.025994683, 0.08402824, 0.03268827, -0.03727...</td>\n",
+       "      <td>In using this algorithm, we have encountered a...</td>\n",
+       "      <td>[4 Algorithm]</td>\n",
+       "      <td>None</td>\n",
+       "      <td>1.576837</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                              vector  \\\n",
+       "0  [-0.025746465, 0.038881335, 0.003366889, -0.03...   \n",
+       "1  [0.034203574, 0.10181023, 0.003722381, 0.00506...   \n",
+       "2  [0.044002376, -0.034766, -0.00025529932, 0.004...   \n",
+       "3  [0.112926856, -0.010892127, 0.007714555, -0.06...   \n",
+       "4  [0.025994683, 0.08402824, 0.03268827, -0.03727...   \n",
+       "\n",
+       "                                                text  \\\n",
+       "0  3. Forbus, K. and Oblinger, D. (1990). Making ...   \n",
+       "1  consider to be a match. These aggressive compo...   \n",
+       "2  4. McCord, M. C. (1990). Slot Grammar: A Syste...   \n",
+       "3  play , about , Utopia , author . There are sti...   \n",
+       "4  In using this algorithm, we have encountered a...   \n",
+       "\n",
+       "                         headings captions  _distance  \n",
+       "0                    [References]     None   0.332435  \n",
+       "1  [5 Evaluation and Conclusions]     None   1.469304  \n",
+       "2                    [References]     None   1.525625  \n",
+       "3   [3 Syntactic-Semantic Graphs]     None   1.540549  \n",
+       "4                   [4 Algorithm]     None   1.576837  "
+      ]
+     },
+     "execution_count": 34,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "results.to_pandas()"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {