From ce9a2fa5279ff99441c2c2332078b2ba7b6b0e01 Mon Sep 17 00:00:00 2001
From: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>
Date: Mon, 27 May 2024 16:54:18 +0200
Subject: [PATCH] chore: apply black formatting and enable CI on pre-commit
 hooks (#66)

---
 .github/actions/setup-poetry/action.yml       |   19 +
 .github/workflows/checks.yml                  |   18 +
 .pre-commit-config.yaml                       |    2 +-
 .../manage_attachments.ipynb                  |   20 +-
 .../upload_converted_documents.ipynb          |   41 +-
 examples/data_query_chemistry/chemistry.ipynb |   83 +-
 .../chemistry_patcid.ipynb                    |   55 +-
 .../data_query_quick_start/quick_start.ipynb  |   75 +-
 examples/data_query_snippets/snippets.ipynb   |  160 +-
 .../document_bulk_upload/run_batch_upload.py  |   33 +-
 .../convert_documents_custom.ipynb            |   18 +-
 .../extract_tables.ipynb                      |   65 +-
 .../convert_documents.ipynb                   |   32 +-
 .../visualize_bbox.ipynb                      |  117 +-
 .../integration_argilla/argilla_upload.ipynb  |    4 +-
 .../kg_download_quick_start.ipynb             |  120 +-
 .../nlp_for_materials/nlp_for_materials.ipynb |   68 +-
 .../nlp_for_references.ipynb                  |   52 +-
 .../nlp_on_documents/nlp_on_documents.ipynb   |   91 +-
 .../qa_doc_collection/doc_collection_qa.ipynb | 1975 +++++++++--------
 examples/qa_single_doc/single_doc_qa.ipynb    |   56 +-
 21 files changed, 1678 insertions(+), 1426 deletions(-)
 create mode 100644 .github/actions/setup-poetry/action.yml
 create mode 100644 .github/workflows/checks.yml

diff --git a/.github/actions/setup-poetry/action.yml b/.github/actions/setup-poetry/action.yml
new file mode 100644
index 0000000..8fe1b14
--- /dev/null
+++ b/.github/actions/setup-poetry/action.yml
@@ -0,0 +1,19 @@
+name: 'Set up Poetry and install'
+description: 'Set up a specific version of Poetry and install dependencies using caching.'
+inputs:
+  python-version:
+    description: "Version range or exact version of Python or PyPy to use, using SemVer's version range syntax."
+    default: '3.10'
+runs:
+  using: 'composite'
+  steps:
+    - name: Install poetry
+      run: pipx install poetry==1.8.3
+      shell: bash
+    - uses: actions/setup-python@v4
+      with:
+        python-version: ${{ inputs.python-version }}
+        cache: 'poetry'
+    - name: Install dependencies
+      run: poetry install --all-extras
+      shell: bash
\ No newline at end of file
diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml
new file mode 100644
index 0000000..971e1e2
--- /dev/null
+++ b/.github/workflows/checks.yml
@@ -0,0 +1,18 @@
+on:
+  push:
+    branches:
+      - "**"
+
+jobs:
+  run-checks:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ['3.8', '3.9', '3.10']
+    steps:
+      - uses: actions/checkout@v3
+      - uses: ./.github/actions/setup-poetry
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Run styling check
+        run: poetry run pre-commit run --all-files
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 22b45ba..47a8da8 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -4,7 +4,7 @@ repos:
     hooks:
       - id: system
         name: Black
-        entry: poetry run black nbrunner dsnotebooks examples
+        entry: poetry run black --include '(\.py|\.ipynb)$' nbrunner dsnotebooks examples
         pass_filenames: false
         language: system
         files: '(\.py|\.ipynb)$'
diff --git a/examples/attachment_management/manage_attachments.ipynb b/examples/attachment_management/manage_attachments.ipynb
index ea311ef..5a25a2f 100644
--- a/examples/attachment_management/manage_attachments.ipynb
+++ b/examples/attachment_management/manage_attachments.ipynb
@@ -42,10 +42,10 @@
     "# notebook settings auto-loaded from .env / env vars\n",
     "notebook_settings = ProjectNotebookSettings()\n",
     "\n",
-    "PROFILE_NAME = notebook_settings.profile     # profile to use\n",
-    "PROJ_KEY = notebook_settings.proj_key        # project to use\n",
+    "PROFILE_NAME = notebook_settings.profile  # profile to use\n",
+    "PROJ_KEY = notebook_settings.proj_key  # project to use\n",
     "INDEX_NAME = notebook_settings.new_idx_name  # index to create\n",
-    "CLEANUP = notebook_settings.cleanup          # whether to clean up\n",
+    "CLEANUP = notebook_settings.cleanup  # whether to clean up\n",
     "ATTACHMENT_KEY = \"usr_attachments\"  # format must be: \"usr_<snake_case>\"\n",
     "FILES_TO_ATTACH = [\n",
     "    \"../../data/samples/2206.00785.pdf\",\n",
@@ -100,8 +100,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def find_index_item(api, coordinates, search_query=\"*\", source=None, page_size=50, pred=None):\n",
-    "    \"\"\" Find first index item that satisfies the criteria \"\"\"\n",
+    "def find_index_item(\n",
+    "    api, coordinates, search_query=\"*\", source=None, page_size=50, pred=None\n",
+    "):\n",
+    "    \"\"\"Find first index item that satisfies the criteria\"\"\"\n",
     "    source_to_use = [\"_id\", \"_name\", \"_s3_data\"] if source is None else source\n",
     "    query = DataQuery(\n",
     "        search_query=search_query,\n",
@@ -116,6 +118,7 @@
     "                return item\n",
     "    return None\n",
     "\n",
+    "\n",
     "def list_item_attachments(api, coordinates, index_item_id, attch_key):\n",
     "    pred = lambda x: x[\"_id\"] == index_item_id\n",
     "    item = find_index_item(api, coordinates, pred=pred)\n",
@@ -420,9 +423,7 @@
     "    m.update(json.dumps(row, sort_keys=True).encode())\n",
     "    h = m.hexdigest()\n",
     "    row[\"_name\"] = f\"row-{i:06d}-{h[:5]}\"\n",
-    "    row[\"file-info\"] = {\n",
-    "        \"document-hash\": h\n",
-    "    }"
+    "    row[\"file-info\"] = {\"document-hash\": h}"
    ]
   },
   {
@@ -609,8 +610,7 @@
     "    filename = Path(attachment[\"path\"]).name\n",
     "    download_url = attachment[\"url\"]\n",
     "    display(HTML(f'&#128073; Download <a href=\"{download_url}\">{filename}</a>'))\n",
-    "    print()\n",
-    "    "
+    "    print()"
    ]
   },
   {
diff --git a/examples/bring_your_own_converted_documents/upload_converted_documents.ipynb b/examples/bring_your_own_converted_documents/upload_converted_documents.ipynb
index ee2d339..8460404 100644
--- a/examples/bring_your_own_converted_documents/upload_converted_documents.ipynb
+++ b/examples/bring_your_own_converted_documents/upload_converted_documents.ipynb
@@ -46,10 +46,10 @@
     "# notebook settings auto-loaded from .env / env vars\n",
     "notebook_settings = ProjectNotebookSettings()\n",
     "\n",
-    "PROFILE_NAME = notebook_settings.profile     # profile to use\n",
-    "PROJ_KEY = notebook_settings.proj_key        # project to use\n",
+    "PROFILE_NAME = notebook_settings.profile  # profile to use\n",
+    "PROJ_KEY = notebook_settings.proj_key  # project to use\n",
     "INDEX_NAME = notebook_settings.new_idx_name  # index to create\n",
-    "CLEANUP = notebook_settings.cleanup          # whether to clean up\n",
+    "CLEANUP = notebook_settings.cleanup  # whether to clean up\n",
     "INPUT_FILES_FOLDER = Path(\"../../data/converted/\")\n",
     "TMP_DIR = tempfile.TemporaryDirectory()"
    ]
@@ -212,7 +212,11 @@
     }
    ],
    "source": [
-    "display(Markdown(f\"The data is now available. You can query it programmatically (see next section) or access it via the Deep Search UI at <br />{api.client.config.host}/projects/{PROJ_KEY}/library/private/{data_index.source.index_key}\"))"
+    "display(\n",
+    "    Markdown(\n",
+    "        f\"The data is now available. You can query it programmatically (see next section) or access it via the Deep Search UI at <br />{api.client.config.host}/projects/{PROJ_KEY}/library/private/{data_index.source.index_key}\"\n",
+    "    )\n",
+    ")"
    ]
   },
   {
@@ -249,7 +253,7 @@
     "# Count the documents in the data index\n",
     "query = DataQuery(\"*\", source=[\"\"], limit=0, coordinates=data_index.source)\n",
     "query_results = api.queries.run(query)\n",
-    "num_results = query_results.outputs['data_count']\n",
+    "num_results = query_results.outputs[\"data_count\"]\n",
     "print(f\"The data index contains {num_results} entries.\")"
    ]
   },
@@ -286,23 +290,34 @@
    "source": [
     "# Find documents matching query\n",
     "search_query = \"speedup\"\n",
-    "query = DataQuery(search_query, source=[\"description.title\", \"description.authors\"], coordinates=data_index.source)\n",
+    "query = DataQuery(\n",
+    "    search_query,\n",
+    "    source=[\"description.title\", \"description.authors\"],\n",
+    "    coordinates=data_index.source,\n",
+    ")\n",
     "query_results = api.queries.run(query)\n",
     "\n",
     "all_results = []\n",
     "cursor = api.queries.run_paginated_query(query)\n",
     "for result_page in tqdm(cursor):\n",
     "    # Iterate through the results of a single page, and add to the total list\n",
-    "    for row in result_page.outputs[\"data_outputs\"]:  \n",
+    "    for row in result_page.outputs[\"data_outputs\"]:\n",
     "        print()\n",
     "        # Add row to results table\n",
-    "        all_results.append({\n",
-    "            \"Title\": row[\"_source\"][\"description\"][\"title\"],\n",
-    "            \"Authors\": \", \".join([author[\"name\"] for author in row[\"_source\"][\"description\"].get(\"authors\", [])]),\n",
-    "        })    \n",
+    "        all_results.append(\n",
+    "            {\n",
+    "                \"Title\": row[\"_source\"][\"description\"][\"title\"],\n",
+    "                \"Authors\": \", \".join(\n",
+    "                    [\n",
+    "                        author[\"name\"]\n",
+    "                        for author in row[\"_source\"][\"description\"].get(\"authors\", [])\n",
+    "                    ]\n",
+    "                ),\n",
+    "            }\n",
+    "        )\n",
     "\n",
     "num_results = len(all_results)\n",
-    "print(f'Finished fetching all data. Total is {num_results} records.')"
+    "print(f\"Finished fetching all data. Total is {num_results} records.\")"
    ]
   },
   {
@@ -388,7 +403,7 @@
     "    api.data_indices.delete(data_index.source)\n",
     "    print(\"Data index deleted\")\n",
     "    TMP_DIR.cleanup()\n",
-    "    print(\"Temporary directory deleted\")\n"
+    "    print(\"Temporary directory deleted\")"
    ]
   }
  ],
diff --git a/examples/data_query_chemistry/chemistry.ipynb b/examples/data_query_chemistry/chemistry.ipynb
index 50464a8..395fc18 100644
--- a/examples/data_query_chemistry/chemistry.ipynb
+++ b/examples/data_query_chemistry/chemistry.ipynb
@@ -64,6 +64,7 @@
     "from numerize.numerize import numerize\n",
     "import mols2grid\n",
     "from tqdm.notebook import tqdm\n",
+    "\n",
     "%matplotlib inline\n",
     "\n",
     "# IPython utilities\n",
@@ -259,10 +260,14 @@
     "\n",
     "# Prepare the data query\n",
     "query = DataQuery(\n",
-    "    search_query, # The search query to be executed\n",
-    "    source=[\"subject\", \"attributes\", \"identifiers\"], # Which fields of documents we want to fetch\n",
-    "    limit=page_size, # The size of each request page\n",
-    "    coordinates=data_collection # The data collection to be queries\n",
+    "    search_query,  # The search query to be executed\n",
+    "    source=[\n",
+    "        \"subject\",\n",
+    "        \"attributes\",\n",
+    "        \"identifiers\",\n",
+    "    ],  # Which fields of documents we want to fetch\n",
+    "    limit=page_size,  # The size of each request page\n",
+    "    coordinates=data_collection,  # The data collection to be queries\n",
     ")\n",
     "\n",
     "\n",
@@ -271,7 +276,9 @@
     "count_query.paginated_task.parameters[\"limit\"] = 0\n",
     "count_results = api.queries.run(count_query)\n",
     "expected_total = count_results.outputs[\"data_count\"]\n",
-    "expected_pages = (expected_total + page_size - 1) // page_size # this is simply a ceiling formula\n",
+    "expected_pages = (\n",
+    "    expected_total + page_size - 1\n",
+    ") // page_size  # this is simply a ceiling formula\n",
     "\n",
     "\n",
     "# Iterate through all results by fetching `page_size` results at the same time\n",
@@ -280,7 +287,7 @@
     "for result_page in tqdm(cursor, total=expected_pages):\n",
     "    all_results.extend(result_page.outputs[\"data_outputs\"])\n",
     "\n",
-    "print(f'Finished fetching all data. Total is {len(all_results)} records.')"
+    "print(f\"Finished fetching all data. Total is {len(all_results)} records.\")"
    ]
   },
   {
@@ -702,7 +709,7 @@
     "    for ref in row[\"_source\"][\"identifiers\"]:\n",
     "        if ref[\"type\"] == \"cid\":\n",
     "            result[\"cid\"] = ref[\"value\"]\n",
-    "    \n",
+    "\n",
     "    for ref in row[\"_source\"][\"subject\"][\"identifiers\"]:\n",
     "        if ref[\"type\"] == \"smiles\":\n",
     "            result[\"SMILES\"] = ref[\"value\"]\n",
@@ -714,7 +721,7 @@
     "    for ref in row[\"_source\"][\"subject\"][\"names\"]:\n",
     "        if ref[\"type\"] == \"chemical_name\":\n",
     "            result[\"chemical_name\"] = ref[\"value\"]\n",
-    "    \n",
+    "\n",
     "    for attribute in row[\"_source\"][\"attributes\"]:\n",
     "        for predicate in attribute[\"predicates\"]:\n",
     "            value = predicate[\"value\"][\"name\"]\n",
@@ -723,9 +730,7 @@
     "            elif \"numerical_value\" in predicate:\n",
     "                value = predicate[\"numerical_value\"][\"val\"]\n",
     "            result[predicate[\"key\"][\"name\"]] = value\n",
-    "                \n",
-    "        \n",
-    "    \n",
+    "\n",
     "    results_table.append(result)\n",
     "\n",
     "df = pd.DataFrame(results_table)\n",
@@ -1480,10 +1485,14 @@
     "\n",
     "# Prepare the data query\n",
     "query = DataQuery(\n",
-    "    search_query, # The search query to be executed\n",
-    "    source=[\"subject\", \"attributes\", \"identifiers\"], # Which fields of documents we want to fetch\n",
-    "    limit=page_size, # The size of each request page\n",
-    "    coordinates=data_collection # The data collection to be queries\n",
+    "    search_query,  # The search query to be executed\n",
+    "    source=[\n",
+    "        \"subject\",\n",
+    "        \"attributes\",\n",
+    "        \"identifiers\",\n",
+    "    ],  # Which fields of documents we want to fetch\n",
+    "    limit=page_size,  # The size of each request page\n",
+    "    coordinates=data_collection,  # The data collection to be queries\n",
     ")\n",
     "\n",
     "\n",
@@ -1492,7 +1501,9 @@
     "count_query.paginated_task.parameters[\"limit\"] = 0\n",
     "count_results = api.queries.run(count_query)\n",
     "expected_total = count_results.outputs[\"data_count\"]\n",
-    "expected_pages = (expected_total + page_size - 1) // page_size # this is simply a ceiling formula\n",
+    "expected_pages = (\n",
+    "    expected_total + page_size - 1\n",
+    ") // page_size  # this is simply a ceiling formula\n",
     "\n",
     "\n",
     "# Iterate through all results by fetching `page_size` results at the same time\n",
@@ -1501,7 +1512,7 @@
     "for result_page in tqdm(cursor, total=expected_pages):\n",
     "    all_results.extend(result_page.outputs[\"data_outputs\"])\n",
     "\n",
-    "print(f'Finished fetching all data. Total is {len(all_results)} records.')"
+    "print(f\"Finished fetching all data. Total is {len(all_results)} records.\")"
    ]
   },
   {
@@ -2286,7 +2297,7 @@
     "    for ref in row[\"_source\"][\"identifiers\"]:\n",
     "        if ref[\"type\"] == \"cid\":\n",
     "            result[\"cid\"] = ref[\"value\"]\n",
-    "    \n",
+    "\n",
     "    for ref in row[\"_source\"][\"subject\"][\"identifiers\"]:\n",
     "        if ref[\"type\"] == \"smiles\":\n",
     "            result[\"SMILES\"] = ref[\"value\"]\n",
@@ -2298,7 +2309,7 @@
     "    for ref in row[\"_source\"][\"subject\"][\"names\"]:\n",
     "        if ref[\"type\"] == \"chemical_name\":\n",
     "            result[\"chemical_name\"] = ref[\"value\"]\n",
-    "    \n",
+    "\n",
     "    for attribute in row[\"_source\"][\"attributes\"]:\n",
     "        for predicate in attribute[\"predicates\"]:\n",
     "            value = predicate[\"value\"][\"name\"]\n",
@@ -2307,12 +2318,10 @@
     "            elif \"numerical_value\" in predicate:\n",
     "                value = predicate[\"numerical_value\"][\"val\"]\n",
     "            result[predicate[\"key\"][\"name\"]] = value\n",
-    "                \n",
-    "        \n",
-    "    \n",
+    "\n",
     "    results_table.append(result)\n",
     "\n",
-    "    \n",
+    "\n",
     "# Display the results table\n",
     "df = pd.DataFrame(results_table)\n",
     "display(df)\n",
@@ -2361,17 +2370,21 @@
    "source": [
     "# Search by name\n",
     "search_smiles = \"C1=CC=C2C(=C1)C(=CN2)CCO\"\n",
-    "search_query = f\"subject.identifiers._name:\\\"smiles#{search_smiles.lower()}\\\"\"\n",
+    "search_query = f'subject.identifiers._name:\"smiles#{search_smiles.lower()}\"'\n",
     "\n",
     "data_collection = ElasticDataCollectionSource(elastic_id=\"default\", index_key=\"pubchem\")\n",
     "page_size = 50\n",
     "\n",
     "# Prepare the data query\n",
     "query = DataQuery(\n",
-    "    search_query, # The search query to be executed\n",
-    "    source=[\"subject\", \"attributes\", \"identifiers\"], # Which fields of documents we want to fetch\n",
-    "    limit=page_size, # The size of each request page\n",
-    "    coordinates=data_collection # The data collection to be queries\n",
+    "    search_query,  # The search query to be executed\n",
+    "    source=[\n",
+    "        \"subject\",\n",
+    "        \"attributes\",\n",
+    "        \"identifiers\",\n",
+    "    ],  # Which fields of documents we want to fetch\n",
+    "    limit=page_size,  # The size of each request page\n",
+    "    coordinates=data_collection,  # The data collection to be queries\n",
     ")\n",
     "\n",
     "\n",
@@ -2380,7 +2393,9 @@
     "count_query.paginated_task.parameters[\"limit\"] = 0\n",
     "count_results = api.queries.run(count_query)\n",
     "expected_total = count_results.outputs[\"data_count\"]\n",
-    "expected_pages = (expected_total + page_size - 1) // page_size # this is simply a ceiling formula\n",
+    "expected_pages = (\n",
+    "    expected_total + page_size - 1\n",
+    ") // page_size  # this is simply a ceiling formula\n",
     "\n",
     "\n",
     "# Iterate through all results by fetching `page_size` results at the same time\n",
@@ -2389,7 +2404,7 @@
     "for result_page in tqdm(cursor, total=expected_pages):\n",
     "    all_results.extend(result_page.outputs[\"data_outputs\"])\n",
     "\n",
-    "print(f'Finished fetching all data. Total is {len(all_results)} records.')"
+    "print(f\"Finished fetching all data. Total is {len(all_results)} records.\")"
    ]
   },
   {
@@ -3151,7 +3166,7 @@
     "    for ref in row[\"_source\"][\"identifiers\"]:\n",
     "        if ref[\"type\"] == \"cid\":\n",
     "            result[\"cid\"] = ref[\"value\"]\n",
-    "    \n",
+    "\n",
     "    for ref in row[\"_source\"][\"subject\"][\"identifiers\"]:\n",
     "        if ref[\"type\"] == \"smiles\":\n",
     "            result[\"SMILES\"] = ref[\"value\"]\n",
@@ -3163,7 +3178,7 @@
     "    for ref in row[\"_source\"][\"subject\"][\"names\"]:\n",
     "        if ref[\"type\"] == \"chemical_name\":\n",
     "            result[\"chemical_name\"] = ref[\"value\"]\n",
-    "    \n",
+    "\n",
     "    for attribute in row[\"_source\"][\"attributes\"]:\n",
     "        for predicate in attribute[\"predicates\"]:\n",
     "            value = predicate[\"value\"][\"name\"]\n",
@@ -3172,9 +3187,7 @@
     "            elif \"numerical_value\" in predicate:\n",
     "                value = predicate[\"numerical_value\"][\"val\"]\n",
     "            result[predicate[\"key\"][\"name\"]] = value\n",
-    "                \n",
-    "        \n",
-    "    \n",
+    "\n",
     "    results_table.append(result)\n",
     "\n",
     "# Display the results table\n",
diff --git a/examples/data_query_chemistry_patcid/chemistry_patcid.ipynb b/examples/data_query_chemistry_patcid/chemistry_patcid.ipynb
index c635c0f..9fe09ea 100644
--- a/examples/data_query_chemistry_patcid/chemistry_patcid.ipynb
+++ b/examples/data_query_chemistry_patcid/chemistry_patcid.ipynb
@@ -86,14 +86,18 @@
     "import deepsearch as ds\n",
     "from deepsearch.cps.client.api import CpsApi\n",
     "from deepsearch.cps.client.components.elastic import ElasticDataCollectionSource\n",
-    "from deepsearch.chemistry.queries.molecules import MoleculeQuery, MoleculesInPatentsQuery, PatentsWithMoleculesQuery\n",
+    "from deepsearch.chemistry.queries.molecules import (\n",
+    "    MoleculeQuery,\n",
+    "    MoleculesInPatentsQuery,\n",
+    "    PatentsWithMoleculesQuery,\n",
+    ")\n",
     "from deepsearch.chemistry.queries.molecules import MolId, MolIdType, MolQueryType\n",
     "from deepsearch.cps.client.components.elastic import ElasticDataCollectionSource\n",
     "from deepsearch.documents.core.lookup import EntitiesLookup\n",
     "from deepsearch.documents.core.render import get_page_svg_with_item\n",
     "from deepsearch.cps.queries import DataQuery\n",
     "\n",
-    "from deepsearch.cps.client.components.queries import RunQueryError\n"
+    "from deepsearch.cps.client.components.queries import RunQueryError"
    ]
   },
   {
@@ -111,7 +115,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "api = CpsApi.from_env(profile_name=PROFILE_NAME)\n"
+    "api = CpsApi.from_env(profile_name=PROFILE_NAME)"
    ]
   },
   {
@@ -180,7 +184,11 @@
     "input_smiles = \"C1(C(=C)C([O-])=C1C)=O\"\n",
     "\n",
     "display(Markdown(\"### Substructure\"))\n",
-    "display(Markdown(f\"We will list molecules containing the Squarilium (`{input_smiles}`) substructure\"))\n",
+    "display(\n",
+    "    Markdown(\n",
+    "        f\"We will list molecules containing the Squarilium (`{input_smiles}`) substructure\"\n",
+    "    )\n",
+    ")\n",
     "\n",
     "smiles_mol = Chem.MolFromSmiles(input_smiles)\n",
     "display(smiles_mol)"
@@ -2277,7 +2285,7 @@
     "\n",
     "df = pd.DataFrame(results_table)\n",
     "display(df)\n",
-    "mols2grid.display(df, smiles_col=\"SMILES\")\n"
+    "mols2grid.display(df, smiles_col=\"SMILES\")"
    ]
   },
   {
@@ -6893,9 +6901,11 @@
    ],
    "source": [
     "# Load the full document\n",
-    "patent_smiles_coords = ElasticDataCollectionSource(elastic_id=\"default\", index_key=\"patent-smiles\")\n",
+    "patent_smiles_coords = ElasticDataCollectionSource(\n",
+    "    elastic_id=\"default\", index_key=\"patent-smiles\"\n",
+    ")\n",
     "query = DataQuery(\n",
-    "    f\"identifiers.value:\\\"{patent_id}\\\"\",\n",
+    "    f'identifiers.value:\"{patent_id}\"',\n",
     "    coordinates=patent_smiles_coords,\n",
     ")\n",
     "\n",
@@ -6926,8 +6936,10 @@
     "            }\n",
     "        )\n",
     "molecules_location_table.sort(key=lambda row: (row[\"Patent\"], row[\"Page\"]))\n",
-    "df_molecules_location = pd.DataFrame(molecules_location_table, columns=[\"SMILES\", \"Patent\", \"Page\", \"Url\"])\n",
-    "display(HTML(df_molecules_location.to_html(escape=False)))\n"
+    "df_molecules_location = pd.DataFrame(\n",
+    "    molecules_location_table, columns=[\"SMILES\", \"Patent\", \"Page\", \"Url\"]\n",
+    ")\n",
+    "display(HTML(df_molecules_location.to_html(escape=False)))"
    ]
   },
   {
@@ -7481,8 +7493,9 @@
    "source": [
     "input_smiles = \"CN1C(=CC2=C([O-])C(=Cc3[se]c4ccccc4[n+]3C)C(=O)C2=O)[Se]c2ccccc21\"\n",
     "\n",
-    "display(Markdown(\n",
-    "    f\"\"\"\n",
+    "display(\n",
+    "    Markdown(\n",
+    "        f\"\"\"\n",
     "For example, looking for a SMILES is done with\n",
     "```python\n",
     "query = PatentsWithMoleculesQuery(\n",
@@ -7492,12 +7505,17 @@
     "    num_items=20,\n",
     ")\n",
     "```\n",
-    "\"\"\"))\n",
-    "\n",
-    "display(Markdown(\n",
     "\"\"\"\n",
+    "    )\n",
+    ")\n",
+    "\n",
+    "display(\n",
+    "    Markdown(\n",
+    "        \"\"\"\n",
     "This will look for patents containing\n",
-    "\"\"\"))\n",
+    "\"\"\"\n",
+    "    )\n",
+    ")\n",
     "\n",
     "smiles_mol = Chem.MolFromSmiles(input_smiles)\n",
     "display(smiles_mol)"
@@ -7521,7 +7539,12 @@
    "source": [
     "# Search by SMILES\n",
     "query = PatentsWithMoleculesQuery(\n",
-    "    molecules=[MolId(type=MolIdType.SMILES, value=\"CN1C(=CC2=C([O-])C(=Cc3[se]c4ccccc4[n+]3C)C(=O)C2=O)[Se]c2ccccc21\")],\n",
+    "    molecules=[\n",
+    "        MolId(\n",
+    "            type=MolIdType.SMILES,\n",
+    "            value=\"CN1C(=CC2=C([O-])C(=Cc3[se]c4ccccc4[n+]3C)C(=O)C2=O)[Se]c2ccccc21\",\n",
+    "        )\n",
+    "    ],\n",
     "    num_items=20,\n",
     ")\n",
     "\n",
diff --git a/examples/data_query_quick_start/quick_start.ipynb b/examples/data_query_quick_start/quick_start.ipynb
index 6f3e334..ffa1c71 100644
--- a/examples/data_query_quick_start/quick_start.ipynb
+++ b/examples/data_query_quick_start/quick_start.ipynb
@@ -39,7 +39,7 @@
     "# notebook settings auto-loaded from .env / env vars\n",
     "notebook_settings = NotebookSettings()\n",
     "\n",
-    "PROFILE_NAME = notebook_settings.profile  # the profile to use\n"
+    "PROFILE_NAME = notebook_settings.profile  # the profile to use"
    ]
   },
   {
@@ -63,6 +63,7 @@
     "from numerize.numerize import numerize\n",
     "from tqdm.notebook import tqdm\n",
     "import matplotlib.pyplot as plt\n",
+    "\n",
     "%matplotlib inline\n",
     "\n",
     "# IPython utilities\n",
@@ -71,7 +72,7 @@
     "# Import the deepsearch-toolkit\n",
     "import deepsearch as ds\n",
     "from deepsearch.cps.client.components.elastic import ElasticDataCollectionSource\n",
-    "from deepsearch.cps.queries import DataQuery\n"
+    "from deepsearch.cps.queries import DataQuery"
    ]
   },
   {
@@ -89,7 +90,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "api = ds.CpsApi.from_env(profile_name=PROFILE_NAME)\n"
+    "api = ds.CpsApi.from_env(profile_name=PROFILE_NAME)"
    ]
   },
   {
@@ -125,7 +126,7 @@
    "source": [
     "# Fetch list of all data collections\n",
     "collections = api.elastic.list()\n",
-    "collections.sort(key=lambda c: c.name.lower())\n"
+    "collections.sort(key=lambda c: c.name.lower())"
    ]
   },
   {
@@ -351,7 +352,7 @@
     "    }\n",
     "    for c in collections\n",
     "]\n",
-    "display(pd.DataFrame(results))\n"
+    "display(pd.DataFrame(results))"
    ]
   },
   {
@@ -512,7 +513,7 @@
    ],
    "source": [
     "# Input query\n",
-    "search_query = \"main-text.text:((\\\"power conversion efficiency\\\" OR PCE) AND organ*)\"\n",
+    "search_query = 'main-text.text:((\"power conversion efficiency\" OR PCE) AND organ*)'\n",
     "\n",
     "# Iterate through the data collections\n",
     "results = []\n",
@@ -526,14 +527,11 @@
     "    # Execute the query\n",
     "    query = DataQuery(search_query, source=[], limit=0, coordinates=c.source)\n",
     "    query_results = api.queries.run(query)\n",
-    "    results.append({\n",
-    "        \"name\": c.name,\n",
-    "        \"matches\": query_results.outputs[\"data_count\"]\n",
-    "    })\n",
+    "    results.append({\"name\": c.name, \"matches\": query_results.outputs[\"data_count\"]})\n",
     "\n",
     "# Sort and display results\n",
     "results.sort(reverse=True, key=lambda r: r[\"matches\"])\n",
-    "display(pd.DataFrame(results))\n"
+    "display(pd.DataFrame(results))"
    ]
   },
   {
@@ -569,7 +567,7 @@
     "x = [r[\"name\"] for r in results]\n",
     "y = [r[\"matches\"] for r in results]\n",
     "plt.pie(y, labels=x, labeldistance=None)\n",
-    "plt.legend(loc=\"upper center\", ncols=3, bbox_to_anchor=(0.5, 0))\n"
+    "plt.legend(loc=\"upper center\", ncols=3, bbox_to_anchor=(0.5, 0))"
    ]
   },
   {
@@ -621,16 +619,22 @@
    ],
    "source": [
     "# Input query\n",
-    "search_query = \"main-text.text:((\\\"power conversion efficiency\\\" OR PCE) AND organ*)\"\n",
-    "data_collection = ElasticDataCollectionSource(elastic_id=\"default\", index_key=\"arxiv-abstract\")\n",
+    "search_query = 'main-text.text:((\"power conversion efficiency\" OR PCE) AND organ*)'\n",
+    "data_collection = ElasticDataCollectionSource(\n",
+    "    elastic_id=\"default\", index_key=\"arxiv-abstract\"\n",
+    ")\n",
     "page_size = 50\n",
     "\n",
     "# Prepare the data query\n",
     "query = DataQuery(\n",
-    "    search_query, # The search query to be executed\n",
-    "    source=[\"description.title\", \"description.authors\", \"identifiers\"], # Which fields of documents we want to fetch\n",
-    "    limit=page_size, # The size of each request page\n",
-    "    coordinates=data_collection # The data collection to be queries\n",
+    "    search_query,  # The search query to be executed\n",
+    "    source=[\n",
+    "        \"description.title\",\n",
+    "        \"description.authors\",\n",
+    "        \"identifiers\",\n",
+    "    ],  # Which fields of documents we want to fetch\n",
+    "    limit=page_size,  # The size of each request page\n",
+    "    coordinates=data_collection,  # The data collection to be queries\n",
     ")\n",
     "\n",
     "\n",
@@ -639,7 +643,9 @@
     "count_query.paginated_task.parameters[\"limit\"] = 0\n",
     "count_results = api.queries.run(count_query)\n",
     "expected_total = count_results.outputs[\"data_count\"]\n",
-    "expected_pages = (expected_total + page_size - 1) // page_size # this is simply a ceiling formula\n",
+    "expected_pages = (\n",
+    "    expected_total + page_size - 1\n",
+    ") // page_size  # this is simply a ceiling formula\n",
     "\n",
     "\n",
     "# Iterate through all results by fetching `page_size` results at the same time\n",
@@ -660,16 +666,23 @@
     "                links[\"doi\"] = f'https://doi.org/{ids[\"value\"]}'\n",
     "\n",
     "        # Add row to results table\n",
-    "        all_results.append({\n",
-    "            \"Title\": row[\"_source\"][\"description\"][\"title\"],\n",
-    "            \"Authors\": \", \".join([author[\"name\"] for author in row[\"_source\"][\"description\"][\"authors\"]]),\n",
-    "            \"arXiv\": identifiers[\"arxiv\"],\n",
-    "            \"arXiv URL\": links[\"arxiv\"],\n",
-    "            \"DOI\": identifiers[\"doi\"],\n",
-    "            \"DOI URL\": links[\"doi\"],\n",
-    "        })\n",
-    "\n",
-    "print(f'Finished fetching all data. Total is {len(all_results)} records.')\n"
+    "        all_results.append(\n",
+    "            {\n",
+    "                \"Title\": row[\"_source\"][\"description\"][\"title\"],\n",
+    "                \"Authors\": \", \".join(\n",
+    "                    [\n",
+    "                        author[\"name\"]\n",
+    "                        for author in row[\"_source\"][\"description\"][\"authors\"]\n",
+    "                    ]\n",
+    "                ),\n",
+    "                \"arXiv\": identifiers[\"arxiv\"],\n",
+    "                \"arXiv URL\": links[\"arxiv\"],\n",
+    "                \"DOI\": identifiers[\"doi\"],\n",
+    "                \"DOI URL\": links[\"doi\"],\n",
+    "            }\n",
+    "        )\n",
+    "\n",
+    "print(f\"Finished fetching all data. Total is {len(all_results)} records.\")"
    ]
   },
   {
@@ -755,7 +768,7 @@
    "source": [
     "# Visualize the table with all results\n",
     "df = pd.json_normalize(all_results)\n",
-    "display(HTML(df.head().to_html(render_links=True)))\n"
+    "display(HTML(df.head().to_html(render_links=True)))"
    ]
   },
   {
@@ -766,7 +779,7 @@
    "outputs": [],
    "source": [
     "# Save the results to an Excel table\n",
-    "df.to_excel(\"quick_start_results.xlsx\")\n"
+    "df.to_excel(\"quick_start_results.xlsx\")"
    ]
   }
  ],
diff --git a/examples/data_query_snippets/snippets.ipynb b/examples/data_query_snippets/snippets.ipynb
index d4cb8a0..a1c9710 100644
--- a/examples/data_query_snippets/snippets.ipynb
+++ b/examples/data_query_snippets/snippets.ipynb
@@ -228,17 +228,21 @@
    ],
    "source": [
     "# Run a proximity query using some keywords with maximum edit distance 5\n",
-    "search_query = \"\\\"climate change mitigation\\\"~5\"\n",
+    "search_query = '\"climate change mitigation\"~5'\n",
     "\n",
     "data_collection = ElasticDataCollectionSource(elastic_id=\"default\", index_key=\"ipcc\")\n",
     "page_size = 50\n",
     "\n",
     "# Prepare the data query\n",
     "query = DataQuery(\n",
-    "    search_query, # The search query to be executed\n",
-    "    source=[\"description.title\", \"description.publication_date\", \"file-info.#-pages\"], # Fields to fetch\n",
-    "    limit=page_size, # The size of each request page\n",
-    "    coordinates=data_collection # The data collection to be queried\n",
+    "    search_query,  # The search query to be executed\n",
+    "    source=[\n",
+    "        \"description.title\",\n",
+    "        \"description.publication_date\",\n",
+    "        \"file-info.#-pages\",\n",
+    "    ],  # Fields to fetch\n",
+    "    limit=page_size,  # The size of each request page\n",
+    "    coordinates=data_collection,  # The data collection to be queried\n",
     ")\n",
     "\n",
     "# Iterate through all results by fetching `page_size` results at the same time\n",
@@ -351,12 +355,21 @@
    "source": [
     "# Visualize summary table\n",
     "df = pd.json_normalize(all_results).loc[\n",
-    "    :,[\"_source.description.title\", \"_source.description.publication_date\", \"_source.file-info.#-pages\"]]\n",
+    "    :,\n",
+    "    [\n",
+    "        \"_source.description.title\",\n",
+    "        \"_source.description.publication_date\",\n",
+    "        \"_source.file-info.#-pages\",\n",
+    "    ],\n",
+    "]\n",
     "df.columns = [\"Title\", \"Publication Date\", \"Number of Pages\"]\n",
     "df[\"Publication Year\"] = df[\"Publication Date\"].str[:4]\n",
     "\n",
-    "df.loc[:,[\"Title\", \"Publication Year\", \"Number of Pages\"]].head(10).style.set_properties(\n",
-    "    subset=[\"Title\"], **{\"text-align\": \"left\"}).set_table_styles([dict(selector=\"th\", props=[(\"text-align\", \"left\")])])"
+    "df.loc[:, [\"Title\", \"Publication Year\", \"Number of Pages\"]].head(\n",
+    "    10\n",
+    ").style.set_properties(subset=[\"Title\"], **{\"text-align\": \"left\"}).set_table_styles(\n",
+    "    [dict(selector=\"th\", props=[(\"text-align\", \"left\")])]\n",
+    ")"
    ]
   },
   {
@@ -384,16 +397,16 @@
    ],
    "source": [
     "# Prepare the data query\n",
-    "search_query = \"\\\"climate change carbon sequestration\\\"~10\"\n",
+    "search_query = '\"climate change carbon sequestration\"~10'\n",
     "\n",
     "highlight = {\"fields\": {\"*\": {}}}\n",
     "\n",
     "query = DataQuery(\n",
     "    search_query,\n",
-    "    source=[\"file-info.filename\"], # Fetch only the report file name\n",
-    "    limit=page_size, # Fetch maximum `page_size` reports\n",
+    "    source=[\"file-info.filename\"],  # Fetch only the report file name\n",
+    "    limit=page_size,  # Fetch maximum `page_size` reports\n",
     "    highlight=highlight,\n",
-    "    coordinates=data_collection\n",
+    "    coordinates=data_collection,\n",
     ")\n",
     "\n",
     "all_results = []\n",
@@ -505,21 +518,24 @@
     "# Format and visualize the first 10 snippets\n",
     "def format_highlight_results(ds_results):\n",
     "    results_table = []\n",
-    "    \n",
+    "\n",
     "    for row in ds_results:\n",
     "        for field in row.get(\"highlight\", {}).keys():\n",
     "            for snippet in row[\"highlight\"][field]:\n",
     "                result = {\n",
     "                    \"Report\": row[\"_source\"][\"file-info\"][\"filename\"],\n",
     "                    \"Field\": field,\n",
-    "                    \"Snippet\": snippet\n",
+    "                    \"Snippet\": snippet,\n",
     "                }\n",
     "                results_table.append(result)\n",
-    "                \n",
+    "\n",
     "    return pd.DataFrame(results_table)\n",
     "\n",
+    "\n",
     "df = format_highlight_results(all_results)\n",
-    "df_style = df.head(10).style.set_table_styles([dict(selector=\"th\", props=[(\"text-align\", \"left\")])])\n",
+    "df_style = df.head(10).style.set_table_styles(\n",
+    "    [dict(selector=\"th\", props=[(\"text-align\", \"left\")])]\n",
+    ")\n",
     "df_style.set_properties(**{\"text-align\": \"left\"})"
    ]
   },
@@ -642,7 +658,12 @@
    "source": [
     "highlight[\"fields\"] = {\"main-text.text\": {}}\n",
     "query = DataQuery(\n",
-    "    search_query, source=[\"file-info.filename\"], limit=page_size, highlight=highlight, coordinates=data_collection)\n",
+    "    search_query,\n",
+    "    source=[\"file-info.filename\"],\n",
+    "    limit=page_size,\n",
+    "    highlight=highlight,\n",
+    "    coordinates=data_collection,\n",
+    ")\n",
     "\n",
     "all_results = api.queries.run(query).outputs[\"data_outputs\"]\n",
     "format_highlight_results(all_results).head(10).style.use(df_style.export())"
@@ -759,7 +780,12 @@
     "highlight[\"fragment_size\"] = 0\n",
     "\n",
     "query = DataQuery(\n",
-    "    search_query, source=[\"file-info.filename\"], limit=page_size, highlight=highlight, coordinates=data_collection)\n",
+    "    search_query,\n",
+    "    source=[\"file-info.filename\"],\n",
+    "    limit=page_size,\n",
+    "    highlight=highlight,\n",
+    "    coordinates=data_collection,\n",
+    ")\n",
     "\n",
     "all_results = api.queries.run(query).outputs[\"data_outputs\"]\n",
     "format_highlight_results(all_results).head(10).style.use(df_style.export())"
@@ -877,7 +903,12 @@
     "highlight[\"post_tags\"] = [\"</span>\"]\n",
     "\n",
     "query = DataQuery(\n",
-    "    search_query, source=[\"file-info.filename\"], limit=page_size, highlight=highlight, coordinates=data_collection)\n",
+    "    search_query,\n",
+    "    source=[\"file-info.filename\"],\n",
+    "    limit=page_size,\n",
+    "    highlight=highlight,\n",
+    "    coordinates=data_collection,\n",
+    ")\n",
     "\n",
     "all_results = api.queries.run(query).outputs[\"data_outputs\"]\n",
     "format_highlight_results(all_results).head(10).style.use(df_style.export())"
@@ -995,22 +1026,25 @@
     }
    ],
    "source": [
-    "search_query = \"\\\"climate change\\\" AND mitigation AND (city cities urban)\"\n",
+    "search_query = '\"climate change\" AND mitigation AND (city cities urban)'\n",
     "\n",
     "highlight = {\n",
     "    \"order\": \"score\",\n",
     "    \"fragment_size\": 150,\n",
     "    \"fields\": {\n",
     "        \"description.title\": {\"number_of_fragments\": 0},\n",
-    "        \"main-text.text\": {\"number_of_fragments\": 7}\n",
-    "    }\n",
+    "        \"main-text.text\": {\"number_of_fragments\": 7},\n",
+    "    },\n",
     "}\n",
     "\n",
     "query = DataQuery(\n",
     "    search_query,\n",
     "    source=[\"file-info.filename\"],\n",
     "    sort=[{\"_score\": \"desc\", \"file-info.document-hash\": \"asc\"}],\n",
-    "    limit=page_size, highlight=highlight, coordinates=data_collection)\n",
+    "    limit=page_size,\n",
+    "    highlight=highlight,\n",
+    "    coordinates=data_collection,\n",
+    ")\n",
     "\n",
     "all_results = api.queries.run(query).outputs[\"data_outputs\"]\n",
     "format_highlight_results(all_results).head(10).style.use(df_style.export())"
@@ -1062,20 +1096,14 @@
     }
    ],
    "source": [
-    "aggs = {\n",
-    "    \"language_count\": {\n",
-    "        \"cardinality\": {\n",
-    "            \"field\": \"description.languages\"\n",
-    "        }\n",
-    "    }\n",
-    "}\n",
+    "aggs = {\"language_count\": {\"cardinality\": {\"field\": \"description.languages\"}}}\n",
     "\n",
     "query = DataQuery(\n",
-    "    search_query = \"*:*\", # Match-all query\n",
+    "    search_query=\"*:*\",  # Match-all query\n",
     "    source=[],  # No document data will be returned\n",
     "    limit=0,\n",
     "    aggregations=aggs,\n",
-    "    coordinates=data_collection\n",
+    "    coordinates=data_collection,\n",
     ")\n",
     "\n",
     "summary = api.queries.run(query).outputs[\"data_aggs\"]\n",
@@ -1126,16 +1154,31 @@
     "            \"field\": \"description.languages\",\n",
     "            \"order\": {\"_key\": \"asc\"},\n",
     "            \"size\": 50,\n",
-    "            \"exclude\": \"en\"\n",
+    "            \"exclude\": \"en\",\n",
     "        }\n",
     "    }\n",
     "}\n",
     "\n",
-    "query = DataQuery(search_query = \"*:*\", source=[], limit=0, aggregations=aggs, coordinates=data_collection)\n",
+    "query = DataQuery(\n",
+    "    search_query=\"*:*\",\n",
+    "    source=[],\n",
+    "    limit=0,\n",
+    "    aggregations=aggs,\n",
+    "    coordinates=data_collection,\n",
+    ")\n",
     "summary = api.queries.run(query).outputs[\"data_aggs\"]\n",
     "\n",
     "df = pd.json_normalize(summary[\"languages\"][\"buckets\"])\n",
-    "df.plot.bar(y=\"doc_count\", x=\"key\", figsize=(15, 5), xlabel=\"Language\", ylabel=\"Number of reports\", rot=0, legend=False, title=\"Number of non-English reports by language\")"
+    "df.plot.bar(\n",
+    "    y=\"doc_count\",\n",
+    "    x=\"key\",\n",
+    "    figsize=(15, 5),\n",
+    "    xlabel=\"Language\",\n",
+    "    ylabel=\"Number of reports\",\n",
+    "    rot=0,\n",
+    "    legend=False,\n",
+    "    title=\"Number of non-English reports by language\",\n",
+    ")"
    ]
   },
   {
@@ -1178,18 +1221,32 @@
     "    \"by_year\": {\n",
     "        \"date_histogram\": {\n",
     "            \"field\": \"description.publication_date\",\n",
-    "                \"calendar_interval\": \"year\",\n",
-    "                \"format\": \"yyyy\",\n",
-    "                \"min_doc_count\": 0\n",
+    "            \"calendar_interval\": \"year\",\n",
+    "            \"format\": \"yyyy\",\n",
+    "            \"min_doc_count\": 0,\n",
     "        }\n",
     "    }\n",
     "}\n",
     "\n",
-    "query = DataQuery(search_query = search_query, source=[], limit=0, aggregations=aggs, coordinates=data_collection)\n",
+    "query = DataQuery(\n",
+    "    search_query=search_query,\n",
+    "    source=[],\n",
+    "    limit=0,\n",
+    "    aggregations=aggs,\n",
+    "    coordinates=data_collection,\n",
+    ")\n",
     "summary = api.queries.run(query).outputs[\"data_aggs\"]\n",
     "\n",
     "df = pd.json_normalize(summary[\"by_year\"][\"buckets\"])\n",
-    "df.plot.bar(y=\"doc_count\", x=\"key_as_string\", figsize=(15, 5), xlabel=\"Publication year\", ylabel=\"Number of reports\", legend=False, title=\"Number of IPCC reports by year\")\n"
+    "df.plot.bar(\n",
+    "    y=\"doc_count\",\n",
+    "    x=\"key_as_string\",\n",
+    "    figsize=(15, 5),\n",
+    "    xlabel=\"Publication year\",\n",
+    "    ylabel=\"Number of reports\",\n",
+    "    legend=False,\n",
+    "    title=\"Number of IPCC reports by year\",\n",
+    ")"
    ]
   },
   {
@@ -1216,23 +1273,26 @@
     }
    ],
    "source": [
-    "search_query = \"tables.data.text:\\\"net-zero emissions\\\"\"\n",
+    "search_query = 'tables.data.text:\"net-zero emissions\"'\n",
     "\n",
     "aggs = {\n",
     "    \"num_tables\": {\n",
-    "        \"sum\": {\n",
-    "            \"script\": {\n",
-    "                \"source\": \"doc['tables.#-cols'].length\",\n",
-    "                \"lang\": \"painless\"\n",
-    "            }\n",
-    "        }\n",
-    "    } \n",
+    "        \"sum\": {\"script\": {\"source\": \"doc['tables.#-cols'].length\", \"lang\": \"painless\"}}\n",
+    "    }\n",
     "}\n",
     "\n",
-    "query = DataQuery(search_query = search_query, source=[], limit=0, aggregations=aggs, coordinates=data_collection)\n",
+    "query = DataQuery(\n",
+    "    search_query=search_query,\n",
+    "    source=[],\n",
+    "    limit=0,\n",
+    "    aggregations=aggs,\n",
+    "    coordinates=data_collection,\n",
+    ")\n",
     "summary = api.queries.run(query).outputs[\"data_aggs\"]\n",
     "\n",
-    "print(f\"We found {int(summary['num_tables']['value'])} tables containing 'net-zero emissions'.\")"
+    "print(\n",
+    "    f\"We found {int(summary['num_tables']['value'])} tables containing 'net-zero emissions'.\"\n",
+    ")"
    ]
   }
  ],
diff --git a/examples/document_bulk_upload/run_batch_upload.py b/examples/document_bulk_upload/run_batch_upload.py
index 046d3e5..b1e4868 100644
--- a/examples/document_bulk_upload/run_batch_upload.py
+++ b/examples/document_bulk_upload/run_batch_upload.py
@@ -30,10 +30,10 @@
 
 import argparse
 import asyncio
-import sys
 import logging
 import os.path
 import signal
+import sys
 import uuid
 from copy import deepcopy
 from enum import Enum
@@ -70,7 +70,12 @@ def __str__(self):
 
 
 async def upload_for_key_prefix(
-    api, coords, s3_credentials, key_prefix, raw_pages: bool, semaphore: asyncio.Semaphore
+    api,
+    coords,
+    s3_credentials,
+    key_prefix,
+    raw_pages: bool,
+    semaphore: asyncio.Semaphore,
 ):
     async with semaphore:  # This will limit the number of concurrent uploads
         task_id = None
@@ -80,7 +85,7 @@ async def upload_for_key_prefix(
 
             payload = {
                 "s3_source": {"coordinates": cos_coordinates_sub.dict()},
-                "target_settings": {"add_raw_pages": raw_pages}
+                "target_settings": {"add_raw_pages": raw_pages},
             }
             task_id = api.data_indices.upload_file(
                 coords=coords,
@@ -93,7 +98,9 @@ async def upload_for_key_prefix(
 
             request_status = await wait_for_task(api, coords, task_id)
 
-            logging.info(f"Report for {key_prefix} with task_id {task_id}: {request_status}")
+            logging.info(
+                f"Report for {key_prefix} with task_id {task_id}: {request_status}"
+            )
             return [key_prefix], request_status
         except Exception as e:
             logging.error(
@@ -102,13 +109,15 @@ async def upload_for_key_prefix(
             return [key_prefix], None
 
 
-async def upload_for_urls(api, coords, url_batch, raw_pages: bool, semaphore: asyncio.Semaphore):
+async def upload_for_urls(
+    api, coords, url_batch, raw_pages: bool, semaphore: asyncio.Semaphore
+):
     async with semaphore:  # This will limit the number of concurrent uploads
         task_id = None
         try:
             payload = {
                 "file_url": url_batch,
-                "target_settings": {"add_raw_pages": raw_pages}
+                "target_settings": {"add_raw_pages": raw_pages},
             }
             task_id = api.data_indices.upload_file(coords=coords, body=payload)
 
@@ -193,7 +202,7 @@ async def main():
         "-w",
         action=argparse.BooleanOptionalAction,
         default=False,
-        required=False
+        required=False,
     )
 
     # Parse the command-line arguments
@@ -219,7 +228,7 @@ async def main():
         raise argparse.ArgumentTypeError(
             "you must provide s3-credentials with input-type S3."
         )
-    
+
     save_file = args.resume_point if args.resume_point else args.input_file
     with open(save_file) as f:
         logging.info(f"Reading elements from {save_file}")
@@ -233,7 +242,9 @@ async def main():
 
     pending_items = elements
     save_elements(RESUME_FILENAME, pending_items)
-    logging.info(f"To resume this job later, provide --resume-point {RESUME_FILENAME} to the command line.")
+    logging.info(
+        f"To resume this job later, provide --resume-point {RESUME_FILENAME} to the command line."
+    )
 
     semaphore = asyncio.Semaphore(args.concurrency)
     signal.signal(signal.SIGTERM, handle_exit_signal)
@@ -249,7 +260,9 @@ async def main():
     if args.input_type == InputSource.S3:
         tasks = [
             loop.create_task(
-                upload_for_key_prefix(api, coords, s3_cred, prefix, args.raw_pages, semaphore)
+                upload_for_key_prefix(
+                    api, coords, s3_cred, prefix, args.raw_pages, semaphore
+                )
             )
             for prefix in pending_items
         ]
diff --git a/examples/document_conversion_custom_settings/convert_documents_custom.ipynb b/examples/document_conversion_custom_settings/convert_documents_custom.ipynb
index 0d6aeda..387b556 100644
--- a/examples/document_conversion_custom_settings/convert_documents_custom.ipynb
+++ b/examples/document_conversion_custom_settings/convert_documents_custom.ipynb
@@ -41,7 +41,7 @@
     "notebook_settings = ProjectNotebookSettings()\n",
     "\n",
     "PROFILE_NAME = notebook_settings.profile  # the profile to use\n",
-    "PROJ_KEY = notebook_settings.proj_key     # the project to use"
+    "PROJ_KEY = notebook_settings.proj_key  # the project to use"
    ]
   },
   {
@@ -65,8 +65,14 @@
    "outputs": [],
    "source": [
     "import deepsearch as ds\n",
-    "from deepsearch.documents.core.models import ConversionSettings, DefaultConversionModel, ProjectConversionModel, \\\n",
-    "    OCRSettings, AlpineOcrEngine, AlpineOcrLanguage"
+    "from deepsearch.documents.core.models import (\n",
+    "    ConversionSettings,\n",
+    "    DefaultConversionModel,\n",
+    "    ProjectConversionModel,\n",
+    "    OCRSettings,\n",
+    "    AlpineOcrEngine,\n",
+    "    AlpineOcrLanguage,\n",
+    ")"
    ]
   },
   {
@@ -142,11 +148,11 @@
     "    proj_key=PROJ_KEY,\n",
     "    source_path=\"../../data/samples/2206.01062.pdf\",\n",
     "    conversion_settings=cs,\n",
-    "    progress_bar=True\n",
-    ")           \n",
+    "    progress_bar=True,\n",
+    ")\n",
     "documents.download_all(result_dir=\"./converted_docs\")\n",
     "info = documents.generate_report(result_dir=\"./converted_docs\")\n",
-    "print(info) "
+    "print(info)"
    ]
   }
  ],
diff --git a/examples/document_conversion_extract_tables/extract_tables.ipynb b/examples/document_conversion_extract_tables/extract_tables.ipynb
index 2be4304..08bd481 100644
--- a/examples/document_conversion_extract_tables/extract_tables.ipynb
+++ b/examples/document_conversion_extract_tables/extract_tables.ipynb
@@ -35,7 +35,7 @@
     "notebook_settings = ProjectNotebookSettings()\n",
     "\n",
     "PROFILE_NAME = notebook_settings.profile  # the profile to use\n",
-    "PROJ_KEY = notebook_settings.proj_key     # the project to use\n",
+    "PROJ_KEY = notebook_settings.proj_key  # the project to use\n",
     "INPUT_FILE = Path(\"../../data/samples/2206.00785.pdf\")"
    ]
   },
@@ -76,17 +76,16 @@
    "outputs": [],
    "source": [
     "def get_tablecell_span(cell, ix):\n",
-    "    span = set([s[ix] for s in cell['spans']])\n",
+    "    span = set([s[ix] for s in cell[\"spans\"]])\n",
     "    if len(span) == 0:\n",
     "        return 1, None, None\n",
     "    return len(span), min(span), max(span)\n",
     "\n",
     "\n",
-    "\n",
     "def write_table(item):\n",
     "    \"\"\"\n",
     "    Convert the JSON table representation to HTML, including column and row spans.\n",
-    "    \n",
+    "\n",
     "    Parameters\n",
     "    ----------\n",
     "    item :\n",
@@ -96,44 +95,45 @@
     "    ncols : int, Default=3\n",
     "        Number of columns in the display table.\n",
     "    \"\"\"\n",
-    "    \n",
+    "\n",
     "    table = item\n",
     "    body = \"\"\n",
     "\n",
-    "    nrows = table['#-rows']\n",
-    "    ncols = table['#-cols']\n",
+    "    nrows = table[\"#-rows\"]\n",
+    "    ncols = table[\"#-cols\"]\n",
     "\n",
     "    body += \"<table>\\n\"\n",
     "    for i in range(nrows):\n",
     "        body += \"  <tr>\\n\"\n",
     "        for j in range(ncols):\n",
-    "            cell = table['data'][i][j]\n",
+    "            cell = table[\"data\"][i][j]\n",
     "\n",
-    "            rowspan,rowstart,rowend = get_tablecell_span(cell, 0)\n",
-    "            colspan,colstart,colend = get_tablecell_span(cell, 1)\n",
+    "            rowspan, rowstart, rowend = get_tablecell_span(cell, 0)\n",
+    "            colspan, colstart, colend = get_tablecell_span(cell, 1)\n",
     "\n",
-    "            if rowstart is not None and rowstart != i: continue\n",
-    "            if colstart is not None and colstart != j: continue\n",
+    "            if rowstart is not None and rowstart != i:\n",
+    "                continue\n",
+    "            if colstart is not None and colstart != j:\n",
+    "                continue\n",
     "\n",
     "            if rowstart is None:\n",
     "                rowstart = i\n",
     "            if colstart is None:\n",
     "                colstart = j\n",
     "\n",
-    "            content = cell['text']\n",
-    "            if content == '':\n",
-    "                content = '&nbsp;'\n",
+    "            content = cell[\"text\"]\n",
+    "            if content == \"\":\n",
+    "                content = \"&nbsp;\"\n",
     "\n",
-    "            label = cell['type']\n",
-    "            label_class = 'body'\n",
-    "            if label in ['row_header', 'row_multi_header', 'row_title']:\n",
-    "                label_class = 'header'\n",
-    "            elif label in ['col_header', 'col_multi_header']:\n",
-    "                label_class = 'header'\n",
-    "            \n",
-    "            \n",
-    "            celltag = 'th' if label_class == 'header' else 'td'\n",
-    "            style = 'style=\"text-align: center;\"' if label_class == 'header' else ''\n",
+    "            label = cell[\"type\"]\n",
+    "            label_class = \"body\"\n",
+    "            if label in [\"row_header\", \"row_multi_header\", \"row_title\"]:\n",
+    "                label_class = \"header\"\n",
+    "            elif label in [\"col_header\", \"col_multi_header\"]:\n",
+    "                label_class = \"header\"\n",
+    "\n",
+    "            celltag = \"th\" if label_class == \"header\" else \"td\"\n",
+    "            style = 'style=\"text-align: center;\"' if label_class == \"header\" else \"\"\n",
     "\n",
     "            body += f'    <{celltag} rowstart=\"{rowstart}\" colstart=\"{colstart}\" rowspan=\"{rowspan}\" colspan=\"{colspan}\" {style}>{content}</{celltag}>\\n'\n",
     "\n",
@@ -156,14 +156,13 @@
     "def visualize_document_tables(doc_jsondata):\n",
     "    \"\"\"\n",
     "    Visualize the tables idenfitied in the converted document.\n",
-    "    \n",
+    "\n",
     "    Parameters\n",
     "    ----------\n",
     "    doc_jsondata :\n",
     "        Converted document\n",
     "    \"\"\"\n",
     "\n",
-    "    \n",
     "    page_counters = {}\n",
     "    # Iterate through all the tables identified in the converted document\n",
     "    for table in doc_jsondata.get(\"tables\", []):\n",
@@ -171,10 +170,10 @@
     "        page = prov[\"page\"]\n",
     "        page_counters.setdefault(page, 0)\n",
     "        page_counters[page] += 1\n",
-    "        \n",
+    "\n",
     "        output_html = write_table(table)\n",
     "        display(Markdown(f\"## Table {page_counters[page]} on page {page}\"))\n",
-    "        display(HTML(output_html))   \n"
+    "        display(HTML(output_html))"
    ]
   },
   {
@@ -326,7 +325,9 @@
     }
    ],
    "source": [
-    "output_dir = tempfile.mkdtemp() # TODO: switch to tempfile.TemporaryDirectory() and use `with`\n",
+    "output_dir = (\n",
+    "    tempfile.mkdtemp()\n",
+    ")  # TODO: switch to tempfile.TemporaryDirectory() and use `with`\n",
     "\n",
     "documents.download_all(result_dir=output_dir, progress_bar=True)\n",
     "\n",
@@ -336,8 +337,8 @@
     "        for name in all_files:\n",
     "            if not name.endswith(\".json\"):\n",
     "                continue\n",
-    "            \n",
-    "            basename = name.rstrip('.json')\n",
+    "\n",
+    "            basename = name.rstrip(\".json\")\n",
     "            doc_jsondata = json.loads(archive.read(f\"{basename}.json\"))\n",
     "\n",
     "            visualize_document_tables(doc_jsondata)"
diff --git a/examples/document_conversion_quick_start/convert_documents.ipynb b/examples/document_conversion_quick_start/convert_documents.ipynb
index ae4447a..79810c7 100644
--- a/examples/document_conversion_quick_start/convert_documents.ipynb
+++ b/examples/document_conversion_quick_start/convert_documents.ipynb
@@ -41,7 +41,7 @@
     "notebook_settings = ProjectNotebookSettings()\n",
     "\n",
     "PROFILE_NAME = notebook_settings.profile  # the profile to use\n",
-    "PROJ_KEY = notebook_settings.proj_key     # the project to use\n",
+    "PROJ_KEY = notebook_settings.proj_key  # the project to use\n",
     "\n",
     "# default project_key = 1234567890abcdefghijklmnopqrstvwyz123456"
    ]
@@ -112,7 +112,7 @@
     "    api=api,\n",
     "    proj_key=PROJ_KEY,\n",
     "    source_path=\"../../data/samples/2206.01062.pdf\",\n",
-    "    progress_bar=True\n",
+    "    progress_bar=True,\n",
     ")\n",
     "documents.download_all(result_dir=output_dir)\n",
     "info = documents.generate_report(result_dir=output_dir)\n",
@@ -134,7 +134,7 @@
     "            if not name.endswith(\".json\"):\n",
     "                continue\n",
     "\n",
-    "            basename = name.rstrip('.json')\n",
+    "            basename = name.rstrip(\".json\")\n",
     "            doc_json = json.loads(archive.read(f\"{basename}.json\"))\n",
     "            doc_md = export_to_markdown(doc_json)\n",
     "\n",
@@ -217,10 +217,12 @@
    },
    "outputs": [],
    "source": [
-    "documents = ds.convert_documents(api=api,\n",
-    "                                 proj_key=PROJ_KEY,\n",
-    "                                 urls=\"https://arxiv.org/pdf/2206.00785.pdf\",\n",
-    "                                 progress_bar=True)"
+    "documents = ds.convert_documents(\n",
+    "    api=api,\n",
+    "    proj_key=PROJ_KEY,\n",
+    "    urls=\"https://arxiv.org/pdf/2206.00785.pdf\",\n",
+    "    progress_bar=True,\n",
+    ")"
    ]
   },
   {
@@ -237,7 +239,7 @@
    "source": [
     "# let's check what happened.\n",
     "# we generate a csv report about the conversion task and store it locally\n",
-    "result_dir = './converted_docs/'\n",
+    "result_dir = \"./converted_docs/\"\n",
     "info = documents.generate_report(result_dir=result_dir)\n",
     "print(info)"
    ]
@@ -320,10 +322,7 @@
    "source": [
     "# Process multiple urls\n",
     "documents = ds.convert_documents(\n",
-    "    api=api,\n",
-    "    proj_key=PROJ_KEY,\n",
-    "    urls= urls,\n",
-    "    progress_bar=True\n",
+    "    api=api, proj_key=PROJ_KEY, urls=urls, progress_bar=True\n",
     ")"
    ]
   },
@@ -368,7 +367,7 @@
     "    api=api,\n",
     "    proj_key=PROJ_KEY,\n",
     "    source_path=\"../../data/samples/2206.01062.pdf\",\n",
-    "    progress_bar=True\n",
+    "    progress_bar=True,\n",
     ")"
    ]
   },
@@ -393,10 +392,7 @@
    "outputs": [],
    "source": [
     "documents = ds.convert_documents(\n",
-    "    api=api,\n",
-    "    proj_key=PROJ_KEY,\n",
-    "    source_path=\"../../data/samples\",\n",
-    "    progress_bar=True\n",
+    "    api=api, proj_key=PROJ_KEY, source_path=\"../../data/samples\", progress_bar=True\n",
     ")"
    ]
   },
@@ -429,7 +425,7 @@
    "outputs": [],
    "source": [
     "# let's download all the converted documents:\n",
-    "documents.download_all(result_dir=result_dir,progress_bar=True)"
+    "documents.download_all(result_dir=result_dir, progress_bar=True)"
    ]
   },
   {
diff --git a/examples/document_conversion_visualize_bbox/visualize_bbox.ipynb b/examples/document_conversion_visualize_bbox/visualize_bbox.ipynb
index 7d68d3c..08d6b2e 100644
--- a/examples/document_conversion_visualize_bbox/visualize_bbox.ipynb
+++ b/examples/document_conversion_visualize_bbox/visualize_bbox.ipynb
@@ -58,7 +58,7 @@
     "notebook_settings = ProjectNotebookSettings()\n",
     "\n",
     "PROFILE_NAME = notebook_settings.profile  # the profile to use\n",
-    "PROJ_KEY = notebook_settings.proj_key     # the project to use\n",
+    "PROJ_KEY = notebook_settings.proj_key  # the project to use\n",
     "# INPUT_FILE = Path(\"../../data/samples/2206.01062.pdf\")\n",
     "INPUT_FILE = Path(\"../../data/samples/2206.00785.pdf\")\n",
     "\n",
@@ -122,7 +122,7 @@
     "        if isinstance(item, typing.Mapping) and not k in item:\n",
     "            print(f\"k={k} not found\")\n",
     "            return {}\n",
-    "        \n",
+    "\n",
     "        if isinstance(item, typing.List):\n",
     "            try:\n",
     "                k = int(k)\n",
@@ -147,7 +147,7 @@
     "\n",
     "    Parameters\n",
     "    ----------\n",
-    "    doc_jsondata : \n",
+    "    doc_jsondata :\n",
     "        Converted document\n",
     "\n",
     "    Returns\n",
@@ -166,11 +166,13 @@
     "            continue\n",
     "        page = item[\"prov\"][0][\"page\"]\n",
     "        item_type = item[\"type\"]\n",
-    "        clusters.setdefault(page, []).append({\n",
-    "            \"page\": page,\n",
-    "            \"type\": item_type,\n",
-    "            \"bbox\": item[\"prov\"][0][\"bbox\"],\n",
-    "        })\n",
+    "        clusters.setdefault(page, []).append(\n",
+    "            {\n",
+    "                \"page\": page,\n",
+    "                \"type\": item_type,\n",
+    "                \"bbox\": item[\"prov\"][0][\"bbox\"],\n",
+    "            }\n",
+    "        )\n",
     "    return clusters"
    ]
   },
@@ -188,7 +190,7 @@
     "\n",
     "    Parameters\n",
     "    ----------\n",
-    "    doc_cellsdata : \n",
+    "    doc_cellsdata :\n",
     "        Cells document provided by the Deep Search conversion\n",
     "\n",
     "    Returns\n",
@@ -201,13 +203,15 @@
     "\n",
     "    cells = {}\n",
     "    for item in doc_cellsdata[\"cells\"][\"data\"]:\n",
-    "        page = item[0]+1\n",
+    "        page = item[0] + 1\n",
     "        item_type = item[5]\n",
-    "        cells.setdefault(page, []).append({\n",
-    "            \"page\": page,\n",
-    "            \"type\": item_type,\n",
-    "            \"bbox\": item[1:5],\n",
-    "        })\n",
+    "        cells.setdefault(page, []).append(\n",
+    "            {\n",
+    "                \"page\": page,\n",
+    "                \"type\": item_type,\n",
+    "                \"bbox\": item[1:5],\n",
+    "            }\n",
+    "        )\n",
     "\n",
     "    return cells"
    ]
@@ -219,31 +223,32 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# The \n",
+    "# The\n",
     "labels_colors_clusters = {\n",
-    "    \"table\": ((240, 128, 128, 100), (255,0,0)),\n",
-    "    \"caption\": ((243, 156, 18, 100), (255,0,0)),\n",
-    "    \"citation\": ((14, 210, 234, 100), (255,0,0)),\n",
-    "    \"picture\": ((255, 236, 204, 100), (255,0,0)),\n",
-    "    \"formula\": ((128, 139, 150, 100), (255,0,0)),\n",
-    "    \"subtitle-level-1\": ((204, 51, 102, 100), (255,0,0)),\n",
-    "    \"paragraph\": ((234, 234, 43, 100), (255,0,0)),\n",
+    "    \"table\": ((240, 128, 128, 100), (255, 0, 0)),\n",
+    "    \"caption\": ((243, 156, 18, 100), (255, 0, 0)),\n",
+    "    \"citation\": ((14, 210, 234, 100), (255, 0, 0)),\n",
+    "    \"picture\": ((255, 236, 204, 100), (255, 0, 0)),\n",
+    "    \"formula\": ((128, 139, 150, 100), (255, 0, 0)),\n",
+    "    \"subtitle-level-1\": ((204, 51, 102, 100), (255, 0, 0)),\n",
+    "    \"paragraph\": ((234, 234, 43, 100), (255, 0, 0)),\n",
     "}\n",
     "\n",
     "labels_colors_cells = {\n",
-    "    \"table\": ((240, 128, 128, 100), (0,0,0,0)),\n",
-    "    \"caption\": ((243, 156, 18, 100), (0,0,0,0)),\n",
-    "    \"citation\": ((14, 210, 234, 100), (0,0,0,0)),\n",
-    "    \"picture\": ((255, 236, 204, 100), (0,0,0,0)),\n",
-    "    \"formula\": ((128, 139, 150, 100), (0,0,0,0)),\n",
-    "    \"subtitle-level-1\": ((204, 51, 102, 100), (0,0,0,0)),\n",
-    "    \"paragraph\": ((234, 234, 43, 100), (0,0,0,0)),\n",
+    "    \"table\": ((240, 128, 128, 100), (0, 0, 0, 0)),\n",
+    "    \"caption\": ((243, 156, 18, 100), (0, 0, 0, 0)),\n",
+    "    \"citation\": ((14, 210, 234, 100), (0, 0, 0, 0)),\n",
+    "    \"picture\": ((255, 236, 204, 100), (0, 0, 0, 0)),\n",
+    "    \"formula\": ((128, 139, 150, 100), (0, 0, 0, 0)),\n",
+    "    \"subtitle-level-1\": ((204, 51, 102, 100), (0, 0, 0, 0)),\n",
+    "    \"paragraph\": ((234, 234, 43, 100), (0, 0, 0, 0)),\n",
     "}\n",
     "\n",
+    "\n",
     "def draw_boxes(img, dims, boxes, colors_map={}):\n",
     "    \"\"\"\n",
     "    Draw bounding boxes on the input PIL Image `img`\n",
-    "    \n",
+    "\n",
     "    Parameters\n",
     "    ----------\n",
     "    img : Image\n",
@@ -264,12 +269,14 @@
     "        bbox = cluster[\"bbox\"]\n",
     "        rect = [\n",
     "            round(bbox[0]),\n",
-    "            round(dims[1]-bbox[3]),\n",
+    "            round(dims[1] - bbox[3]),\n",
     "            round(bbox[2]),\n",
-    "            round(dims[1]-bbox[1])\n",
+    "            round(dims[1] - bbox[1]),\n",
     "        ]\n",
-    "        \n",
-    "        c_fill, c_outline = colors_map.get(cluster[\"type\"].lower(), ((128,128,128,100), (0,0,0,0)))\n",
+    "\n",
+    "        c_fill, c_outline = colors_map.get(\n",
+    "            cluster[\"type\"].lower(), ((128, 128, 128, 100), (0, 0, 0, 0))\n",
+    "        )\n",
     "        drw.rectangle(rect, outline=c_outline, fill=c_fill)"
    ]
   },
@@ -283,7 +290,7 @@
     "def pdf_to_page_image(pdf_filename, page, resolution=72):\n",
     "    \"\"\"\n",
     "    Convert the page number `page` of the PDF document to an image\n",
-    "    \n",
+    "\n",
     "    Parameters\n",
     "    ----------\n",
     "    pdf_filename : Path\n",
@@ -305,9 +312,13 @@
     "            \"pdftoppm\",\n",
     "            \"-png\",\n",
     "            \"-singlefile\",\n",
-    "            \"-f\", str(page), \"-l\", str(page),\n",
+    "            \"-f\",\n",
+    "            str(page),\n",
+    "            \"-l\",\n",
+    "            str(page),\n",
     "            \"-cropbox\",\n",
-    "            \"-r\", str(resolution),\n",
+    "            \"-r\",\n",
+    "            str(resolution),\n",
     "            pdf_filename,\n",
     "            output_filename,\n",
     "        ]\n",
@@ -319,7 +330,7 @@
     "            ) from cpe\n",
     "        png_file = output_filename + \".png\"\n",
     "        img = Image.open(png_file)\n",
-    "        return img\n"
+    "        return img"
    ]
   },
   {
@@ -331,9 +342,9 @@
    "source": [
     "def visualize_document_bboxes(doc_jsondata, doc_cellsdata, ncols=3):\n",
     "    \"\"\"\n",
-    "    Visualize the document pages overlaying the PDF image with the \n",
+    "    Visualize the document pages overlaying the PDF image with the\n",
     "    bounding boxes of the text cells and the segmentation clusters.\n",
-    "    \n",
+    "\n",
     "    Parameters\n",
     "    ----------\n",
     "    doc_jsondata :\n",
@@ -343,11 +354,11 @@
     "    ncols : int, Default=3\n",
     "        Number of columns in the display table.\n",
     "    \"\"\"\n",
-    "    \n",
+    "\n",
     "    clusters = page_elements_from_json_document(doc_jsondata)\n",
     "    cells = page_elements_from_text_cells(doc_cellsdata)\n",
     "    pages_to_dims = {dims[\"page\"]: dims for dims in doc_jsondata[\"page-dimensions\"]}\n",
-    "    \n",
+    "\n",
     "    output_html = \"<table>\"\n",
     "    for i, page in enumerate(sorted(clusters.keys())):\n",
     "        dims = pages_to_dims[page][\"width\"], pages_to_dims[page][\"height\"]\n",
@@ -358,12 +369,14 @@
     "            img = pdf_to_page_image(INPUT_FILE, page=page)\n",
     "            img = img.resize((math.ceil(dims[0]), math.ceil(dims[1])))\n",
     "        else:\n",
-    "            img = Image.new(\"RGB\", (math.ceil(dims[0]), math.ceil(dims[1])), (255, 255, 255))\n",
+    "            img = Image.new(\n",
+    "                \"RGB\", (math.ceil(dims[0]), math.ceil(dims[1])), (255, 255, 255)\n",
+    "            )\n",
     "            img = img.resize((math.ceil(dims[0]), math.ceil(dims[1])))\n",
     "\n",
     "        # Draw page rectangle\n",
     "        drw = ImageDraw.Draw(img)\n",
-    "        drw.rectangle([0,0,dims[0]-1, dims[1]-1], outline=(0,0,0))\n",
+    "        drw.rectangle([0, 0, dims[0] - 1, dims[1] - 1], outline=(0, 0, 0))\n",
     "\n",
     "        # Draw bounding boxes\n",
     "        if SHOW_TEXT_CELLS_BOXES:\n",
@@ -376,7 +389,6 @@
     "        elif i % ncols == 0:\n",
     "            output_html += \"</tr><tr>\"\n",
     "\n",
-    "\n",
     "        buffer = io.BytesIO()\n",
     "        img.save(buffer, format=\"PNG\")\n",
     "        img_str = base64.b64encode(buffer.getvalue()).decode(\"utf8\")\n",
@@ -386,7 +398,6 @@
     "        output_html += f\"<img src='data:image/png;base64,{img_str}' />\"\n",
     "        output_html += \"</td>\"\n",
     "\n",
-    "\n",
     "    output_html += \"</tr></table>\"\n",
     "    display(HTML(output_html))"
    ]
@@ -430,7 +441,7 @@
     "# Launch the document conversion\n",
     "documents = ds.convert_documents(\n",
     "    api=api, proj_key=PROJ_KEY, source_path=INPUT_FILE, progress_bar=True\n",
-    ")\n"
+    ")"
    ]
   },
   {
@@ -449,7 +460,9 @@
    ],
    "source": [
     "# Download results\n",
-    "output_dir = tempfile.mkdtemp() # TODO: switch to tempfile.TemporaryDirectory() and use `with`\n",
+    "output_dir = (\n",
+    "    tempfile.mkdtemp()\n",
+    ")  # TODO: switch to tempfile.TemporaryDirectory() and use `with`\n",
     "documents.download_all(result_dir=output_dir, progress_bar=True)"
    ]
   },
@@ -480,10 +493,10 @@
     "        for name in all_files:\n",
     "            if not name.endswith(\".json\"):\n",
     "                continue\n",
-    "            \n",
-    "            basename = name.rstrip('.json')\n",
+    "\n",
+    "            basename = name.rstrip(\".json\")\n",
     "            doc_jsondata = json.loads(archive.read(f\"{basename}.json\"))\n",
-    "            doc_cellsdata = json.loads(archive.read(f\"{basename}.cells\"))                \n",
+    "            doc_cellsdata = json.loads(archive.read(f\"{basename}.cells\"))\n",
     "\n",
     "            visualize_document_bboxes(doc_jsondata, doc_cellsdata)"
    ]
diff --git a/examples/integration_argilla/argilla_upload.ipynb b/examples/integration_argilla/argilla_upload.ipynb
index 8998b8b..8497f08 100644
--- a/examples/integration_argilla/argilla_upload.ipynb
+++ b/examples/integration_argilla/argilla_upload.ipynb
@@ -56,7 +56,7 @@
     "notebook_settings = ProjectNotebookSettings()\n",
     "\n",
     "PROFILE_NAME = notebook_settings.profile  # the profile to use\n",
-    "PROJ_KEY = notebook_settings.proj_key     # the project to use\n",
+    "PROJ_KEY = notebook_settings.proj_key  # the project to use\n",
     "\n",
     "INPUT_FILE = Path(\"../../data/samples/2206.00785.pdf\")\n",
     "\n",
@@ -65,7 +65,7 @@
     "ARGILLA_API_KEY = os.environ[\"ARGILLA_API_KEY\"]  # required env var\n",
     "ARGILLA_DATASET = \"deepsearch-documents\"\n",
     "# Tokenization\n",
-    "SPACY_MODEL = \"en_core_web_sm\"\n"
+    "SPACY_MODEL = \"en_core_web_sm\""
    ]
   },
   {
diff --git a/examples/kg_download_quick_start/kg_download_quick_start.ipynb b/examples/kg_download_quick_start/kg_download_quick_start.ipynb
index 6c69173..bcdc525 100644
--- a/examples/kg_download_quick_start/kg_download_quick_start.ipynb
+++ b/examples/kg_download_quick_start/kg_download_quick_start.ipynb
@@ -47,7 +47,7 @@
     "PROFILE_NAME = notebook_settings.profile  # the profile to use\n",
     "PROJECT_KEY = notebook_settings.proj_key\n",
     "KG_KEY = notebook_settings.kg_key\n",
-    "BASE_DIR = './KG-data'"
+    "BASE_DIR = \"./KG-data\""
    ]
   },
   {
@@ -98,10 +98,12 @@
     "if not os.path.exists(BASE_DIR):\n",
     "    os.mkdir(BASE_DIR)\n",
     "\n",
-    "    \n",
+    "\n",
     "# Raise an error if the base directory is not empty\n",
     "if len(os.listdir(BASE_DIR)) > 0:\n",
-    "    raise ValueError(f'BASE_DIR must be empty but found the following contents: {os.listdir(BASE_DIR)}')"
+    "    raise ValueError(\n",
+    "        f\"BASE_DIR must be empty but found the following contents: {os.listdir(BASE_DIR)}\"\n",
+    "    )"
    ]
   },
   {
@@ -180,13 +182,15 @@
    ],
    "source": [
     "# Download the knowledge graph using urlopen\n",
-    "zipped_file_path = os.path.join(BASE_DIR, 'kg_data.tar.gz')\n",
+    "zipped_file_path = os.path.join(BASE_DIR, \"kg_data.tar.gz\")\n",
     "context = ssl.create_default_context()\n",
     "context.check_hostname = False\n",
-    "context.verify_mode=ssl.CERT_NONE\n",
+    "context.verify_mode = ssl.CERT_NONE\n",
     "\n",
-    "with open(zipped_file_path, 'wb+') as download_file, urlopen(download_url, context=context) as response:\n",
-    "    content_length = int(response.getheader('content-length'))\n",
+    "with open(zipped_file_path, \"wb+\") as download_file, urlopen(\n",
+    "    download_url, context=context\n",
+    ") as response:\n",
+    "    content_length = int(response.getheader(\"content-length\"))\n",
     "    with tqdm(total=100, position=0) as pbar:\n",
     "        for line in response:\n",
     "            download_file.write(line)\n",
@@ -210,9 +214,9 @@
    "outputs": [],
    "source": [
     "# Save the unzipped KG\n",
-    "unzipped_dir = os.path.join(BASE_DIR, 'unzipped_data')\n",
+    "unzipped_dir = os.path.join(BASE_DIR, \"unzipped_data\")\n",
     "os.mkdir(unzipped_dir)\n",
-    "with tarfile.open(zipped_file_path, 'r') as f:\n",
+    "with tarfile.open(zipped_file_path, \"r\") as f:\n",
     "    f.extractall(path=unzipped_dir)"
    ]
   },
@@ -273,7 +277,7 @@
    ],
    "source": [
     "# Get a list of all the files in the unzipped data\n",
-    "files = list(os.walk(os.path.join(BASE_DIR, 'unzipped_data')))[0][2]\n",
+    "files = list(os.walk(os.path.join(BASE_DIR, \"unzipped_data\")))[0][2]\n",
     "display(sorted(files))"
    ]
   },
@@ -299,7 +303,7 @@
     "    :param filepath: Path to the jsonl file\n",
     "    :return dataframe: A pandas DataFrame corresponding to the data stored in the file\n",
     "    \"\"\"\n",
-    "    with open(filepath, 'r') as f:\n",
+    "    with open(filepath, \"r\") as f:\n",
     "        data = pd.DataFrame([json.loads(line) for line in f])\n",
     "    return data"
    ]
@@ -336,7 +340,7 @@
    ],
    "source": [
     "# Show the first record in the materials file\n",
-    "materials = jsonl2df(os.path.join(BASE_DIR, 'unzipped_data', 'material.jsonl'))\n",
+    "materials = jsonl2df(os.path.join(BASE_DIR, \"unzipped_data\", \"material.jsonl\"))\n",
     "display(materials.iloc[0])"
    ]
   },
@@ -454,8 +458,16 @@
    ],
    "source": [
     "# Show the first few edges\n",
-    "edges = jsonl2df(os.path.join(BASE_DIR, 'unzipped_data', '_edges.jsonl'))\n",
-    "edges = edges[['source_collection', 'target_collection', 'source_hash', 'target_hash', 'symmetric']]\n",
+    "edges = jsonl2df(os.path.join(BASE_DIR, \"unzipped_data\", \"_edges.jsonl\"))\n",
+    "edges = edges[\n",
+    "    [\n",
+    "        \"source_collection\",\n",
+    "        \"target_collection\",\n",
+    "        \"source_hash\",\n",
+    "        \"target_hash\",\n",
+    "        \"symmetric\",\n",
+    "    ]\n",
+    "]\n",
     "display(edges.head())"
    ]
   },
@@ -507,15 +519,17 @@
    "outputs": [],
    "source": [
     "nodetypes = {\n",
-    "    'material': os.path.join(BASE_DIR, 'unzipped_data', 'material.jsonl'),\n",
-    "    'property': os.path.join(BASE_DIR, 'unzipped_data', 'property.jsonl')\n",
+    "    \"material\": os.path.join(BASE_DIR, \"unzipped_data\", \"material.jsonl\"),\n",
+    "    \"property\": os.path.join(BASE_DIR, \"unzipped_data\", \"property.jsonl\"),\n",
     "}\n",
     "\n",
     "for nodetype in nodetypes:\n",
     "    data = jsonl2df(nodetypes[nodetype])\n",
     "    hetero_kg[nodetype].x = torch.eye(data.shape[0])\n",
-    "    hetero_kg[nodetype]['_hash'] = dict((_hash, _idx) for _idx, _hash in enumerate(data['_hash'].to_list()))\n",
-    "    hetero_kg[nodetype]['_name'] = data['_name'].to_list()"
+    "    hetero_kg[nodetype][\"_hash\"] = dict(\n",
+    "        (_hash, _idx) for _idx, _hash in enumerate(data[\"_hash\"].to_list())\n",
+    "    )\n",
+    "    hetero_kg[nodetype][\"_name\"] = data[\"_name\"].to_list()"
    ]
   },
   {
@@ -536,17 +550,26 @@
    "outputs": [],
    "source": [
     "# Find the relevant edges\n",
-    "edges = jsonl2df(os.path.join(BASE_DIR, 'unzipped_data', '_edges.jsonl'))\n",
-    "edges = edges[(edges.source_collection == 'material') & (edges.target_collection == 'property')]\n",
-    "edges = [edges['source_hash'].to_list(), edges['target_hash'].to_list()]\n",
+    "edges = jsonl2df(os.path.join(BASE_DIR, \"unzipped_data\", \"_edges.jsonl\"))\n",
+    "edges = edges[\n",
+    "    (edges.source_collection == \"material\") & (edges.target_collection == \"property\")\n",
+    "]\n",
+    "edges = [edges[\"source_hash\"].to_list(), edges[\"target_hash\"].to_list()]\n",
     "\n",
     "# Create the edge index\n",
     "edge_index = []\n",
     "for hash_mat, hash_prop in zip(*edges):\n",
-    "    edge_index.append([hetero_kg['material']['_hash'][hash_mat], hetero_kg['property']['_hash'][hash_prop]])\n",
+    "    edge_index.append(\n",
+    "        [\n",
+    "            hetero_kg[\"material\"][\"_hash\"][hash_mat],\n",
+    "            hetero_kg[\"property\"][\"_hash\"][hash_prop],\n",
+    "        ]\n",
+    "    )\n",
     "\n",
     "# Add edge index to the KG\n",
-    "hetero_kg['material', 'mat2prop', 'property'].edge_index = torch.tensor(edge_index).long().t()\n",
+    "hetero_kg[\"material\", \"mat2prop\", \"property\"].edge_index = (\n",
+    "    torch.tensor(edge_index).long().t()\n",
+    ")\n",
     "\n",
     "# Make the graph undirected\n",
     "hetero_kg = ToUndirected()(hetero_kg)"
@@ -584,15 +607,15 @@
    ],
    "source": [
     "# Summarize the KG\n",
-    "print('Number of nodes')\n",
+    "print(\"Number of nodes\")\n",
     "for node_type in hetero_kg.node_types:\n",
-    "    print(f'\\t{node_type} -> {hetero_kg[node_type].num_nodes}')\n",
-    "print(f'Total number of nodes: {hetero_kg.num_nodes}')\n",
+    "    print(f\"\\t{node_type} -> {hetero_kg[node_type].num_nodes}\")\n",
+    "print(f\"Total number of nodes: {hetero_kg.num_nodes}\")\n",
     "\n",
-    "print('\\nNumber of edges')\n",
+    "print(\"\\nNumber of edges\")\n",
     "for edge_type in hetero_kg.edge_types:\n",
-    "    print(f'\\t{edge_type} -> {hetero_kg[edge_type].num_edges}')\n",
-    "print(f'Total number of edges: {hetero_kg.num_edges}')"
+    "    print(f\"\\t{edge_type} -> {hetero_kg[edge_type].num_edges}\")\n",
+    "print(f\"Total number of edges: {hetero_kg.num_edges}\")"
    ]
   },
   {
@@ -704,25 +727,34 @@
    ],
    "source": [
     "# Select materials to display\n",
-    "materials = ['perovskite/Si', 'O(2) Ti(1)', 'A(1) I(3) M(1) Pb(1)', 'O(1) Zn(1)']\n",
-    "mat_idx = [hetero_kg['material']['_name'].index(mat) for mat in materials]\n",
+    "materials = [\"perovskite/Si\", \"O(2) Ti(1)\", \"A(1) I(3) M(1) Pb(1)\", \"O(1) Zn(1)\"]\n",
+    "mat_idx = [hetero_kg[\"material\"][\"_name\"].index(mat) for mat in materials]\n",
     "\n",
     "# Get properties corresponding to each material\n",
     "properties = dict()\n",
     "for m_idx, material in zip(mat_idx, materials):\n",
-    "    current_edges = hetero_kg['material', 'mat2prop', 'property'].edge_index[0, :] == m_idx\n",
-    "    prop_idx = hetero_kg['material', 'mat2prop', 'property'].edge_index[1, current_edges]\n",
-    "    properties[material] = [hetero_kg['property']['_name'][idx] for idx in prop_idx.tolist()]\n",
-    "    \n",
+    "    current_edges = (\n",
+    "        hetero_kg[\"material\", \"mat2prop\", \"property\"].edge_index[0, :] == m_idx\n",
+    "    )\n",
+    "    prop_idx = hetero_kg[\"material\", \"mat2prop\", \"property\"].edge_index[\n",
+    "        1, current_edges\n",
+    "    ]\n",
+    "    properties[material] = [\n",
+    "        hetero_kg[\"property\"][\"_name\"][idx] for idx in prop_idx.tolist()\n",
+    "    ]\n",
+    "\n",
     "# Show up to four randomly chosen properties for each material\n",
     "df = pd.DataFrame()\n",
     "for mat, prop in properties.items():\n",
     "    # Restrict to four properties\n",
     "    if len(prop) > 4:\n",
     "        prop = [prop[idx] for idx in torch.randperm(len(prop)).tolist()[:4]]\n",
-    "    \n",
+    "\n",
     "    # Add the row to the dataframe\n",
-    "    curr_dict = dict([('material', [mat])] + [(f'Property{p_idx}', [p]) for p_idx, p in enumerate(prop)])\n",
+    "    curr_dict = dict(\n",
+    "        [(\"material\", [mat])]\n",
+    "        + [(f\"Property{p_idx}\", [p]) for p_idx, p in enumerate(prop)]\n",
+    "    )\n",
     "    curr_df = pd.DataFrame(curr_dict)\n",
     "    df = pd.concat([df, curr_df]).reset_index(drop=True)\n",
     "\n",
@@ -754,13 +786,17 @@
    ],
    "source": [
     "# Find properties linked to perovskite/Si\n",
-    "m_idx = hetero_kg['material']['_name'].index('perovskite/Si')\n",
-    "perovskite_edges = hetero_kg['material', 'mat2prop', 'property'].edge_index[0, :] == m_idx\n",
-    "prop_idx = hetero_kg['material', 'mat2prop', 'property'].edge_index[1, perovskite_edges]\n",
-    "properties = [hetero_kg['property']['_name'][idx] for idx in prop_idx.tolist()]\n",
+    "m_idx = hetero_kg[\"material\"][\"_name\"].index(\"perovskite/Si\")\n",
+    "perovskite_edges = (\n",
+    "    hetero_kg[\"material\", \"mat2prop\", \"property\"].edge_index[0, :] == m_idx\n",
+    ")\n",
+    "prop_idx = hetero_kg[\"material\", \"mat2prop\", \"property\"].edge_index[1, perovskite_edges]\n",
+    "properties = [hetero_kg[\"property\"][\"_name\"][idx] for idx in prop_idx.tolist()]\n",
     "\n",
     "# Check if the desired properties are linked\n",
-    "print(f'Is perovskite/Si linked to power conversion efficiency: {\"power conversion efficiency\" in properties}')\n",
+    "print(\n",
+    "    f'Is perovskite/Si linked to power conversion efficiency: {\"power conversion efficiency\" in properties}'\n",
+    ")\n",
     "print(f'Is perovskite/Si linked to band gap: {\"band gap\" in properties}')"
    ]
   },
diff --git a/examples/nlp_for_materials/nlp_for_materials.ipynb b/examples/nlp_for_materials/nlp_for_materials.ipynb
index 5d57cef..189d0cf 100644
--- a/examples/nlp_for_materials/nlp_for_materials.ipynb
+++ b/examples/nlp_for_materials/nlp_for_materials.ipynb
@@ -49,7 +49,7 @@
     "notebook_settings = ProjectNotebookSettings()\n",
     "\n",
     "PROFILE_NAME = notebook_settings.profile  # the profile to use\n",
-    "PROJ_KEY = notebook_settings.proj_key     # the project to use\n",
+    "PROJ_KEY = notebook_settings.proj_key  # the project to use\n",
     "\n",
     "# default project_key = 1234567890abcdefghijklmnopqrstvwyz123456"
    ]
@@ -151,11 +151,11 @@
     "    api=api,\n",
     "    proj_key=PROJ_KEY,\n",
     "    source_path=f\"../../data/samples/{fname}\",\n",
-    "    progress_bar=True\n",
-    ")           \n",
+    "    progress_bar=True,\n",
+    ")\n",
     "documents.download_all(result_dir=output_dir)\n",
     "info = documents.generate_report(result_dir=output_dir)\n",
-    "print(info) "
+    "print(info)"
    ]
   },
   {
@@ -181,23 +181,21 @@
     "        for name in all_files:\n",
     "            if not name.endswith(\".json\"):\n",
     "                continue\n",
-    "            \n",
-    "            #basename = name.rstrip('.json')\n",
+    "\n",
+    "            # basename = name.rstrip('.json')\n",
     "            doc_json = json.loads(archive.read(name))\n",
-    "            \n",
+    "\n",
     "            ofile = output_dir / name\n",
     "            print(f\"writing {ofile}\")\n",
     "            with ofile.open(\"w\") as fw:\n",
     "                fw.write(json.dumps(doc_json, indent=2))\n",
-    "                \n",
+    "\n",
     "            doc_md = export_to_markdown(doc_json)\n",
     "\n",
     "            ofile = output_dir / name.replace(\".json\", \".md\")\n",
     "            print(f\"writing {ofile}\")\n",
     "            with ofile.open(\"w\") as fw:\n",
-    "                fw.write(doc_md)\n",
-    "\n",
-    "            "
+    "                fw.write(doc_md)"
    ]
   },
   {
@@ -297,7 +295,7 @@
     "\n",
     "res = model.apply_on_doc(doc)\n",
     "\n",
-    "insts = pd.DataFrame(res[\"instances\"][\"data\"], columns=res[\"instances\"][\"headers\"])\n"
+    "insts = pd.DataFrame(res[\"instances\"][\"data\"], columns=res[\"instances\"][\"headers\"])"
    ]
   },
   {
@@ -321,9 +319,11 @@
     }
    ],
    "source": [
-    "#print(insts.columns)\n",
+    "# print(insts.columns)\n",
     "\n",
-    "materials = insts[(insts[\"type\"]==\"material\") & (insts[\"subtype\"]==\"complex_chemical\")][[\"type\", \"subtype\", \"name\", \"subj_path\"]]\n",
+    "materials = insts[\n",
+    "    (insts[\"type\"] == \"material\") & (insts[\"subtype\"] == \"complex_chemical\")\n",
+    "][[\"type\", \"subtype\", \"name\", \"subj_path\"]]\n",
     "print(materials.to_string())"
    ]
   },
@@ -549,16 +549,18 @@
     "\n",
     "\n",
     "# Input query\n",
-    "search_query = \"\\\"SUBSTITUTED 6-PHENYLNICOTINIC ACIDS AND THEIR USE\\\"\"\n",
-    "data_collection = ElasticDataCollectionSource(elastic_id=\"default\", index_key=\"patent-uspto\")\n",
+    "search_query = '\"SUBSTITUTED 6-PHENYLNICOTINIC ACIDS AND THEIR USE\"'\n",
+    "data_collection = ElasticDataCollectionSource(\n",
+    "    elastic_id=\"default\", index_key=\"patent-uspto\"\n",
+    ")\n",
     "page_size = 50\n",
     "\n",
     "# Prepare the data query\n",
     "query = DataQuery(\n",
-    "    search_query, # The search query to be executed\n",
-    "    #source=[\"description.title\", \"description.authors\", \"identifiers\"], # Which fields of documents we want to fetch\n",
-    "    limit=page_size, # The size of each request page\n",
-    "    coordinates=data_collection # The data collection to be queries\n",
+    "    search_query,  # The search query to be executed\n",
+    "    # source=[\"description.title\", \"description.authors\", \"identifiers\"], # Which fields of documents we want to fetch\n",
+    "    limit=page_size,  # The size of each request page\n",
+    "    coordinates=data_collection,  # The data collection to be queries\n",
     ")\n",
     "\n",
     "\n",
@@ -567,7 +569,9 @@
     "count_query.paginated_task.parameters[\"limit\"] = 0\n",
     "count_results = api.queries.run(count_query)\n",
     "expected_total = count_results.outputs[\"data_count\"]\n",
-    "expected_pages = (expected_total + page_size - 1) // page_size # this is simply a ceiling formula\n",
+    "expected_pages = (\n",
+    "    expected_total + page_size - 1\n",
+    ") // page_size  # this is simply a ceiling formula\n",
     "\n",
     "print(f\"#-found documents: \", count_results)\n",
     "\n",
@@ -579,7 +583,7 @@
     "    for row in result_page.outputs[\"data_outputs\"]:\n",
     "        documents.append(row[\"_source\"])\n",
     "\n",
-    "print(f'Finished fetching all data. Total is {len(documents)} records.')"
+    "print(f\"Finished fetching all data. Total is {len(documents)} records.\")"
    ]
   },
   {
@@ -694,27 +698,31 @@
     "model = init_nlp_model(\"language;term;material\")\n",
     "model.set_loglevel(\"INFO\")\n",
     "\n",
-    "max_items=5\n",
+    "max_items = 5\n",
     "\n",
     "for doc in documents:\n",
     "\n",
     "    dname = doc[\"file-info\"][\"filename\"]\n",
-    "    \n",
-    "    for i,item in enumerate(doc[\"main-text\"]):\n",
+    "\n",
+    "    for i, item in enumerate(doc[\"main-text\"]):\n",
     "\n",
     "        if \"text\" not in item:\n",
     "            continue\n",
     "\n",
-    "        if i>max_items:\n",
+    "        if i > max_items:\n",
     "            break\n",
-    "        \n",
+    "\n",
     "        res = model.apply_on_text(item[\"text\"])\n",
     "\n",
-    "        insts = pd.DataFrame(res[\"instances\"][\"data\"], columns=res[\"instances\"][\"headers\"])\n",
+    "        insts = pd.DataFrame(\n",
+    "            res[\"instances\"][\"data\"], columns=res[\"instances\"][\"headers\"]\n",
+    "        )\n",
     "\n",
-    "        materials = insts[insts[\"type\"]==\"material\"][[\"type\", \"subtype\", \"name\", \"subj_path\"]]\n",
+    "        materials = insts[insts[\"type\"] == \"material\"][\n",
+    "            [\"type\", \"subtype\", \"name\", \"subj_path\"]\n",
+    "        ]\n",
     "\n",
-    "        if len(materials)>0:\n",
+    "        if len(materials) > 0:\n",
     "            lines = wrapper.wrap(item[\"text\"])\n",
     "            print(f\"\\n {dname}: text-{i}\\n\")\n",
     "            print(\"\\n\".join(lines), \"\\n\")\n",
diff --git a/examples/nlp_for_references/nlp_for_references.ipynb b/examples/nlp_for_references/nlp_for_references.ipynb
index f6a5773..4f923f6 100644
--- a/examples/nlp_for_references/nlp_for_references.ipynb
+++ b/examples/nlp_for_references/nlp_for_references.ipynb
@@ -37,7 +37,7 @@
     "notebook_settings = ProjectNotebookSettings()\n",
     "\n",
     "PROFILE_NAME = notebook_settings.profile  # the profile to use\n",
-    "PROJ_KEY = notebook_settings.proj_key     # the project to use\n",
+    "PROJ_KEY = notebook_settings.proj_key  # the project to use\n",
     "\n",
     "# default project_key = 1234567890abcdefghijklmnopqrstvwyz123456"
    ]
@@ -164,11 +164,11 @@
     "    api=api,\n",
     "    proj_key=PROJ_KEY,\n",
     "    source_path=f\"../../data/samples/{fname}\",\n",
-    "    progress_bar=True\n",
-    ")           \n",
+    "    progress_bar=True,\n",
+    ")\n",
     "documents.download_all(result_dir=output_dir)\n",
     "info = documents.generate_report(result_dir=output_dir)\n",
-    "print(info) "
+    "print(info)"
    ]
   },
   {
@@ -194,23 +194,21 @@
     "        for name in all_files:\n",
     "            if not name.endswith(\".json\"):\n",
     "                continue\n",
-    "            \n",
-    "            #basename = name.rstrip('.json')\n",
+    "\n",
+    "            # basename = name.rstrip('.json')\n",
     "            doc_json = json.loads(archive.read(name))\n",
-    "            \n",
+    "\n",
     "            ofile = output_dir / name\n",
     "            print(f\"writing {ofile}\")\n",
     "            with ofile.open(\"w\") as fw:\n",
     "                fw.write(json.dumps(doc_json, indent=2))\n",
-    "                \n",
+    "\n",
     "            doc_md = export_to_markdown(doc_json)\n",
     "\n",
     "            ofile = output_dir / name.replace(\".json\", \".md\")\n",
     "            print(f\"writing {ofile}\")\n",
     "            with ofile.open(\"w\") as fw:\n",
-    "                fw.write(doc_md)\n",
-    "\n",
-    "            "
+    "                fw.write(doc_md)"
    ]
   },
   {
@@ -241,26 +239,25 @@
    "source": [
     "def resolve(path, doc):\n",
     "\n",
-    "    if len(path)>1 and path[0]==\"#\":\n",
+    "    if len(path) > 1 and path[0] == \"#\":\n",
     "        return resolve(path[1:], doc)\n",
-    "        \n",
-    "    if len(path)==1 and isinstance(doc, dict):\n",
+    "\n",
+    "    if len(path) == 1 and isinstance(doc, dict):\n",
     "        return doc[path[0]]\n",
     "\n",
-    "    elif len(path)==1 and isinstance(doc, list):\n",
+    "    elif len(path) == 1 and isinstance(doc, list):\n",
     "        ind = int(path[0])\n",
     "        return doc[ind]\n",
-    "    \n",
-    "    elif len(path)>1 and isinstance(doc, dict):\n",
+    "\n",
+    "    elif len(path) > 1 and isinstance(doc, dict):\n",
     "        return resolve(path[1:], doc[path[0]])\n",
     "\n",
-    "    elif len(path)>1 and isinstance(doc, list):\n",
+    "    elif len(path) > 1 and isinstance(doc, list):\n",
     "        ind = int(path[0])\n",
     "        return resolve(path[1:], doc[ind])\n",
     "\n",
     "    else:\n",
-    "        return None\n",
-    "    "
+    "        return None"
    ]
   },
   {
@@ -380,22 +377,21 @@
     "props = pd.DataFrame(res[\"properties\"][\"data\"], columns=res[\"properties\"][\"headers\"])\n",
     "insts = pd.DataFrame(res[\"instances\"][\"data\"], columns=res[\"instances\"][\"headers\"])\n",
     "\n",
-    "refs = props[(props[\"label\"]==\"reference\") & (props[\"confidence\"]>0.8)]\n",
+    "refs = props[(props[\"label\"] == \"reference\") & (props[\"confidence\"] > 0.8)]\n",
     "\n",
     "cnt = 0\n",
-    "for i,ref in refs.iterrows():\n",
-    "    #print(ref)\n",
+    "for i, ref in refs.iterrows():\n",
+    "    # print(ref)\n",
     "\n",
     "    item = resolve(ref[\"subj_path\"].split(\"/\"), res)\n",
     "    print(\"\\n\".join(textwrap.wrap(item[\"text\"], 70)))\n",
     "\n",
-    "    ents = insts[insts[\"subj_hash\"]==item[\"subj_hash\"]][[\"subtype\", \"name\"]]\n",
+    "    ents = insts[insts[\"subj_hash\"] == item[\"subj_hash\"]][[\"subtype\", \"name\"]]\n",
     "    print(\"\\nentities:\\n\", ents, \"\\n\\n\")\n",
     "\n",
-    "    \n",
-    "    cnt+=1\n",
-    "    if cnt>5:\n",
-    "        break\n"
+    "    cnt += 1\n",
+    "    if cnt > 5:\n",
+    "        break"
    ]
   },
   {
diff --git a/examples/nlp_on_documents/nlp_on_documents.ipynb b/examples/nlp_on_documents/nlp_on_documents.ipynb
index c52e660..507f193 100644
--- a/examples/nlp_on_documents/nlp_on_documents.ipynb
+++ b/examples/nlp_on_documents/nlp_on_documents.ipynb
@@ -49,7 +49,7 @@
     "notebook_settings = ProjectNotebookSettings()\n",
     "\n",
     "PROFILE_NAME = notebook_settings.profile  # the profile to use\n",
-    "PROJ_KEY = notebook_settings.proj_key     # the project to use\n",
+    "PROJ_KEY = notebook_settings.proj_key  # the project to use\n",
     "\n",
     "# default project_key = 1234567890abcdefghijklmnopqrstvwyz123456"
    ]
@@ -151,11 +151,11 @@
     "    api=api,\n",
     "    proj_key=PROJ_KEY,\n",
     "    source_path=f\"../../data/samples/{fname}\",\n",
-    "    progress_bar=True\n",
-    ")           \n",
+    "    progress_bar=True,\n",
+    ")\n",
     "documents.download_all(result_dir=output_dir)\n",
     "info = documents.generate_report(result_dir=output_dir)\n",
-    "print(info) "
+    "print(info)"
    ]
   },
   {
@@ -181,23 +181,21 @@
     "        for name in all_files:\n",
     "            if not name.endswith(\".json\"):\n",
     "                continue\n",
-    "            \n",
-    "            #basename = name.rstrip('.json')\n",
+    "\n",
+    "            # basename = name.rstrip('.json')\n",
     "            doc_json = json.loads(archive.read(name))\n",
-    "            \n",
+    "\n",
     "            ofile = output_dir / name\n",
     "            print(f\"writing {ofile}\")\n",
     "            with ofile.open(\"w\") as fw:\n",
     "                fw.write(json.dumps(doc_json, indent=2))\n",
-    "                \n",
+    "\n",
     "            doc_md = export_to_markdown(doc_json)\n",
     "\n",
     "            ofile = output_dir / name.replace(\".json\", \".md\")\n",
     "            print(f\"writing {ofile}\")\n",
     "            with ofile.open(\"w\") as fw:\n",
-    "                fw.write(doc_md)\n",
-    "\n",
-    "            "
+    "                fw.write(doc_md)"
    ]
   },
   {
@@ -266,33 +264,36 @@
     "with open(ifile) as fr:\n",
     "    doc = json.load(fr)\n",
     "\n",
-    "terms = [\"METAL\",\n",
-    "         \"COPPER\",\n",
-    "         \"COBALT\",\n",
-    "         \"TUNGSTEN\",\n",
-    "         \"MOLYBDENUM\",\n",
-    "         \"RUTHENIUM\",\n",
-    "         \"Self-assembly material\", \n",
-    "         \"Self-assembly molecular layer\",\n",
-    "         \"surface modification\", \n",
-    "         \"inhibitor\", \n",
-    "         \"corrosion inhibitor\", \n",
-    "         \"adsorption\", \"selectivity\", \n",
-    "         \"Anti-corrosion\", \n",
-    "         \"contact angle\",\n",
-    "         \"Area selective deposition\",\n",
-    "         \"Advanced interconnect metallization\",\n",
-    "         \"Integrated circuits\",\n",
-    "         \"Atomic layer deposition\"]\n",
-    "\n",
-    "term_hist = [ {\"key\":term, \"count\":0} for term in terms]\n",
-    "\n",
-    "for i,item in enumerate(doc[\"main-text\"]):\n",
+    "terms = [\n",
+    "    \"METAL\",\n",
+    "    \"COPPER\",\n",
+    "    \"COBALT\",\n",
+    "    \"TUNGSTEN\",\n",
+    "    \"MOLYBDENUM\",\n",
+    "    \"RUTHENIUM\",\n",
+    "    \"Self-assembly material\",\n",
+    "    \"Self-assembly molecular layer\",\n",
+    "    \"surface modification\",\n",
+    "    \"inhibitor\",\n",
+    "    \"corrosion inhibitor\",\n",
+    "    \"adsorption\",\n",
+    "    \"selectivity\",\n",
+    "    \"Anti-corrosion\",\n",
+    "    \"contact angle\",\n",
+    "    \"Area selective deposition\",\n",
+    "    \"Advanced interconnect metallization\",\n",
+    "    \"Integrated circuits\",\n",
+    "    \"Atomic layer deposition\",\n",
+    "]\n",
+    "\n",
+    "term_hist = [{\"key\": term, \"count\": 0} for term in terms]\n",
+    "\n",
+    "for i, item in enumerate(doc[\"main-text\"]):\n",
     "\n",
     "    if \"text\" not in item:\n",
     "        continue\n",
-    "    \n",
-    "    for j,term in enumerate(terms):\n",
+    "\n",
+    "    for j, term in enumerate(terms):\n",
     "        term_hist[j][\"count\"] += item[\"text\"].count(term.lower())\n",
     "\n",
     "df = pd.DataFrame(term_hist)\n",
@@ -336,7 +337,7 @@
     "from tabulate import tabulate\n",
     "\n",
     "models = load_pretrained_nlp_models()\n",
-    "#print(f\"models: {models}\")"
+    "# print(f\"models: {models}\")"
    ]
   },
   {
@@ -372,17 +373,17 @@
     "\n",
     "model = init_nlp_model(\"language;term\")\n",
     "\n",
-    "for i,item in enumerate(doc[\"main-text\"]):\n",
+    "for i, item in enumerate(doc[\"main-text\"]):\n",
     "\n",
     "    if \"text\" in item:\n",
     "        res = model.apply_on_text(item[\"text\"])\n",
-    "        #print(res.keys())\n",
+    "        # print(res.keys())\n",
     "\n",
-    "        #print(item[\"text\"])\n",
-    "        #print(tabulate(res[\"instances\"][\"data\"], \n",
+    "        # print(item[\"text\"])\n",
+    "        # print(tabulate(res[\"instances\"][\"data\"],\n",
     "        #               headers=res[\"instances\"][\"headers\"]))\n",
     "\n",
-    "    if i>10:\n",
+    "    if i > 10:\n",
     "        break\n",
     "\n",
     "\n",
@@ -390,7 +391,7 @@
     "\n",
     "df = pd.DataFrame(res[\"instances\"][\"data\"], columns=res[\"instances\"][\"headers\"])\n",
     "\n",
-    "terms = df[df[\"type\"]==\"term\"][[\"type\", \"name\", \"subj_path\"]]\n",
+    "terms = df[df[\"type\"] == \"term\"][[\"type\", \"name\", \"subj_path\"]]\n",
     "print(terms)"
    ]
   },
@@ -625,15 +626,15 @@
    ],
    "source": [
     "nodes = read_nodes_in_dataframe(\"./glm/nodes.csv\")\n",
-    "#print(nodes)\n",
+    "# print(nodes)\n",
     "\n",
     "# Get all terms of the document\n",
-    "terms = nodes[nodes[\"name\"]==\"term\"][[\"total-count\", \"nodes-text\"]]\n",
+    "terms = nodes[nodes[\"name\"] == \"term\"][[\"total-count\", \"nodes-text\"]]\n",
     "print(terms)\n",
     "\n",
     "# Get all terms of the document with `composition`\n",
     "res = expand_terms(glm, \"composition\")\n",
-    "#show_query_result(res)\n",
+    "# show_query_result(res)\n",
     "\n",
     "last_result = res[\"result\"][-1][\"nodes\"]\n",
     "expanded_terms = pd.DataFrame(last_result[\"data\"], columns=last_result[\"headers\"])\n",
diff --git a/examples/qa_doc_collection/doc_collection_qa.ipynb b/examples/qa_doc_collection/doc_collection_qa.ipynb
index 193fde3..c1150b6 100644
--- a/examples/qa_doc_collection/doc_collection_qa.ipynb
+++ b/examples/qa_doc_collection/doc_collection_qa.ipynb
@@ -1,983 +1,998 @@
 {
-    "cells": [
-        {
-            "cell_type": "markdown",
-            "id": "99f717ef-4cba-4300-b258-0b1c248cb873",
-            "metadata": {},
-            "source": [
-                "# RAG and Semantic Retrieval on a Document Collection\n",
-                "\n",
-                "Deep Search allows users to interact with the documents using conversational AI, i.e. you interact with a virtual assistant which answer your questions using the information in the corpus.\n",
-                "\n",
-                "In this example we demonstrate how achive the same interaction programmatically.\n",
-                "\n",
-                "### Access required\n",
-                "\n",
-                "The content of this notebook requires access to Deep Search capabilities which are not\n",
-                "available on the public access system.\n",
-                "\n",
-                "[Contact us](https://ds4sd.github.io) if you are interested in exploring\n",
-                "these Deep Search capabilities.\n",
-                "\n",
-                "\n",
-                "### GenAI Integration required\n",
-                "\n",
-                "When interacting with the virtual assistant, Deep Search requires a connection to a Generative AI API. Currently, we support connections to [watsonx.ai](https://www.ibm.com/products/watsonx-ai) or the IBM-internal GenAI platform BAM.\n",
-                "\n",
-                "Deep Search allows custom GenAI configurations for each project.\n",
-                "In the following example you will require to work in a project which has such GenAI capabilities activated."
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "id": "256aef50-71a1-4278-9b22-17cb99a6566e",
-            "metadata": {},
-            "source": [
-                "### Set notebook parameters\n"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 1,
-            "id": "5b244bdd-1b52-41ff-b63e-9a203570d210",
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "from dsnotebooks.settings import CollQANotebookSettings\n",
-                "\n",
-                "# notebooks settings auto-loaded from .env / env vars\n",
-                "notebook_settings = CollQANotebookSettings()\n",
-                "\n",
-                "PROFILE_NAME = notebook_settings.profile      # the profile to use\n",
-                "PROJ_KEY = notebook_settings.proj_key         # the project to use\n",
-                "INDEX_KEY = notebook_settings.sem_on_idx_key  # the collection to use\n",
-                "\n",
-                "SKIP_INGESTED_DOCS = notebook_settings.skip_ingested_docs  # whether to skip any already semantically ingested docs\n",
-                "\n",
-                "RETR_K = notebook_settings.retr_k             # the number of search results to retrieve\n",
-                "TEXT_WEIGHT = notebook_settings.text_weight   # the weight of lexical search (0.0: semantic-only, 1.0: lexical-only, anything in between: hybrid search)\n",
-                "RERANK = notebook_settings.rerank             # whether to rerank the search results\n",
-                "RAISE = notebook_settings.raise_on_sem_err    # whether semantic operation errors should raise an exception or be reflected in response fields"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "id": "a5269060-bb5f-4fe3-9b64-547202db6714",
-            "metadata": {},
-            "source": [
-                "### Import example dependencies"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 2,
-            "id": "5d236ea0-db1c-4171-8e11-cdd0bad69d66",
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "# Import standard dependenices\n",
-                "import pandas as pd\n",
-                "import rich\n",
-                "\n",
-                "# IPython utilities\n",
-                "from IPython.display import display, Markdown\n",
-                "\n",
-                "# Import the deepsearch-toolkit\n",
-                "from deepsearch.cps.client.api import CpsApi\n",
-                "from deepsearch.cps.client.components.elastic import ElasticProjectDataCollectionSource\n",
-                "from deepsearch.cps.queries import DataQuery, RAGQuery, SemanticQuery\n",
-                "from deepsearch.cps.queries.results import RAGResult, SearchResult, SearchResultItem\n"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "id": "293c249b-6018-46f2-b4d8-795f994d4729",
-            "metadata": {},
-            "source": [
-                "### Connect to Deep Search"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 3,
-            "id": "9c108432-a285-4c7b-a996-008ac3ff3d7a",
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "api = CpsApi.from_env(profile_name=PROFILE_NAME)"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "### Utils"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 4,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "def render_provenance_url(\n",
-                "        api: CpsApi,\n",
-                "        coords: ElasticProjectDataCollectionSource,\n",
-                "        retr_item: SearchResultItem,\n",
-                "):\n",
-                "    ## compute URL to the document in the Deep Search UI\n",
-                "    item_index = int(retr_item.main_path[retr_item.main_path.rfind(\".\")+1:])\n",
-                "    doc_url = api.documents.generate_url(\n",
-                "        document_hash=retr_item.doc_hash,\n",
-                "        data_source=coords,\n",
-                "        item_index=item_index,\n",
-                "    )\n",
-                "    display(Markdown(f\"The provenance of the answer can be inspected on the [source document]({doc_url}).\"))"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "id": "38cde869-46d1-4833-8eb3-2381b5e5fb68",
-            "metadata": {},
-            "source": [
-                "---"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "Prepare the collection coordinates:"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 5,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "coll_coords = ElasticProjectDataCollectionSource(\n",
-                "    proj_key=PROJ_KEY,\n",
-                "    index_key=INDEX_KEY,\n",
-                ")"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "We are using a small collection, so we can just list its documents to get an idea of its contents (for more details on querying, check the [Data Query Quick Start](https://github.com/DS4SD/deepsearch-examples/tree/main/examples/data_query_quick_start))."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 6,
-            "id": "2b38875e-f39c-4dd5-9d42-3ffca5d0bdac",
-            "metadata": {},
-            "outputs": [
-                {
-                    "name": "stdout",
-                    "output_type": "stream",
-                    "text": [
-                        "Finished fetching all data. Total is 10 records.\n"
-                    ]
-                },
-                {
-                    "data": {
-                        "text/html": [
-                            "<div>\n",
-                            "<style scoped>\n",
-                            "    .dataframe tbody tr th:only-of-type {\n",
-                            "        vertical-align: middle;\n",
-                            "    }\n",
-                            "\n",
-                            "    .dataframe tbody tr th {\n",
-                            "        vertical-align: top;\n",
-                            "    }\n",
-                            "\n",
-                            "    .dataframe thead th {\n",
-                            "        text-align: right;\n",
-                            "    }\n",
-                            "</style>\n",
-                            "<table border=\"1\" class=\"dataframe\">\n",
-                            "  <thead>\n",
-                            "    <tr style=\"text-align: right;\">\n",
-                            "      <th></th>\n",
-                            "      <th>Filename</th>\n",
-                            "      <th>DocHash</th>\n",
-                            "    </tr>\n",
-                            "  </thead>\n",
-                            "  <tbody>\n",
-                            "    <tr>\n",
-                            "      <th>0</th>\n",
-                            "      <td>natural-language-processing.pdf</td>\n",
-                            "      <td>000f892ddcc67f165797a96e94f44fb9e0697c7912a383...</td>\n",
-                            "    </tr>\n",
-                            "    <tr>\n",
-                            "      <th>1</th>\n",
-                            "      <td>ibm-z.pdf</td>\n",
-                            "      <td>07e56eb5a10f725fccad9386d126b7b05bec1fa71b9b3d...</td>\n",
-                            "    </tr>\n",
-                            "    <tr>\n",
-                            "      <th>2</th>\n",
-                            "      <td>ibm.pdf</td>\n",
-                            "      <td>234bc5cf2c860d49574b0ff7191c354b7bbc11472a0997...</td>\n",
-                            "    </tr>\n",
-                            "    <tr>\n",
-                            "      <th>3</th>\n",
-                            "      <td>ibm-the-great-mind-challenge.pdf</td>\n",
-                            "      <td>335120a57b418655196e3315b562a2f9e89cedeaef9318...</td>\n",
-                            "    </tr>\n",
-                            "    <tr>\n",
-                            "      <th>4</th>\n",
-                            "      <td>turing-award.pdf</td>\n",
-                            "      <td>8a7c91a269abc3063df9f4e19f7961ddb8e2393fa0f272...</td>\n",
-                            "    </tr>\n",
-                            "    <tr>\n",
-                            "      <th>5</th>\n",
-                            "      <td>ibm-research.pdf</td>\n",
-                            "      <td>b30bc667a324ae111d025526563b674a8d3fd869bc07c8...</td>\n",
-                            "    </tr>\n",
-                            "    <tr>\n",
-                            "      <th>6</th>\n",
-                            "      <td>artificial-intelligence.pdf</td>\n",
-                            "      <td>b60a87c1d62a59d517f2fd6f2d3ea1a96c58b651332a8b...</td>\n",
-                            "    </tr>\n",
-                            "    <tr>\n",
-                            "      <th>7</th>\n",
-                            "      <td>machine-learning.pdf</td>\n",
-                            "      <td>e470e7b42a92c8e5f25094362361947b9203e0074c2223...</td>\n",
-                            "    </tr>\n",
-                            "    <tr>\n",
-                            "      <th>8</th>\n",
-                            "      <td>deep-blue-chess-computer.pdf</td>\n",
-                            "      <td>fa7ce2f66a7a5e061813d36348425f81d9e7ebc23454d8...</td>\n",
-                            "    </tr>\n",
-                            "    <tr>\n",
-                            "      <th>9</th>\n",
-                            "      <td>red-hat.pdf</td>\n",
-                            "      <td>fb53bb607f2e9642d7fe044585d1dcdb052c57febe1b87...</td>\n",
-                            "    </tr>\n",
-                            "  </tbody>\n",
-                            "</table>\n",
-                            "</div>"
-                        ],
-                        "text/plain": [
-                            "                           Filename  \\\n",
-                            "0   natural-language-processing.pdf   \n",
-                            "1                         ibm-z.pdf   \n",
-                            "2                           ibm.pdf   \n",
-                            "3  ibm-the-great-mind-challenge.pdf   \n",
-                            "4                  turing-award.pdf   \n",
-                            "5                  ibm-research.pdf   \n",
-                            "6       artificial-intelligence.pdf   \n",
-                            "7              machine-learning.pdf   \n",
-                            "8      deep-blue-chess-computer.pdf   \n",
-                            "9                       red-hat.pdf   \n",
-                            "\n",
-                            "                                             DocHash  \n",
-                            "0  000f892ddcc67f165797a96e94f44fb9e0697c7912a383...  \n",
-                            "1  07e56eb5a10f725fccad9386d126b7b05bec1fa71b9b3d...  \n",
-                            "2  234bc5cf2c860d49574b0ff7191c354b7bbc11472a0997...  \n",
-                            "3  335120a57b418655196e3315b562a2f9e89cedeaef9318...  \n",
-                            "4  8a7c91a269abc3063df9f4e19f7961ddb8e2393fa0f272...  \n",
-                            "5  b30bc667a324ae111d025526563b674a8d3fd869bc07c8...  \n",
-                            "6  b60a87c1d62a59d517f2fd6f2d3ea1a96c58b651332a8b...  \n",
-                            "7  e470e7b42a92c8e5f25094362361947b9203e0074c2223...  \n",
-                            "8  fa7ce2f66a7a5e061813d36348425f81d9e7ebc23454d8...  \n",
-                            "9  fb53bb607f2e9642d7fe044585d1dcdb052c57febe1b87...  "
-                        ]
-                    },
-                    "metadata": {},
-                    "output_type": "display_data"
-                }
-            ],
-            "source": [
-                "# Prepare the data query\n",
-                "query = DataQuery(\n",
-                "    search_query=\"*\",  # The search query to be executed\n",
-                "    source=[           # Which fields of documents we want to fetch\n",
-                "            \"file-info.document-hash\",\n",
-                "            \"file-info.filename\",\n",
-                "            # \"description.title\",\n",
-                "    ],\n",
-                "    coordinates=coll_coords,  # The data collection to be queries\n",
-                ")\n",
-                "\n",
-                "# Query Deep Search for the documents matching the query\n",
-                "results = []\n",
-                "query_results = api.queries.run(query)\n",
-                "for row in query_results.outputs[\"data_outputs\"]:\n",
-                "        # Add row to results table\n",
-                "        results.append({\n",
-                "            \"Filename\": row[\"_source\"][\"file-info\"][\"filename\"],\n",
-                "            \"DocHash\": row[\"_source\"][\"file-info\"][\"document-hash\"],\n",
-                "            # \"Title\": row[\"_source\"].get(\"description\", {}).get(\"title\"),\n",
-                "        })\n",
-                "\n",
-                "print(f'Finished fetching all data. Total is {len(results)} records.')\n",
-                "\n",
-                "# Visualize the table with all results\n",
-                "df = pd.json_normalize(results)\n",
-                "display(df)"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "## Prepare source"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 7,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "from deepsearch.cps.client.components.documents import PrivateDataCollectionSource, PrivateDataDocumentSource, PublicDataDocumentSource\n",
-                "\n",
-                "data_source = PrivateDataCollectionSource(\n",
-                "    source=coll_coords,\n",
-                ")"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "## Ingestion"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "In the cell below we show how to semantically index your collection (indexing of already indexed docs is controlled via param `skip_ingested_docs`):"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 8,
-            "metadata": {},
-            "outputs": [
-                {
-                    "name": "stderr",
-                    "output_type": "stream",
-                    "text": [
-                        "/Users/pva/work/github.com/DS4SD/deepsearch-examples/.venv/lib/python3.10/site-packages/pydantic/main.py:314: UserWarning: Pydantic serializer warnings:\n",
-                        "  Expected `list[str]` but got `_LiteralGenericAlias` - serialized value may not be as expected\n",
-                        "  return self.__pydantic_serializer__.to_python(\n"
-                    ]
-                },
-                {
-                    "data": {
-                        "text/plain": [
-                            "{'ing_out': {}}"
-                        ]
-                    },
-                    "execution_count": 8,
-                    "metadata": {},
-                    "output_type": "execute_result"
-                }
-            ],
-            "source": [
-                "# launch the ingestion of the collection for DocumentQA\n",
-                "task = api.documents.semantic_ingest(\n",
-                "    project=PROJ_KEY,\n",
-                "    data_source=data_source,\n",
-                "    skip_ingested_docs=SKIP_INGESTED_DOCS,\n",
-                ")\n",
-                "\n",
-                "# wait for the ingestion task to finish\n",
-                "api.tasks.wait_for(PROJ_KEY, task.task_id)"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "## RAG"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 9,
-            "id": "ee573e76-98ea-43ce-a2ba-a81f64b3adf3",
-            "metadata": {},
-            "outputs": [
-                {
-                    "data": {
-                        "text/html": [
-                            "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">RAGResult</span><span style=\"font-weight: bold\">(</span>\n",
-                            "    <span style=\"color: #808000; text-decoration-color: #808000\">answers</span>=<span style=\"font-weight: bold\">[</span>\n",
-                            "        <span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">RAGAnswerItem</span><span style=\"font-weight: bold\">(</span>\n",
-                            "            <span style=\"color: #808000; text-decoration-color: #808000\">answer</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'The IBM lab in Zurich is located in Rüschlikon, Switzerland.'</span>,\n",
-                            "            <span style=\"color: #808000; text-decoration-color: #808000\">grounding</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">RAGGroundingInfo</span><span style=\"font-weight: bold\">(</span>\n",
-                            "                <span style=\"color: #808000; text-decoration-color: #808000\">retr_items</span>=<span style=\"font-weight: bold\">[</span>\n",
-                            "                    <span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">SearchResultItem</span><span style=\"font-weight: bold\">(</span>\n",
-                            "                        <span style=\"color: #808000; text-decoration-color: #808000\">doc_hash</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c'</span>,\n",
-                            "                        <span style=\"color: #808000; text-decoration-color: #808000\">chunk</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'Switzerland\\nIBM Research-Zurich (previously called IBM Zurich Research Laboratory, </span>\n",
-                            "<span style=\"color: #008000; text-decoration-color: #008000\">ZRL) is the European branch of IBM Research. It was opened in 1956 and is located in Rüschlikon, near Zurich, </span>\n",
-                            "<span style=\"color: #008000; text-decoration-color: #008000\">Switzerland.'</span>,\n",
-                            "                        <span style=\"color: #808000; text-decoration-color: #808000\">main_path</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'main-text.70'</span>,\n",
-                            "                        <span style=\"color: #808000; text-decoration-color: #808000\">path_group</span>=<span style=\"font-weight: bold\">[</span><span style=\"color: #008000; text-decoration-color: #008000\">'main-text.69'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.70'</span><span style=\"font-weight: bold\">]</span>,\n",
-                            "                        <span style=\"color: #808000; text-decoration-color: #808000\">source_is_text</span>=<span style=\"color: #00ff00; text-decoration-color: #00ff00; font-style: italic\">True</span>\n",
-                            "                    <span style=\"font-weight: bold\">)</span>\n",
-                            "                <span style=\"font-weight: bold\">]</span>,\n",
-                            "                <span style=\"color: #808000; text-decoration-color: #808000\">gen_ctx_paths</span>=<span style=\"font-weight: bold\">[</span>\n",
-                            "                    <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.58'</span>,\n",
-                            "                    <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.59'</span>,\n",
-                            "                    <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.60'</span>,\n",
-                            "                    <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.61'</span>,\n",
-                            "                    <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.62'</span>,\n",
-                            "                    <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.63'</span>,\n",
-                            "                    <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.64'</span>,\n",
-                            "                    <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.65'</span>,\n",
-                            "                    <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.66'</span>,\n",
-                            "                    <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.67'</span>,\n",
-                            "                    <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.68'</span>,\n",
-                            "                    <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.69'</span>,\n",
-                            "                    <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.70'</span>,\n",
-                            "                    <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.71'</span>,\n",
-                            "                    <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.72'</span>,\n",
-                            "                    <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.73'</span>,\n",
-                            "                    <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.74'</span>,\n",
-                            "                    <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.75'</span>,\n",
-                            "                    <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.76'</span>\n",
-                            "                <span style=\"font-weight: bold\">]</span>\n",
-                            "            <span style=\"font-weight: bold\">)</span>,\n",
-                            "            <span style=\"color: #808000; text-decoration-color: #808000\">prompt</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>\n",
-                            "        <span style=\"font-weight: bold\">)</span>\n",
-                            "    <span style=\"font-weight: bold\">]</span>,\n",
-                            "    <span style=\"color: #808000; text-decoration-color: #808000\">search_result_items</span>=<span style=\"font-weight: bold\">[</span>\n",
-                            "        <span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">SearchResultItem</span><span style=\"font-weight: bold\">(</span>\n",
-                            "            <span style=\"color: #808000; text-decoration-color: #808000\">doc_hash</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c'</span>,\n",
-                            "            <span style=\"color: #808000; text-decoration-color: #808000\">chunk</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'Switzerland\\nIBM Research-Zurich (previously called IBM Zurich Research Laboratory, ZRL) is the </span>\n",
-                            "<span style=\"color: #008000; text-decoration-color: #008000\">European branch of IBM Research. It was opened in 1956 and is located in Rüschlikon, near Zurich, Switzerland.'</span>,\n",
-                            "            <span style=\"color: #808000; text-decoration-color: #808000\">main_path</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'main-text.70'</span>,\n",
-                            "            <span style=\"color: #808000; text-decoration-color: #808000\">path_group</span>=<span style=\"font-weight: bold\">[</span><span style=\"color: #008000; text-decoration-color: #008000\">'main-text.69'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.70'</span><span style=\"font-weight: bold\">]</span>,\n",
-                            "            <span style=\"color: #808000; text-decoration-color: #808000\">source_is_text</span>=<span style=\"color: #00ff00; text-decoration-color: #00ff00; font-style: italic\">True</span>\n",
-                            "        <span style=\"font-weight: bold\">)</span>,\n",
-                            "        <span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">SearchResultItem</span><span style=\"font-weight: bold\">(</span>\n",
-                            "            <span style=\"color: #808000; text-decoration-color: #808000\">doc_hash</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c'</span>,\n",
-                            "            <span style=\"color: #808000; text-decoration-color: #808000\">chunk</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'Switzerland\\nIn 1956, IBM opened their first European research laboratory in Adliswil, </span>\n",
-                            "<span style=\"color: #008000; text-decoration-color: #008000\">Switzerland, near Zurich. The lab moved to its own campus in neighboring Rüschlikon in 1962. The Zurich lab is </span>\n",
-                            "<span style=\"color: #008000; text-decoration-color: #008000\">staffed by a multicultural and interdisciplinary team of a few hundred permanent research staff members, graduate </span>\n",
-                            "<span style=\"color: #008000; text-decoration-color: #008000\">students and post-doctoral fellows, representing about 45 nationalities. Collocated with the lab is a Client Center</span>\n",
-                            "<span style=\"color: #008000; text-decoration-color: #008000\">(formerly the Industry Solutions Lab), an executive briefing facility demonstrating technology prototypes and </span>\n",
-                            "<span style=\"color: #008000; text-decoration-color: #008000\">solutions.'</span>,\n",
-                            "            <span style=\"color: #808000; text-decoration-color: #808000\">main_path</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'main-text.71'</span>,\n",
-                            "            <span style=\"color: #808000; text-decoration-color: #808000\">path_group</span>=<span style=\"font-weight: bold\">[</span><span style=\"color: #008000; text-decoration-color: #008000\">'main-text.69'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.71'</span><span style=\"font-weight: bold\">]</span>,\n",
-                            "            <span style=\"color: #808000; text-decoration-color: #808000\">source_is_text</span>=<span style=\"color: #00ff00; text-decoration-color: #00ff00; font-style: italic\">True</span>\n",
-                            "        <span style=\"font-weight: bold\">)</span>,\n",
-                            "        <span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">SearchResultItem</span><span style=\"font-weight: bold\">(</span>\n",
-                            "            <span style=\"color: #808000; text-decoration-color: #808000\">doc_hash</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c'</span>,\n",
-                            "            <span style=\"color: #808000; text-decoration-color: #808000\">chunk</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'Switzerland\\nThe research projects pursued at the IBM Zurich lab are organized into four </span>\n",
-                            "<span style=\"color: #008000; text-decoration-color: #008000\">scientific and technical departments: Science &amp; Technology, Cloud and AI Systems Research, Cognitive Computing &amp; </span>\n",
-                            "<span style=\"color: #008000; text-decoration-color: #008000\">Industry Solutions and Security Research. The lab is currently managed by Alessandro Curioni.'</span>,\n",
-                            "            <span style=\"color: #808000; text-decoration-color: #808000\">main_path</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'main-text.74'</span>,\n",
-                            "            <span style=\"color: #808000; text-decoration-color: #808000\">path_group</span>=<span style=\"font-weight: bold\">[</span><span style=\"color: #008000; text-decoration-color: #008000\">'main-text.69'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.74'</span><span style=\"font-weight: bold\">]</span>,\n",
-                            "            <span style=\"color: #808000; text-decoration-color: #808000\">source_is_text</span>=<span style=\"color: #00ff00; text-decoration-color: #00ff00; font-style: italic\">True</span>\n",
-                            "        <span style=\"font-weight: bold\">)</span>\n",
-                            "    <span style=\"font-weight: bold\">]</span>\n",
-                            "<span style=\"font-weight: bold\">)</span>\n",
-                            "</pre>\n"
-                        ],
-                        "text/plain": [
-                            "\u001b[1;35mRAGResult\u001b[0m\u001b[1m(\u001b[0m\n",
-                            "    \u001b[33manswers\u001b[0m=\u001b[1m[\u001b[0m\n",
-                            "        \u001b[1;35mRAGAnswerItem\u001b[0m\u001b[1m(\u001b[0m\n",
-                            "            \u001b[33manswer\u001b[0m=\u001b[32m'The IBM lab in Zurich is located in Rüschlikon, Switzerland.'\u001b[0m,\n",
-                            "            \u001b[33mgrounding\u001b[0m=\u001b[1;35mRAGGroundingInfo\u001b[0m\u001b[1m(\u001b[0m\n",
-                            "                \u001b[33mretr_items\u001b[0m=\u001b[1m[\u001b[0m\n",
-                            "                    \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n",
-                            "                        \u001b[33mdoc_hash\u001b[0m=\u001b[32m'b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c'\u001b[0m,\n",
-                            "                        \u001b[33mchunk\u001b[0m=\u001b[32m'Switzerland\\nIBM Research-Zurich \u001b[0m\u001b[32m(\u001b[0m\u001b[32mpreviously called IBM Zurich Research Laboratory, \u001b[0m\n",
-                            "\u001b[32mZRL\u001b[0m\u001b[32m)\u001b[0m\u001b[32m is the European branch of IBM Research. It was opened in 1956 and is located in Rüschlikon, near Zurich, \u001b[0m\n",
-                            "\u001b[32mSwitzerland.'\u001b[0m,\n",
-                            "                        \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.70'\u001b[0m,\n",
-                            "                        \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.69'\u001b[0m, \u001b[32m'main-text.70'\u001b[0m\u001b[1m]\u001b[0m,\n",
-                            "                        \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n",
-                            "                    \u001b[1m)\u001b[0m\n",
-                            "                \u001b[1m]\u001b[0m,\n",
-                            "                \u001b[33mgen_ctx_paths\u001b[0m=\u001b[1m[\u001b[0m\n",
-                            "                    \u001b[32m'main-text.58'\u001b[0m,\n",
-                            "                    \u001b[32m'main-text.59'\u001b[0m,\n",
-                            "                    \u001b[32m'main-text.60'\u001b[0m,\n",
-                            "                    \u001b[32m'main-text.61'\u001b[0m,\n",
-                            "                    \u001b[32m'main-text.62'\u001b[0m,\n",
-                            "                    \u001b[32m'main-text.63'\u001b[0m,\n",
-                            "                    \u001b[32m'main-text.64'\u001b[0m,\n",
-                            "                    \u001b[32m'main-text.65'\u001b[0m,\n",
-                            "                    \u001b[32m'main-text.66'\u001b[0m,\n",
-                            "                    \u001b[32m'main-text.67'\u001b[0m,\n",
-                            "                    \u001b[32m'main-text.68'\u001b[0m,\n",
-                            "                    \u001b[32m'main-text.69'\u001b[0m,\n",
-                            "                    \u001b[32m'main-text.70'\u001b[0m,\n",
-                            "                    \u001b[32m'main-text.71'\u001b[0m,\n",
-                            "                    \u001b[32m'main-text.72'\u001b[0m,\n",
-                            "                    \u001b[32m'main-text.73'\u001b[0m,\n",
-                            "                    \u001b[32m'main-text.74'\u001b[0m,\n",
-                            "                    \u001b[32m'main-text.75'\u001b[0m,\n",
-                            "                    \u001b[32m'main-text.76'\u001b[0m\n",
-                            "                \u001b[1m]\u001b[0m\n",
-                            "            \u001b[1m)\u001b[0m,\n",
-                            "            \u001b[33mprompt\u001b[0m=\u001b[3;35mNone\u001b[0m\n",
-                            "        \u001b[1m)\u001b[0m\n",
-                            "    \u001b[1m]\u001b[0m,\n",
-                            "    \u001b[33msearch_result_items\u001b[0m=\u001b[1m[\u001b[0m\n",
-                            "        \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n",
-                            "            \u001b[33mdoc_hash\u001b[0m=\u001b[32m'b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c'\u001b[0m,\n",
-                            "            \u001b[33mchunk\u001b[0m=\u001b[32m'Switzerland\\nIBM Research-Zurich \u001b[0m\u001b[32m(\u001b[0m\u001b[32mpreviously called IBM Zurich Research Laboratory, ZRL\u001b[0m\u001b[32m)\u001b[0m\u001b[32m is the \u001b[0m\n",
-                            "\u001b[32mEuropean branch of IBM Research. It was opened in 1956 and is located in Rüschlikon, near Zurich, Switzerland.'\u001b[0m,\n",
-                            "            \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.70'\u001b[0m,\n",
-                            "            \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.69'\u001b[0m, \u001b[32m'main-text.70'\u001b[0m\u001b[1m]\u001b[0m,\n",
-                            "            \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n",
-                            "        \u001b[1m)\u001b[0m,\n",
-                            "        \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n",
-                            "            \u001b[33mdoc_hash\u001b[0m=\u001b[32m'b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c'\u001b[0m,\n",
-                            "            \u001b[33mchunk\u001b[0m=\u001b[32m'Switzerland\\nIn 1956, IBM opened their first European research laboratory in Adliswil, \u001b[0m\n",
-                            "\u001b[32mSwitzerland, near Zurich. The lab moved to its own campus in neighboring Rüschlikon in 1962. The Zurich lab is \u001b[0m\n",
-                            "\u001b[32mstaffed by a multicultural and interdisciplinary team of a few hundred permanent research staff members, graduate \u001b[0m\n",
-                            "\u001b[32mstudents and post-doctoral fellows, representing about 45 nationalities. Collocated with the lab is a Client Center\u001b[0m\n",
-                            "\u001b[32m(\u001b[0m\u001b[32mformerly the Industry Solutions Lab\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, an executive briefing facility demonstrating technology prototypes and \u001b[0m\n",
-                            "\u001b[32msolutions.'\u001b[0m,\n",
-                            "            \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.71'\u001b[0m,\n",
-                            "            \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.69'\u001b[0m, \u001b[32m'main-text.71'\u001b[0m\u001b[1m]\u001b[0m,\n",
-                            "            \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n",
-                            "        \u001b[1m)\u001b[0m,\n",
-                            "        \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n",
-                            "            \u001b[33mdoc_hash\u001b[0m=\u001b[32m'b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c'\u001b[0m,\n",
-                            "            \u001b[33mchunk\u001b[0m=\u001b[32m'Switzerland\\nThe research projects pursued at the IBM Zurich lab are organized into four \u001b[0m\n",
-                            "\u001b[32mscientific and technical departments: Science & Technology, Cloud and AI Systems Research, Cognitive Computing & \u001b[0m\n",
-                            "\u001b[32mIndustry Solutions and Security Research. The lab is currently managed by Alessandro Curioni.'\u001b[0m,\n",
-                            "            \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.74'\u001b[0m,\n",
-                            "            \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.69'\u001b[0m, \u001b[32m'main-text.74'\u001b[0m\u001b[1m]\u001b[0m,\n",
-                            "            \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n",
-                            "        \u001b[1m)\u001b[0m\n",
-                            "    \u001b[1m]\u001b[0m\n",
-                            "\u001b[1m)\u001b[0m\n"
-                        ]
-                    },
-                    "metadata": {},
-                    "output_type": "display_data"
-                }
-            ],
-            "source": [
-                "question = \"Where is the IBM lab in Zurich?\"\n",
-                "\n",
-                "# submit natural-language query on collection\n",
-                "question_query = RAGQuery(\n",
-                "    question=question,\n",
-                "    project=PROJ_KEY,\n",
-                "    data_source=data_source,\n",
-                "\n",
-                "    ## optional retrieval params\n",
-                "    retr_k=RETR_K,\n",
-                ")\n",
-                "api_output = api.queries.run(question_query)\n",
-                "rag_result = RAGResult.from_api_output(api_output, raise_on_error=RAISE)\n",
-                "\n",
-                "rich.print(rag_result)"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "Additionally, we can generate a provenance URL to the document in the Deep Search UI:"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 10,
-            "metadata": {},
-            "outputs": [
-                {
-                    "data": {
-                        "text/markdown": [
-                            "The provenance of the answer can be inspected on the [source document](https://sds.app.accelerate.science/projects/b09ae7561a01dc7c4b0fd21a43bfd93d140766d1/library/private/6b70072911ad2794a3844dd44d1705a5ba37ca0b?search=JTdCJTIycHJpdmF0ZUNvbGxlY3Rpb24lMjIlM0ElMjI2YjcwMDcyOTExYWQyNzk0YTM4NDRkZDQ0ZDE3MDVhNWJhMzdjYTBiJTIyJTJDJTIydHlwZSUyMiUzQSUyMkRvY3VtZW50JTIyJTJDJTIyZXhwcmVzc2lvbiUyMiUzQSUyMmZpbGUtaW5mby5kb2N1bWVudC1oYXNoJTNBJTIwJTVDJTIyYjMwYmM2NjdhMzI0YWUxMTFkMDI1NTI2NTYzYjY3NGE4ZDNmZDg2OWJjMDdjOGZkMjA0YWE5NWIwNWQ0MWYwYyU1QyUyMiUyMiUyQyUyMmZpbHRlcnMlMjIlM0ElNUIlNUQlMkMlMjJzZWxlY3QlMjIlM0ElNUIlMjJfbmFtZSUyMiUyQyUyMmRlc2NyaXB0aW9uLmNvbGxlY3Rpb24lMjIlMkMlMjJwcm92JTIyJTJDJTIyZGVzY3JpcHRpb24udGl0bGUlMjIlMkMlMjJkZXNjcmlwdGlvbi5wdWJsaWNhdGlvbl9kYXRlJTIyJTJDJTIyZGVzY3JpcHRpb24udXJsX3JlZnMlMjIlNUQlMkMlMjJpdGVtSW5kZXglMjIlM0EwJTJDJTIycGFnZVNpemUlMjIlM0ExMCUyQyUyMnNlYXJjaEFmdGVySGlzdG9yeSUyMiUzQSU1QiU1RCUyQyUyMnZpZXdUeXBlJTIyJTNBJTIyc25pcHBldHMlMjIlMkMlMjJyZWNvcmRTZWxlY3Rpb24lMjIlM0ElN0IlMjJyZWNvcmQlMjIlM0ElN0IlMjJpZCUyMiUzQSUyMmIzMGJjNjY3YTMyNGFlMTExZDAyNTUyNjU2M2I2NzRhOGQzZmQ4NjliYzA3YzhmZDIwNGFhOTViMDVkNDFmMGMlMjIlN0QlMkMlMjJpdGVtSW5kZXglMjIlM0E3MCU3RCU3RA%3D%3D)."
-                        ],
-                        "text/plain": [
-                            "<IPython.core.display.Markdown object>"
-                        ]
-                    },
-                    "metadata": {},
-                    "output_type": "display_data"
-                }
-            ],
-            "source": [
-                "render_provenance_url(api=api, coords=coll_coords, retr_item=rag_result.answers[0].grounding.retr_items[0])"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "Let us try out a different question on our document corpus.\n",
-                "Here we also include (commented out) various additional parameters the user can optionally set:\n",
-                "- `retr_k`: number of items to retrieve\n",
-                "- `text_weight`: weight of lexical search (`0.0`: fully semantic search, `1.0`: fully lexical search, anything in-between: hybrid search)\n",
-                "- `rerank`: whether to rerank the retrieval results\n",
-                "- `gen_ctx_extr_method` (Literal[\"window\", \"page\"], optional): method for gen context extraction from document; defaults to \"window\"\n",
-                "- `gen_ctx_window_size` (int, optional): (relevant only if `gen_ctx_extr_method` is \"window\") max chars to use for extracted gen context (actual extraction quantized on doc item level); defaults to 5000\n",
-                "- `gen_ctx_window_lead_weight` (float, optional): (relevant only if `gen_ctx_extr_method` is \"window\") weight of leading text for distributing remaining window size after extracting the `main_path`; defaults to 0.5 (centered around `main_path`)\n",
-                "- `return_prompt` (bool, optional): whether to return the instantiated prompt; defaults to False\n",
-                "- `gen_timeout` (float, optional): timeout for LLM generation; defaults to None, i.e. determined by system\n",
-                "\n",
-                "For more details refer to `deepsearch.cps.queries.RAGQuery`."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 11,
-            "id": "89d95a17-1569-4c90-a983-8ca437b7569d",
-            "metadata": {},
-            "outputs": [
-                {
-                    "data": {
-                        "text/html": [
-                            "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">RAGResult</span><span style=\"font-weight: bold\">(</span>\n",
-                            "    <span style=\"color: #808000; text-decoration-color: #808000\">answers</span>=<span style=\"font-weight: bold\">[</span>\n",
-                            "        <span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">RAGAnswerItem</span><span style=\"font-weight: bold\">(</span>\n",
-                            "            <span style=\"color: #808000; text-decoration-color: #808000\">answer</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'The term machine learning was coined in 1959 by Arthur Samuel, an IBM employee and pioneer in </span>\n",
-                            "<span style=\"color: #008000; text-decoration-color: #008000\">the field of computer gaming and artificial intelligence.'</span>,\n",
-                            "            <span style=\"color: #808000; text-decoration-color: #808000\">grounding</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">RAGGroundingInfo</span><span style=\"font-weight: bold\">(</span>\n",
-                            "                <span style=\"color: #808000; text-decoration-color: #808000\">retr_items</span>=<span style=\"font-weight: bold\">[</span>\n",
-                            "                    <span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">SearchResultItem</span><span style=\"font-weight: bold\">(</span>\n",
-                            "                        <span style=\"color: #808000; text-decoration-color: #808000\">doc_hash</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'e470e7b42a92c8e5f25094362361947b9203e0074c2223505b4921940ec075a1'</span>,\n",
-                            "                        <span style=\"color: #808000; text-decoration-color: #808000\">chunk</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'History and relationships to other fields\\nThe term machine learning was coined in </span>\n",
-                            "<span style=\"color: #008000; text-decoration-color: #008000\">1959 by Arthur Samuel, an IBM employee and pioneer in the field of computer gaming and artificial intelligence. </span>\n",
-                            "<span style=\"color: #008000; text-decoration-color: #008000\">$^{[10][11]}$ The synonym self-teaching computers was also used in this time period. [12][13]'</span>,\n",
-                            "                        <span style=\"color: #808000; text-decoration-color: #808000\">main_path</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'main-text.6'</span>,\n",
-                            "                        <span style=\"color: #808000; text-decoration-color: #808000\">path_group</span>=<span style=\"font-weight: bold\">[</span><span style=\"color: #008000; text-decoration-color: #008000\">'main-text.5'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.6'</span><span style=\"font-weight: bold\">]</span>,\n",
-                            "                        <span style=\"color: #808000; text-decoration-color: #808000\">source_is_text</span>=<span style=\"color: #00ff00; text-decoration-color: #00ff00; font-style: italic\">True</span>\n",
-                            "                    <span style=\"font-weight: bold\">)</span>\n",
-                            "                <span style=\"font-weight: bold\">]</span>,\n",
-                            "                <span style=\"color: #808000; text-decoration-color: #808000\">gen_ctx_paths</span>=<span style=\"font-weight: bold\">[</span>\n",
-                            "                    <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.1'</span>,\n",
-                            "                    <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.2'</span>,\n",
-                            "                    <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.3'</span>,\n",
-                            "                    <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.4'</span>,\n",
-                            "                    <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.5'</span>,\n",
-                            "                    <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.6'</span>,\n",
-                            "                    <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.7'</span>,\n",
-                            "                    <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.8'</span>,\n",
-                            "                    <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.9'</span>,\n",
-                            "                    <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.10'</span>\n",
-                            "                <span style=\"font-weight: bold\">]</span>\n",
-                            "            <span style=\"font-weight: bold\">)</span>,\n",
-                            "            <span style=\"color: #808000; text-decoration-color: #808000\">prompt</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>\n",
-                            "        <span style=\"font-weight: bold\">)</span>\n",
-                            "    <span style=\"font-weight: bold\">]</span>,\n",
-                            "    <span style=\"color: #808000; text-decoration-color: #808000\">search_result_items</span>=<span style=\"font-weight: bold\">[</span>\n",
-                            "        <span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">SearchResultItem</span><span style=\"font-weight: bold\">(</span>\n",
-                            "            <span style=\"color: #808000; text-decoration-color: #808000\">doc_hash</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'e470e7b42a92c8e5f25094362361947b9203e0074c2223505b4921940ec075a1'</span>,\n",
-                            "            <span style=\"color: #808000; text-decoration-color: #808000\">chunk</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'History and relationships to other fields\\nThe term machine learning was coined in 1959 by </span>\n",
-                            "<span style=\"color: #008000; text-decoration-color: #008000\">Arthur Samuel, an IBM employee and pioneer in the field of computer gaming and artificial intelligence. </span>\n",
-                            "<span style=\"color: #008000; text-decoration-color: #008000\">$^{[10][11]}$ The synonym self-teaching computers was also used in this time period. [12][13]'</span>,\n",
-                            "            <span style=\"color: #808000; text-decoration-color: #808000\">main_path</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'main-text.6'</span>,\n",
-                            "            <span style=\"color: #808000; text-decoration-color: #808000\">path_group</span>=<span style=\"font-weight: bold\">[</span><span style=\"color: #008000; text-decoration-color: #008000\">'main-text.5'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.6'</span><span style=\"font-weight: bold\">]</span>,\n",
-                            "            <span style=\"color: #808000; text-decoration-color: #808000\">source_is_text</span>=<span style=\"color: #00ff00; text-decoration-color: #00ff00; font-style: italic\">True</span>\n",
-                            "        <span style=\"font-weight: bold\">)</span>,\n",
-                            "        <span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">SearchResultItem</span><span style=\"font-weight: bold\">(</span>\n",
-                            "            <span style=\"color: #808000; text-decoration-color: #808000\">doc_hash</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'e470e7b42a92c8e5f25094362361947b9203e0074c2223505b4921940ec075a1'</span>,\n",
-                            "            <span style=\"color: #808000; text-decoration-color: #808000\">chunk</span>=<span style=\"color: #008000; text-decoration-color: #008000\">\"Machine learning\\nMachine learning (ML) is an umbrella term for solving problems for which </span>\n",
-                            "<span style=\"color: #008000; text-decoration-color: #008000\">development of algorithms by human programmers would be cost-prohibitive, and instead the problems are solved by </span>\n",
-                            "<span style=\"color: #008000; text-decoration-color: #008000\">helping machines 'discover ' their 'own ' algorithms, $^{[1]}$ without needing to be explicitly told what to do by </span>\n",
-                            "<span style=\"color: #008000; text-decoration-color: #008000\">any human-developed algorithms. $^{[2]}$ Recently, generative artificial neural networks have been able to surpass </span>\n",
-                            "<span style=\"color: #008000; text-decoration-color: #008000\">results of many previous approaches. $^{[3][4]}$ Machine-learning approaches have been applied to large language </span>\n",
-                            "<span style=\"color: #008000; text-decoration-color: #008000\">models, computer vision, speech recognition, email filtering, agriculture and medicine, where it is too costly to </span>\n",
-                            "<span style=\"color: #008000; text-decoration-color: #008000\">develop algorithms to perform the needed tasks. [5][6]\"</span>,\n",
-                            "            <span style=\"color: #808000; text-decoration-color: #808000\">main_path</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'main-text.2'</span>,\n",
-                            "            <span style=\"color: #808000; text-decoration-color: #808000\">path_group</span>=<span style=\"font-weight: bold\">[</span><span style=\"color: #008000; text-decoration-color: #008000\">'main-text.1'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.2'</span><span style=\"font-weight: bold\">]</span>,\n",
-                            "            <span style=\"color: #808000; text-decoration-color: #808000\">source_is_text</span>=<span style=\"color: #00ff00; text-decoration-color: #00ff00; font-style: italic\">True</span>\n",
-                            "        <span style=\"font-weight: bold\">)</span>,\n",
-                            "        <span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">SearchResultItem</span><span style=\"font-weight: bold\">(</span>\n",
-                            "            <span style=\"color: #808000; text-decoration-color: #808000\">doc_hash</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'e470e7b42a92c8e5f25094362361947b9203e0074c2223505b4921940ec075a1'</span>,\n",
-                            "            <span style=\"color: #808000; text-decoration-color: #808000\">chunk</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'Artificial intelligence\\nMachine learning (ML), reorganized and recognized as its own field, </span>\n",
-                            "<span style=\"color: #008000; text-decoration-color: #008000\">started to flourish in the 1990s. The field changed its goal from achieving artificial intelligence to tackling </span>\n",
-                            "<span style=\"color: #008000; text-decoration-color: #008000\">solvable problems of a practical nature. It shifted focus away from the symbolic approaches it had inherited from </span>\n",
-                            "<span style=\"color: #008000; text-decoration-color: #008000\">AI, and toward methods and models borrowed from statistics, fuzzy logic, and probability theory. [24]'</span>,\n",
-                            "            <span style=\"color: #808000; text-decoration-color: #808000\">main_path</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'main-text.15'</span>,\n",
-                            "            <span style=\"color: #808000; text-decoration-color: #808000\">path_group</span>=<span style=\"font-weight: bold\">[</span><span style=\"color: #008000; text-decoration-color: #008000\">'main-text.10'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.15'</span><span style=\"font-weight: bold\">]</span>,\n",
-                            "            <span style=\"color: #808000; text-decoration-color: #808000\">source_is_text</span>=<span style=\"color: #00ff00; text-decoration-color: #00ff00; font-style: italic\">True</span>\n",
-                            "        <span style=\"font-weight: bold\">)</span>\n",
-                            "    <span style=\"font-weight: bold\">]</span>\n",
-                            "<span style=\"font-weight: bold\">)</span>\n",
-                            "</pre>\n"
-                        ],
-                        "text/plain": [
-                            "\u001b[1;35mRAGResult\u001b[0m\u001b[1m(\u001b[0m\n",
-                            "    \u001b[33manswers\u001b[0m=\u001b[1m[\u001b[0m\n",
-                            "        \u001b[1;35mRAGAnswerItem\u001b[0m\u001b[1m(\u001b[0m\n",
-                            "            \u001b[33manswer\u001b[0m=\u001b[32m'The term machine learning was coined in 1959 by Arthur Samuel, an IBM employee and pioneer in \u001b[0m\n",
-                            "\u001b[32mthe field of computer gaming and artificial intelligence.'\u001b[0m,\n",
-                            "            \u001b[33mgrounding\u001b[0m=\u001b[1;35mRAGGroundingInfo\u001b[0m\u001b[1m(\u001b[0m\n",
-                            "                \u001b[33mretr_items\u001b[0m=\u001b[1m[\u001b[0m\n",
-                            "                    \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n",
-                            "                        \u001b[33mdoc_hash\u001b[0m=\u001b[32m'e470e7b42a92c8e5f25094362361947b9203e0074c2223505b4921940ec075a1'\u001b[0m,\n",
-                            "                        \u001b[33mchunk\u001b[0m=\u001b[32m'History and relationships to other fields\\nThe term machine learning was coined in \u001b[0m\n",
-                            "\u001b[32m1959 by Arthur Samuel, an IBM employee and pioneer in the field of computer gaming and artificial intelligence. \u001b[0m\n",
-                            "\u001b[32m$^\u001b[0m\u001b[32m{\u001b[0m\u001b[32m[\u001b[0m\u001b[32m10\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m11\u001b[0m\u001b[32m]\u001b[0m\u001b[32m}\u001b[0m\u001b[32m$ The synonym self-teaching computers was also used in this time period. \u001b[0m\u001b[32m[\u001b[0m\u001b[32m12\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m13\u001b[0m\u001b[32m]\u001b[0m\u001b[32m'\u001b[0m,\n",
-                            "                        \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.6'\u001b[0m,\n",
-                            "                        \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.5'\u001b[0m, \u001b[32m'main-text.6'\u001b[0m\u001b[1m]\u001b[0m,\n",
-                            "                        \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n",
-                            "                    \u001b[1m)\u001b[0m\n",
-                            "                \u001b[1m]\u001b[0m,\n",
-                            "                \u001b[33mgen_ctx_paths\u001b[0m=\u001b[1m[\u001b[0m\n",
-                            "                    \u001b[32m'main-text.1'\u001b[0m,\n",
-                            "                    \u001b[32m'main-text.2'\u001b[0m,\n",
-                            "                    \u001b[32m'main-text.3'\u001b[0m,\n",
-                            "                    \u001b[32m'main-text.4'\u001b[0m,\n",
-                            "                    \u001b[32m'main-text.5'\u001b[0m,\n",
-                            "                    \u001b[32m'main-text.6'\u001b[0m,\n",
-                            "                    \u001b[32m'main-text.7'\u001b[0m,\n",
-                            "                    \u001b[32m'main-text.8'\u001b[0m,\n",
-                            "                    \u001b[32m'main-text.9'\u001b[0m,\n",
-                            "                    \u001b[32m'main-text.10'\u001b[0m\n",
-                            "                \u001b[1m]\u001b[0m\n",
-                            "            \u001b[1m)\u001b[0m,\n",
-                            "            \u001b[33mprompt\u001b[0m=\u001b[3;35mNone\u001b[0m\n",
-                            "        \u001b[1m)\u001b[0m\n",
-                            "    \u001b[1m]\u001b[0m,\n",
-                            "    \u001b[33msearch_result_items\u001b[0m=\u001b[1m[\u001b[0m\n",
-                            "        \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n",
-                            "            \u001b[33mdoc_hash\u001b[0m=\u001b[32m'e470e7b42a92c8e5f25094362361947b9203e0074c2223505b4921940ec075a1'\u001b[0m,\n",
-                            "            \u001b[33mchunk\u001b[0m=\u001b[32m'History and relationships to other fields\\nThe term machine learning was coined in 1959 by \u001b[0m\n",
-                            "\u001b[32mArthur Samuel, an IBM employee and pioneer in the field of computer gaming and artificial intelligence. \u001b[0m\n",
-                            "\u001b[32m$^\u001b[0m\u001b[32m{\u001b[0m\u001b[32m[\u001b[0m\u001b[32m10\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m11\u001b[0m\u001b[32m]\u001b[0m\u001b[32m}\u001b[0m\u001b[32m$ The synonym self-teaching computers was also used in this time period. \u001b[0m\u001b[32m[\u001b[0m\u001b[32m12\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m13\u001b[0m\u001b[32m]\u001b[0m\u001b[32m'\u001b[0m,\n",
-                            "            \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.6'\u001b[0m,\n",
-                            "            \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.5'\u001b[0m, \u001b[32m'main-text.6'\u001b[0m\u001b[1m]\u001b[0m,\n",
-                            "            \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n",
-                            "        \u001b[1m)\u001b[0m,\n",
-                            "        \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n",
-                            "            \u001b[33mdoc_hash\u001b[0m=\u001b[32m'e470e7b42a92c8e5f25094362361947b9203e0074c2223505b4921940ec075a1'\u001b[0m,\n",
-                            "            \u001b[33mchunk\u001b[0m=\u001b[32m\"Machine\u001b[0m\u001b[32m learning\\nMachine learning \u001b[0m\u001b[32m(\u001b[0m\u001b[32mML\u001b[0m\u001b[32m)\u001b[0m\u001b[32m is an umbrella term for solving problems for which \u001b[0m\n",
-                            "\u001b[32mdevelopment of algorithms by human programmers would be cost-prohibitive, and instead the problems are solved by \u001b[0m\n",
-                            "\u001b[32mhelping machines 'discover ' their 'own ' algorithms, $^\u001b[0m\u001b[32m{\u001b[0m\u001b[32m[\u001b[0m\u001b[32m1\u001b[0m\u001b[32m]\u001b[0m\u001b[32m}\u001b[0m\u001b[32m$ without needing to be explicitly told what to do by \u001b[0m\n",
-                            "\u001b[32many human-developed algorithms. $^\u001b[0m\u001b[32m{\u001b[0m\u001b[32m[\u001b[0m\u001b[32m2\u001b[0m\u001b[32m]\u001b[0m\u001b[32m}\u001b[0m\u001b[32m$ Recently, generative artificial neural networks have been able to surpass \u001b[0m\n",
-                            "\u001b[32mresults of many previous approaches. $^\u001b[0m\u001b[32m{\u001b[0m\u001b[32m[\u001b[0m\u001b[32m3\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m4\u001b[0m\u001b[32m]\u001b[0m\u001b[32m}\u001b[0m\u001b[32m$ Machine-learning approaches have been applied to large language \u001b[0m\n",
-                            "\u001b[32mmodels, computer vision, speech recognition, email filtering, agriculture and medicine, where it is too costly to \u001b[0m\n",
-                            "\u001b[32mdevelop algorithms to perform the needed tasks. \u001b[0m\u001b[32m[\u001b[0m\u001b[32m5\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m6\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\"\u001b[0m,\n",
-                            "            \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.2'\u001b[0m,\n",
-                            "            \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.1'\u001b[0m, \u001b[32m'main-text.2'\u001b[0m\u001b[1m]\u001b[0m,\n",
-                            "            \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n",
-                            "        \u001b[1m)\u001b[0m,\n",
-                            "        \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n",
-                            "            \u001b[33mdoc_hash\u001b[0m=\u001b[32m'e470e7b42a92c8e5f25094362361947b9203e0074c2223505b4921940ec075a1'\u001b[0m,\n",
-                            "            \u001b[33mchunk\u001b[0m=\u001b[32m'Artificial intelligence\\nMachine learning \u001b[0m\u001b[32m(\u001b[0m\u001b[32mML\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, reorganized and recognized as its own field, \u001b[0m\n",
-                            "\u001b[32mstarted to flourish in the 1990s. The field changed its goal from achieving artificial intelligence to tackling \u001b[0m\n",
-                            "\u001b[32msolvable problems of a practical nature. It shifted focus away from the symbolic approaches it had inherited from \u001b[0m\n",
-                            "\u001b[32mAI, and toward methods and models borrowed from statistics, fuzzy logic, and probability theory. \u001b[0m\u001b[32m[\u001b[0m\u001b[32m24\u001b[0m\u001b[32m]\u001b[0m\u001b[32m'\u001b[0m,\n",
-                            "            \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.15'\u001b[0m,\n",
-                            "            \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.10'\u001b[0m, \u001b[32m'main-text.15'\u001b[0m\u001b[1m]\u001b[0m,\n",
-                            "            \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n",
-                            "        \u001b[1m)\u001b[0m\n",
-                            "    \u001b[1m]\u001b[0m\n",
-                            "\u001b[1m)\u001b[0m\n"
-                        ]
-                    },
-                    "metadata": {},
-                    "output_type": "display_data"
-                }
-            ],
-            "source": [
-                "question = \"Who came up with the term 'machine learning'?\"\n",
-                "\n",
-                "# submit natural-language query on collection\n",
-                "question_query = RAGQuery(\n",
-                "    question=question,\n",
-                "    project=PROJ_KEY,\n",
-                "    data_source=data_source,\n",
-                "\n",
-                "    ## optional retrieval params\n",
-                "    retr_k=RETR_K,\n",
-                "    # text_weight=TEXT_WEIGHT,\n",
-                "    # rerank=RERANK,\n",
-                "\n",
-                "    ## optional generation params\n",
-                "    # model_id=\"ibm-mistralai/mixtral-8x7b-instruct-v01-q\",\n",
-                "    # gen_params={\"random_seed\": 42, \"max_new_tokens\": 1024},\n",
-                "    # prompt_template=\"Answer the query based on the context.\\n\\nContext: {{ context }}\\n\\nQuery: {{ query }}\",\n",
-                "\n",
-                "    # gen_ctx_extr_method=\"window\",\n",
-                "    # gen_ctx_window_size=5000,\n",
-                "    # gen_ctx_window_lead_weight=0.5\n",
-                "    # return_prompt=True,\n",
-                "    # gen_timeout=10.0,\n",
-                ")\n",
-                "api_output = api.queries.run(question_query)\n",
-                "rag_result = RAGResult.from_api_output(api_output, raise_on_error=RAISE)\n",
-                "\n",
-                "rich.print(rag_result)"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "As seen by the returned `doc_hash`, this answer came from a different document than the previous one."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 12,
-            "metadata": {},
-            "outputs": [
-                {
-                    "data": {
-                        "text/markdown": [
-                            "The provenance of the answer can be inspected on the [source document](https://sds.app.accelerate.science/projects/b09ae7561a01dc7c4b0fd21a43bfd93d140766d1/library/private/6b70072911ad2794a3844dd44d1705a5ba37ca0b?search=JTdCJTIycHJpdmF0ZUNvbGxlY3Rpb24lMjIlM0ElMjI2YjcwMDcyOTExYWQyNzk0YTM4NDRkZDQ0ZDE3MDVhNWJhMzdjYTBiJTIyJTJDJTIydHlwZSUyMiUzQSUyMkRvY3VtZW50JTIyJTJDJTIyZXhwcmVzc2lvbiUyMiUzQSUyMmZpbGUtaW5mby5kb2N1bWVudC1oYXNoJTNBJTIwJTVDJTIyZTQ3MGU3YjQyYTkyYzhlNWYyNTA5NDM2MjM2MTk0N2I5MjAzZTAwNzRjMjIyMzUwNWI0OTIxOTQwZWMwNzVhMSU1QyUyMiUyMiUyQyUyMmZpbHRlcnMlMjIlM0ElNUIlNUQlMkMlMjJzZWxlY3QlMjIlM0ElNUIlMjJfbmFtZSUyMiUyQyUyMmRlc2NyaXB0aW9uLmNvbGxlY3Rpb24lMjIlMkMlMjJwcm92JTIyJTJDJTIyZGVzY3JpcHRpb24udGl0bGUlMjIlMkMlMjJkZXNjcmlwdGlvbi5wdWJsaWNhdGlvbl9kYXRlJTIyJTJDJTIyZGVzY3JpcHRpb24udXJsX3JlZnMlMjIlNUQlMkMlMjJpdGVtSW5kZXglMjIlM0EwJTJDJTIycGFnZVNpemUlMjIlM0ExMCUyQyUyMnNlYXJjaEFmdGVySGlzdG9yeSUyMiUzQSU1QiU1RCUyQyUyMnZpZXdUeXBlJTIyJTNBJTIyc25pcHBldHMlMjIlMkMlMjJyZWNvcmRTZWxlY3Rpb24lMjIlM0ElN0IlMjJyZWNvcmQlMjIlM0ElN0IlMjJpZCUyMiUzQSUyMmU0NzBlN2I0MmE5MmM4ZTVmMjUwOTQzNjIzNjE5NDdiOTIwM2UwMDc0YzIyMjM1MDViNDkyMTk0MGVjMDc1YTElMjIlN0QlMkMlMjJpdGVtSW5kZXglMjIlM0E2JTdEJTdE)."
-                        ],
-                        "text/plain": [
-                            "<IPython.core.display.Markdown object>"
-                        ]
-                    },
-                    "metadata": {},
-                    "output_type": "display_data"
-                }
-            ],
-            "source": [
-                "render_provenance_url(api=api, coords=coll_coords, retr_item=rag_result.answers[0].grounding.retr_items[0])"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "## Semantic retrieval\n",
-                "\n",
-                "Besides RAG, which includes natural language generation, a user may only be interested in\n",
-                "the semantic retrieval part.\n",
-                "\n",
-                "This can be obtained very similarly to RAG, as shown below:"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 13,
-            "metadata": {},
-            "outputs": [
-                {
-                    "data": {
-                        "text/html": [
-                            "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">SearchResult</span><span style=\"font-weight: bold\">(</span>\n",
-                            "    <span style=\"color: #808000; text-decoration-color: #808000\">search_result_items</span>=<span style=\"font-weight: bold\">[</span>\n",
-                            "        <span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">SearchResultItem</span><span style=\"font-weight: bold\">(</span>\n",
-                            "            <span style=\"color: #808000; text-decoration-color: #808000\">doc_hash</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c'</span>,\n",
-                            "            <span style=\"color: #808000; text-decoration-color: #808000\">chunk</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'Switzerland\\nIBM Research-Zurich (previously called IBM Zurich Research Laboratory, ZRL) is the </span>\n",
-                            "<span style=\"color: #008000; text-decoration-color: #008000\">European branch of IBM Research. It was opened in 1956 and is located in Rüschlikon, near Zurich, Switzerland.'</span>,\n",
-                            "            <span style=\"color: #808000; text-decoration-color: #808000\">main_path</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'main-text.70'</span>,\n",
-                            "            <span style=\"color: #808000; text-decoration-color: #808000\">path_group</span>=<span style=\"font-weight: bold\">[</span><span style=\"color: #008000; text-decoration-color: #008000\">'main-text.69'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.70'</span><span style=\"font-weight: bold\">]</span>,\n",
-                            "            <span style=\"color: #808000; text-decoration-color: #808000\">source_is_text</span>=<span style=\"color: #00ff00; text-decoration-color: #00ff00; font-style: italic\">True</span>\n",
-                            "        <span style=\"font-weight: bold\">)</span>,\n",
-                            "        <span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">SearchResultItem</span><span style=\"font-weight: bold\">(</span>\n",
-                            "            <span style=\"color: #808000; text-decoration-color: #808000\">doc_hash</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c'</span>,\n",
-                            "            <span style=\"color: #808000; text-decoration-color: #808000\">chunk</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'Switzerland\\nIn 1956, IBM opened their first European research laboratory in Adliswil, </span>\n",
-                            "<span style=\"color: #008000; text-decoration-color: #008000\">Switzerland, near Zurich. The lab moved to its own campus in neighboring Rüschlikon in 1962. The Zurich lab is </span>\n",
-                            "<span style=\"color: #008000; text-decoration-color: #008000\">staffed by a multicultural and interdisciplinary team of a few hundred permanent research staff members, graduate </span>\n",
-                            "<span style=\"color: #008000; text-decoration-color: #008000\">students and post-doctoral fellows, representing about 45 nationalities. Collocated with the lab is a Client Center</span>\n",
-                            "<span style=\"color: #008000; text-decoration-color: #008000\">(formerly the Industry Solutions Lab), an executive briefing facility demonstrating technology prototypes and </span>\n",
-                            "<span style=\"color: #008000; text-decoration-color: #008000\">solutions.'</span>,\n",
-                            "            <span style=\"color: #808000; text-decoration-color: #808000\">main_path</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'main-text.71'</span>,\n",
-                            "            <span style=\"color: #808000; text-decoration-color: #808000\">path_group</span>=<span style=\"font-weight: bold\">[</span><span style=\"color: #008000; text-decoration-color: #008000\">'main-text.69'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.71'</span><span style=\"font-weight: bold\">]</span>,\n",
-                            "            <span style=\"color: #808000; text-decoration-color: #808000\">source_is_text</span>=<span style=\"color: #00ff00; text-decoration-color: #00ff00; font-style: italic\">True</span>\n",
-                            "        <span style=\"font-weight: bold\">)</span>,\n",
-                            "        <span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">SearchResultItem</span><span style=\"font-weight: bold\">(</span>\n",
-                            "            <span style=\"color: #808000; text-decoration-color: #808000\">doc_hash</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c'</span>,\n",
-                            "            <span style=\"color: #808000; text-decoration-color: #808000\">chunk</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'Switzerland\\nThe research projects pursued at the IBM Zurich lab are organized into four </span>\n",
-                            "<span style=\"color: #008000; text-decoration-color: #008000\">scientific and technical departments: Science &amp; Technology, Cloud and AI Systems Research, Cognitive Computing &amp; </span>\n",
-                            "<span style=\"color: #008000; text-decoration-color: #008000\">Industry Solutions and Security Research. The lab is currently managed by Alessandro Curioni.'</span>,\n",
-                            "            <span style=\"color: #808000; text-decoration-color: #808000\">main_path</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'main-text.74'</span>,\n",
-                            "            <span style=\"color: #808000; text-decoration-color: #808000\">path_group</span>=<span style=\"font-weight: bold\">[</span><span style=\"color: #008000; text-decoration-color: #008000\">'main-text.69'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.74'</span><span style=\"font-weight: bold\">]</span>,\n",
-                            "            <span style=\"color: #808000; text-decoration-color: #808000\">source_is_text</span>=<span style=\"color: #00ff00; text-decoration-color: #00ff00; font-style: italic\">True</span>\n",
-                            "        <span style=\"font-weight: bold\">)</span>\n",
-                            "    <span style=\"font-weight: bold\">]</span>\n",
-                            "<span style=\"font-weight: bold\">)</span>\n",
-                            "</pre>\n"
-                        ],
-                        "text/plain": [
-                            "\u001b[1;35mSearchResult\u001b[0m\u001b[1m(\u001b[0m\n",
-                            "    \u001b[33msearch_result_items\u001b[0m=\u001b[1m[\u001b[0m\n",
-                            "        \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n",
-                            "            \u001b[33mdoc_hash\u001b[0m=\u001b[32m'b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c'\u001b[0m,\n",
-                            "            \u001b[33mchunk\u001b[0m=\u001b[32m'Switzerland\\nIBM Research-Zurich \u001b[0m\u001b[32m(\u001b[0m\u001b[32mpreviously called IBM Zurich Research Laboratory, ZRL\u001b[0m\u001b[32m)\u001b[0m\u001b[32m is the \u001b[0m\n",
-                            "\u001b[32mEuropean branch of IBM Research. It was opened in 1956 and is located in Rüschlikon, near Zurich, Switzerland.'\u001b[0m,\n",
-                            "            \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.70'\u001b[0m,\n",
-                            "            \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.69'\u001b[0m, \u001b[32m'main-text.70'\u001b[0m\u001b[1m]\u001b[0m,\n",
-                            "            \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n",
-                            "        \u001b[1m)\u001b[0m,\n",
-                            "        \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n",
-                            "            \u001b[33mdoc_hash\u001b[0m=\u001b[32m'b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c'\u001b[0m,\n",
-                            "            \u001b[33mchunk\u001b[0m=\u001b[32m'Switzerland\\nIn 1956, IBM opened their first European research laboratory in Adliswil, \u001b[0m\n",
-                            "\u001b[32mSwitzerland, near Zurich. The lab moved to its own campus in neighboring Rüschlikon in 1962. The Zurich lab is \u001b[0m\n",
-                            "\u001b[32mstaffed by a multicultural and interdisciplinary team of a few hundred permanent research staff members, graduate \u001b[0m\n",
-                            "\u001b[32mstudents and post-doctoral fellows, representing about 45 nationalities. Collocated with the lab is a Client Center\u001b[0m\n",
-                            "\u001b[32m(\u001b[0m\u001b[32mformerly the Industry Solutions Lab\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, an executive briefing facility demonstrating technology prototypes and \u001b[0m\n",
-                            "\u001b[32msolutions.'\u001b[0m,\n",
-                            "            \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.71'\u001b[0m,\n",
-                            "            \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.69'\u001b[0m, \u001b[32m'main-text.71'\u001b[0m\u001b[1m]\u001b[0m,\n",
-                            "            \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n",
-                            "        \u001b[1m)\u001b[0m,\n",
-                            "        \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n",
-                            "            \u001b[33mdoc_hash\u001b[0m=\u001b[32m'b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c'\u001b[0m,\n",
-                            "            \u001b[33mchunk\u001b[0m=\u001b[32m'Switzerland\\nThe research projects pursued at the IBM Zurich lab are organized into four \u001b[0m\n",
-                            "\u001b[32mscientific and technical departments: Science & Technology, Cloud and AI Systems Research, Cognitive Computing & \u001b[0m\n",
-                            "\u001b[32mIndustry Solutions and Security Research. The lab is currently managed by Alessandro Curioni.'\u001b[0m,\n",
-                            "            \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.74'\u001b[0m,\n",
-                            "            \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.69'\u001b[0m, \u001b[32m'main-text.74'\u001b[0m\u001b[1m]\u001b[0m,\n",
-                            "            \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n",
-                            "        \u001b[1m)\u001b[0m\n",
-                            "    \u001b[1m]\u001b[0m\n",
-                            "\u001b[1m)\u001b[0m\n"
-                        ]
-                    },
-                    "metadata": {},
-                    "output_type": "display_data"
-                }
-            ],
-            "source": [
-                "question = \"Where is the IBM lab in Zurich?\"\n",
-                "\n",
-                "# submit natural-language query on collection\n",
-                "question_query = SemanticQuery(\n",
-                "    question=question,\n",
-                "    project=PROJ_KEY,\n",
-                "    data_source=data_source,\n",
-                "\n",
-                "    ## optional params\n",
-                "    retr_k=RETR_K,\n",
-                "    # text_weight=TEXT_WEIGHT,\n",
-                "    # rerank=RERANK,\n",
-                ")\n",
-                "api_output = api.queries.run(question_query)\n",
-                "search_result = SearchResult.from_api_output(api_output, raise_on_error=RAISE)\n",
-                "\n",
-                "rich.print(search_result)"
-            ]
-        }
-    ],
-    "metadata": {
-        "kernelspec": {
-            "display_name": "Python 3 (ipykernel)",
-            "language": "python",
-            "name": "python3"
-        },
-        "language_info": {
-            "codemirror_mode": {
-                "name": "ipython",
-                "version": 3
-            },
-            "file_extension": ".py",
-            "mimetype": "text/x-python",
-            "name": "python",
-            "nbconvert_exporter": "python",
-            "pygments_lexer": "ipython3",
-            "version": "3.10.4"
-        }
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "99f717ef-4cba-4300-b258-0b1c248cb873",
+   "metadata": {},
+   "source": [
+    "# RAG and Semantic Retrieval on a Document Collection\n",
+    "\n",
+    "Deep Search allows users to interact with the documents using conversational AI, i.e. you interact with a virtual assistant which answer your questions using the information in the corpus.\n",
+    "\n",
+    "In this example we demonstrate how achive the same interaction programmatically.\n",
+    "\n",
+    "### Access required\n",
+    "\n",
+    "The content of this notebook requires access to Deep Search capabilities which are not\n",
+    "available on the public access system.\n",
+    "\n",
+    "[Contact us](https://ds4sd.github.io) if you are interested in exploring\n",
+    "these Deep Search capabilities.\n",
+    "\n",
+    "\n",
+    "### GenAI Integration required\n",
+    "\n",
+    "When interacting with the virtual assistant, Deep Search requires a connection to a Generative AI API. Currently, we support connections to [watsonx.ai](https://www.ibm.com/products/watsonx-ai) or the IBM-internal GenAI platform BAM.\n",
+    "\n",
+    "Deep Search allows custom GenAI configurations for each project.\n",
+    "In the following example you will require to work in a project which has such GenAI capabilities activated."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "256aef50-71a1-4278-9b22-17cb99a6566e",
+   "metadata": {},
+   "source": [
+    "### Set notebook parameters\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "5b244bdd-1b52-41ff-b63e-9a203570d210",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from dsnotebooks.settings import CollQANotebookSettings\n",
+    "\n",
+    "# notebooks settings auto-loaded from .env / env vars\n",
+    "notebook_settings = CollQANotebookSettings()\n",
+    "\n",
+    "PROFILE_NAME = notebook_settings.profile  # the profile to use\n",
+    "PROJ_KEY = notebook_settings.proj_key  # the project to use\n",
+    "INDEX_KEY = notebook_settings.sem_on_idx_key  # the collection to use\n",
+    "\n",
+    "SKIP_INGESTED_DOCS = (\n",
+    "    notebook_settings.skip_ingested_docs\n",
+    ")  # whether to skip any already semantically ingested docs\n",
+    "\n",
+    "RETR_K = notebook_settings.retr_k  # the number of search results to retrieve\n",
+    "TEXT_WEIGHT = (\n",
+    "    notebook_settings.text_weight\n",
+    ")  # the weight of lexical search (0.0: semantic-only, 1.0: lexical-only, anything in between: hybrid search)\n",
+    "RERANK = notebook_settings.rerank  # whether to rerank the search results\n",
+    "RAISE = (\n",
+    "    notebook_settings.raise_on_sem_err\n",
+    ")  # whether semantic operation errors should raise an exception or be reflected in response fields"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a5269060-bb5f-4fe3-9b64-547202db6714",
+   "metadata": {},
+   "source": [
+    "### Import example dependencies"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "5d236ea0-db1c-4171-8e11-cdd0bad69d66",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Import standard dependenices\n",
+    "import pandas as pd\n",
+    "import rich\n",
+    "\n",
+    "# IPython utilities\n",
+    "from IPython.display import display, Markdown\n",
+    "\n",
+    "# Import the deepsearch-toolkit\n",
+    "from deepsearch.cps.client.api import CpsApi\n",
+    "from deepsearch.cps.client.components.elastic import ElasticProjectDataCollectionSource\n",
+    "from deepsearch.cps.queries import DataQuery, RAGQuery, SemanticQuery\n",
+    "from deepsearch.cps.queries.results import RAGResult, SearchResult, SearchResultItem"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "293c249b-6018-46f2-b4d8-795f994d4729",
+   "metadata": {},
+   "source": [
+    "### Connect to Deep Search"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "9c108432-a285-4c7b-a996-008ac3ff3d7a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "api = CpsApi.from_env(profile_name=PROFILE_NAME)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Utils"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def render_provenance_url(\n",
+    "    api: CpsApi,\n",
+    "    coords: ElasticProjectDataCollectionSource,\n",
+    "    retr_item: SearchResultItem,\n",
+    "):\n",
+    "    ## compute URL to the document in the Deep Search UI\n",
+    "    item_index = int(retr_item.main_path[retr_item.main_path.rfind(\".\") + 1 :])\n",
+    "    doc_url = api.documents.generate_url(\n",
+    "        document_hash=retr_item.doc_hash,\n",
+    "        data_source=coords,\n",
+    "        item_index=item_index,\n",
+    "    )\n",
+    "    display(\n",
+    "        Markdown(\n",
+    "            f\"The provenance of the answer can be inspected on the [source document]({doc_url}).\"\n",
+    "        )\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "38cde869-46d1-4833-8eb3-2381b5e5fb68",
+   "metadata": {},
+   "source": [
+    "---"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Prepare the collection coordinates:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "coll_coords = ElasticProjectDataCollectionSource(\n",
+    "    proj_key=PROJ_KEY,\n",
+    "    index_key=INDEX_KEY,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We are using a small collection, so we can just list its documents to get an idea of its contents (for more details on querying, check the [Data Query Quick Start](https://github.com/DS4SD/deepsearch-examples/tree/main/examples/data_query_quick_start))."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "2b38875e-f39c-4dd5-9d42-3ffca5d0bdac",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Finished fetching all data. Total is 10 records.\n"
+     ]
     },
-    "nbformat": 4,
-    "nbformat_minor": 5
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Filename</th>\n",
+       "      <th>DocHash</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>natural-language-processing.pdf</td>\n",
+       "      <td>000f892ddcc67f165797a96e94f44fb9e0697c7912a383...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>ibm-z.pdf</td>\n",
+       "      <td>07e56eb5a10f725fccad9386d126b7b05bec1fa71b9b3d...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>ibm.pdf</td>\n",
+       "      <td>234bc5cf2c860d49574b0ff7191c354b7bbc11472a0997...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>ibm-the-great-mind-challenge.pdf</td>\n",
+       "      <td>335120a57b418655196e3315b562a2f9e89cedeaef9318...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>turing-award.pdf</td>\n",
+       "      <td>8a7c91a269abc3063df9f4e19f7961ddb8e2393fa0f272...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>ibm-research.pdf</td>\n",
+       "      <td>b30bc667a324ae111d025526563b674a8d3fd869bc07c8...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>artificial-intelligence.pdf</td>\n",
+       "      <td>b60a87c1d62a59d517f2fd6f2d3ea1a96c58b651332a8b...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>machine-learning.pdf</td>\n",
+       "      <td>e470e7b42a92c8e5f25094362361947b9203e0074c2223...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>deep-blue-chess-computer.pdf</td>\n",
+       "      <td>fa7ce2f66a7a5e061813d36348425f81d9e7ebc23454d8...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>red-hat.pdf</td>\n",
+       "      <td>fb53bb607f2e9642d7fe044585d1dcdb052c57febe1b87...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                           Filename  \\\n",
+       "0   natural-language-processing.pdf   \n",
+       "1                         ibm-z.pdf   \n",
+       "2                           ibm.pdf   \n",
+       "3  ibm-the-great-mind-challenge.pdf   \n",
+       "4                  turing-award.pdf   \n",
+       "5                  ibm-research.pdf   \n",
+       "6       artificial-intelligence.pdf   \n",
+       "7              machine-learning.pdf   \n",
+       "8      deep-blue-chess-computer.pdf   \n",
+       "9                       red-hat.pdf   \n",
+       "\n",
+       "                                             DocHash  \n",
+       "0  000f892ddcc67f165797a96e94f44fb9e0697c7912a383...  \n",
+       "1  07e56eb5a10f725fccad9386d126b7b05bec1fa71b9b3d...  \n",
+       "2  234bc5cf2c860d49574b0ff7191c354b7bbc11472a0997...  \n",
+       "3  335120a57b418655196e3315b562a2f9e89cedeaef9318...  \n",
+       "4  8a7c91a269abc3063df9f4e19f7961ddb8e2393fa0f272...  \n",
+       "5  b30bc667a324ae111d025526563b674a8d3fd869bc07c8...  \n",
+       "6  b60a87c1d62a59d517f2fd6f2d3ea1a96c58b651332a8b...  \n",
+       "7  e470e7b42a92c8e5f25094362361947b9203e0074c2223...  \n",
+       "8  fa7ce2f66a7a5e061813d36348425f81d9e7ebc23454d8...  \n",
+       "9  fb53bb607f2e9642d7fe044585d1dcdb052c57febe1b87...  "
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# Prepare the data query\n",
+    "query = DataQuery(\n",
+    "    search_query=\"*\",  # The search query to be executed\n",
+    "    source=[  # Which fields of documents we want to fetch\n",
+    "        \"file-info.document-hash\",\n",
+    "        \"file-info.filename\",\n",
+    "        # \"description.title\",\n",
+    "    ],\n",
+    "    coordinates=coll_coords,  # The data collection to be queries\n",
+    ")\n",
+    "\n",
+    "# Query Deep Search for the documents matching the query\n",
+    "results = []\n",
+    "query_results = api.queries.run(query)\n",
+    "for row in query_results.outputs[\"data_outputs\"]:\n",
+    "    # Add row to results table\n",
+    "    results.append(\n",
+    "        {\n",
+    "            \"Filename\": row[\"_source\"][\"file-info\"][\"filename\"],\n",
+    "            \"DocHash\": row[\"_source\"][\"file-info\"][\"document-hash\"],\n",
+    "            # \"Title\": row[\"_source\"].get(\"description\", {}).get(\"title\"),\n",
+    "        }\n",
+    "    )\n",
+    "\n",
+    "print(f\"Finished fetching all data. Total is {len(results)} records.\")\n",
+    "\n",
+    "# Visualize the table with all results\n",
+    "df = pd.json_normalize(results)\n",
+    "display(df)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prepare source"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from deepsearch.cps.client.components.documents import (\n",
+    "    PrivateDataCollectionSource,\n",
+    "    PrivateDataDocumentSource,\n",
+    "    PublicDataDocumentSource,\n",
+    ")\n",
+    "\n",
+    "data_source = PrivateDataCollectionSource(\n",
+    "    source=coll_coords,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Ingestion"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the cell below we show how to semantically index your collection (indexing of already indexed docs is controlled via param `skip_ingested_docs`):"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/pva/work/github.com/DS4SD/deepsearch-examples/.venv/lib/python3.10/site-packages/pydantic/main.py:314: UserWarning: Pydantic serializer warnings:\n",
+      "  Expected `list[str]` but got `_LiteralGenericAlias` - serialized value may not be as expected\n",
+      "  return self.__pydantic_serializer__.to_python(\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "{'ing_out': {}}"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# launch the ingestion of the collection for DocumentQA\n",
+    "task = api.documents.semantic_ingest(\n",
+    "    project=PROJ_KEY,\n",
+    "    data_source=data_source,\n",
+    "    skip_ingested_docs=SKIP_INGESTED_DOCS,\n",
+    ")\n",
+    "\n",
+    "# wait for the ingestion task to finish\n",
+    "api.tasks.wait_for(PROJ_KEY, task.task_id)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## RAG"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "ee573e76-98ea-43ce-a2ba-a81f64b3adf3",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">RAGResult</span><span style=\"font-weight: bold\">(</span>\n",
+       "    <span style=\"color: #808000; text-decoration-color: #808000\">answers</span>=<span style=\"font-weight: bold\">[</span>\n",
+       "        <span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">RAGAnswerItem</span><span style=\"font-weight: bold\">(</span>\n",
+       "            <span style=\"color: #808000; text-decoration-color: #808000\">answer</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'The IBM lab in Zurich is located in Rüschlikon, Switzerland.'</span>,\n",
+       "            <span style=\"color: #808000; text-decoration-color: #808000\">grounding</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">RAGGroundingInfo</span><span style=\"font-weight: bold\">(</span>\n",
+       "                <span style=\"color: #808000; text-decoration-color: #808000\">retr_items</span>=<span style=\"font-weight: bold\">[</span>\n",
+       "                    <span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">SearchResultItem</span><span style=\"font-weight: bold\">(</span>\n",
+       "                        <span style=\"color: #808000; text-decoration-color: #808000\">doc_hash</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c'</span>,\n",
+       "                        <span style=\"color: #808000; text-decoration-color: #808000\">chunk</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'Switzerland\\nIBM Research-Zurich (previously called IBM Zurich Research Laboratory, </span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000\">ZRL) is the European branch of IBM Research. It was opened in 1956 and is located in Rüschlikon, near Zurich, </span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000\">Switzerland.'</span>,\n",
+       "                        <span style=\"color: #808000; text-decoration-color: #808000\">main_path</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'main-text.70'</span>,\n",
+       "                        <span style=\"color: #808000; text-decoration-color: #808000\">path_group</span>=<span style=\"font-weight: bold\">[</span><span style=\"color: #008000; text-decoration-color: #008000\">'main-text.69'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.70'</span><span style=\"font-weight: bold\">]</span>,\n",
+       "                        <span style=\"color: #808000; text-decoration-color: #808000\">source_is_text</span>=<span style=\"color: #00ff00; text-decoration-color: #00ff00; font-style: italic\">True</span>\n",
+       "                    <span style=\"font-weight: bold\">)</span>\n",
+       "                <span style=\"font-weight: bold\">]</span>,\n",
+       "                <span style=\"color: #808000; text-decoration-color: #808000\">gen_ctx_paths</span>=<span style=\"font-weight: bold\">[</span>\n",
+       "                    <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.58'</span>,\n",
+       "                    <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.59'</span>,\n",
+       "                    <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.60'</span>,\n",
+       "                    <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.61'</span>,\n",
+       "                    <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.62'</span>,\n",
+       "                    <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.63'</span>,\n",
+       "                    <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.64'</span>,\n",
+       "                    <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.65'</span>,\n",
+       "                    <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.66'</span>,\n",
+       "                    <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.67'</span>,\n",
+       "                    <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.68'</span>,\n",
+       "                    <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.69'</span>,\n",
+       "                    <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.70'</span>,\n",
+       "                    <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.71'</span>,\n",
+       "                    <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.72'</span>,\n",
+       "                    <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.73'</span>,\n",
+       "                    <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.74'</span>,\n",
+       "                    <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.75'</span>,\n",
+       "                    <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.76'</span>\n",
+       "                <span style=\"font-weight: bold\">]</span>\n",
+       "            <span style=\"font-weight: bold\">)</span>,\n",
+       "            <span style=\"color: #808000; text-decoration-color: #808000\">prompt</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>\n",
+       "        <span style=\"font-weight: bold\">)</span>\n",
+       "    <span style=\"font-weight: bold\">]</span>,\n",
+       "    <span style=\"color: #808000; text-decoration-color: #808000\">search_result_items</span>=<span style=\"font-weight: bold\">[</span>\n",
+       "        <span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">SearchResultItem</span><span style=\"font-weight: bold\">(</span>\n",
+       "            <span style=\"color: #808000; text-decoration-color: #808000\">doc_hash</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c'</span>,\n",
+       "            <span style=\"color: #808000; text-decoration-color: #808000\">chunk</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'Switzerland\\nIBM Research-Zurich (previously called IBM Zurich Research Laboratory, ZRL) is the </span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000\">European branch of IBM Research. It was opened in 1956 and is located in Rüschlikon, near Zurich, Switzerland.'</span>,\n",
+       "            <span style=\"color: #808000; text-decoration-color: #808000\">main_path</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'main-text.70'</span>,\n",
+       "            <span style=\"color: #808000; text-decoration-color: #808000\">path_group</span>=<span style=\"font-weight: bold\">[</span><span style=\"color: #008000; text-decoration-color: #008000\">'main-text.69'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.70'</span><span style=\"font-weight: bold\">]</span>,\n",
+       "            <span style=\"color: #808000; text-decoration-color: #808000\">source_is_text</span>=<span style=\"color: #00ff00; text-decoration-color: #00ff00; font-style: italic\">True</span>\n",
+       "        <span style=\"font-weight: bold\">)</span>,\n",
+       "        <span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">SearchResultItem</span><span style=\"font-weight: bold\">(</span>\n",
+       "            <span style=\"color: #808000; text-decoration-color: #808000\">doc_hash</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c'</span>,\n",
+       "            <span style=\"color: #808000; text-decoration-color: #808000\">chunk</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'Switzerland\\nIn 1956, IBM opened their first European research laboratory in Adliswil, </span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000\">Switzerland, near Zurich. The lab moved to its own campus in neighboring Rüschlikon in 1962. The Zurich lab is </span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000\">staffed by a multicultural and interdisciplinary team of a few hundred permanent research staff members, graduate </span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000\">students and post-doctoral fellows, representing about 45 nationalities. Collocated with the lab is a Client Center</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000\">(formerly the Industry Solutions Lab), an executive briefing facility demonstrating technology prototypes and </span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000\">solutions.'</span>,\n",
+       "            <span style=\"color: #808000; text-decoration-color: #808000\">main_path</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'main-text.71'</span>,\n",
+       "            <span style=\"color: #808000; text-decoration-color: #808000\">path_group</span>=<span style=\"font-weight: bold\">[</span><span style=\"color: #008000; text-decoration-color: #008000\">'main-text.69'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.71'</span><span style=\"font-weight: bold\">]</span>,\n",
+       "            <span style=\"color: #808000; text-decoration-color: #808000\">source_is_text</span>=<span style=\"color: #00ff00; text-decoration-color: #00ff00; font-style: italic\">True</span>\n",
+       "        <span style=\"font-weight: bold\">)</span>,\n",
+       "        <span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">SearchResultItem</span><span style=\"font-weight: bold\">(</span>\n",
+       "            <span style=\"color: #808000; text-decoration-color: #808000\">doc_hash</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c'</span>,\n",
+       "            <span style=\"color: #808000; text-decoration-color: #808000\">chunk</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'Switzerland\\nThe research projects pursued at the IBM Zurich lab are organized into four </span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000\">scientific and technical departments: Science &amp; Technology, Cloud and AI Systems Research, Cognitive Computing &amp; </span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000\">Industry Solutions and Security Research. The lab is currently managed by Alessandro Curioni.'</span>,\n",
+       "            <span style=\"color: #808000; text-decoration-color: #808000\">main_path</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'main-text.74'</span>,\n",
+       "            <span style=\"color: #808000; text-decoration-color: #808000\">path_group</span>=<span style=\"font-weight: bold\">[</span><span style=\"color: #008000; text-decoration-color: #008000\">'main-text.69'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.74'</span><span style=\"font-weight: bold\">]</span>,\n",
+       "            <span style=\"color: #808000; text-decoration-color: #808000\">source_is_text</span>=<span style=\"color: #00ff00; text-decoration-color: #00ff00; font-style: italic\">True</span>\n",
+       "        <span style=\"font-weight: bold\">)</span>\n",
+       "    <span style=\"font-weight: bold\">]</span>\n",
+       "<span style=\"font-weight: bold\">)</span>\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1;35mRAGResult\u001b[0m\u001b[1m(\u001b[0m\n",
+       "    \u001b[33manswers\u001b[0m=\u001b[1m[\u001b[0m\n",
+       "        \u001b[1;35mRAGAnswerItem\u001b[0m\u001b[1m(\u001b[0m\n",
+       "            \u001b[33manswer\u001b[0m=\u001b[32m'The IBM lab in Zurich is located in Rüschlikon, Switzerland.'\u001b[0m,\n",
+       "            \u001b[33mgrounding\u001b[0m=\u001b[1;35mRAGGroundingInfo\u001b[0m\u001b[1m(\u001b[0m\n",
+       "                \u001b[33mretr_items\u001b[0m=\u001b[1m[\u001b[0m\n",
+       "                    \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n",
+       "                        \u001b[33mdoc_hash\u001b[0m=\u001b[32m'b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c'\u001b[0m,\n",
+       "                        \u001b[33mchunk\u001b[0m=\u001b[32m'Switzerland\\nIBM Research-Zurich \u001b[0m\u001b[32m(\u001b[0m\u001b[32mpreviously called IBM Zurich Research Laboratory, \u001b[0m\n",
+       "\u001b[32mZRL\u001b[0m\u001b[32m)\u001b[0m\u001b[32m is the European branch of IBM Research. It was opened in 1956 and is located in Rüschlikon, near Zurich, \u001b[0m\n",
+       "\u001b[32mSwitzerland.'\u001b[0m,\n",
+       "                        \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.70'\u001b[0m,\n",
+       "                        \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.69'\u001b[0m, \u001b[32m'main-text.70'\u001b[0m\u001b[1m]\u001b[0m,\n",
+       "                        \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n",
+       "                    \u001b[1m)\u001b[0m\n",
+       "                \u001b[1m]\u001b[0m,\n",
+       "                \u001b[33mgen_ctx_paths\u001b[0m=\u001b[1m[\u001b[0m\n",
+       "                    \u001b[32m'main-text.58'\u001b[0m,\n",
+       "                    \u001b[32m'main-text.59'\u001b[0m,\n",
+       "                    \u001b[32m'main-text.60'\u001b[0m,\n",
+       "                    \u001b[32m'main-text.61'\u001b[0m,\n",
+       "                    \u001b[32m'main-text.62'\u001b[0m,\n",
+       "                    \u001b[32m'main-text.63'\u001b[0m,\n",
+       "                    \u001b[32m'main-text.64'\u001b[0m,\n",
+       "                    \u001b[32m'main-text.65'\u001b[0m,\n",
+       "                    \u001b[32m'main-text.66'\u001b[0m,\n",
+       "                    \u001b[32m'main-text.67'\u001b[0m,\n",
+       "                    \u001b[32m'main-text.68'\u001b[0m,\n",
+       "                    \u001b[32m'main-text.69'\u001b[0m,\n",
+       "                    \u001b[32m'main-text.70'\u001b[0m,\n",
+       "                    \u001b[32m'main-text.71'\u001b[0m,\n",
+       "                    \u001b[32m'main-text.72'\u001b[0m,\n",
+       "                    \u001b[32m'main-text.73'\u001b[0m,\n",
+       "                    \u001b[32m'main-text.74'\u001b[0m,\n",
+       "                    \u001b[32m'main-text.75'\u001b[0m,\n",
+       "                    \u001b[32m'main-text.76'\u001b[0m\n",
+       "                \u001b[1m]\u001b[0m\n",
+       "            \u001b[1m)\u001b[0m,\n",
+       "            \u001b[33mprompt\u001b[0m=\u001b[3;35mNone\u001b[0m\n",
+       "        \u001b[1m)\u001b[0m\n",
+       "    \u001b[1m]\u001b[0m,\n",
+       "    \u001b[33msearch_result_items\u001b[0m=\u001b[1m[\u001b[0m\n",
+       "        \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n",
+       "            \u001b[33mdoc_hash\u001b[0m=\u001b[32m'b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c'\u001b[0m,\n",
+       "            \u001b[33mchunk\u001b[0m=\u001b[32m'Switzerland\\nIBM Research-Zurich \u001b[0m\u001b[32m(\u001b[0m\u001b[32mpreviously called IBM Zurich Research Laboratory, ZRL\u001b[0m\u001b[32m)\u001b[0m\u001b[32m is the \u001b[0m\n",
+       "\u001b[32mEuropean branch of IBM Research. It was opened in 1956 and is located in Rüschlikon, near Zurich, Switzerland.'\u001b[0m,\n",
+       "            \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.70'\u001b[0m,\n",
+       "            \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.69'\u001b[0m, \u001b[32m'main-text.70'\u001b[0m\u001b[1m]\u001b[0m,\n",
+       "            \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n",
+       "        \u001b[1m)\u001b[0m,\n",
+       "        \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n",
+       "            \u001b[33mdoc_hash\u001b[0m=\u001b[32m'b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c'\u001b[0m,\n",
+       "            \u001b[33mchunk\u001b[0m=\u001b[32m'Switzerland\\nIn 1956, IBM opened their first European research laboratory in Adliswil, \u001b[0m\n",
+       "\u001b[32mSwitzerland, near Zurich. The lab moved to its own campus in neighboring Rüschlikon in 1962. The Zurich lab is \u001b[0m\n",
+       "\u001b[32mstaffed by a multicultural and interdisciplinary team of a few hundred permanent research staff members, graduate \u001b[0m\n",
+       "\u001b[32mstudents and post-doctoral fellows, representing about 45 nationalities. Collocated with the lab is a Client Center\u001b[0m\n",
+       "\u001b[32m(\u001b[0m\u001b[32mformerly the Industry Solutions Lab\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, an executive briefing facility demonstrating technology prototypes and \u001b[0m\n",
+       "\u001b[32msolutions.'\u001b[0m,\n",
+       "            \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.71'\u001b[0m,\n",
+       "            \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.69'\u001b[0m, \u001b[32m'main-text.71'\u001b[0m\u001b[1m]\u001b[0m,\n",
+       "            \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n",
+       "        \u001b[1m)\u001b[0m,\n",
+       "        \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n",
+       "            \u001b[33mdoc_hash\u001b[0m=\u001b[32m'b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c'\u001b[0m,\n",
+       "            \u001b[33mchunk\u001b[0m=\u001b[32m'Switzerland\\nThe research projects pursued at the IBM Zurich lab are organized into four \u001b[0m\n",
+       "\u001b[32mscientific and technical departments: Science & Technology, Cloud and AI Systems Research, Cognitive Computing & \u001b[0m\n",
+       "\u001b[32mIndustry Solutions and Security Research. The lab is currently managed by Alessandro Curioni.'\u001b[0m,\n",
+       "            \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.74'\u001b[0m,\n",
+       "            \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.69'\u001b[0m, \u001b[32m'main-text.74'\u001b[0m\u001b[1m]\u001b[0m,\n",
+       "            \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n",
+       "        \u001b[1m)\u001b[0m\n",
+       "    \u001b[1m]\u001b[0m\n",
+       "\u001b[1m)\u001b[0m\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "question = \"Where is the IBM lab in Zurich?\"\n",
+    "\n",
+    "# submit natural-language query on collection\n",
+    "question_query = RAGQuery(\n",
+    "    question=question,\n",
+    "    project=PROJ_KEY,\n",
+    "    data_source=data_source,\n",
+    "    ## optional retrieval params\n",
+    "    retr_k=RETR_K,\n",
+    ")\n",
+    "api_output = api.queries.run(question_query)\n",
+    "rag_result = RAGResult.from_api_output(api_output, raise_on_error=RAISE)\n",
+    "\n",
+    "rich.print(rag_result)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Additionally, we can generate a provenance URL to the document in the Deep Search UI:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/markdown": [
+       "The provenance of the answer can be inspected on the [source document](https://sds.app.accelerate.science/projects/b09ae7561a01dc7c4b0fd21a43bfd93d140766d1/library/private/6b70072911ad2794a3844dd44d1705a5ba37ca0b?search=JTdCJTIycHJpdmF0ZUNvbGxlY3Rpb24lMjIlM0ElMjI2YjcwMDcyOTExYWQyNzk0YTM4NDRkZDQ0ZDE3MDVhNWJhMzdjYTBiJTIyJTJDJTIydHlwZSUyMiUzQSUyMkRvY3VtZW50JTIyJTJDJTIyZXhwcmVzc2lvbiUyMiUzQSUyMmZpbGUtaW5mby5kb2N1bWVudC1oYXNoJTNBJTIwJTVDJTIyYjMwYmM2NjdhMzI0YWUxMTFkMDI1NTI2NTYzYjY3NGE4ZDNmZDg2OWJjMDdjOGZkMjA0YWE5NWIwNWQ0MWYwYyU1QyUyMiUyMiUyQyUyMmZpbHRlcnMlMjIlM0ElNUIlNUQlMkMlMjJzZWxlY3QlMjIlM0ElNUIlMjJfbmFtZSUyMiUyQyUyMmRlc2NyaXB0aW9uLmNvbGxlY3Rpb24lMjIlMkMlMjJwcm92JTIyJTJDJTIyZGVzY3JpcHRpb24udGl0bGUlMjIlMkMlMjJkZXNjcmlwdGlvbi5wdWJsaWNhdGlvbl9kYXRlJTIyJTJDJTIyZGVzY3JpcHRpb24udXJsX3JlZnMlMjIlNUQlMkMlMjJpdGVtSW5kZXglMjIlM0EwJTJDJTIycGFnZVNpemUlMjIlM0ExMCUyQyUyMnNlYXJjaEFmdGVySGlzdG9yeSUyMiUzQSU1QiU1RCUyQyUyMnZpZXdUeXBlJTIyJTNBJTIyc25pcHBldHMlMjIlMkMlMjJyZWNvcmRTZWxlY3Rpb24lMjIlM0ElN0IlMjJyZWNvcmQlMjIlM0ElN0IlMjJpZCUyMiUzQSUyMmIzMGJjNjY3YTMyNGFlMTExZDAyNTUyNjU2M2I2NzRhOGQzZmQ4NjliYzA3YzhmZDIwNGFhOTViMDVkNDFmMGMlMjIlN0QlMkMlMjJpdGVtSW5kZXglMjIlM0E3MCU3RCU3RA%3D%3D)."
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "render_provenance_url(\n",
+    "    api=api, coords=coll_coords, retr_item=rag_result.answers[0].grounding.retr_items[0]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let us try out a different question on our document corpus.\n",
+    "Here we also include (commented out) various additional parameters the user can optionally set:\n",
+    "- `retr_k`: number of items to retrieve\n",
+    "- `text_weight`: weight of lexical search (`0.0`: fully semantic search, `1.0`: fully lexical search, anything in-between: hybrid search)\n",
+    "- `rerank`: whether to rerank the retrieval results\n",
+    "- `gen_ctx_extr_method` (Literal[\"window\", \"page\"], optional): method for gen context extraction from document; defaults to \"window\"\n",
+    "- `gen_ctx_window_size` (int, optional): (relevant only if `gen_ctx_extr_method` is \"window\") max chars to use for extracted gen context (actual extraction quantized on doc item level); defaults to 5000\n",
+    "- `gen_ctx_window_lead_weight` (float, optional): (relevant only if `gen_ctx_extr_method` is \"window\") weight of leading text for distributing remaining window size after extracting the `main_path`; defaults to 0.5 (centered around `main_path`)\n",
+    "- `return_prompt` (bool, optional): whether to return the instantiated prompt; defaults to False\n",
+    "- `gen_timeout` (float, optional): timeout for LLM generation; defaults to None, i.e. determined by system\n",
+    "\n",
+    "For more details refer to `deepsearch.cps.queries.RAGQuery`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "89d95a17-1569-4c90-a983-8ca437b7569d",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">RAGResult</span><span style=\"font-weight: bold\">(</span>\n",
+       "    <span style=\"color: #808000; text-decoration-color: #808000\">answers</span>=<span style=\"font-weight: bold\">[</span>\n",
+       "        <span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">RAGAnswerItem</span><span style=\"font-weight: bold\">(</span>\n",
+       "            <span style=\"color: #808000; text-decoration-color: #808000\">answer</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'The term machine learning was coined in 1959 by Arthur Samuel, an IBM employee and pioneer in </span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000\">the field of computer gaming and artificial intelligence.'</span>,\n",
+       "            <span style=\"color: #808000; text-decoration-color: #808000\">grounding</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">RAGGroundingInfo</span><span style=\"font-weight: bold\">(</span>\n",
+       "                <span style=\"color: #808000; text-decoration-color: #808000\">retr_items</span>=<span style=\"font-weight: bold\">[</span>\n",
+       "                    <span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">SearchResultItem</span><span style=\"font-weight: bold\">(</span>\n",
+       "                        <span style=\"color: #808000; text-decoration-color: #808000\">doc_hash</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'e470e7b42a92c8e5f25094362361947b9203e0074c2223505b4921940ec075a1'</span>,\n",
+       "                        <span style=\"color: #808000; text-decoration-color: #808000\">chunk</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'History and relationships to other fields\\nThe term machine learning was coined in </span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000\">1959 by Arthur Samuel, an IBM employee and pioneer in the field of computer gaming and artificial intelligence. </span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000\">$^{[10][11]}$ The synonym self-teaching computers was also used in this time period. [12][13]'</span>,\n",
+       "                        <span style=\"color: #808000; text-decoration-color: #808000\">main_path</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'main-text.6'</span>,\n",
+       "                        <span style=\"color: #808000; text-decoration-color: #808000\">path_group</span>=<span style=\"font-weight: bold\">[</span><span style=\"color: #008000; text-decoration-color: #008000\">'main-text.5'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.6'</span><span style=\"font-weight: bold\">]</span>,\n",
+       "                        <span style=\"color: #808000; text-decoration-color: #808000\">source_is_text</span>=<span style=\"color: #00ff00; text-decoration-color: #00ff00; font-style: italic\">True</span>\n",
+       "                    <span style=\"font-weight: bold\">)</span>\n",
+       "                <span style=\"font-weight: bold\">]</span>,\n",
+       "                <span style=\"color: #808000; text-decoration-color: #808000\">gen_ctx_paths</span>=<span style=\"font-weight: bold\">[</span>\n",
+       "                    <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.1'</span>,\n",
+       "                    <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.2'</span>,\n",
+       "                    <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.3'</span>,\n",
+       "                    <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.4'</span>,\n",
+       "                    <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.5'</span>,\n",
+       "                    <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.6'</span>,\n",
+       "                    <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.7'</span>,\n",
+       "                    <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.8'</span>,\n",
+       "                    <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.9'</span>,\n",
+       "                    <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.10'</span>\n",
+       "                <span style=\"font-weight: bold\">]</span>\n",
+       "            <span style=\"font-weight: bold\">)</span>,\n",
+       "            <span style=\"color: #808000; text-decoration-color: #808000\">prompt</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>\n",
+       "        <span style=\"font-weight: bold\">)</span>\n",
+       "    <span style=\"font-weight: bold\">]</span>,\n",
+       "    <span style=\"color: #808000; text-decoration-color: #808000\">search_result_items</span>=<span style=\"font-weight: bold\">[</span>\n",
+       "        <span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">SearchResultItem</span><span style=\"font-weight: bold\">(</span>\n",
+       "            <span style=\"color: #808000; text-decoration-color: #808000\">doc_hash</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'e470e7b42a92c8e5f25094362361947b9203e0074c2223505b4921940ec075a1'</span>,\n",
+       "            <span style=\"color: #808000; text-decoration-color: #808000\">chunk</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'History and relationships to other fields\\nThe term machine learning was coined in 1959 by </span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000\">Arthur Samuel, an IBM employee and pioneer in the field of computer gaming and artificial intelligence. </span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000\">$^{[10][11]}$ The synonym self-teaching computers was also used in this time period. [12][13]'</span>,\n",
+       "            <span style=\"color: #808000; text-decoration-color: #808000\">main_path</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'main-text.6'</span>,\n",
+       "            <span style=\"color: #808000; text-decoration-color: #808000\">path_group</span>=<span style=\"font-weight: bold\">[</span><span style=\"color: #008000; text-decoration-color: #008000\">'main-text.5'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.6'</span><span style=\"font-weight: bold\">]</span>,\n",
+       "            <span style=\"color: #808000; text-decoration-color: #808000\">source_is_text</span>=<span style=\"color: #00ff00; text-decoration-color: #00ff00; font-style: italic\">True</span>\n",
+       "        <span style=\"font-weight: bold\">)</span>,\n",
+       "        <span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">SearchResultItem</span><span style=\"font-weight: bold\">(</span>\n",
+       "            <span style=\"color: #808000; text-decoration-color: #808000\">doc_hash</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'e470e7b42a92c8e5f25094362361947b9203e0074c2223505b4921940ec075a1'</span>,\n",
+       "            <span style=\"color: #808000; text-decoration-color: #808000\">chunk</span>=<span style=\"color: #008000; text-decoration-color: #008000\">\"Machine learning\\nMachine learning (ML) is an umbrella term for solving problems for which </span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000\">development of algorithms by human programmers would be cost-prohibitive, and instead the problems are solved by </span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000\">helping machines 'discover ' their 'own ' algorithms, $^{[1]}$ without needing to be explicitly told what to do by </span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000\">any human-developed algorithms. $^{[2]}$ Recently, generative artificial neural networks have been able to surpass </span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000\">results of many previous approaches. $^{[3][4]}$ Machine-learning approaches have been applied to large language </span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000\">models, computer vision, speech recognition, email filtering, agriculture and medicine, where it is too costly to </span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000\">develop algorithms to perform the needed tasks. [5][6]\"</span>,\n",
+       "            <span style=\"color: #808000; text-decoration-color: #808000\">main_path</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'main-text.2'</span>,\n",
+       "            <span style=\"color: #808000; text-decoration-color: #808000\">path_group</span>=<span style=\"font-weight: bold\">[</span><span style=\"color: #008000; text-decoration-color: #008000\">'main-text.1'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.2'</span><span style=\"font-weight: bold\">]</span>,\n",
+       "            <span style=\"color: #808000; text-decoration-color: #808000\">source_is_text</span>=<span style=\"color: #00ff00; text-decoration-color: #00ff00; font-style: italic\">True</span>\n",
+       "        <span style=\"font-weight: bold\">)</span>,\n",
+       "        <span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">SearchResultItem</span><span style=\"font-weight: bold\">(</span>\n",
+       "            <span style=\"color: #808000; text-decoration-color: #808000\">doc_hash</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'e470e7b42a92c8e5f25094362361947b9203e0074c2223505b4921940ec075a1'</span>,\n",
+       "            <span style=\"color: #808000; text-decoration-color: #808000\">chunk</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'Artificial intelligence\\nMachine learning (ML), reorganized and recognized as its own field, </span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000\">started to flourish in the 1990s. The field changed its goal from achieving artificial intelligence to tackling </span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000\">solvable problems of a practical nature. It shifted focus away from the symbolic approaches it had inherited from </span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000\">AI, and toward methods and models borrowed from statistics, fuzzy logic, and probability theory. [24]'</span>,\n",
+       "            <span style=\"color: #808000; text-decoration-color: #808000\">main_path</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'main-text.15'</span>,\n",
+       "            <span style=\"color: #808000; text-decoration-color: #808000\">path_group</span>=<span style=\"font-weight: bold\">[</span><span style=\"color: #008000; text-decoration-color: #008000\">'main-text.10'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.15'</span><span style=\"font-weight: bold\">]</span>,\n",
+       "            <span style=\"color: #808000; text-decoration-color: #808000\">source_is_text</span>=<span style=\"color: #00ff00; text-decoration-color: #00ff00; font-style: italic\">True</span>\n",
+       "        <span style=\"font-weight: bold\">)</span>\n",
+       "    <span style=\"font-weight: bold\">]</span>\n",
+       "<span style=\"font-weight: bold\">)</span>\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1;35mRAGResult\u001b[0m\u001b[1m(\u001b[0m\n",
+       "    \u001b[33manswers\u001b[0m=\u001b[1m[\u001b[0m\n",
+       "        \u001b[1;35mRAGAnswerItem\u001b[0m\u001b[1m(\u001b[0m\n",
+       "            \u001b[33manswer\u001b[0m=\u001b[32m'The term machine learning was coined in 1959 by Arthur Samuel, an IBM employee and pioneer in \u001b[0m\n",
+       "\u001b[32mthe field of computer gaming and artificial intelligence.'\u001b[0m,\n",
+       "            \u001b[33mgrounding\u001b[0m=\u001b[1;35mRAGGroundingInfo\u001b[0m\u001b[1m(\u001b[0m\n",
+       "                \u001b[33mretr_items\u001b[0m=\u001b[1m[\u001b[0m\n",
+       "                    \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n",
+       "                        \u001b[33mdoc_hash\u001b[0m=\u001b[32m'e470e7b42a92c8e5f25094362361947b9203e0074c2223505b4921940ec075a1'\u001b[0m,\n",
+       "                        \u001b[33mchunk\u001b[0m=\u001b[32m'History and relationships to other fields\\nThe term machine learning was coined in \u001b[0m\n",
+       "\u001b[32m1959 by Arthur Samuel, an IBM employee and pioneer in the field of computer gaming and artificial intelligence. \u001b[0m\n",
+       "\u001b[32m$^\u001b[0m\u001b[32m{\u001b[0m\u001b[32m[\u001b[0m\u001b[32m10\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m11\u001b[0m\u001b[32m]\u001b[0m\u001b[32m}\u001b[0m\u001b[32m$ The synonym self-teaching computers was also used in this time period. \u001b[0m\u001b[32m[\u001b[0m\u001b[32m12\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m13\u001b[0m\u001b[32m]\u001b[0m\u001b[32m'\u001b[0m,\n",
+       "                        \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.6'\u001b[0m,\n",
+       "                        \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.5'\u001b[0m, \u001b[32m'main-text.6'\u001b[0m\u001b[1m]\u001b[0m,\n",
+       "                        \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n",
+       "                    \u001b[1m)\u001b[0m\n",
+       "                \u001b[1m]\u001b[0m,\n",
+       "                \u001b[33mgen_ctx_paths\u001b[0m=\u001b[1m[\u001b[0m\n",
+       "                    \u001b[32m'main-text.1'\u001b[0m,\n",
+       "                    \u001b[32m'main-text.2'\u001b[0m,\n",
+       "                    \u001b[32m'main-text.3'\u001b[0m,\n",
+       "                    \u001b[32m'main-text.4'\u001b[0m,\n",
+       "                    \u001b[32m'main-text.5'\u001b[0m,\n",
+       "                    \u001b[32m'main-text.6'\u001b[0m,\n",
+       "                    \u001b[32m'main-text.7'\u001b[0m,\n",
+       "                    \u001b[32m'main-text.8'\u001b[0m,\n",
+       "                    \u001b[32m'main-text.9'\u001b[0m,\n",
+       "                    \u001b[32m'main-text.10'\u001b[0m\n",
+       "                \u001b[1m]\u001b[0m\n",
+       "            \u001b[1m)\u001b[0m,\n",
+       "            \u001b[33mprompt\u001b[0m=\u001b[3;35mNone\u001b[0m\n",
+       "        \u001b[1m)\u001b[0m\n",
+       "    \u001b[1m]\u001b[0m,\n",
+       "    \u001b[33msearch_result_items\u001b[0m=\u001b[1m[\u001b[0m\n",
+       "        \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n",
+       "            \u001b[33mdoc_hash\u001b[0m=\u001b[32m'e470e7b42a92c8e5f25094362361947b9203e0074c2223505b4921940ec075a1'\u001b[0m,\n",
+       "            \u001b[33mchunk\u001b[0m=\u001b[32m'History and relationships to other fields\\nThe term machine learning was coined in 1959 by \u001b[0m\n",
+       "\u001b[32mArthur Samuel, an IBM employee and pioneer in the field of computer gaming and artificial intelligence. \u001b[0m\n",
+       "\u001b[32m$^\u001b[0m\u001b[32m{\u001b[0m\u001b[32m[\u001b[0m\u001b[32m10\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m11\u001b[0m\u001b[32m]\u001b[0m\u001b[32m}\u001b[0m\u001b[32m$ The synonym self-teaching computers was also used in this time period. \u001b[0m\u001b[32m[\u001b[0m\u001b[32m12\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m13\u001b[0m\u001b[32m]\u001b[0m\u001b[32m'\u001b[0m,\n",
+       "            \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.6'\u001b[0m,\n",
+       "            \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.5'\u001b[0m, \u001b[32m'main-text.6'\u001b[0m\u001b[1m]\u001b[0m,\n",
+       "            \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n",
+       "        \u001b[1m)\u001b[0m,\n",
+       "        \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n",
+       "            \u001b[33mdoc_hash\u001b[0m=\u001b[32m'e470e7b42a92c8e5f25094362361947b9203e0074c2223505b4921940ec075a1'\u001b[0m,\n",
+       "            \u001b[33mchunk\u001b[0m=\u001b[32m\"Machine\u001b[0m\u001b[32m learning\\nMachine learning \u001b[0m\u001b[32m(\u001b[0m\u001b[32mML\u001b[0m\u001b[32m)\u001b[0m\u001b[32m is an umbrella term for solving problems for which \u001b[0m\n",
+       "\u001b[32mdevelopment of algorithms by human programmers would be cost-prohibitive, and instead the problems are solved by \u001b[0m\n",
+       "\u001b[32mhelping machines 'discover ' their 'own ' algorithms, $^\u001b[0m\u001b[32m{\u001b[0m\u001b[32m[\u001b[0m\u001b[32m1\u001b[0m\u001b[32m]\u001b[0m\u001b[32m}\u001b[0m\u001b[32m$ without needing to be explicitly told what to do by \u001b[0m\n",
+       "\u001b[32many human-developed algorithms. $^\u001b[0m\u001b[32m{\u001b[0m\u001b[32m[\u001b[0m\u001b[32m2\u001b[0m\u001b[32m]\u001b[0m\u001b[32m}\u001b[0m\u001b[32m$ Recently, generative artificial neural networks have been able to surpass \u001b[0m\n",
+       "\u001b[32mresults of many previous approaches. $^\u001b[0m\u001b[32m{\u001b[0m\u001b[32m[\u001b[0m\u001b[32m3\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m4\u001b[0m\u001b[32m]\u001b[0m\u001b[32m}\u001b[0m\u001b[32m$ Machine-learning approaches have been applied to large language \u001b[0m\n",
+       "\u001b[32mmodels, computer vision, speech recognition, email filtering, agriculture and medicine, where it is too costly to \u001b[0m\n",
+       "\u001b[32mdevelop algorithms to perform the needed tasks. \u001b[0m\u001b[32m[\u001b[0m\u001b[32m5\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m6\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\"\u001b[0m,\n",
+       "            \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.2'\u001b[0m,\n",
+       "            \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.1'\u001b[0m, \u001b[32m'main-text.2'\u001b[0m\u001b[1m]\u001b[0m,\n",
+       "            \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n",
+       "        \u001b[1m)\u001b[0m,\n",
+       "        \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n",
+       "            \u001b[33mdoc_hash\u001b[0m=\u001b[32m'e470e7b42a92c8e5f25094362361947b9203e0074c2223505b4921940ec075a1'\u001b[0m,\n",
+       "            \u001b[33mchunk\u001b[0m=\u001b[32m'Artificial intelligence\\nMachine learning \u001b[0m\u001b[32m(\u001b[0m\u001b[32mML\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, reorganized and recognized as its own field, \u001b[0m\n",
+       "\u001b[32mstarted to flourish in the 1990s. The field changed its goal from achieving artificial intelligence to tackling \u001b[0m\n",
+       "\u001b[32msolvable problems of a practical nature. It shifted focus away from the symbolic approaches it had inherited from \u001b[0m\n",
+       "\u001b[32mAI, and toward methods and models borrowed from statistics, fuzzy logic, and probability theory. \u001b[0m\u001b[32m[\u001b[0m\u001b[32m24\u001b[0m\u001b[32m]\u001b[0m\u001b[32m'\u001b[0m,\n",
+       "            \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.15'\u001b[0m,\n",
+       "            \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.10'\u001b[0m, \u001b[32m'main-text.15'\u001b[0m\u001b[1m]\u001b[0m,\n",
+       "            \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n",
+       "        \u001b[1m)\u001b[0m\n",
+       "    \u001b[1m]\u001b[0m\n",
+       "\u001b[1m)\u001b[0m\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "question = \"Who came up with the term 'machine learning'?\"\n",
+    "\n",
+    "# submit natural-language query on collection\n",
+    "question_query = RAGQuery(\n",
+    "    question=question,\n",
+    "    project=PROJ_KEY,\n",
+    "    data_source=data_source,\n",
+    "    ## optional retrieval params\n",
+    "    retr_k=RETR_K,\n",
+    "    # text_weight=TEXT_WEIGHT,\n",
+    "    # rerank=RERANK,\n",
+    "    ## optional generation params\n",
+    "    # model_id=\"ibm-mistralai/mixtral-8x7b-instruct-v01-q\",\n",
+    "    # gen_params={\"random_seed\": 42, \"max_new_tokens\": 1024},\n",
+    "    # prompt_template=\"Answer the query based on the context.\\n\\nContext: {{ context }}\\n\\nQuery: {{ query }}\",\n",
+    "    # gen_ctx_extr_method=\"window\",\n",
+    "    # gen_ctx_window_size=5000,\n",
+    "    # gen_ctx_window_lead_weight=0.5\n",
+    "    # return_prompt=True,\n",
+    "    # gen_timeout=10.0,\n",
+    ")\n",
+    "api_output = api.queries.run(question_query)\n",
+    "rag_result = RAGResult.from_api_output(api_output, raise_on_error=RAISE)\n",
+    "\n",
+    "rich.print(rag_result)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "As seen by the returned `doc_hash`, this answer came from a different document than the previous one."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/markdown": [
+       "The provenance of the answer can be inspected on the [source document](https://sds.app.accelerate.science/projects/b09ae7561a01dc7c4b0fd21a43bfd93d140766d1/library/private/6b70072911ad2794a3844dd44d1705a5ba37ca0b?search=JTdCJTIycHJpdmF0ZUNvbGxlY3Rpb24lMjIlM0ElMjI2YjcwMDcyOTExYWQyNzk0YTM4NDRkZDQ0ZDE3MDVhNWJhMzdjYTBiJTIyJTJDJTIydHlwZSUyMiUzQSUyMkRvY3VtZW50JTIyJTJDJTIyZXhwcmVzc2lvbiUyMiUzQSUyMmZpbGUtaW5mby5kb2N1bWVudC1oYXNoJTNBJTIwJTVDJTIyZTQ3MGU3YjQyYTkyYzhlNWYyNTA5NDM2MjM2MTk0N2I5MjAzZTAwNzRjMjIyMzUwNWI0OTIxOTQwZWMwNzVhMSU1QyUyMiUyMiUyQyUyMmZpbHRlcnMlMjIlM0ElNUIlNUQlMkMlMjJzZWxlY3QlMjIlM0ElNUIlMjJfbmFtZSUyMiUyQyUyMmRlc2NyaXB0aW9uLmNvbGxlY3Rpb24lMjIlMkMlMjJwcm92JTIyJTJDJTIyZGVzY3JpcHRpb24udGl0bGUlMjIlMkMlMjJkZXNjcmlwdGlvbi5wdWJsaWNhdGlvbl9kYXRlJTIyJTJDJTIyZGVzY3JpcHRpb24udXJsX3JlZnMlMjIlNUQlMkMlMjJpdGVtSW5kZXglMjIlM0EwJTJDJTIycGFnZVNpemUlMjIlM0ExMCUyQyUyMnNlYXJjaEFmdGVySGlzdG9yeSUyMiUzQSU1QiU1RCUyQyUyMnZpZXdUeXBlJTIyJTNBJTIyc25pcHBldHMlMjIlMkMlMjJyZWNvcmRTZWxlY3Rpb24lMjIlM0ElN0IlMjJyZWNvcmQlMjIlM0ElN0IlMjJpZCUyMiUzQSUyMmU0NzBlN2I0MmE5MmM4ZTVmMjUwOTQzNjIzNjE5NDdiOTIwM2UwMDc0YzIyMjM1MDViNDkyMTk0MGVjMDc1YTElMjIlN0QlMkMlMjJpdGVtSW5kZXglMjIlM0E2JTdEJTdE)."
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "render_provenance_url(\n",
+    "    api=api, coords=coll_coords, retr_item=rag_result.answers[0].grounding.retr_items[0]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Semantic retrieval\n",
+    "\n",
+    "Besides RAG, which includes natural language generation, a user may only be interested in\n",
+    "the semantic retrieval part.\n",
+    "\n",
+    "This can be obtained very similarly to RAG, as shown below:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">SearchResult</span><span style=\"font-weight: bold\">(</span>\n",
+       "    <span style=\"color: #808000; text-decoration-color: #808000\">search_result_items</span>=<span style=\"font-weight: bold\">[</span>\n",
+       "        <span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">SearchResultItem</span><span style=\"font-weight: bold\">(</span>\n",
+       "            <span style=\"color: #808000; text-decoration-color: #808000\">doc_hash</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c'</span>,\n",
+       "            <span style=\"color: #808000; text-decoration-color: #808000\">chunk</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'Switzerland\\nIBM Research-Zurich (previously called IBM Zurich Research Laboratory, ZRL) is the </span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000\">European branch of IBM Research. It was opened in 1956 and is located in Rüschlikon, near Zurich, Switzerland.'</span>,\n",
+       "            <span style=\"color: #808000; text-decoration-color: #808000\">main_path</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'main-text.70'</span>,\n",
+       "            <span style=\"color: #808000; text-decoration-color: #808000\">path_group</span>=<span style=\"font-weight: bold\">[</span><span style=\"color: #008000; text-decoration-color: #008000\">'main-text.69'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.70'</span><span style=\"font-weight: bold\">]</span>,\n",
+       "            <span style=\"color: #808000; text-decoration-color: #808000\">source_is_text</span>=<span style=\"color: #00ff00; text-decoration-color: #00ff00; font-style: italic\">True</span>\n",
+       "        <span style=\"font-weight: bold\">)</span>,\n",
+       "        <span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">SearchResultItem</span><span style=\"font-weight: bold\">(</span>\n",
+       "            <span style=\"color: #808000; text-decoration-color: #808000\">doc_hash</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c'</span>,\n",
+       "            <span style=\"color: #808000; text-decoration-color: #808000\">chunk</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'Switzerland\\nIn 1956, IBM opened their first European research laboratory in Adliswil, </span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000\">Switzerland, near Zurich. The lab moved to its own campus in neighboring Rüschlikon in 1962. The Zurich lab is </span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000\">staffed by a multicultural and interdisciplinary team of a few hundred permanent research staff members, graduate </span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000\">students and post-doctoral fellows, representing about 45 nationalities. Collocated with the lab is a Client Center</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000\">(formerly the Industry Solutions Lab), an executive briefing facility demonstrating technology prototypes and </span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000\">solutions.'</span>,\n",
+       "            <span style=\"color: #808000; text-decoration-color: #808000\">main_path</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'main-text.71'</span>,\n",
+       "            <span style=\"color: #808000; text-decoration-color: #808000\">path_group</span>=<span style=\"font-weight: bold\">[</span><span style=\"color: #008000; text-decoration-color: #008000\">'main-text.69'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.71'</span><span style=\"font-weight: bold\">]</span>,\n",
+       "            <span style=\"color: #808000; text-decoration-color: #808000\">source_is_text</span>=<span style=\"color: #00ff00; text-decoration-color: #00ff00; font-style: italic\">True</span>\n",
+       "        <span style=\"font-weight: bold\">)</span>,\n",
+       "        <span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">SearchResultItem</span><span style=\"font-weight: bold\">(</span>\n",
+       "            <span style=\"color: #808000; text-decoration-color: #808000\">doc_hash</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c'</span>,\n",
+       "            <span style=\"color: #808000; text-decoration-color: #808000\">chunk</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'Switzerland\\nThe research projects pursued at the IBM Zurich lab are organized into four </span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000\">scientific and technical departments: Science &amp; Technology, Cloud and AI Systems Research, Cognitive Computing &amp; </span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000\">Industry Solutions and Security Research. The lab is currently managed by Alessandro Curioni.'</span>,\n",
+       "            <span style=\"color: #808000; text-decoration-color: #808000\">main_path</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'main-text.74'</span>,\n",
+       "            <span style=\"color: #808000; text-decoration-color: #808000\">path_group</span>=<span style=\"font-weight: bold\">[</span><span style=\"color: #008000; text-decoration-color: #008000\">'main-text.69'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'main-text.74'</span><span style=\"font-weight: bold\">]</span>,\n",
+       "            <span style=\"color: #808000; text-decoration-color: #808000\">source_is_text</span>=<span style=\"color: #00ff00; text-decoration-color: #00ff00; font-style: italic\">True</span>\n",
+       "        <span style=\"font-weight: bold\">)</span>\n",
+       "    <span style=\"font-weight: bold\">]</span>\n",
+       "<span style=\"font-weight: bold\">)</span>\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1;35mSearchResult\u001b[0m\u001b[1m(\u001b[0m\n",
+       "    \u001b[33msearch_result_items\u001b[0m=\u001b[1m[\u001b[0m\n",
+       "        \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n",
+       "            \u001b[33mdoc_hash\u001b[0m=\u001b[32m'b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c'\u001b[0m,\n",
+       "            \u001b[33mchunk\u001b[0m=\u001b[32m'Switzerland\\nIBM Research-Zurich \u001b[0m\u001b[32m(\u001b[0m\u001b[32mpreviously called IBM Zurich Research Laboratory, ZRL\u001b[0m\u001b[32m)\u001b[0m\u001b[32m is the \u001b[0m\n",
+       "\u001b[32mEuropean branch of IBM Research. It was opened in 1956 and is located in Rüschlikon, near Zurich, Switzerland.'\u001b[0m,\n",
+       "            \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.70'\u001b[0m,\n",
+       "            \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.69'\u001b[0m, \u001b[32m'main-text.70'\u001b[0m\u001b[1m]\u001b[0m,\n",
+       "            \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n",
+       "        \u001b[1m)\u001b[0m,\n",
+       "        \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n",
+       "            \u001b[33mdoc_hash\u001b[0m=\u001b[32m'b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c'\u001b[0m,\n",
+       "            \u001b[33mchunk\u001b[0m=\u001b[32m'Switzerland\\nIn 1956, IBM opened their first European research laboratory in Adliswil, \u001b[0m\n",
+       "\u001b[32mSwitzerland, near Zurich. The lab moved to its own campus in neighboring Rüschlikon in 1962. The Zurich lab is \u001b[0m\n",
+       "\u001b[32mstaffed by a multicultural and interdisciplinary team of a few hundred permanent research staff members, graduate \u001b[0m\n",
+       "\u001b[32mstudents and post-doctoral fellows, representing about 45 nationalities. Collocated with the lab is a Client Center\u001b[0m\n",
+       "\u001b[32m(\u001b[0m\u001b[32mformerly the Industry Solutions Lab\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, an executive briefing facility demonstrating technology prototypes and \u001b[0m\n",
+       "\u001b[32msolutions.'\u001b[0m,\n",
+       "            \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.71'\u001b[0m,\n",
+       "            \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.69'\u001b[0m, \u001b[32m'main-text.71'\u001b[0m\u001b[1m]\u001b[0m,\n",
+       "            \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n",
+       "        \u001b[1m)\u001b[0m,\n",
+       "        \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n",
+       "            \u001b[33mdoc_hash\u001b[0m=\u001b[32m'b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c'\u001b[0m,\n",
+       "            \u001b[33mchunk\u001b[0m=\u001b[32m'Switzerland\\nThe research projects pursued at the IBM Zurich lab are organized into four \u001b[0m\n",
+       "\u001b[32mscientific and technical departments: Science & Technology, Cloud and AI Systems Research, Cognitive Computing & \u001b[0m\n",
+       "\u001b[32mIndustry Solutions and Security Research. The lab is currently managed by Alessandro Curioni.'\u001b[0m,\n",
+       "            \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.74'\u001b[0m,\n",
+       "            \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.69'\u001b[0m, \u001b[32m'main-text.74'\u001b[0m\u001b[1m]\u001b[0m,\n",
+       "            \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n",
+       "        \u001b[1m)\u001b[0m\n",
+       "    \u001b[1m]\u001b[0m\n",
+       "\u001b[1m)\u001b[0m\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "question = \"Where is the IBM lab in Zurich?\"\n",
+    "\n",
+    "# submit natural-language query on collection\n",
+    "question_query = SemanticQuery(\n",
+    "    question=question,\n",
+    "    project=PROJ_KEY,\n",
+    "    data_source=data_source,\n",
+    "    ## optional params\n",
+    "    retr_k=RETR_K,\n",
+    "    # text_weight=TEXT_WEIGHT,\n",
+    "    # rerank=RERANK,\n",
+    ")\n",
+    "api_output = api.queries.run(question_query)\n",
+    "search_result = SearchResult.from_api_output(api_output, raise_on_error=RAISE)\n",
+    "\n",
+    "rich.print(search_result)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
 }
diff --git a/examples/qa_single_doc/single_doc_qa.ipynb b/examples/qa_single_doc/single_doc_qa.ipynb
index 36f3683..90ab125 100644
--- a/examples/qa_single_doc/single_doc_qa.ipynb
+++ b/examples/qa_single_doc/single_doc_qa.ipynb
@@ -48,8 +48,8 @@
     "# notebooks settings auto-loaded from .env / env vars\n",
     "notebook_settings = DocQANotebookSettings()\n",
     "\n",
-    "PROFILE_NAME = notebook_settings.profile     # the profile to use\n",
-    "PROJ_KEY = notebook_settings.proj_key        # the project to use\n",
+    "PROFILE_NAME = notebook_settings.profile  # the profile to use\n",
+    "PROJ_KEY = notebook_settings.proj_key  # the project to use\n",
     "\n",
     "# index and doc for doc QA from semantically indexed collection\n",
     "SEM_ON_IDX_KEY = notebook_settings.sem_on_idx_key\n",
@@ -59,12 +59,18 @@
     "SEM_OFF_IDX_KEY = notebook_settings.sem_off_idx_key\n",
     "SEM_OFF_IDX_DOC_HASH = notebook_settings.sem_off_idx_doc_hash\n",
     "\n",
-    "SKIP_INGESTED_DOCS = notebook_settings.skip_ingested_docs  # whether to skip any already semantically ingested docs\n",
+    "SKIP_INGESTED_DOCS = (\n",
+    "    notebook_settings.skip_ingested_docs\n",
+    ")  # whether to skip any already semantically ingested docs\n",
     "\n",
-    "RETR_K = notebook_settings.retr_k            # the number of search results to retrieve\n",
-    "TEXT_WEIGHT = notebook_settings.text_weight  # the weight of lexical search (0.0: semantic-only, 1.0: lexical-only, anything in between: hybrid search)\n",
-    "RERANK = notebook_settings.rerank            # whether to rerank the search results\n",
-    "RAISE = notebook_settings.raise_on_sem_err   # whether semantic operation errors should raise an exception or be reflected in response fields"
+    "RETR_K = notebook_settings.retr_k  # the number of search results to retrieve\n",
+    "TEXT_WEIGHT = (\n",
+    "    notebook_settings.text_weight\n",
+    ")  # the weight of lexical search (0.0: semantic-only, 1.0: lexical-only, anything in between: hybrid search)\n",
+    "RERANK = notebook_settings.rerank  # whether to rerank the search results\n",
+    "RAISE = (\n",
+    "    notebook_settings.raise_on_sem_err\n",
+    ")  # whether semantic operation errors should raise an exception or be reflected in response fields"
    ]
   },
   {
@@ -92,7 +98,7 @@
     "from deepsearch.cps.client.api import CpsApi\n",
     "from deepsearch.cps.client.components.elastic import ElasticProjectDataCollectionSource\n",
     "from deepsearch.cps.queries import RAGQuery, SemanticQuery\n",
-    "from deepsearch.cps.queries.results import RAGResult, SearchResult, SearchResultItem\n"
+    "from deepsearch.cps.queries.results import RAGResult, SearchResult, SearchResultItem"
    ]
   },
   {
@@ -110,7 +116,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "api = CpsApi.from_env(profile_name=PROFILE_NAME)\n"
+    "api = CpsApi.from_env(profile_name=PROFILE_NAME)"
    ]
   },
   {
@@ -132,13 +138,17 @@
     "    retr_item: SearchResultItem,\n",
     "):\n",
     "    ## compute URL to the document in the Deep Search UI\n",
-    "    item_index = int(retr_item.main_path[retr_item.main_path.rfind(\".\")+1:])\n",
+    "    item_index = int(retr_item.main_path[retr_item.main_path.rfind(\".\") + 1 :])\n",
     "    doc_url = api.documents.generate_url(\n",
     "        document_hash=retr_item.doc_hash,\n",
     "        data_source=coords,\n",
     "        item_index=item_index,\n",
     "    )\n",
-    "    display(Markdown(f\"The provenance of the answer can be inspected on the [source document]({doc_url}).\"))"
+    "    display(\n",
+    "        Markdown(\n",
+    "            f\"The provenance of the answer can be inspected on the [source document]({doc_url}).\"\n",
+    "        )\n",
+    "    )"
    ]
   },
   {
@@ -377,7 +387,6 @@
     "    question=question,\n",
     "    project=PROJ_KEY,\n",
     "    data_source=data_source,\n",
-    "\n",
     "    ## optional retrieval params\n",
     "    retr_k=RETR_K,\n",
     ")\n",
@@ -413,7 +422,9 @@
     }
    ],
    "source": [
-    "render_provenance_url(api=api, coords=coords, retr_item=rag_result.answers[0].grounding.retr_items[0])"
+    "render_provenance_url(\n",
+    "    api=api, coords=coords, retr_item=rag_result.answers[0].grounding.retr_items[0]\n",
+    ")"
    ]
   },
   {
@@ -693,23 +704,19 @@
     "    question=question,\n",
     "    project=PROJ_KEY,\n",
     "    data_source=data_source,\n",
-    "\n",
     "    ## optional retrieval params\n",
     "    retr_k=RETR_K,\n",
     "    # text_weight=TEXT_WEIGHT,\n",
     "    rerank=RERANK,\n",
-    "\n",
     "    ## optional generation params\n",
     "    # model_id=\"ibm-mistralai/mixtral-8x7b-instruct-v01-q\",\n",
     "    # gen_params={\"random_seed\": 42, \"max_new_tokens\": 1024},\n",
     "    # prompt_template=\"Answer the query based on the context.\\n\\nContext: {{ context }}\\n\\nQuery: {{ query }}\",\n",
-    "\n",
     "    # gen_ctx_extr_method=\"window\",\n",
     "    # gen_ctx_window_size=5000,\n",
     "    # gen_ctx_window_lead_weight=0.5\n",
     "    # return_prompt=True,\n",
     "    # gen_timeout=10.0,\n",
-    "\n",
     ")\n",
     "api_output = api.queries.run(question_query)\n",
     "rag_result = RAGResult.from_api_output(api_output, raise_on_error=RAISE)\n",
@@ -736,7 +743,9 @@
     }
    ],
    "source": [
-    "render_provenance_url(api=api, coords=coords, retr_item=rag_result.answers[0].grounding.retr_items[0])"
+    "render_provenance_url(\n",
+    "    api=api, coords=coords, retr_item=rag_result.answers[0].grounding.retr_items[0]\n",
+    ")"
    ]
   },
   {
@@ -844,7 +853,6 @@
     "    question=question,\n",
     "    project=PROJ_KEY,\n",
     "    data_source=data_source,\n",
-    "\n",
     "    ## optional params\n",
     "    retr_k=RETR_K,\n",
     "    # text_weight=TEXT_WEIGHT,\n",
@@ -1150,12 +1158,10 @@
     "    question=question,\n",
     "    project=PROJ_KEY,\n",
     "    data_source=data_source,\n",
-    "\n",
     "    ## optional retrieval params\n",
     "    retr_k=4,\n",
     "    # text_weight=TEXT_WEIGHT,\n",
     "    rerank=RERANK,\n",
-    "\n",
     "    ## optional generation params\n",
     "    # model_id=\"ibm-mistralai/mixtral-8x7b-instruct-v01-q\",\n",
     "    # gen_params={\"random_seed\": 42, \"max_new_tokens\": 1024},\n",
@@ -1164,7 +1170,7 @@
     "api_output = api.queries.run(question_query)\n",
     "rag_result = RAGResult.from_api_output(api_output, raise_on_error=RAISE)\n",
     "\n",
-    "rich.print(rag_result)\n"
+    "rich.print(rag_result)"
    ]
   },
   {
@@ -1186,7 +1192,9 @@
     }
    ],
    "source": [
-    "render_provenance_url(api=api, coords=coords, retr_item=rag_result.answers[0].grounding.retr_items[0])"
+    "render_provenance_url(\n",
+    "    api=api, coords=coords, retr_item=rag_result.answers[0].grounding.retr_items[0]\n",
+    ")"
    ]
   },
   {
@@ -1319,7 +1327,6 @@
     "    question=question,\n",
     "    project=PROJ_KEY,\n",
     "    data_source=data_source,\n",
-    "\n",
     "    ## optional params\n",
     "    retr_k=4,\n",
     "    # text_weight=TEXT_WEIGHT,\n",
@@ -1627,7 +1634,6 @@
     "    question=question,\n",
     "    project=PROJ_KEY,\n",
     "    data_source=data_source,\n",
-    "\n",
     "    ## optional retrieval params\n",
     "    retr_k=RETR_K,\n",
     ")\n",