From ce9a2fa5279ff99441c2c2332078b2ba7b6b0e01 Mon Sep 17 00:00:00 2001 From: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Date: Mon, 27 May 2024 16:54:18 +0200 Subject: [PATCH] chore: apply black formatting and enable CI on pre-commit hooks (#66) --- .github/actions/setup-poetry/action.yml | 19 + .github/workflows/checks.yml | 18 + .pre-commit-config.yaml | 2 +- .../manage_attachments.ipynb | 20 +- .../upload_converted_documents.ipynb | 41 +- examples/data_query_chemistry/chemistry.ipynb | 83 +- .../chemistry_patcid.ipynb | 55 +- .../data_query_quick_start/quick_start.ipynb | 75 +- examples/data_query_snippets/snippets.ipynb | 160 +- .../document_bulk_upload/run_batch_upload.py | 33 +- .../convert_documents_custom.ipynb | 18 +- .../extract_tables.ipynb | 65 +- .../convert_documents.ipynb | 32 +- .../visualize_bbox.ipynb | 117 +- .../integration_argilla/argilla_upload.ipynb | 4 +- .../kg_download_quick_start.ipynb | 120 +- .../nlp_for_materials/nlp_for_materials.ipynb | 68 +- .../nlp_for_references.ipynb | 52 +- .../nlp_on_documents/nlp_on_documents.ipynb | 91 +- .../qa_doc_collection/doc_collection_qa.ipynb | 1975 +++++++++-------- examples/qa_single_doc/single_doc_qa.ipynb | 56 +- 21 files changed, 1678 insertions(+), 1426 deletions(-) create mode 100644 .github/actions/setup-poetry/action.yml create mode 100644 .github/workflows/checks.yml diff --git a/.github/actions/setup-poetry/action.yml b/.github/actions/setup-poetry/action.yml new file mode 100644 index 0000000..8fe1b14 --- /dev/null +++ b/.github/actions/setup-poetry/action.yml @@ -0,0 +1,19 @@ +name: 'Set up Poetry and install' +description: 'Set up a specific version of Poetry and install dependencies using caching.' +inputs: + python-version: + description: "Version range or exact version of Python or PyPy to use, using SemVer's version range syntax." + default: '3.10' +runs: + using: 'composite' + steps: + - name: Install poetry + run: pipx install poetry==1.8.3 + shell: bash + - uses: actions/setup-python@v4 + with: + python-version: ${{ inputs.python-version }} + cache: 'poetry' + - name: Install dependencies + run: poetry install --all-extras + shell: bash \ No newline at end of file diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml new file mode 100644 index 0000000..971e1e2 --- /dev/null +++ b/.github/workflows/checks.yml @@ -0,0 +1,18 @@ +on: + push: + branches: + - "**" + +jobs: + run-checks: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ['3.8', '3.9', '3.10'] + steps: + - uses: actions/checkout@v3 + - uses: ./.github/actions/setup-poetry + with: + python-version: ${{ matrix.python-version }} + - name: Run styling check + run: poetry run pre-commit run --all-files diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 22b45ba..47a8da8 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -4,7 +4,7 @@ repos: hooks: - id: system name: Black - entry: poetry run black nbrunner dsnotebooks examples + entry: poetry run black --include '(\.py|\.ipynb)$' nbrunner dsnotebooks examples pass_filenames: false language: system files: '(\.py|\.ipynb)$' diff --git a/examples/attachment_management/manage_attachments.ipynb b/examples/attachment_management/manage_attachments.ipynb index ea311ef..5a25a2f 100644 --- a/examples/attachment_management/manage_attachments.ipynb +++ b/examples/attachment_management/manage_attachments.ipynb @@ -42,10 +42,10 @@ "# notebook settings auto-loaded from .env / env vars\n", "notebook_settings = ProjectNotebookSettings()\n", "\n", - "PROFILE_NAME = notebook_settings.profile # profile to use\n", - "PROJ_KEY = notebook_settings.proj_key # project to use\n", + "PROFILE_NAME = notebook_settings.profile # profile to use\n", + "PROJ_KEY = notebook_settings.proj_key # project to use\n", "INDEX_NAME = notebook_settings.new_idx_name # index to create\n", - "CLEANUP = notebook_settings.cleanup # whether to clean up\n", + "CLEANUP = notebook_settings.cleanup # whether to clean up\n", "ATTACHMENT_KEY = \"usr_attachments\" # format must be: \"usr_\"\n", "FILES_TO_ATTACH = [\n", " \"../../data/samples/2206.00785.pdf\",\n", @@ -100,8 +100,10 @@ "metadata": {}, "outputs": [], "source": [ - "def find_index_item(api, coordinates, search_query=\"*\", source=None, page_size=50, pred=None):\n", - " \"\"\" Find first index item that satisfies the criteria \"\"\"\n", + "def find_index_item(\n", + " api, coordinates, search_query=\"*\", source=None, page_size=50, pred=None\n", + "):\n", + " \"\"\"Find first index item that satisfies the criteria\"\"\"\n", " source_to_use = [\"_id\", \"_name\", \"_s3_data\"] if source is None else source\n", " query = DataQuery(\n", " search_query=search_query,\n", @@ -116,6 +118,7 @@ " return item\n", " return None\n", "\n", + "\n", "def list_item_attachments(api, coordinates, index_item_id, attch_key):\n", " pred = lambda x: x[\"_id\"] == index_item_id\n", " item = find_index_item(api, coordinates, pred=pred)\n", @@ -420,9 +423,7 @@ " m.update(json.dumps(row, sort_keys=True).encode())\n", " h = m.hexdigest()\n", " row[\"_name\"] = f\"row-{i:06d}-{h[:5]}\"\n", - " row[\"file-info\"] = {\n", - " \"document-hash\": h\n", - " }" + " row[\"file-info\"] = {\"document-hash\": h}" ] }, { @@ -609,8 +610,7 @@ " filename = Path(attachment[\"path\"]).name\n", " download_url = attachment[\"url\"]\n", " display(HTML(f'👉 Download {filename}'))\n", - " print()\n", - " " + " print()" ] }, { diff --git a/examples/bring_your_own_converted_documents/upload_converted_documents.ipynb b/examples/bring_your_own_converted_documents/upload_converted_documents.ipynb index ee2d339..8460404 100644 --- a/examples/bring_your_own_converted_documents/upload_converted_documents.ipynb +++ b/examples/bring_your_own_converted_documents/upload_converted_documents.ipynb @@ -46,10 +46,10 @@ "# notebook settings auto-loaded from .env / env vars\n", "notebook_settings = ProjectNotebookSettings()\n", "\n", - "PROFILE_NAME = notebook_settings.profile # profile to use\n", - "PROJ_KEY = notebook_settings.proj_key # project to use\n", + "PROFILE_NAME = notebook_settings.profile # profile to use\n", + "PROJ_KEY = notebook_settings.proj_key # project to use\n", "INDEX_NAME = notebook_settings.new_idx_name # index to create\n", - "CLEANUP = notebook_settings.cleanup # whether to clean up\n", + "CLEANUP = notebook_settings.cleanup # whether to clean up\n", "INPUT_FILES_FOLDER = Path(\"../../data/converted/\")\n", "TMP_DIR = tempfile.TemporaryDirectory()" ] @@ -212,7 +212,11 @@ } ], "source": [ - "display(Markdown(f\"The data is now available. You can query it programmatically (see next section) or access it via the Deep Search UI at
{api.client.config.host}/projects/{PROJ_KEY}/library/private/{data_index.source.index_key}\"))" + "display(\n", + " Markdown(\n", + " f\"The data is now available. You can query it programmatically (see next section) or access it via the Deep Search UI at
{api.client.config.host}/projects/{PROJ_KEY}/library/private/{data_index.source.index_key}\"\n", + " )\n", + ")" ] }, { @@ -249,7 +253,7 @@ "# Count the documents in the data index\n", "query = DataQuery(\"*\", source=[\"\"], limit=0, coordinates=data_index.source)\n", "query_results = api.queries.run(query)\n", - "num_results = query_results.outputs['data_count']\n", + "num_results = query_results.outputs[\"data_count\"]\n", "print(f\"The data index contains {num_results} entries.\")" ] }, @@ -286,23 +290,34 @@ "source": [ "# Find documents matching query\n", "search_query = \"speedup\"\n", - "query = DataQuery(search_query, source=[\"description.title\", \"description.authors\"], coordinates=data_index.source)\n", + "query = DataQuery(\n", + " search_query,\n", + " source=[\"description.title\", \"description.authors\"],\n", + " coordinates=data_index.source,\n", + ")\n", "query_results = api.queries.run(query)\n", "\n", "all_results = []\n", "cursor = api.queries.run_paginated_query(query)\n", "for result_page in tqdm(cursor):\n", " # Iterate through the results of a single page, and add to the total list\n", - " for row in result_page.outputs[\"data_outputs\"]: \n", + " for row in result_page.outputs[\"data_outputs\"]:\n", " print()\n", " # Add row to results table\n", - " all_results.append({\n", - " \"Title\": row[\"_source\"][\"description\"][\"title\"],\n", - " \"Authors\": \", \".join([author[\"name\"] for author in row[\"_source\"][\"description\"].get(\"authors\", [])]),\n", - " }) \n", + " all_results.append(\n", + " {\n", + " \"Title\": row[\"_source\"][\"description\"][\"title\"],\n", + " \"Authors\": \", \".join(\n", + " [\n", + " author[\"name\"]\n", + " for author in row[\"_source\"][\"description\"].get(\"authors\", [])\n", + " ]\n", + " ),\n", + " }\n", + " )\n", "\n", "num_results = len(all_results)\n", - "print(f'Finished fetching all data. Total is {num_results} records.')" + "print(f\"Finished fetching all data. Total is {num_results} records.\")" ] }, { @@ -388,7 +403,7 @@ " api.data_indices.delete(data_index.source)\n", " print(\"Data index deleted\")\n", " TMP_DIR.cleanup()\n", - " print(\"Temporary directory deleted\")\n" + " print(\"Temporary directory deleted\")" ] } ], diff --git a/examples/data_query_chemistry/chemistry.ipynb b/examples/data_query_chemistry/chemistry.ipynb index 50464a8..395fc18 100644 --- a/examples/data_query_chemistry/chemistry.ipynb +++ b/examples/data_query_chemistry/chemistry.ipynb @@ -64,6 +64,7 @@ "from numerize.numerize import numerize\n", "import mols2grid\n", "from tqdm.notebook import tqdm\n", + "\n", "%matplotlib inline\n", "\n", "# IPython utilities\n", @@ -259,10 +260,14 @@ "\n", "# Prepare the data query\n", "query = DataQuery(\n", - " search_query, # The search query to be executed\n", - " source=[\"subject\", \"attributes\", \"identifiers\"], # Which fields of documents we want to fetch\n", - " limit=page_size, # The size of each request page\n", - " coordinates=data_collection # The data collection to be queries\n", + " search_query, # The search query to be executed\n", + " source=[\n", + " \"subject\",\n", + " \"attributes\",\n", + " \"identifiers\",\n", + " ], # Which fields of documents we want to fetch\n", + " limit=page_size, # The size of each request page\n", + " coordinates=data_collection, # The data collection to be queries\n", ")\n", "\n", "\n", @@ -271,7 +276,9 @@ "count_query.paginated_task.parameters[\"limit\"] = 0\n", "count_results = api.queries.run(count_query)\n", "expected_total = count_results.outputs[\"data_count\"]\n", - "expected_pages = (expected_total + page_size - 1) // page_size # this is simply a ceiling formula\n", + "expected_pages = (\n", + " expected_total + page_size - 1\n", + ") // page_size # this is simply a ceiling formula\n", "\n", "\n", "# Iterate through all results by fetching `page_size` results at the same time\n", @@ -280,7 +287,7 @@ "for result_page in tqdm(cursor, total=expected_pages):\n", " all_results.extend(result_page.outputs[\"data_outputs\"])\n", "\n", - "print(f'Finished fetching all data. Total is {len(all_results)} records.')" + "print(f\"Finished fetching all data. Total is {len(all_results)} records.\")" ] }, { @@ -702,7 +709,7 @@ " for ref in row[\"_source\"][\"identifiers\"]:\n", " if ref[\"type\"] == \"cid\":\n", " result[\"cid\"] = ref[\"value\"]\n", - " \n", + "\n", " for ref in row[\"_source\"][\"subject\"][\"identifiers\"]:\n", " if ref[\"type\"] == \"smiles\":\n", " result[\"SMILES\"] = ref[\"value\"]\n", @@ -714,7 +721,7 @@ " for ref in row[\"_source\"][\"subject\"][\"names\"]:\n", " if ref[\"type\"] == \"chemical_name\":\n", " result[\"chemical_name\"] = ref[\"value\"]\n", - " \n", + "\n", " for attribute in row[\"_source\"][\"attributes\"]:\n", " for predicate in attribute[\"predicates\"]:\n", " value = predicate[\"value\"][\"name\"]\n", @@ -723,9 +730,7 @@ " elif \"numerical_value\" in predicate:\n", " value = predicate[\"numerical_value\"][\"val\"]\n", " result[predicate[\"key\"][\"name\"]] = value\n", - " \n", - " \n", - " \n", + "\n", " results_table.append(result)\n", "\n", "df = pd.DataFrame(results_table)\n", @@ -1480,10 +1485,14 @@ "\n", "# Prepare the data query\n", "query = DataQuery(\n", - " search_query, # The search query to be executed\n", - " source=[\"subject\", \"attributes\", \"identifiers\"], # Which fields of documents we want to fetch\n", - " limit=page_size, # The size of each request page\n", - " coordinates=data_collection # The data collection to be queries\n", + " search_query, # The search query to be executed\n", + " source=[\n", + " \"subject\",\n", + " \"attributes\",\n", + " \"identifiers\",\n", + " ], # Which fields of documents we want to fetch\n", + " limit=page_size, # The size of each request page\n", + " coordinates=data_collection, # The data collection to be queries\n", ")\n", "\n", "\n", @@ -1492,7 +1501,9 @@ "count_query.paginated_task.parameters[\"limit\"] = 0\n", "count_results = api.queries.run(count_query)\n", "expected_total = count_results.outputs[\"data_count\"]\n", - "expected_pages = (expected_total + page_size - 1) // page_size # this is simply a ceiling formula\n", + "expected_pages = (\n", + " expected_total + page_size - 1\n", + ") // page_size # this is simply a ceiling formula\n", "\n", "\n", "# Iterate through all results by fetching `page_size` results at the same time\n", @@ -1501,7 +1512,7 @@ "for result_page in tqdm(cursor, total=expected_pages):\n", " all_results.extend(result_page.outputs[\"data_outputs\"])\n", "\n", - "print(f'Finished fetching all data. Total is {len(all_results)} records.')" + "print(f\"Finished fetching all data. Total is {len(all_results)} records.\")" ] }, { @@ -2286,7 +2297,7 @@ " for ref in row[\"_source\"][\"identifiers\"]:\n", " if ref[\"type\"] == \"cid\":\n", " result[\"cid\"] = ref[\"value\"]\n", - " \n", + "\n", " for ref in row[\"_source\"][\"subject\"][\"identifiers\"]:\n", " if ref[\"type\"] == \"smiles\":\n", " result[\"SMILES\"] = ref[\"value\"]\n", @@ -2298,7 +2309,7 @@ " for ref in row[\"_source\"][\"subject\"][\"names\"]:\n", " if ref[\"type\"] == \"chemical_name\":\n", " result[\"chemical_name\"] = ref[\"value\"]\n", - " \n", + "\n", " for attribute in row[\"_source\"][\"attributes\"]:\n", " for predicate in attribute[\"predicates\"]:\n", " value = predicate[\"value\"][\"name\"]\n", @@ -2307,12 +2318,10 @@ " elif \"numerical_value\" in predicate:\n", " value = predicate[\"numerical_value\"][\"val\"]\n", " result[predicate[\"key\"][\"name\"]] = value\n", - " \n", - " \n", - " \n", + "\n", " results_table.append(result)\n", "\n", - " \n", + "\n", "# Display the results table\n", "df = pd.DataFrame(results_table)\n", "display(df)\n", @@ -2361,17 +2370,21 @@ "source": [ "# Search by name\n", "search_smiles = \"C1=CC=C2C(=C1)C(=CN2)CCO\"\n", - "search_query = f\"subject.identifiers._name:\\\"smiles#{search_smiles.lower()}\\\"\"\n", + "search_query = f'subject.identifiers._name:\"smiles#{search_smiles.lower()}\"'\n", "\n", "data_collection = ElasticDataCollectionSource(elastic_id=\"default\", index_key=\"pubchem\")\n", "page_size = 50\n", "\n", "# Prepare the data query\n", "query = DataQuery(\n", - " search_query, # The search query to be executed\n", - " source=[\"subject\", \"attributes\", \"identifiers\"], # Which fields of documents we want to fetch\n", - " limit=page_size, # The size of each request page\n", - " coordinates=data_collection # The data collection to be queries\n", + " search_query, # The search query to be executed\n", + " source=[\n", + " \"subject\",\n", + " \"attributes\",\n", + " \"identifiers\",\n", + " ], # Which fields of documents we want to fetch\n", + " limit=page_size, # The size of each request page\n", + " coordinates=data_collection, # The data collection to be queries\n", ")\n", "\n", "\n", @@ -2380,7 +2393,9 @@ "count_query.paginated_task.parameters[\"limit\"] = 0\n", "count_results = api.queries.run(count_query)\n", "expected_total = count_results.outputs[\"data_count\"]\n", - "expected_pages = (expected_total + page_size - 1) // page_size # this is simply a ceiling formula\n", + "expected_pages = (\n", + " expected_total + page_size - 1\n", + ") // page_size # this is simply a ceiling formula\n", "\n", "\n", "# Iterate through all results by fetching `page_size` results at the same time\n", @@ -2389,7 +2404,7 @@ "for result_page in tqdm(cursor, total=expected_pages):\n", " all_results.extend(result_page.outputs[\"data_outputs\"])\n", "\n", - "print(f'Finished fetching all data. Total is {len(all_results)} records.')" + "print(f\"Finished fetching all data. Total is {len(all_results)} records.\")" ] }, { @@ -3151,7 +3166,7 @@ " for ref in row[\"_source\"][\"identifiers\"]:\n", " if ref[\"type\"] == \"cid\":\n", " result[\"cid\"] = ref[\"value\"]\n", - " \n", + "\n", " for ref in row[\"_source\"][\"subject\"][\"identifiers\"]:\n", " if ref[\"type\"] == \"smiles\":\n", " result[\"SMILES\"] = ref[\"value\"]\n", @@ -3163,7 +3178,7 @@ " for ref in row[\"_source\"][\"subject\"][\"names\"]:\n", " if ref[\"type\"] == \"chemical_name\":\n", " result[\"chemical_name\"] = ref[\"value\"]\n", - " \n", + "\n", " for attribute in row[\"_source\"][\"attributes\"]:\n", " for predicate in attribute[\"predicates\"]:\n", " value = predicate[\"value\"][\"name\"]\n", @@ -3172,9 +3187,7 @@ " elif \"numerical_value\" in predicate:\n", " value = predicate[\"numerical_value\"][\"val\"]\n", " result[predicate[\"key\"][\"name\"]] = value\n", - " \n", - " \n", - " \n", + "\n", " results_table.append(result)\n", "\n", "# Display the results table\n", diff --git a/examples/data_query_chemistry_patcid/chemistry_patcid.ipynb b/examples/data_query_chemistry_patcid/chemistry_patcid.ipynb index c635c0f..9fe09ea 100644 --- a/examples/data_query_chemistry_patcid/chemistry_patcid.ipynb +++ b/examples/data_query_chemistry_patcid/chemistry_patcid.ipynb @@ -86,14 +86,18 @@ "import deepsearch as ds\n", "from deepsearch.cps.client.api import CpsApi\n", "from deepsearch.cps.client.components.elastic import ElasticDataCollectionSource\n", - "from deepsearch.chemistry.queries.molecules import MoleculeQuery, MoleculesInPatentsQuery, PatentsWithMoleculesQuery\n", + "from deepsearch.chemistry.queries.molecules import (\n", + " MoleculeQuery,\n", + " MoleculesInPatentsQuery,\n", + " PatentsWithMoleculesQuery,\n", + ")\n", "from deepsearch.chemistry.queries.molecules import MolId, MolIdType, MolQueryType\n", "from deepsearch.cps.client.components.elastic import ElasticDataCollectionSource\n", "from deepsearch.documents.core.lookup import EntitiesLookup\n", "from deepsearch.documents.core.render import get_page_svg_with_item\n", "from deepsearch.cps.queries import DataQuery\n", "\n", - "from deepsearch.cps.client.components.queries import RunQueryError\n" + "from deepsearch.cps.client.components.queries import RunQueryError" ] }, { @@ -111,7 +115,7 @@ "metadata": {}, "outputs": [], "source": [ - "api = CpsApi.from_env(profile_name=PROFILE_NAME)\n" + "api = CpsApi.from_env(profile_name=PROFILE_NAME)" ] }, { @@ -180,7 +184,11 @@ "input_smiles = \"C1(C(=C)C([O-])=C1C)=O\"\n", "\n", "display(Markdown(\"### Substructure\"))\n", - "display(Markdown(f\"We will list molecules containing the Squarilium (`{input_smiles}`) substructure\"))\n", + "display(\n", + " Markdown(\n", + " f\"We will list molecules containing the Squarilium (`{input_smiles}`) substructure\"\n", + " )\n", + ")\n", "\n", "smiles_mol = Chem.MolFromSmiles(input_smiles)\n", "display(smiles_mol)" @@ -2277,7 +2285,7 @@ "\n", "df = pd.DataFrame(results_table)\n", "display(df)\n", - "mols2grid.display(df, smiles_col=\"SMILES\")\n" + "mols2grid.display(df, smiles_col=\"SMILES\")" ] }, { @@ -6893,9 +6901,11 @@ ], "source": [ "# Load the full document\n", - "patent_smiles_coords = ElasticDataCollectionSource(elastic_id=\"default\", index_key=\"patent-smiles\")\n", + "patent_smiles_coords = ElasticDataCollectionSource(\n", + " elastic_id=\"default\", index_key=\"patent-smiles\"\n", + ")\n", "query = DataQuery(\n", - " f\"identifiers.value:\\\"{patent_id}\\\"\",\n", + " f'identifiers.value:\"{patent_id}\"',\n", " coordinates=patent_smiles_coords,\n", ")\n", "\n", @@ -6926,8 +6936,10 @@ " }\n", " )\n", "molecules_location_table.sort(key=lambda row: (row[\"Patent\"], row[\"Page\"]))\n", - "df_molecules_location = pd.DataFrame(molecules_location_table, columns=[\"SMILES\", \"Patent\", \"Page\", \"Url\"])\n", - "display(HTML(df_molecules_location.to_html(escape=False)))\n" + "df_molecules_location = pd.DataFrame(\n", + " molecules_location_table, columns=[\"SMILES\", \"Patent\", \"Page\", \"Url\"]\n", + ")\n", + "display(HTML(df_molecules_location.to_html(escape=False)))" ] }, { @@ -7481,8 +7493,9 @@ "source": [ "input_smiles = \"CN1C(=CC2=C([O-])C(=Cc3[se]c4ccccc4[n+]3C)C(=O)C2=O)[Se]c2ccccc21\"\n", "\n", - "display(Markdown(\n", - " f\"\"\"\n", + "display(\n", + " Markdown(\n", + " f\"\"\"\n", "For example, looking for a SMILES is done with\n", "```python\n", "query = PatentsWithMoleculesQuery(\n", @@ -7492,12 +7505,17 @@ " num_items=20,\n", ")\n", "```\n", - "\"\"\"))\n", - "\n", - "display(Markdown(\n", "\"\"\"\n", + " )\n", + ")\n", + "\n", + "display(\n", + " Markdown(\n", + " \"\"\"\n", "This will look for patents containing\n", - "\"\"\"))\n", + "\"\"\"\n", + " )\n", + ")\n", "\n", "smiles_mol = Chem.MolFromSmiles(input_smiles)\n", "display(smiles_mol)" @@ -7521,7 +7539,12 @@ "source": [ "# Search by SMILES\n", "query = PatentsWithMoleculesQuery(\n", - " molecules=[MolId(type=MolIdType.SMILES, value=\"CN1C(=CC2=C([O-])C(=Cc3[se]c4ccccc4[n+]3C)C(=O)C2=O)[Se]c2ccccc21\")],\n", + " molecules=[\n", + " MolId(\n", + " type=MolIdType.SMILES,\n", + " value=\"CN1C(=CC2=C([O-])C(=Cc3[se]c4ccccc4[n+]3C)C(=O)C2=O)[Se]c2ccccc21\",\n", + " )\n", + " ],\n", " num_items=20,\n", ")\n", "\n", diff --git a/examples/data_query_quick_start/quick_start.ipynb b/examples/data_query_quick_start/quick_start.ipynb index 6f3e334..ffa1c71 100644 --- a/examples/data_query_quick_start/quick_start.ipynb +++ b/examples/data_query_quick_start/quick_start.ipynb @@ -39,7 +39,7 @@ "# notebook settings auto-loaded from .env / env vars\n", "notebook_settings = NotebookSettings()\n", "\n", - "PROFILE_NAME = notebook_settings.profile # the profile to use\n" + "PROFILE_NAME = notebook_settings.profile # the profile to use" ] }, { @@ -63,6 +63,7 @@ "from numerize.numerize import numerize\n", "from tqdm.notebook import tqdm\n", "import matplotlib.pyplot as plt\n", + "\n", "%matplotlib inline\n", "\n", "# IPython utilities\n", @@ -71,7 +72,7 @@ "# Import the deepsearch-toolkit\n", "import deepsearch as ds\n", "from deepsearch.cps.client.components.elastic import ElasticDataCollectionSource\n", - "from deepsearch.cps.queries import DataQuery\n" + "from deepsearch.cps.queries import DataQuery" ] }, { @@ -89,7 +90,7 @@ "metadata": {}, "outputs": [], "source": [ - "api = ds.CpsApi.from_env(profile_name=PROFILE_NAME)\n" + "api = ds.CpsApi.from_env(profile_name=PROFILE_NAME)" ] }, { @@ -125,7 +126,7 @@ "source": [ "# Fetch list of all data collections\n", "collections = api.elastic.list()\n", - "collections.sort(key=lambda c: c.name.lower())\n" + "collections.sort(key=lambda c: c.name.lower())" ] }, { @@ -351,7 +352,7 @@ " }\n", " for c in collections\n", "]\n", - "display(pd.DataFrame(results))\n" + "display(pd.DataFrame(results))" ] }, { @@ -512,7 +513,7 @@ ], "source": [ "# Input query\n", - "search_query = \"main-text.text:((\\\"power conversion efficiency\\\" OR PCE) AND organ*)\"\n", + "search_query = 'main-text.text:((\"power conversion efficiency\" OR PCE) AND organ*)'\n", "\n", "# Iterate through the data collections\n", "results = []\n", @@ -526,14 +527,11 @@ " # Execute the query\n", " query = DataQuery(search_query, source=[], limit=0, coordinates=c.source)\n", " query_results = api.queries.run(query)\n", - " results.append({\n", - " \"name\": c.name,\n", - " \"matches\": query_results.outputs[\"data_count\"]\n", - " })\n", + " results.append({\"name\": c.name, \"matches\": query_results.outputs[\"data_count\"]})\n", "\n", "# Sort and display results\n", "results.sort(reverse=True, key=lambda r: r[\"matches\"])\n", - "display(pd.DataFrame(results))\n" + "display(pd.DataFrame(results))" ] }, { @@ -569,7 +567,7 @@ "x = [r[\"name\"] for r in results]\n", "y = [r[\"matches\"] for r in results]\n", "plt.pie(y, labels=x, labeldistance=None)\n", - "plt.legend(loc=\"upper center\", ncols=3, bbox_to_anchor=(0.5, 0))\n" + "plt.legend(loc=\"upper center\", ncols=3, bbox_to_anchor=(0.5, 0))" ] }, { @@ -621,16 +619,22 @@ ], "source": [ "# Input query\n", - "search_query = \"main-text.text:((\\\"power conversion efficiency\\\" OR PCE) AND organ*)\"\n", - "data_collection = ElasticDataCollectionSource(elastic_id=\"default\", index_key=\"arxiv-abstract\")\n", + "search_query = 'main-text.text:((\"power conversion efficiency\" OR PCE) AND organ*)'\n", + "data_collection = ElasticDataCollectionSource(\n", + " elastic_id=\"default\", index_key=\"arxiv-abstract\"\n", + ")\n", "page_size = 50\n", "\n", "# Prepare the data query\n", "query = DataQuery(\n", - " search_query, # The search query to be executed\n", - " source=[\"description.title\", \"description.authors\", \"identifiers\"], # Which fields of documents we want to fetch\n", - " limit=page_size, # The size of each request page\n", - " coordinates=data_collection # The data collection to be queries\n", + " search_query, # The search query to be executed\n", + " source=[\n", + " \"description.title\",\n", + " \"description.authors\",\n", + " \"identifiers\",\n", + " ], # Which fields of documents we want to fetch\n", + " limit=page_size, # The size of each request page\n", + " coordinates=data_collection, # The data collection to be queries\n", ")\n", "\n", "\n", @@ -639,7 +643,9 @@ "count_query.paginated_task.parameters[\"limit\"] = 0\n", "count_results = api.queries.run(count_query)\n", "expected_total = count_results.outputs[\"data_count\"]\n", - "expected_pages = (expected_total + page_size - 1) // page_size # this is simply a ceiling formula\n", + "expected_pages = (\n", + " expected_total + page_size - 1\n", + ") // page_size # this is simply a ceiling formula\n", "\n", "\n", "# Iterate through all results by fetching `page_size` results at the same time\n", @@ -660,16 +666,23 @@ " links[\"doi\"] = f'https://doi.org/{ids[\"value\"]}'\n", "\n", " # Add row to results table\n", - " all_results.append({\n", - " \"Title\": row[\"_source\"][\"description\"][\"title\"],\n", - " \"Authors\": \", \".join([author[\"name\"] for author in row[\"_source\"][\"description\"][\"authors\"]]),\n", - " \"arXiv\": identifiers[\"arxiv\"],\n", - " \"arXiv URL\": links[\"arxiv\"],\n", - " \"DOI\": identifiers[\"doi\"],\n", - " \"DOI URL\": links[\"doi\"],\n", - " })\n", - "\n", - "print(f'Finished fetching all data. Total is {len(all_results)} records.')\n" + " all_results.append(\n", + " {\n", + " \"Title\": row[\"_source\"][\"description\"][\"title\"],\n", + " \"Authors\": \", \".join(\n", + " [\n", + " author[\"name\"]\n", + " for author in row[\"_source\"][\"description\"][\"authors\"]\n", + " ]\n", + " ),\n", + " \"arXiv\": identifiers[\"arxiv\"],\n", + " \"arXiv URL\": links[\"arxiv\"],\n", + " \"DOI\": identifiers[\"doi\"],\n", + " \"DOI URL\": links[\"doi\"],\n", + " }\n", + " )\n", + "\n", + "print(f\"Finished fetching all data. Total is {len(all_results)} records.\")" ] }, { @@ -755,7 +768,7 @@ "source": [ "# Visualize the table with all results\n", "df = pd.json_normalize(all_results)\n", - "display(HTML(df.head().to_html(render_links=True)))\n" + "display(HTML(df.head().to_html(render_links=True)))" ] }, { @@ -766,7 +779,7 @@ "outputs": [], "source": [ "# Save the results to an Excel table\n", - "df.to_excel(\"quick_start_results.xlsx\")\n" + "df.to_excel(\"quick_start_results.xlsx\")" ] } ], diff --git a/examples/data_query_snippets/snippets.ipynb b/examples/data_query_snippets/snippets.ipynb index d4cb8a0..a1c9710 100644 --- a/examples/data_query_snippets/snippets.ipynb +++ b/examples/data_query_snippets/snippets.ipynb @@ -228,17 +228,21 @@ ], "source": [ "# Run a proximity query using some keywords with maximum edit distance 5\n", - "search_query = \"\\\"climate change mitigation\\\"~5\"\n", + "search_query = '\"climate change mitigation\"~5'\n", "\n", "data_collection = ElasticDataCollectionSource(elastic_id=\"default\", index_key=\"ipcc\")\n", "page_size = 50\n", "\n", "# Prepare the data query\n", "query = DataQuery(\n", - " search_query, # The search query to be executed\n", - " source=[\"description.title\", \"description.publication_date\", \"file-info.#-pages\"], # Fields to fetch\n", - " limit=page_size, # The size of each request page\n", - " coordinates=data_collection # The data collection to be queried\n", + " search_query, # The search query to be executed\n", + " source=[\n", + " \"description.title\",\n", + " \"description.publication_date\",\n", + " \"file-info.#-pages\",\n", + " ], # Fields to fetch\n", + " limit=page_size, # The size of each request page\n", + " coordinates=data_collection, # The data collection to be queried\n", ")\n", "\n", "# Iterate through all results by fetching `page_size` results at the same time\n", @@ -351,12 +355,21 @@ "source": [ "# Visualize summary table\n", "df = pd.json_normalize(all_results).loc[\n", - " :,[\"_source.description.title\", \"_source.description.publication_date\", \"_source.file-info.#-pages\"]]\n", + " :,\n", + " [\n", + " \"_source.description.title\",\n", + " \"_source.description.publication_date\",\n", + " \"_source.file-info.#-pages\",\n", + " ],\n", + "]\n", "df.columns = [\"Title\", \"Publication Date\", \"Number of Pages\"]\n", "df[\"Publication Year\"] = df[\"Publication Date\"].str[:4]\n", "\n", - "df.loc[:,[\"Title\", \"Publication Year\", \"Number of Pages\"]].head(10).style.set_properties(\n", - " subset=[\"Title\"], **{\"text-align\": \"left\"}).set_table_styles([dict(selector=\"th\", props=[(\"text-align\", \"left\")])])" + "df.loc[:, [\"Title\", \"Publication Year\", \"Number of Pages\"]].head(\n", + " 10\n", + ").style.set_properties(subset=[\"Title\"], **{\"text-align\": \"left\"}).set_table_styles(\n", + " [dict(selector=\"th\", props=[(\"text-align\", \"left\")])]\n", + ")" ] }, { @@ -384,16 +397,16 @@ ], "source": [ "# Prepare the data query\n", - "search_query = \"\\\"climate change carbon sequestration\\\"~10\"\n", + "search_query = '\"climate change carbon sequestration\"~10'\n", "\n", "highlight = {\"fields\": {\"*\": {}}}\n", "\n", "query = DataQuery(\n", " search_query,\n", - " source=[\"file-info.filename\"], # Fetch only the report file name\n", - " limit=page_size, # Fetch maximum `page_size` reports\n", + " source=[\"file-info.filename\"], # Fetch only the report file name\n", + " limit=page_size, # Fetch maximum `page_size` reports\n", " highlight=highlight,\n", - " coordinates=data_collection\n", + " coordinates=data_collection,\n", ")\n", "\n", "all_results = []\n", @@ -505,21 +518,24 @@ "# Format and visualize the first 10 snippets\n", "def format_highlight_results(ds_results):\n", " results_table = []\n", - " \n", + "\n", " for row in ds_results:\n", " for field in row.get(\"highlight\", {}).keys():\n", " for snippet in row[\"highlight\"][field]:\n", " result = {\n", " \"Report\": row[\"_source\"][\"file-info\"][\"filename\"],\n", " \"Field\": field,\n", - " \"Snippet\": snippet\n", + " \"Snippet\": snippet,\n", " }\n", " results_table.append(result)\n", - " \n", + "\n", " return pd.DataFrame(results_table)\n", "\n", + "\n", "df = format_highlight_results(all_results)\n", - "df_style = df.head(10).style.set_table_styles([dict(selector=\"th\", props=[(\"text-align\", \"left\")])])\n", + "df_style = df.head(10).style.set_table_styles(\n", + " [dict(selector=\"th\", props=[(\"text-align\", \"left\")])]\n", + ")\n", "df_style.set_properties(**{\"text-align\": \"left\"})" ] }, @@ -642,7 +658,12 @@ "source": [ "highlight[\"fields\"] = {\"main-text.text\": {}}\n", "query = DataQuery(\n", - " search_query, source=[\"file-info.filename\"], limit=page_size, highlight=highlight, coordinates=data_collection)\n", + " search_query,\n", + " source=[\"file-info.filename\"],\n", + " limit=page_size,\n", + " highlight=highlight,\n", + " coordinates=data_collection,\n", + ")\n", "\n", "all_results = api.queries.run(query).outputs[\"data_outputs\"]\n", "format_highlight_results(all_results).head(10).style.use(df_style.export())" @@ -759,7 +780,12 @@ "highlight[\"fragment_size\"] = 0\n", "\n", "query = DataQuery(\n", - " search_query, source=[\"file-info.filename\"], limit=page_size, highlight=highlight, coordinates=data_collection)\n", + " search_query,\n", + " source=[\"file-info.filename\"],\n", + " limit=page_size,\n", + " highlight=highlight,\n", + " coordinates=data_collection,\n", + ")\n", "\n", "all_results = api.queries.run(query).outputs[\"data_outputs\"]\n", "format_highlight_results(all_results).head(10).style.use(df_style.export())" @@ -877,7 +903,12 @@ "highlight[\"post_tags\"] = [\"\"]\n", "\n", "query = DataQuery(\n", - " search_query, source=[\"file-info.filename\"], limit=page_size, highlight=highlight, coordinates=data_collection)\n", + " search_query,\n", + " source=[\"file-info.filename\"],\n", + " limit=page_size,\n", + " highlight=highlight,\n", + " coordinates=data_collection,\n", + ")\n", "\n", "all_results = api.queries.run(query).outputs[\"data_outputs\"]\n", "format_highlight_results(all_results).head(10).style.use(df_style.export())" @@ -995,22 +1026,25 @@ } ], "source": [ - "search_query = \"\\\"climate change\\\" AND mitigation AND (city cities urban)\"\n", + "search_query = '\"climate change\" AND mitigation AND (city cities urban)'\n", "\n", "highlight = {\n", " \"order\": \"score\",\n", " \"fragment_size\": 150,\n", " \"fields\": {\n", " \"description.title\": {\"number_of_fragments\": 0},\n", - " \"main-text.text\": {\"number_of_fragments\": 7}\n", - " }\n", + " \"main-text.text\": {\"number_of_fragments\": 7},\n", + " },\n", "}\n", "\n", "query = DataQuery(\n", " search_query,\n", " source=[\"file-info.filename\"],\n", " sort=[{\"_score\": \"desc\", \"file-info.document-hash\": \"asc\"}],\n", - " limit=page_size, highlight=highlight, coordinates=data_collection)\n", + " limit=page_size,\n", + " highlight=highlight,\n", + " coordinates=data_collection,\n", + ")\n", "\n", "all_results = api.queries.run(query).outputs[\"data_outputs\"]\n", "format_highlight_results(all_results).head(10).style.use(df_style.export())" @@ -1062,20 +1096,14 @@ } ], "source": [ - "aggs = {\n", - " \"language_count\": {\n", - " \"cardinality\": {\n", - " \"field\": \"description.languages\"\n", - " }\n", - " }\n", - "}\n", + "aggs = {\"language_count\": {\"cardinality\": {\"field\": \"description.languages\"}}}\n", "\n", "query = DataQuery(\n", - " search_query = \"*:*\", # Match-all query\n", + " search_query=\"*:*\", # Match-all query\n", " source=[], # No document data will be returned\n", " limit=0,\n", " aggregations=aggs,\n", - " coordinates=data_collection\n", + " coordinates=data_collection,\n", ")\n", "\n", "summary = api.queries.run(query).outputs[\"data_aggs\"]\n", @@ -1126,16 +1154,31 @@ " \"field\": \"description.languages\",\n", " \"order\": {\"_key\": \"asc\"},\n", " \"size\": 50,\n", - " \"exclude\": \"en\"\n", + " \"exclude\": \"en\",\n", " }\n", " }\n", "}\n", "\n", - "query = DataQuery(search_query = \"*:*\", source=[], limit=0, aggregations=aggs, coordinates=data_collection)\n", + "query = DataQuery(\n", + " search_query=\"*:*\",\n", + " source=[],\n", + " limit=0,\n", + " aggregations=aggs,\n", + " coordinates=data_collection,\n", + ")\n", "summary = api.queries.run(query).outputs[\"data_aggs\"]\n", "\n", "df = pd.json_normalize(summary[\"languages\"][\"buckets\"])\n", - "df.plot.bar(y=\"doc_count\", x=\"key\", figsize=(15, 5), xlabel=\"Language\", ylabel=\"Number of reports\", rot=0, legend=False, title=\"Number of non-English reports by language\")" + "df.plot.bar(\n", + " y=\"doc_count\",\n", + " x=\"key\",\n", + " figsize=(15, 5),\n", + " xlabel=\"Language\",\n", + " ylabel=\"Number of reports\",\n", + " rot=0,\n", + " legend=False,\n", + " title=\"Number of non-English reports by language\",\n", + ")" ] }, { @@ -1178,18 +1221,32 @@ " \"by_year\": {\n", " \"date_histogram\": {\n", " \"field\": \"description.publication_date\",\n", - " \"calendar_interval\": \"year\",\n", - " \"format\": \"yyyy\",\n", - " \"min_doc_count\": 0\n", + " \"calendar_interval\": \"year\",\n", + " \"format\": \"yyyy\",\n", + " \"min_doc_count\": 0,\n", " }\n", " }\n", "}\n", "\n", - "query = DataQuery(search_query = search_query, source=[], limit=0, aggregations=aggs, coordinates=data_collection)\n", + "query = DataQuery(\n", + " search_query=search_query,\n", + " source=[],\n", + " limit=0,\n", + " aggregations=aggs,\n", + " coordinates=data_collection,\n", + ")\n", "summary = api.queries.run(query).outputs[\"data_aggs\"]\n", "\n", "df = pd.json_normalize(summary[\"by_year\"][\"buckets\"])\n", - "df.plot.bar(y=\"doc_count\", x=\"key_as_string\", figsize=(15, 5), xlabel=\"Publication year\", ylabel=\"Number of reports\", legend=False, title=\"Number of IPCC reports by year\")\n" + "df.plot.bar(\n", + " y=\"doc_count\",\n", + " x=\"key_as_string\",\n", + " figsize=(15, 5),\n", + " xlabel=\"Publication year\",\n", + " ylabel=\"Number of reports\",\n", + " legend=False,\n", + " title=\"Number of IPCC reports by year\",\n", + ")" ] }, { @@ -1216,23 +1273,26 @@ } ], "source": [ - "search_query = \"tables.data.text:\\\"net-zero emissions\\\"\"\n", + "search_query = 'tables.data.text:\"net-zero emissions\"'\n", "\n", "aggs = {\n", " \"num_tables\": {\n", - " \"sum\": {\n", - " \"script\": {\n", - " \"source\": \"doc['tables.#-cols'].length\",\n", - " \"lang\": \"painless\"\n", - " }\n", - " }\n", - " } \n", + " \"sum\": {\"script\": {\"source\": \"doc['tables.#-cols'].length\", \"lang\": \"painless\"}}\n", + " }\n", "}\n", "\n", - "query = DataQuery(search_query = search_query, source=[], limit=0, aggregations=aggs, coordinates=data_collection)\n", + "query = DataQuery(\n", + " search_query=search_query,\n", + " source=[],\n", + " limit=0,\n", + " aggregations=aggs,\n", + " coordinates=data_collection,\n", + ")\n", "summary = api.queries.run(query).outputs[\"data_aggs\"]\n", "\n", - "print(f\"We found {int(summary['num_tables']['value'])} tables containing 'net-zero emissions'.\")" + "print(\n", + " f\"We found {int(summary['num_tables']['value'])} tables containing 'net-zero emissions'.\"\n", + ")" ] } ], diff --git a/examples/document_bulk_upload/run_batch_upload.py b/examples/document_bulk_upload/run_batch_upload.py index 046d3e5..b1e4868 100644 --- a/examples/document_bulk_upload/run_batch_upload.py +++ b/examples/document_bulk_upload/run_batch_upload.py @@ -30,10 +30,10 @@ import argparse import asyncio -import sys import logging import os.path import signal +import sys import uuid from copy import deepcopy from enum import Enum @@ -70,7 +70,12 @@ def __str__(self): async def upload_for_key_prefix( - api, coords, s3_credentials, key_prefix, raw_pages: bool, semaphore: asyncio.Semaphore + api, + coords, + s3_credentials, + key_prefix, + raw_pages: bool, + semaphore: asyncio.Semaphore, ): async with semaphore: # This will limit the number of concurrent uploads task_id = None @@ -80,7 +85,7 @@ async def upload_for_key_prefix( payload = { "s3_source": {"coordinates": cos_coordinates_sub.dict()}, - "target_settings": {"add_raw_pages": raw_pages} + "target_settings": {"add_raw_pages": raw_pages}, } task_id = api.data_indices.upload_file( coords=coords, @@ -93,7 +98,9 @@ async def upload_for_key_prefix( request_status = await wait_for_task(api, coords, task_id) - logging.info(f"Report for {key_prefix} with task_id {task_id}: {request_status}") + logging.info( + f"Report for {key_prefix} with task_id {task_id}: {request_status}" + ) return [key_prefix], request_status except Exception as e: logging.error( @@ -102,13 +109,15 @@ async def upload_for_key_prefix( return [key_prefix], None -async def upload_for_urls(api, coords, url_batch, raw_pages: bool, semaphore: asyncio.Semaphore): +async def upload_for_urls( + api, coords, url_batch, raw_pages: bool, semaphore: asyncio.Semaphore +): async with semaphore: # This will limit the number of concurrent uploads task_id = None try: payload = { "file_url": url_batch, - "target_settings": {"add_raw_pages": raw_pages} + "target_settings": {"add_raw_pages": raw_pages}, } task_id = api.data_indices.upload_file(coords=coords, body=payload) @@ -193,7 +202,7 @@ async def main(): "-w", action=argparse.BooleanOptionalAction, default=False, - required=False + required=False, ) # Parse the command-line arguments @@ -219,7 +228,7 @@ async def main(): raise argparse.ArgumentTypeError( "you must provide s3-credentials with input-type S3." ) - + save_file = args.resume_point if args.resume_point else args.input_file with open(save_file) as f: logging.info(f"Reading elements from {save_file}") @@ -233,7 +242,9 @@ async def main(): pending_items = elements save_elements(RESUME_FILENAME, pending_items) - logging.info(f"To resume this job later, provide --resume-point {RESUME_FILENAME} to the command line.") + logging.info( + f"To resume this job later, provide --resume-point {RESUME_FILENAME} to the command line." + ) semaphore = asyncio.Semaphore(args.concurrency) signal.signal(signal.SIGTERM, handle_exit_signal) @@ -249,7 +260,9 @@ async def main(): if args.input_type == InputSource.S3: tasks = [ loop.create_task( - upload_for_key_prefix(api, coords, s3_cred, prefix, args.raw_pages, semaphore) + upload_for_key_prefix( + api, coords, s3_cred, prefix, args.raw_pages, semaphore + ) ) for prefix in pending_items ] diff --git a/examples/document_conversion_custom_settings/convert_documents_custom.ipynb b/examples/document_conversion_custom_settings/convert_documents_custom.ipynb index 0d6aeda..387b556 100644 --- a/examples/document_conversion_custom_settings/convert_documents_custom.ipynb +++ b/examples/document_conversion_custom_settings/convert_documents_custom.ipynb @@ -41,7 +41,7 @@ "notebook_settings = ProjectNotebookSettings()\n", "\n", "PROFILE_NAME = notebook_settings.profile # the profile to use\n", - "PROJ_KEY = notebook_settings.proj_key # the project to use" + "PROJ_KEY = notebook_settings.proj_key # the project to use" ] }, { @@ -65,8 +65,14 @@ "outputs": [], "source": [ "import deepsearch as ds\n", - "from deepsearch.documents.core.models import ConversionSettings, DefaultConversionModel, ProjectConversionModel, \\\n", - " OCRSettings, AlpineOcrEngine, AlpineOcrLanguage" + "from deepsearch.documents.core.models import (\n", + " ConversionSettings,\n", + " DefaultConversionModel,\n", + " ProjectConversionModel,\n", + " OCRSettings,\n", + " AlpineOcrEngine,\n", + " AlpineOcrLanguage,\n", + ")" ] }, { @@ -142,11 +148,11 @@ " proj_key=PROJ_KEY,\n", " source_path=\"../../data/samples/2206.01062.pdf\",\n", " conversion_settings=cs,\n", - " progress_bar=True\n", - ") \n", + " progress_bar=True,\n", + ")\n", "documents.download_all(result_dir=\"./converted_docs\")\n", "info = documents.generate_report(result_dir=\"./converted_docs\")\n", - "print(info) " + "print(info)" ] } ], diff --git a/examples/document_conversion_extract_tables/extract_tables.ipynb b/examples/document_conversion_extract_tables/extract_tables.ipynb index 2be4304..08bd481 100644 --- a/examples/document_conversion_extract_tables/extract_tables.ipynb +++ b/examples/document_conversion_extract_tables/extract_tables.ipynb @@ -35,7 +35,7 @@ "notebook_settings = ProjectNotebookSettings()\n", "\n", "PROFILE_NAME = notebook_settings.profile # the profile to use\n", - "PROJ_KEY = notebook_settings.proj_key # the project to use\n", + "PROJ_KEY = notebook_settings.proj_key # the project to use\n", "INPUT_FILE = Path(\"../../data/samples/2206.00785.pdf\")" ] }, @@ -76,17 +76,16 @@ "outputs": [], "source": [ "def get_tablecell_span(cell, ix):\n", - " span = set([s[ix] for s in cell['spans']])\n", + " span = set([s[ix] for s in cell[\"spans\"]])\n", " if len(span) == 0:\n", " return 1, None, None\n", " return len(span), min(span), max(span)\n", "\n", "\n", - "\n", "def write_table(item):\n", " \"\"\"\n", " Convert the JSON table representation to HTML, including column and row spans.\n", - " \n", + "\n", " Parameters\n", " ----------\n", " item :\n", @@ -96,44 +95,45 @@ " ncols : int, Default=3\n", " Number of columns in the display table.\n", " \"\"\"\n", - " \n", + "\n", " table = item\n", " body = \"\"\n", "\n", - " nrows = table['#-rows']\n", - " ncols = table['#-cols']\n", + " nrows = table[\"#-rows\"]\n", + " ncols = table[\"#-cols\"]\n", "\n", " body += \"\\n\"\n", " for i in range(nrows):\n", " body += \" \\n\"\n", " for j in range(ncols):\n", - " cell = table['data'][i][j]\n", + " cell = table[\"data\"][i][j]\n", "\n", - " rowspan,rowstart,rowend = get_tablecell_span(cell, 0)\n", - " colspan,colstart,colend = get_tablecell_span(cell, 1)\n", + " rowspan, rowstart, rowend = get_tablecell_span(cell, 0)\n", + " colspan, colstart, colend = get_tablecell_span(cell, 1)\n", "\n", - " if rowstart is not None and rowstart != i: continue\n", - " if colstart is not None and colstart != j: continue\n", + " if rowstart is not None and rowstart != i:\n", + " continue\n", + " if colstart is not None and colstart != j:\n", + " continue\n", "\n", " if rowstart is None:\n", " rowstart = i\n", " if colstart is None:\n", " colstart = j\n", "\n", - " content = cell['text']\n", - " if content == '':\n", - " content = ' '\n", + " content = cell[\"text\"]\n", + " if content == \"\":\n", + " content = \" \"\n", "\n", - " label = cell['type']\n", - " label_class = 'body'\n", - " if label in ['row_header', 'row_multi_header', 'row_title']:\n", - " label_class = 'header'\n", - " elif label in ['col_header', 'col_multi_header']:\n", - " label_class = 'header'\n", - " \n", - " \n", - " celltag = 'th' if label_class == 'header' else 'td'\n", - " style = 'style=\"text-align: center;\"' if label_class == 'header' else ''\n", + " label = cell[\"type\"]\n", + " label_class = \"body\"\n", + " if label in [\"row_header\", \"row_multi_header\", \"row_title\"]:\n", + " label_class = \"header\"\n", + " elif label in [\"col_header\", \"col_multi_header\"]:\n", + " label_class = \"header\"\n", + "\n", + " celltag = \"th\" if label_class == \"header\" else \"td\"\n", + " style = 'style=\"text-align: center;\"' if label_class == \"header\" else \"\"\n", "\n", " body += f' <{celltag} rowstart=\"{rowstart}\" colstart=\"{colstart}\" rowspan=\"{rowspan}\" colspan=\"{colspan}\" {style}>{content}\\n'\n", "\n", @@ -156,14 +156,13 @@ "def visualize_document_tables(doc_jsondata):\n", " \"\"\"\n", " Visualize the tables idenfitied in the converted document.\n", - " \n", + "\n", " Parameters\n", " ----------\n", " doc_jsondata :\n", " Converted document\n", " \"\"\"\n", "\n", - " \n", " page_counters = {}\n", " # Iterate through all the tables identified in the converted document\n", " for table in doc_jsondata.get(\"tables\", []):\n", @@ -171,10 +170,10 @@ " page = prov[\"page\"]\n", " page_counters.setdefault(page, 0)\n", " page_counters[page] += 1\n", - " \n", + "\n", " output_html = write_table(table)\n", " display(Markdown(f\"## Table {page_counters[page]} on page {page}\"))\n", - " display(HTML(output_html)) \n" + " display(HTML(output_html))" ] }, { @@ -326,7 +325,9 @@ } ], "source": [ - "output_dir = tempfile.mkdtemp() # TODO: switch to tempfile.TemporaryDirectory() and use `with`\n", + "output_dir = (\n", + " tempfile.mkdtemp()\n", + ") # TODO: switch to tempfile.TemporaryDirectory() and use `with`\n", "\n", "documents.download_all(result_dir=output_dir, progress_bar=True)\n", "\n", @@ -336,8 +337,8 @@ " for name in all_files:\n", " if not name.endswith(\".json\"):\n", " continue\n", - " \n", - " basename = name.rstrip('.json')\n", + "\n", + " basename = name.rstrip(\".json\")\n", " doc_jsondata = json.loads(archive.read(f\"{basename}.json\"))\n", "\n", " visualize_document_tables(doc_jsondata)" diff --git a/examples/document_conversion_quick_start/convert_documents.ipynb b/examples/document_conversion_quick_start/convert_documents.ipynb index ae4447a..79810c7 100644 --- a/examples/document_conversion_quick_start/convert_documents.ipynb +++ b/examples/document_conversion_quick_start/convert_documents.ipynb @@ -41,7 +41,7 @@ "notebook_settings = ProjectNotebookSettings()\n", "\n", "PROFILE_NAME = notebook_settings.profile # the profile to use\n", - "PROJ_KEY = notebook_settings.proj_key # the project to use\n", + "PROJ_KEY = notebook_settings.proj_key # the project to use\n", "\n", "# default project_key = 1234567890abcdefghijklmnopqrstvwyz123456" ] @@ -112,7 +112,7 @@ " api=api,\n", " proj_key=PROJ_KEY,\n", " source_path=\"../../data/samples/2206.01062.pdf\",\n", - " progress_bar=True\n", + " progress_bar=True,\n", ")\n", "documents.download_all(result_dir=output_dir)\n", "info = documents.generate_report(result_dir=output_dir)\n", @@ -134,7 +134,7 @@ " if not name.endswith(\".json\"):\n", " continue\n", "\n", - " basename = name.rstrip('.json')\n", + " basename = name.rstrip(\".json\")\n", " doc_json = json.loads(archive.read(f\"{basename}.json\"))\n", " doc_md = export_to_markdown(doc_json)\n", "\n", @@ -217,10 +217,12 @@ }, "outputs": [], "source": [ - "documents = ds.convert_documents(api=api,\n", - " proj_key=PROJ_KEY,\n", - " urls=\"https://arxiv.org/pdf/2206.00785.pdf\",\n", - " progress_bar=True)" + "documents = ds.convert_documents(\n", + " api=api,\n", + " proj_key=PROJ_KEY,\n", + " urls=\"https://arxiv.org/pdf/2206.00785.pdf\",\n", + " progress_bar=True,\n", + ")" ] }, { @@ -237,7 +239,7 @@ "source": [ "# let's check what happened.\n", "# we generate a csv report about the conversion task and store it locally\n", - "result_dir = './converted_docs/'\n", + "result_dir = \"./converted_docs/\"\n", "info = documents.generate_report(result_dir=result_dir)\n", "print(info)" ] @@ -320,10 +322,7 @@ "source": [ "# Process multiple urls\n", "documents = ds.convert_documents(\n", - " api=api,\n", - " proj_key=PROJ_KEY,\n", - " urls= urls,\n", - " progress_bar=True\n", + " api=api, proj_key=PROJ_KEY, urls=urls, progress_bar=True\n", ")" ] }, @@ -368,7 +367,7 @@ " api=api,\n", " proj_key=PROJ_KEY,\n", " source_path=\"../../data/samples/2206.01062.pdf\",\n", - " progress_bar=True\n", + " progress_bar=True,\n", ")" ] }, @@ -393,10 +392,7 @@ "outputs": [], "source": [ "documents = ds.convert_documents(\n", - " api=api,\n", - " proj_key=PROJ_KEY,\n", - " source_path=\"../../data/samples\",\n", - " progress_bar=True\n", + " api=api, proj_key=PROJ_KEY, source_path=\"../../data/samples\", progress_bar=True\n", ")" ] }, @@ -429,7 +425,7 @@ "outputs": [], "source": [ "# let's download all the converted documents:\n", - "documents.download_all(result_dir=result_dir,progress_bar=True)" + "documents.download_all(result_dir=result_dir, progress_bar=True)" ] }, { diff --git a/examples/document_conversion_visualize_bbox/visualize_bbox.ipynb b/examples/document_conversion_visualize_bbox/visualize_bbox.ipynb index 7d68d3c..08d6b2e 100644 --- a/examples/document_conversion_visualize_bbox/visualize_bbox.ipynb +++ b/examples/document_conversion_visualize_bbox/visualize_bbox.ipynb @@ -58,7 +58,7 @@ "notebook_settings = ProjectNotebookSettings()\n", "\n", "PROFILE_NAME = notebook_settings.profile # the profile to use\n", - "PROJ_KEY = notebook_settings.proj_key # the project to use\n", + "PROJ_KEY = notebook_settings.proj_key # the project to use\n", "# INPUT_FILE = Path(\"../../data/samples/2206.01062.pdf\")\n", "INPUT_FILE = Path(\"../../data/samples/2206.00785.pdf\")\n", "\n", @@ -122,7 +122,7 @@ " if isinstance(item, typing.Mapping) and not k in item:\n", " print(f\"k={k} not found\")\n", " return {}\n", - " \n", + "\n", " if isinstance(item, typing.List):\n", " try:\n", " k = int(k)\n", @@ -147,7 +147,7 @@ "\n", " Parameters\n", " ----------\n", - " doc_jsondata : \n", + " doc_jsondata :\n", " Converted document\n", "\n", " Returns\n", @@ -166,11 +166,13 @@ " continue\n", " page = item[\"prov\"][0][\"page\"]\n", " item_type = item[\"type\"]\n", - " clusters.setdefault(page, []).append({\n", - " \"page\": page,\n", - " \"type\": item_type,\n", - " \"bbox\": item[\"prov\"][0][\"bbox\"],\n", - " })\n", + " clusters.setdefault(page, []).append(\n", + " {\n", + " \"page\": page,\n", + " \"type\": item_type,\n", + " \"bbox\": item[\"prov\"][0][\"bbox\"],\n", + " }\n", + " )\n", " return clusters" ] }, @@ -188,7 +190,7 @@ "\n", " Parameters\n", " ----------\n", - " doc_cellsdata : \n", + " doc_cellsdata :\n", " Cells document provided by the Deep Search conversion\n", "\n", " Returns\n", @@ -201,13 +203,15 @@ "\n", " cells = {}\n", " for item in doc_cellsdata[\"cells\"][\"data\"]:\n", - " page = item[0]+1\n", + " page = item[0] + 1\n", " item_type = item[5]\n", - " cells.setdefault(page, []).append({\n", - " \"page\": page,\n", - " \"type\": item_type,\n", - " \"bbox\": item[1:5],\n", - " })\n", + " cells.setdefault(page, []).append(\n", + " {\n", + " \"page\": page,\n", + " \"type\": item_type,\n", + " \"bbox\": item[1:5],\n", + " }\n", + " )\n", "\n", " return cells" ] @@ -219,31 +223,32 @@ "metadata": {}, "outputs": [], "source": [ - "# The \n", + "# The\n", "labels_colors_clusters = {\n", - " \"table\": ((240, 128, 128, 100), (255,0,0)),\n", - " \"caption\": ((243, 156, 18, 100), (255,0,0)),\n", - " \"citation\": ((14, 210, 234, 100), (255,0,0)),\n", - " \"picture\": ((255, 236, 204, 100), (255,0,0)),\n", - " \"formula\": ((128, 139, 150, 100), (255,0,0)),\n", - " \"subtitle-level-1\": ((204, 51, 102, 100), (255,0,0)),\n", - " \"paragraph\": ((234, 234, 43, 100), (255,0,0)),\n", + " \"table\": ((240, 128, 128, 100), (255, 0, 0)),\n", + " \"caption\": ((243, 156, 18, 100), (255, 0, 0)),\n", + " \"citation\": ((14, 210, 234, 100), (255, 0, 0)),\n", + " \"picture\": ((255, 236, 204, 100), (255, 0, 0)),\n", + " \"formula\": ((128, 139, 150, 100), (255, 0, 0)),\n", + " \"subtitle-level-1\": ((204, 51, 102, 100), (255, 0, 0)),\n", + " \"paragraph\": ((234, 234, 43, 100), (255, 0, 0)),\n", "}\n", "\n", "labels_colors_cells = {\n", - " \"table\": ((240, 128, 128, 100), (0,0,0,0)),\n", - " \"caption\": ((243, 156, 18, 100), (0,0,0,0)),\n", - " \"citation\": ((14, 210, 234, 100), (0,0,0,0)),\n", - " \"picture\": ((255, 236, 204, 100), (0,0,0,0)),\n", - " \"formula\": ((128, 139, 150, 100), (0,0,0,0)),\n", - " \"subtitle-level-1\": ((204, 51, 102, 100), (0,0,0,0)),\n", - " \"paragraph\": ((234, 234, 43, 100), (0,0,0,0)),\n", + " \"table\": ((240, 128, 128, 100), (0, 0, 0, 0)),\n", + " \"caption\": ((243, 156, 18, 100), (0, 0, 0, 0)),\n", + " \"citation\": ((14, 210, 234, 100), (0, 0, 0, 0)),\n", + " \"picture\": ((255, 236, 204, 100), (0, 0, 0, 0)),\n", + " \"formula\": ((128, 139, 150, 100), (0, 0, 0, 0)),\n", + " \"subtitle-level-1\": ((204, 51, 102, 100), (0, 0, 0, 0)),\n", + " \"paragraph\": ((234, 234, 43, 100), (0, 0, 0, 0)),\n", "}\n", "\n", + "\n", "def draw_boxes(img, dims, boxes, colors_map={}):\n", " \"\"\"\n", " Draw bounding boxes on the input PIL Image `img`\n", - " \n", + "\n", " Parameters\n", " ----------\n", " img : Image\n", @@ -264,12 +269,14 @@ " bbox = cluster[\"bbox\"]\n", " rect = [\n", " round(bbox[0]),\n", - " round(dims[1]-bbox[3]),\n", + " round(dims[1] - bbox[3]),\n", " round(bbox[2]),\n", - " round(dims[1]-bbox[1])\n", + " round(dims[1] - bbox[1]),\n", " ]\n", - " \n", - " c_fill, c_outline = colors_map.get(cluster[\"type\"].lower(), ((128,128,128,100), (0,0,0,0)))\n", + "\n", + " c_fill, c_outline = colors_map.get(\n", + " cluster[\"type\"].lower(), ((128, 128, 128, 100), (0, 0, 0, 0))\n", + " )\n", " drw.rectangle(rect, outline=c_outline, fill=c_fill)" ] }, @@ -283,7 +290,7 @@ "def pdf_to_page_image(pdf_filename, page, resolution=72):\n", " \"\"\"\n", " Convert the page number `page` of the PDF document to an image\n", - " \n", + "\n", " Parameters\n", " ----------\n", " pdf_filename : Path\n", @@ -305,9 +312,13 @@ " \"pdftoppm\",\n", " \"-png\",\n", " \"-singlefile\",\n", - " \"-f\", str(page), \"-l\", str(page),\n", + " \"-f\",\n", + " str(page),\n", + " \"-l\",\n", + " str(page),\n", " \"-cropbox\",\n", - " \"-r\", str(resolution),\n", + " \"-r\",\n", + " str(resolution),\n", " pdf_filename,\n", " output_filename,\n", " ]\n", @@ -319,7 +330,7 @@ " ) from cpe\n", " png_file = output_filename + \".png\"\n", " img = Image.open(png_file)\n", - " return img\n" + " return img" ] }, { @@ -331,9 +342,9 @@ "source": [ "def visualize_document_bboxes(doc_jsondata, doc_cellsdata, ncols=3):\n", " \"\"\"\n", - " Visualize the document pages overlaying the PDF image with the \n", + " Visualize the document pages overlaying the PDF image with the\n", " bounding boxes of the text cells and the segmentation clusters.\n", - " \n", + "\n", " Parameters\n", " ----------\n", " doc_jsondata :\n", @@ -343,11 +354,11 @@ " ncols : int, Default=3\n", " Number of columns in the display table.\n", " \"\"\"\n", - " \n", + "\n", " clusters = page_elements_from_json_document(doc_jsondata)\n", " cells = page_elements_from_text_cells(doc_cellsdata)\n", " pages_to_dims = {dims[\"page\"]: dims for dims in doc_jsondata[\"page-dimensions\"]}\n", - " \n", + "\n", " output_html = \"
\"\n", " for i, page in enumerate(sorted(clusters.keys())):\n", " dims = pages_to_dims[page][\"width\"], pages_to_dims[page][\"height\"]\n", @@ -358,12 +369,14 @@ " img = pdf_to_page_image(INPUT_FILE, page=page)\n", " img = img.resize((math.ceil(dims[0]), math.ceil(dims[1])))\n", " else:\n", - " img = Image.new(\"RGB\", (math.ceil(dims[0]), math.ceil(dims[1])), (255, 255, 255))\n", + " img = Image.new(\n", + " \"RGB\", (math.ceil(dims[0]), math.ceil(dims[1])), (255, 255, 255)\n", + " )\n", " img = img.resize((math.ceil(dims[0]), math.ceil(dims[1])))\n", "\n", " # Draw page rectangle\n", " drw = ImageDraw.Draw(img)\n", - " drw.rectangle([0,0,dims[0]-1, dims[1]-1], outline=(0,0,0))\n", + " drw.rectangle([0, 0, dims[0] - 1, dims[1] - 1], outline=(0, 0, 0))\n", "\n", " # Draw bounding boxes\n", " if SHOW_TEXT_CELLS_BOXES:\n", @@ -376,7 +389,6 @@ " elif i % ncols == 0:\n", " output_html += \"\"\n", "\n", - "\n", " buffer = io.BytesIO()\n", " img.save(buffer, format=\"PNG\")\n", " img_str = base64.b64encode(buffer.getvalue()).decode(\"utf8\")\n", @@ -386,7 +398,6 @@ " output_html += f\"\"\n", " output_html += \"\"\n", "\n", - "\n", " output_html += \"
\"\n", " display(HTML(output_html))" ] @@ -430,7 +441,7 @@ "# Launch the document conversion\n", "documents = ds.convert_documents(\n", " api=api, proj_key=PROJ_KEY, source_path=INPUT_FILE, progress_bar=True\n", - ")\n" + ")" ] }, { @@ -449,7 +460,9 @@ ], "source": [ "# Download results\n", - "output_dir = tempfile.mkdtemp() # TODO: switch to tempfile.TemporaryDirectory() and use `with`\n", + "output_dir = (\n", + " tempfile.mkdtemp()\n", + ") # TODO: switch to tempfile.TemporaryDirectory() and use `with`\n", "documents.download_all(result_dir=output_dir, progress_bar=True)" ] }, @@ -480,10 +493,10 @@ " for name in all_files:\n", " if not name.endswith(\".json\"):\n", " continue\n", - " \n", - " basename = name.rstrip('.json')\n", + "\n", + " basename = name.rstrip(\".json\")\n", " doc_jsondata = json.loads(archive.read(f\"{basename}.json\"))\n", - " doc_cellsdata = json.loads(archive.read(f\"{basename}.cells\")) \n", + " doc_cellsdata = json.loads(archive.read(f\"{basename}.cells\"))\n", "\n", " visualize_document_bboxes(doc_jsondata, doc_cellsdata)" ] diff --git a/examples/integration_argilla/argilla_upload.ipynb b/examples/integration_argilla/argilla_upload.ipynb index 8998b8b..8497f08 100644 --- a/examples/integration_argilla/argilla_upload.ipynb +++ b/examples/integration_argilla/argilla_upload.ipynb @@ -56,7 +56,7 @@ "notebook_settings = ProjectNotebookSettings()\n", "\n", "PROFILE_NAME = notebook_settings.profile # the profile to use\n", - "PROJ_KEY = notebook_settings.proj_key # the project to use\n", + "PROJ_KEY = notebook_settings.proj_key # the project to use\n", "\n", "INPUT_FILE = Path(\"../../data/samples/2206.00785.pdf\")\n", "\n", @@ -65,7 +65,7 @@ "ARGILLA_API_KEY = os.environ[\"ARGILLA_API_KEY\"] # required env var\n", "ARGILLA_DATASET = \"deepsearch-documents\"\n", "# Tokenization\n", - "SPACY_MODEL = \"en_core_web_sm\"\n" + "SPACY_MODEL = \"en_core_web_sm\"" ] }, { diff --git a/examples/kg_download_quick_start/kg_download_quick_start.ipynb b/examples/kg_download_quick_start/kg_download_quick_start.ipynb index 6c69173..bcdc525 100644 --- a/examples/kg_download_quick_start/kg_download_quick_start.ipynb +++ b/examples/kg_download_quick_start/kg_download_quick_start.ipynb @@ -47,7 +47,7 @@ "PROFILE_NAME = notebook_settings.profile # the profile to use\n", "PROJECT_KEY = notebook_settings.proj_key\n", "KG_KEY = notebook_settings.kg_key\n", - "BASE_DIR = './KG-data'" + "BASE_DIR = \"./KG-data\"" ] }, { @@ -98,10 +98,12 @@ "if not os.path.exists(BASE_DIR):\n", " os.mkdir(BASE_DIR)\n", "\n", - " \n", + "\n", "# Raise an error if the base directory is not empty\n", "if len(os.listdir(BASE_DIR)) > 0:\n", - " raise ValueError(f'BASE_DIR must be empty but found the following contents: {os.listdir(BASE_DIR)}')" + " raise ValueError(\n", + " f\"BASE_DIR must be empty but found the following contents: {os.listdir(BASE_DIR)}\"\n", + " )" ] }, { @@ -180,13 +182,15 @@ ], "source": [ "# Download the knowledge graph using urlopen\n", - "zipped_file_path = os.path.join(BASE_DIR, 'kg_data.tar.gz')\n", + "zipped_file_path = os.path.join(BASE_DIR, \"kg_data.tar.gz\")\n", "context = ssl.create_default_context()\n", "context.check_hostname = False\n", - "context.verify_mode=ssl.CERT_NONE\n", + "context.verify_mode = ssl.CERT_NONE\n", "\n", - "with open(zipped_file_path, 'wb+') as download_file, urlopen(download_url, context=context) as response:\n", - " content_length = int(response.getheader('content-length'))\n", + "with open(zipped_file_path, \"wb+\") as download_file, urlopen(\n", + " download_url, context=context\n", + ") as response:\n", + " content_length = int(response.getheader(\"content-length\"))\n", " with tqdm(total=100, position=0) as pbar:\n", " for line in response:\n", " download_file.write(line)\n", @@ -210,9 +214,9 @@ "outputs": [], "source": [ "# Save the unzipped KG\n", - "unzipped_dir = os.path.join(BASE_DIR, 'unzipped_data')\n", + "unzipped_dir = os.path.join(BASE_DIR, \"unzipped_data\")\n", "os.mkdir(unzipped_dir)\n", - "with tarfile.open(zipped_file_path, 'r') as f:\n", + "with tarfile.open(zipped_file_path, \"r\") as f:\n", " f.extractall(path=unzipped_dir)" ] }, @@ -273,7 +277,7 @@ ], "source": [ "# Get a list of all the files in the unzipped data\n", - "files = list(os.walk(os.path.join(BASE_DIR, 'unzipped_data')))[0][2]\n", + "files = list(os.walk(os.path.join(BASE_DIR, \"unzipped_data\")))[0][2]\n", "display(sorted(files))" ] }, @@ -299,7 +303,7 @@ " :param filepath: Path to the jsonl file\n", " :return dataframe: A pandas DataFrame corresponding to the data stored in the file\n", " \"\"\"\n", - " with open(filepath, 'r') as f:\n", + " with open(filepath, \"r\") as f:\n", " data = pd.DataFrame([json.loads(line) for line in f])\n", " return data" ] @@ -336,7 +340,7 @@ ], "source": [ "# Show the first record in the materials file\n", - "materials = jsonl2df(os.path.join(BASE_DIR, 'unzipped_data', 'material.jsonl'))\n", + "materials = jsonl2df(os.path.join(BASE_DIR, \"unzipped_data\", \"material.jsonl\"))\n", "display(materials.iloc[0])" ] }, @@ -454,8 +458,16 @@ ], "source": [ "# Show the first few edges\n", - "edges = jsonl2df(os.path.join(BASE_DIR, 'unzipped_data', '_edges.jsonl'))\n", - "edges = edges[['source_collection', 'target_collection', 'source_hash', 'target_hash', 'symmetric']]\n", + "edges = jsonl2df(os.path.join(BASE_DIR, \"unzipped_data\", \"_edges.jsonl\"))\n", + "edges = edges[\n", + " [\n", + " \"source_collection\",\n", + " \"target_collection\",\n", + " \"source_hash\",\n", + " \"target_hash\",\n", + " \"symmetric\",\n", + " ]\n", + "]\n", "display(edges.head())" ] }, @@ -507,15 +519,17 @@ "outputs": [], "source": [ "nodetypes = {\n", - " 'material': os.path.join(BASE_DIR, 'unzipped_data', 'material.jsonl'),\n", - " 'property': os.path.join(BASE_DIR, 'unzipped_data', 'property.jsonl')\n", + " \"material\": os.path.join(BASE_DIR, \"unzipped_data\", \"material.jsonl\"),\n", + " \"property\": os.path.join(BASE_DIR, \"unzipped_data\", \"property.jsonl\"),\n", "}\n", "\n", "for nodetype in nodetypes:\n", " data = jsonl2df(nodetypes[nodetype])\n", " hetero_kg[nodetype].x = torch.eye(data.shape[0])\n", - " hetero_kg[nodetype]['_hash'] = dict((_hash, _idx) for _idx, _hash in enumerate(data['_hash'].to_list()))\n", - " hetero_kg[nodetype]['_name'] = data['_name'].to_list()" + " hetero_kg[nodetype][\"_hash\"] = dict(\n", + " (_hash, _idx) for _idx, _hash in enumerate(data[\"_hash\"].to_list())\n", + " )\n", + " hetero_kg[nodetype][\"_name\"] = data[\"_name\"].to_list()" ] }, { @@ -536,17 +550,26 @@ "outputs": [], "source": [ "# Find the relevant edges\n", - "edges = jsonl2df(os.path.join(BASE_DIR, 'unzipped_data', '_edges.jsonl'))\n", - "edges = edges[(edges.source_collection == 'material') & (edges.target_collection == 'property')]\n", - "edges = [edges['source_hash'].to_list(), edges['target_hash'].to_list()]\n", + "edges = jsonl2df(os.path.join(BASE_DIR, \"unzipped_data\", \"_edges.jsonl\"))\n", + "edges = edges[\n", + " (edges.source_collection == \"material\") & (edges.target_collection == \"property\")\n", + "]\n", + "edges = [edges[\"source_hash\"].to_list(), edges[\"target_hash\"].to_list()]\n", "\n", "# Create the edge index\n", "edge_index = []\n", "for hash_mat, hash_prop in zip(*edges):\n", - " edge_index.append([hetero_kg['material']['_hash'][hash_mat], hetero_kg['property']['_hash'][hash_prop]])\n", + " edge_index.append(\n", + " [\n", + " hetero_kg[\"material\"][\"_hash\"][hash_mat],\n", + " hetero_kg[\"property\"][\"_hash\"][hash_prop],\n", + " ]\n", + " )\n", "\n", "# Add edge index to the KG\n", - "hetero_kg['material', 'mat2prop', 'property'].edge_index = torch.tensor(edge_index).long().t()\n", + "hetero_kg[\"material\", \"mat2prop\", \"property\"].edge_index = (\n", + " torch.tensor(edge_index).long().t()\n", + ")\n", "\n", "# Make the graph undirected\n", "hetero_kg = ToUndirected()(hetero_kg)" @@ -584,15 +607,15 @@ ], "source": [ "# Summarize the KG\n", - "print('Number of nodes')\n", + "print(\"Number of nodes\")\n", "for node_type in hetero_kg.node_types:\n", - " print(f'\\t{node_type} -> {hetero_kg[node_type].num_nodes}')\n", - "print(f'Total number of nodes: {hetero_kg.num_nodes}')\n", + " print(f\"\\t{node_type} -> {hetero_kg[node_type].num_nodes}\")\n", + "print(f\"Total number of nodes: {hetero_kg.num_nodes}\")\n", "\n", - "print('\\nNumber of edges')\n", + "print(\"\\nNumber of edges\")\n", "for edge_type in hetero_kg.edge_types:\n", - " print(f'\\t{edge_type} -> {hetero_kg[edge_type].num_edges}')\n", - "print(f'Total number of edges: {hetero_kg.num_edges}')" + " print(f\"\\t{edge_type} -> {hetero_kg[edge_type].num_edges}\")\n", + "print(f\"Total number of edges: {hetero_kg.num_edges}\")" ] }, { @@ -704,25 +727,34 @@ ], "source": [ "# Select materials to display\n", - "materials = ['perovskite/Si', 'O(2) Ti(1)', 'A(1) I(3) M(1) Pb(1)', 'O(1) Zn(1)']\n", - "mat_idx = [hetero_kg['material']['_name'].index(mat) for mat in materials]\n", + "materials = [\"perovskite/Si\", \"O(2) Ti(1)\", \"A(1) I(3) M(1) Pb(1)\", \"O(1) Zn(1)\"]\n", + "mat_idx = [hetero_kg[\"material\"][\"_name\"].index(mat) for mat in materials]\n", "\n", "# Get properties corresponding to each material\n", "properties = dict()\n", "for m_idx, material in zip(mat_idx, materials):\n", - " current_edges = hetero_kg['material', 'mat2prop', 'property'].edge_index[0, :] == m_idx\n", - " prop_idx = hetero_kg['material', 'mat2prop', 'property'].edge_index[1, current_edges]\n", - " properties[material] = [hetero_kg['property']['_name'][idx] for idx in prop_idx.tolist()]\n", - " \n", + " current_edges = (\n", + " hetero_kg[\"material\", \"mat2prop\", \"property\"].edge_index[0, :] == m_idx\n", + " )\n", + " prop_idx = hetero_kg[\"material\", \"mat2prop\", \"property\"].edge_index[\n", + " 1, current_edges\n", + " ]\n", + " properties[material] = [\n", + " hetero_kg[\"property\"][\"_name\"][idx] for idx in prop_idx.tolist()\n", + " ]\n", + "\n", "# Show up to four randomly chosen properties for each material\n", "df = pd.DataFrame()\n", "for mat, prop in properties.items():\n", " # Restrict to four properties\n", " if len(prop) > 4:\n", " prop = [prop[idx] for idx in torch.randperm(len(prop)).tolist()[:4]]\n", - " \n", + "\n", " # Add the row to the dataframe\n", - " curr_dict = dict([('material', [mat])] + [(f'Property{p_idx}', [p]) for p_idx, p in enumerate(prop)])\n", + " curr_dict = dict(\n", + " [(\"material\", [mat])]\n", + " + [(f\"Property{p_idx}\", [p]) for p_idx, p in enumerate(prop)]\n", + " )\n", " curr_df = pd.DataFrame(curr_dict)\n", " df = pd.concat([df, curr_df]).reset_index(drop=True)\n", "\n", @@ -754,13 +786,17 @@ ], "source": [ "# Find properties linked to perovskite/Si\n", - "m_idx = hetero_kg['material']['_name'].index('perovskite/Si')\n", - "perovskite_edges = hetero_kg['material', 'mat2prop', 'property'].edge_index[0, :] == m_idx\n", - "prop_idx = hetero_kg['material', 'mat2prop', 'property'].edge_index[1, perovskite_edges]\n", - "properties = [hetero_kg['property']['_name'][idx] for idx in prop_idx.tolist()]\n", + "m_idx = hetero_kg[\"material\"][\"_name\"].index(\"perovskite/Si\")\n", + "perovskite_edges = (\n", + " hetero_kg[\"material\", \"mat2prop\", \"property\"].edge_index[0, :] == m_idx\n", + ")\n", + "prop_idx = hetero_kg[\"material\", \"mat2prop\", \"property\"].edge_index[1, perovskite_edges]\n", + "properties = [hetero_kg[\"property\"][\"_name\"][idx] for idx in prop_idx.tolist()]\n", "\n", "# Check if the desired properties are linked\n", - "print(f'Is perovskite/Si linked to power conversion efficiency: {\"power conversion efficiency\" in properties}')\n", + "print(\n", + " f'Is perovskite/Si linked to power conversion efficiency: {\"power conversion efficiency\" in properties}'\n", + ")\n", "print(f'Is perovskite/Si linked to band gap: {\"band gap\" in properties}')" ] }, diff --git a/examples/nlp_for_materials/nlp_for_materials.ipynb b/examples/nlp_for_materials/nlp_for_materials.ipynb index 5d57cef..189d0cf 100644 --- a/examples/nlp_for_materials/nlp_for_materials.ipynb +++ b/examples/nlp_for_materials/nlp_for_materials.ipynb @@ -49,7 +49,7 @@ "notebook_settings = ProjectNotebookSettings()\n", "\n", "PROFILE_NAME = notebook_settings.profile # the profile to use\n", - "PROJ_KEY = notebook_settings.proj_key # the project to use\n", + "PROJ_KEY = notebook_settings.proj_key # the project to use\n", "\n", "# default project_key = 1234567890abcdefghijklmnopqrstvwyz123456" ] @@ -151,11 +151,11 @@ " api=api,\n", " proj_key=PROJ_KEY,\n", " source_path=f\"../../data/samples/{fname}\",\n", - " progress_bar=True\n", - ") \n", + " progress_bar=True,\n", + ")\n", "documents.download_all(result_dir=output_dir)\n", "info = documents.generate_report(result_dir=output_dir)\n", - "print(info) " + "print(info)" ] }, { @@ -181,23 +181,21 @@ " for name in all_files:\n", " if not name.endswith(\".json\"):\n", " continue\n", - " \n", - " #basename = name.rstrip('.json')\n", + "\n", + " # basename = name.rstrip('.json')\n", " doc_json = json.loads(archive.read(name))\n", - " \n", + "\n", " ofile = output_dir / name\n", " print(f\"writing {ofile}\")\n", " with ofile.open(\"w\") as fw:\n", " fw.write(json.dumps(doc_json, indent=2))\n", - " \n", + "\n", " doc_md = export_to_markdown(doc_json)\n", "\n", " ofile = output_dir / name.replace(\".json\", \".md\")\n", " print(f\"writing {ofile}\")\n", " with ofile.open(\"w\") as fw:\n", - " fw.write(doc_md)\n", - "\n", - " " + " fw.write(doc_md)" ] }, { @@ -297,7 +295,7 @@ "\n", "res = model.apply_on_doc(doc)\n", "\n", - "insts = pd.DataFrame(res[\"instances\"][\"data\"], columns=res[\"instances\"][\"headers\"])\n" + "insts = pd.DataFrame(res[\"instances\"][\"data\"], columns=res[\"instances\"][\"headers\"])" ] }, { @@ -321,9 +319,11 @@ } ], "source": [ - "#print(insts.columns)\n", + "# print(insts.columns)\n", "\n", - "materials = insts[(insts[\"type\"]==\"material\") & (insts[\"subtype\"]==\"complex_chemical\")][[\"type\", \"subtype\", \"name\", \"subj_path\"]]\n", + "materials = insts[\n", + " (insts[\"type\"] == \"material\") & (insts[\"subtype\"] == \"complex_chemical\")\n", + "][[\"type\", \"subtype\", \"name\", \"subj_path\"]]\n", "print(materials.to_string())" ] }, @@ -549,16 +549,18 @@ "\n", "\n", "# Input query\n", - "search_query = \"\\\"SUBSTITUTED 6-PHENYLNICOTINIC ACIDS AND THEIR USE\\\"\"\n", - "data_collection = ElasticDataCollectionSource(elastic_id=\"default\", index_key=\"patent-uspto\")\n", + "search_query = '\"SUBSTITUTED 6-PHENYLNICOTINIC ACIDS AND THEIR USE\"'\n", + "data_collection = ElasticDataCollectionSource(\n", + " elastic_id=\"default\", index_key=\"patent-uspto\"\n", + ")\n", "page_size = 50\n", "\n", "# Prepare the data query\n", "query = DataQuery(\n", - " search_query, # The search query to be executed\n", - " #source=[\"description.title\", \"description.authors\", \"identifiers\"], # Which fields of documents we want to fetch\n", - " limit=page_size, # The size of each request page\n", - " coordinates=data_collection # The data collection to be queries\n", + " search_query, # The search query to be executed\n", + " # source=[\"description.title\", \"description.authors\", \"identifiers\"], # Which fields of documents we want to fetch\n", + " limit=page_size, # The size of each request page\n", + " coordinates=data_collection, # The data collection to be queries\n", ")\n", "\n", "\n", @@ -567,7 +569,9 @@ "count_query.paginated_task.parameters[\"limit\"] = 0\n", "count_results = api.queries.run(count_query)\n", "expected_total = count_results.outputs[\"data_count\"]\n", - "expected_pages = (expected_total + page_size - 1) // page_size # this is simply a ceiling formula\n", + "expected_pages = (\n", + " expected_total + page_size - 1\n", + ") // page_size # this is simply a ceiling formula\n", "\n", "print(f\"#-found documents: \", count_results)\n", "\n", @@ -579,7 +583,7 @@ " for row in result_page.outputs[\"data_outputs\"]:\n", " documents.append(row[\"_source\"])\n", "\n", - "print(f'Finished fetching all data. Total is {len(documents)} records.')" + "print(f\"Finished fetching all data. Total is {len(documents)} records.\")" ] }, { @@ -694,27 +698,31 @@ "model = init_nlp_model(\"language;term;material\")\n", "model.set_loglevel(\"INFO\")\n", "\n", - "max_items=5\n", + "max_items = 5\n", "\n", "for doc in documents:\n", "\n", " dname = doc[\"file-info\"][\"filename\"]\n", - " \n", - " for i,item in enumerate(doc[\"main-text\"]):\n", + "\n", + " for i, item in enumerate(doc[\"main-text\"]):\n", "\n", " if \"text\" not in item:\n", " continue\n", "\n", - " if i>max_items:\n", + " if i > max_items:\n", " break\n", - " \n", + "\n", " res = model.apply_on_text(item[\"text\"])\n", "\n", - " insts = pd.DataFrame(res[\"instances\"][\"data\"], columns=res[\"instances\"][\"headers\"])\n", + " insts = pd.DataFrame(\n", + " res[\"instances\"][\"data\"], columns=res[\"instances\"][\"headers\"]\n", + " )\n", "\n", - " materials = insts[insts[\"type\"]==\"material\"][[\"type\", \"subtype\", \"name\", \"subj_path\"]]\n", + " materials = insts[insts[\"type\"] == \"material\"][\n", + " [\"type\", \"subtype\", \"name\", \"subj_path\"]\n", + " ]\n", "\n", - " if len(materials)>0:\n", + " if len(materials) > 0:\n", " lines = wrapper.wrap(item[\"text\"])\n", " print(f\"\\n {dname}: text-{i}\\n\")\n", " print(\"\\n\".join(lines), \"\\n\")\n", diff --git a/examples/nlp_for_references/nlp_for_references.ipynb b/examples/nlp_for_references/nlp_for_references.ipynb index f6a5773..4f923f6 100644 --- a/examples/nlp_for_references/nlp_for_references.ipynb +++ b/examples/nlp_for_references/nlp_for_references.ipynb @@ -37,7 +37,7 @@ "notebook_settings = ProjectNotebookSettings()\n", "\n", "PROFILE_NAME = notebook_settings.profile # the profile to use\n", - "PROJ_KEY = notebook_settings.proj_key # the project to use\n", + "PROJ_KEY = notebook_settings.proj_key # the project to use\n", "\n", "# default project_key = 1234567890abcdefghijklmnopqrstvwyz123456" ] @@ -164,11 +164,11 @@ " api=api,\n", " proj_key=PROJ_KEY,\n", " source_path=f\"../../data/samples/{fname}\",\n", - " progress_bar=True\n", - ") \n", + " progress_bar=True,\n", + ")\n", "documents.download_all(result_dir=output_dir)\n", "info = documents.generate_report(result_dir=output_dir)\n", - "print(info) " + "print(info)" ] }, { @@ -194,23 +194,21 @@ " for name in all_files:\n", " if not name.endswith(\".json\"):\n", " continue\n", - " \n", - " #basename = name.rstrip('.json')\n", + "\n", + " # basename = name.rstrip('.json')\n", " doc_json = json.loads(archive.read(name))\n", - " \n", + "\n", " ofile = output_dir / name\n", " print(f\"writing {ofile}\")\n", " with ofile.open(\"w\") as fw:\n", " fw.write(json.dumps(doc_json, indent=2))\n", - " \n", + "\n", " doc_md = export_to_markdown(doc_json)\n", "\n", " ofile = output_dir / name.replace(\".json\", \".md\")\n", " print(f\"writing {ofile}\")\n", " with ofile.open(\"w\") as fw:\n", - " fw.write(doc_md)\n", - "\n", - " " + " fw.write(doc_md)" ] }, { @@ -241,26 +239,25 @@ "source": [ "def resolve(path, doc):\n", "\n", - " if len(path)>1 and path[0]==\"#\":\n", + " if len(path) > 1 and path[0] == \"#\":\n", " return resolve(path[1:], doc)\n", - " \n", - " if len(path)==1 and isinstance(doc, dict):\n", + "\n", + " if len(path) == 1 and isinstance(doc, dict):\n", " return doc[path[0]]\n", "\n", - " elif len(path)==1 and isinstance(doc, list):\n", + " elif len(path) == 1 and isinstance(doc, list):\n", " ind = int(path[0])\n", " return doc[ind]\n", - " \n", - " elif len(path)>1 and isinstance(doc, dict):\n", + "\n", + " elif len(path) > 1 and isinstance(doc, dict):\n", " return resolve(path[1:], doc[path[0]])\n", "\n", - " elif len(path)>1 and isinstance(doc, list):\n", + " elif len(path) > 1 and isinstance(doc, list):\n", " ind = int(path[0])\n", " return resolve(path[1:], doc[ind])\n", "\n", " else:\n", - " return None\n", - " " + " return None" ] }, { @@ -380,22 +377,21 @@ "props = pd.DataFrame(res[\"properties\"][\"data\"], columns=res[\"properties\"][\"headers\"])\n", "insts = pd.DataFrame(res[\"instances\"][\"data\"], columns=res[\"instances\"][\"headers\"])\n", "\n", - "refs = props[(props[\"label\"]==\"reference\") & (props[\"confidence\"]>0.8)]\n", + "refs = props[(props[\"label\"] == \"reference\") & (props[\"confidence\"] > 0.8)]\n", "\n", "cnt = 0\n", - "for i,ref in refs.iterrows():\n", - " #print(ref)\n", + "for i, ref in refs.iterrows():\n", + " # print(ref)\n", "\n", " item = resolve(ref[\"subj_path\"].split(\"/\"), res)\n", " print(\"\\n\".join(textwrap.wrap(item[\"text\"], 70)))\n", "\n", - " ents = insts[insts[\"subj_hash\"]==item[\"subj_hash\"]][[\"subtype\", \"name\"]]\n", + " ents = insts[insts[\"subj_hash\"] == item[\"subj_hash\"]][[\"subtype\", \"name\"]]\n", " print(\"\\nentities:\\n\", ents, \"\\n\\n\")\n", "\n", - " \n", - " cnt+=1\n", - " if cnt>5:\n", - " break\n" + " cnt += 1\n", + " if cnt > 5:\n", + " break" ] }, { diff --git a/examples/nlp_on_documents/nlp_on_documents.ipynb b/examples/nlp_on_documents/nlp_on_documents.ipynb index c52e660..507f193 100644 --- a/examples/nlp_on_documents/nlp_on_documents.ipynb +++ b/examples/nlp_on_documents/nlp_on_documents.ipynb @@ -49,7 +49,7 @@ "notebook_settings = ProjectNotebookSettings()\n", "\n", "PROFILE_NAME = notebook_settings.profile # the profile to use\n", - "PROJ_KEY = notebook_settings.proj_key # the project to use\n", + "PROJ_KEY = notebook_settings.proj_key # the project to use\n", "\n", "# default project_key = 1234567890abcdefghijklmnopqrstvwyz123456" ] @@ -151,11 +151,11 @@ " api=api,\n", " proj_key=PROJ_KEY,\n", " source_path=f\"../../data/samples/{fname}\",\n", - " progress_bar=True\n", - ") \n", + " progress_bar=True,\n", + ")\n", "documents.download_all(result_dir=output_dir)\n", "info = documents.generate_report(result_dir=output_dir)\n", - "print(info) " + "print(info)" ] }, { @@ -181,23 +181,21 @@ " for name in all_files:\n", " if not name.endswith(\".json\"):\n", " continue\n", - " \n", - " #basename = name.rstrip('.json')\n", + "\n", + " # basename = name.rstrip('.json')\n", " doc_json = json.loads(archive.read(name))\n", - " \n", + "\n", " ofile = output_dir / name\n", " print(f\"writing {ofile}\")\n", " with ofile.open(\"w\") as fw:\n", " fw.write(json.dumps(doc_json, indent=2))\n", - " \n", + "\n", " doc_md = export_to_markdown(doc_json)\n", "\n", " ofile = output_dir / name.replace(\".json\", \".md\")\n", " print(f\"writing {ofile}\")\n", " with ofile.open(\"w\") as fw:\n", - " fw.write(doc_md)\n", - "\n", - " " + " fw.write(doc_md)" ] }, { @@ -266,33 +264,36 @@ "with open(ifile) as fr:\n", " doc = json.load(fr)\n", "\n", - "terms = [\"METAL\",\n", - " \"COPPER\",\n", - " \"COBALT\",\n", - " \"TUNGSTEN\",\n", - " \"MOLYBDENUM\",\n", - " \"RUTHENIUM\",\n", - " \"Self-assembly material\", \n", - " \"Self-assembly molecular layer\",\n", - " \"surface modification\", \n", - " \"inhibitor\", \n", - " \"corrosion inhibitor\", \n", - " \"adsorption\", \"selectivity\", \n", - " \"Anti-corrosion\", \n", - " \"contact angle\",\n", - " \"Area selective deposition\",\n", - " \"Advanced interconnect metallization\",\n", - " \"Integrated circuits\",\n", - " \"Atomic layer deposition\"]\n", - "\n", - "term_hist = [ {\"key\":term, \"count\":0} for term in terms]\n", - "\n", - "for i,item in enumerate(doc[\"main-text\"]):\n", + "terms = [\n", + " \"METAL\",\n", + " \"COPPER\",\n", + " \"COBALT\",\n", + " \"TUNGSTEN\",\n", + " \"MOLYBDENUM\",\n", + " \"RUTHENIUM\",\n", + " \"Self-assembly material\",\n", + " \"Self-assembly molecular layer\",\n", + " \"surface modification\",\n", + " \"inhibitor\",\n", + " \"corrosion inhibitor\",\n", + " \"adsorption\",\n", + " \"selectivity\",\n", + " \"Anti-corrosion\",\n", + " \"contact angle\",\n", + " \"Area selective deposition\",\n", + " \"Advanced interconnect metallization\",\n", + " \"Integrated circuits\",\n", + " \"Atomic layer deposition\",\n", + "]\n", + "\n", + "term_hist = [{\"key\": term, \"count\": 0} for term in terms]\n", + "\n", + "for i, item in enumerate(doc[\"main-text\"]):\n", "\n", " if \"text\" not in item:\n", " continue\n", - " \n", - " for j,term in enumerate(terms):\n", + "\n", + " for j, term in enumerate(terms):\n", " term_hist[j][\"count\"] += item[\"text\"].count(term.lower())\n", "\n", "df = pd.DataFrame(term_hist)\n", @@ -336,7 +337,7 @@ "from tabulate import tabulate\n", "\n", "models = load_pretrained_nlp_models()\n", - "#print(f\"models: {models}\")" + "# print(f\"models: {models}\")" ] }, { @@ -372,17 +373,17 @@ "\n", "model = init_nlp_model(\"language;term\")\n", "\n", - "for i,item in enumerate(doc[\"main-text\"]):\n", + "for i, item in enumerate(doc[\"main-text\"]):\n", "\n", " if \"text\" in item:\n", " res = model.apply_on_text(item[\"text\"])\n", - " #print(res.keys())\n", + " # print(res.keys())\n", "\n", - " #print(item[\"text\"])\n", - " #print(tabulate(res[\"instances\"][\"data\"], \n", + " # print(item[\"text\"])\n", + " # print(tabulate(res[\"instances\"][\"data\"],\n", " # headers=res[\"instances\"][\"headers\"]))\n", "\n", - " if i>10:\n", + " if i > 10:\n", " break\n", "\n", "\n", @@ -390,7 +391,7 @@ "\n", "df = pd.DataFrame(res[\"instances\"][\"data\"], columns=res[\"instances\"][\"headers\"])\n", "\n", - "terms = df[df[\"type\"]==\"term\"][[\"type\", \"name\", \"subj_path\"]]\n", + "terms = df[df[\"type\"] == \"term\"][[\"type\", \"name\", \"subj_path\"]]\n", "print(terms)" ] }, @@ -625,15 +626,15 @@ ], "source": [ "nodes = read_nodes_in_dataframe(\"./glm/nodes.csv\")\n", - "#print(nodes)\n", + "# print(nodes)\n", "\n", "# Get all terms of the document\n", - "terms = nodes[nodes[\"name\"]==\"term\"][[\"total-count\", \"nodes-text\"]]\n", + "terms = nodes[nodes[\"name\"] == \"term\"][[\"total-count\", \"nodes-text\"]]\n", "print(terms)\n", "\n", "# Get all terms of the document with `composition`\n", "res = expand_terms(glm, \"composition\")\n", - "#show_query_result(res)\n", + "# show_query_result(res)\n", "\n", "last_result = res[\"result\"][-1][\"nodes\"]\n", "expanded_terms = pd.DataFrame(last_result[\"data\"], columns=last_result[\"headers\"])\n", diff --git a/examples/qa_doc_collection/doc_collection_qa.ipynb b/examples/qa_doc_collection/doc_collection_qa.ipynb index 193fde3..c1150b6 100644 --- a/examples/qa_doc_collection/doc_collection_qa.ipynb +++ b/examples/qa_doc_collection/doc_collection_qa.ipynb @@ -1,983 +1,998 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "99f717ef-4cba-4300-b258-0b1c248cb873", - "metadata": {}, - "source": [ - "# RAG and Semantic Retrieval on a Document Collection\n", - "\n", - "Deep Search allows users to interact with the documents using conversational AI, i.e. you interact with a virtual assistant which answer your questions using the information in the corpus.\n", - "\n", - "In this example we demonstrate how achive the same interaction programmatically.\n", - "\n", - "### Access required\n", - "\n", - "The content of this notebook requires access to Deep Search capabilities which are not\n", - "available on the public access system.\n", - "\n", - "[Contact us](https://ds4sd.github.io) if you are interested in exploring\n", - "these Deep Search capabilities.\n", - "\n", - "\n", - "### GenAI Integration required\n", - "\n", - "When interacting with the virtual assistant, Deep Search requires a connection to a Generative AI API. Currently, we support connections to [watsonx.ai](https://www.ibm.com/products/watsonx-ai) or the IBM-internal GenAI platform BAM.\n", - "\n", - "Deep Search allows custom GenAI configurations for each project.\n", - "In the following example you will require to work in a project which has such GenAI capabilities activated." - ] - }, - { - "cell_type": "markdown", - "id": "256aef50-71a1-4278-9b22-17cb99a6566e", - "metadata": {}, - "source": [ - "### Set notebook parameters\n" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "5b244bdd-1b52-41ff-b63e-9a203570d210", - "metadata": {}, - "outputs": [], - "source": [ - "from dsnotebooks.settings import CollQANotebookSettings\n", - "\n", - "# notebooks settings auto-loaded from .env / env vars\n", - "notebook_settings = CollQANotebookSettings()\n", - "\n", - "PROFILE_NAME = notebook_settings.profile # the profile to use\n", - "PROJ_KEY = notebook_settings.proj_key # the project to use\n", - "INDEX_KEY = notebook_settings.sem_on_idx_key # the collection to use\n", - "\n", - "SKIP_INGESTED_DOCS = notebook_settings.skip_ingested_docs # whether to skip any already semantically ingested docs\n", - "\n", - "RETR_K = notebook_settings.retr_k # the number of search results to retrieve\n", - "TEXT_WEIGHT = notebook_settings.text_weight # the weight of lexical search (0.0: semantic-only, 1.0: lexical-only, anything in between: hybrid search)\n", - "RERANK = notebook_settings.rerank # whether to rerank the search results\n", - "RAISE = notebook_settings.raise_on_sem_err # whether semantic operation errors should raise an exception or be reflected in response fields" - ] - }, - { - "cell_type": "markdown", - "id": "a5269060-bb5f-4fe3-9b64-547202db6714", - "metadata": {}, - "source": [ - "### Import example dependencies" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "5d236ea0-db1c-4171-8e11-cdd0bad69d66", - "metadata": {}, - "outputs": [], - "source": [ - "# Import standard dependenices\n", - "import pandas as pd\n", - "import rich\n", - "\n", - "# IPython utilities\n", - "from IPython.display import display, Markdown\n", - "\n", - "# Import the deepsearch-toolkit\n", - "from deepsearch.cps.client.api import CpsApi\n", - "from deepsearch.cps.client.components.elastic import ElasticProjectDataCollectionSource\n", - "from deepsearch.cps.queries import DataQuery, RAGQuery, SemanticQuery\n", - "from deepsearch.cps.queries.results import RAGResult, SearchResult, SearchResultItem\n" - ] - }, - { - "cell_type": "markdown", - "id": "293c249b-6018-46f2-b4d8-795f994d4729", - "metadata": {}, - "source": [ - "### Connect to Deep Search" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "9c108432-a285-4c7b-a996-008ac3ff3d7a", - "metadata": {}, - "outputs": [], - "source": [ - "api = CpsApi.from_env(profile_name=PROFILE_NAME)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Utils" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "def render_provenance_url(\n", - " api: CpsApi,\n", - " coords: ElasticProjectDataCollectionSource,\n", - " retr_item: SearchResultItem,\n", - "):\n", - " ## compute URL to the document in the Deep Search UI\n", - " item_index = int(retr_item.main_path[retr_item.main_path.rfind(\".\")+1:])\n", - " doc_url = api.documents.generate_url(\n", - " document_hash=retr_item.doc_hash,\n", - " data_source=coords,\n", - " item_index=item_index,\n", - " )\n", - " display(Markdown(f\"The provenance of the answer can be inspected on the [source document]({doc_url}).\"))" - ] - }, - { - "cell_type": "markdown", - "id": "38cde869-46d1-4833-8eb3-2381b5e5fb68", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Prepare the collection coordinates:" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "coll_coords = ElasticProjectDataCollectionSource(\n", - " proj_key=PROJ_KEY,\n", - " index_key=INDEX_KEY,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We are using a small collection, so we can just list its documents to get an idea of its contents (for more details on querying, check the [Data Query Quick Start](https://github.com/DS4SD/deepsearch-examples/tree/main/examples/data_query_quick_start))." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "2b38875e-f39c-4dd5-9d42-3ffca5d0bdac", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Finished fetching all data. Total is 10 records.\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
FilenameDocHash
0natural-language-processing.pdf000f892ddcc67f165797a96e94f44fb9e0697c7912a383...
1ibm-z.pdf07e56eb5a10f725fccad9386d126b7b05bec1fa71b9b3d...
2ibm.pdf234bc5cf2c860d49574b0ff7191c354b7bbc11472a0997...
3ibm-the-great-mind-challenge.pdf335120a57b418655196e3315b562a2f9e89cedeaef9318...
4turing-award.pdf8a7c91a269abc3063df9f4e19f7961ddb8e2393fa0f272...
5ibm-research.pdfb30bc667a324ae111d025526563b674a8d3fd869bc07c8...
6artificial-intelligence.pdfb60a87c1d62a59d517f2fd6f2d3ea1a96c58b651332a8b...
7machine-learning.pdfe470e7b42a92c8e5f25094362361947b9203e0074c2223...
8deep-blue-chess-computer.pdffa7ce2f66a7a5e061813d36348425f81d9e7ebc23454d8...
9red-hat.pdffb53bb607f2e9642d7fe044585d1dcdb052c57febe1b87...
\n", - "
" - ], - "text/plain": [ - " Filename \\\n", - "0 natural-language-processing.pdf \n", - "1 ibm-z.pdf \n", - "2 ibm.pdf \n", - "3 ibm-the-great-mind-challenge.pdf \n", - "4 turing-award.pdf \n", - "5 ibm-research.pdf \n", - "6 artificial-intelligence.pdf \n", - "7 machine-learning.pdf \n", - "8 deep-blue-chess-computer.pdf \n", - "9 red-hat.pdf \n", - "\n", - " DocHash \n", - "0 000f892ddcc67f165797a96e94f44fb9e0697c7912a383... \n", - "1 07e56eb5a10f725fccad9386d126b7b05bec1fa71b9b3d... \n", - "2 234bc5cf2c860d49574b0ff7191c354b7bbc11472a0997... \n", - "3 335120a57b418655196e3315b562a2f9e89cedeaef9318... \n", - "4 8a7c91a269abc3063df9f4e19f7961ddb8e2393fa0f272... \n", - "5 b30bc667a324ae111d025526563b674a8d3fd869bc07c8... \n", - "6 b60a87c1d62a59d517f2fd6f2d3ea1a96c58b651332a8b... \n", - "7 e470e7b42a92c8e5f25094362361947b9203e0074c2223... \n", - "8 fa7ce2f66a7a5e061813d36348425f81d9e7ebc23454d8... \n", - "9 fb53bb607f2e9642d7fe044585d1dcdb052c57febe1b87... " - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# Prepare the data query\n", - "query = DataQuery(\n", - " search_query=\"*\", # The search query to be executed\n", - " source=[ # Which fields of documents we want to fetch\n", - " \"file-info.document-hash\",\n", - " \"file-info.filename\",\n", - " # \"description.title\",\n", - " ],\n", - " coordinates=coll_coords, # The data collection to be queries\n", - ")\n", - "\n", - "# Query Deep Search for the documents matching the query\n", - "results = []\n", - "query_results = api.queries.run(query)\n", - "for row in query_results.outputs[\"data_outputs\"]:\n", - " # Add row to results table\n", - " results.append({\n", - " \"Filename\": row[\"_source\"][\"file-info\"][\"filename\"],\n", - " \"DocHash\": row[\"_source\"][\"file-info\"][\"document-hash\"],\n", - " # \"Title\": row[\"_source\"].get(\"description\", {}).get(\"title\"),\n", - " })\n", - "\n", - "print(f'Finished fetching all data. Total is {len(results)} records.')\n", - "\n", - "# Visualize the table with all results\n", - "df = pd.json_normalize(results)\n", - "display(df)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Prepare source" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "from deepsearch.cps.client.components.documents import PrivateDataCollectionSource, PrivateDataDocumentSource, PublicDataDocumentSource\n", - "\n", - "data_source = PrivateDataCollectionSource(\n", - " source=coll_coords,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Ingestion" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In the cell below we show how to semantically index your collection (indexing of already indexed docs is controlled via param `skip_ingested_docs`):" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/pva/work/github.com/DS4SD/deepsearch-examples/.venv/lib/python3.10/site-packages/pydantic/main.py:314: UserWarning: Pydantic serializer warnings:\n", - " Expected `list[str]` but got `_LiteralGenericAlias` - serialized value may not be as expected\n", - " return self.__pydantic_serializer__.to_python(\n" - ] - }, - { - "data": { - "text/plain": [ - "{'ing_out': {}}" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# launch the ingestion of the collection for DocumentQA\n", - "task = api.documents.semantic_ingest(\n", - " project=PROJ_KEY,\n", - " data_source=data_source,\n", - " skip_ingested_docs=SKIP_INGESTED_DOCS,\n", - ")\n", - "\n", - "# wait for the ingestion task to finish\n", - "api.tasks.wait_for(PROJ_KEY, task.task_id)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## RAG" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "ee573e76-98ea-43ce-a2ba-a81f64b3adf3", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
RAGResult(\n",
-                            "    answers=[\n",
-                            "        RAGAnswerItem(\n",
-                            "            answer='The IBM lab in Zurich is located in Rüschlikon, Switzerland.',\n",
-                            "            grounding=RAGGroundingInfo(\n",
-                            "                retr_items=[\n",
-                            "                    SearchResultItem(\n",
-                            "                        doc_hash='b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c',\n",
-                            "                        chunk='Switzerland\\nIBM Research-Zurich (previously called IBM Zurich Research Laboratory, \n",
-                            "ZRL) is the European branch of IBM Research. It was opened in 1956 and is located in Rüschlikon, near Zurich, \n",
-                            "Switzerland.',\n",
-                            "                        main_path='main-text.70',\n",
-                            "                        path_group=['main-text.69', 'main-text.70'],\n",
-                            "                        source_is_text=True\n",
-                            "                    )\n",
-                            "                ],\n",
-                            "                gen_ctx_paths=[\n",
-                            "                    'main-text.58',\n",
-                            "                    'main-text.59',\n",
-                            "                    'main-text.60',\n",
-                            "                    'main-text.61',\n",
-                            "                    'main-text.62',\n",
-                            "                    'main-text.63',\n",
-                            "                    'main-text.64',\n",
-                            "                    'main-text.65',\n",
-                            "                    'main-text.66',\n",
-                            "                    'main-text.67',\n",
-                            "                    'main-text.68',\n",
-                            "                    'main-text.69',\n",
-                            "                    'main-text.70',\n",
-                            "                    'main-text.71',\n",
-                            "                    'main-text.72',\n",
-                            "                    'main-text.73',\n",
-                            "                    'main-text.74',\n",
-                            "                    'main-text.75',\n",
-                            "                    'main-text.76'\n",
-                            "                ]\n",
-                            "            ),\n",
-                            "            prompt=None\n",
-                            "        )\n",
-                            "    ],\n",
-                            "    search_result_items=[\n",
-                            "        SearchResultItem(\n",
-                            "            doc_hash='b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c',\n",
-                            "            chunk='Switzerland\\nIBM Research-Zurich (previously called IBM Zurich Research Laboratory, ZRL) is the \n",
-                            "European branch of IBM Research. It was opened in 1956 and is located in Rüschlikon, near Zurich, Switzerland.',\n",
-                            "            main_path='main-text.70',\n",
-                            "            path_group=['main-text.69', 'main-text.70'],\n",
-                            "            source_is_text=True\n",
-                            "        ),\n",
-                            "        SearchResultItem(\n",
-                            "            doc_hash='b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c',\n",
-                            "            chunk='Switzerland\\nIn 1956, IBM opened their first European research laboratory in Adliswil, \n",
-                            "Switzerland, near Zurich. The lab moved to its own campus in neighboring Rüschlikon in 1962. The Zurich lab is \n",
-                            "staffed by a multicultural and interdisciplinary team of a few hundred permanent research staff members, graduate \n",
-                            "students and post-doctoral fellows, representing about 45 nationalities. Collocated with the lab is a Client Center\n",
-                            "(formerly the Industry Solutions Lab), an executive briefing facility demonstrating technology prototypes and \n",
-                            "solutions.',\n",
-                            "            main_path='main-text.71',\n",
-                            "            path_group=['main-text.69', 'main-text.71'],\n",
-                            "            source_is_text=True\n",
-                            "        ),\n",
-                            "        SearchResultItem(\n",
-                            "            doc_hash='b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c',\n",
-                            "            chunk='Switzerland\\nThe research projects pursued at the IBM Zurich lab are organized into four \n",
-                            "scientific and technical departments: Science & Technology, Cloud and AI Systems Research, Cognitive Computing & \n",
-                            "Industry Solutions and Security Research. The lab is currently managed by Alessandro Curioni.',\n",
-                            "            main_path='main-text.74',\n",
-                            "            path_group=['main-text.69', 'main-text.74'],\n",
-                            "            source_is_text=True\n",
-                            "        )\n",
-                            "    ]\n",
-                            ")\n",
-                            "
\n" - ], - "text/plain": [ - "\u001b[1;35mRAGResult\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33manswers\u001b[0m=\u001b[1m[\u001b[0m\n", - " \u001b[1;35mRAGAnswerItem\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33manswer\u001b[0m=\u001b[32m'The IBM lab in Zurich is located in Rüschlikon, Switzerland.'\u001b[0m,\n", - " \u001b[33mgrounding\u001b[0m=\u001b[1;35mRAGGroundingInfo\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mretr_items\u001b[0m=\u001b[1m[\u001b[0m\n", - " \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mdoc_hash\u001b[0m=\u001b[32m'b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c'\u001b[0m,\n", - " \u001b[33mchunk\u001b[0m=\u001b[32m'Switzerland\\nIBM Research-Zurich \u001b[0m\u001b[32m(\u001b[0m\u001b[32mpreviously called IBM Zurich Research Laboratory, \u001b[0m\n", - "\u001b[32mZRL\u001b[0m\u001b[32m)\u001b[0m\u001b[32m is the European branch of IBM Research. It was opened in 1956 and is located in Rüschlikon, near Zurich, \u001b[0m\n", - "\u001b[32mSwitzerland.'\u001b[0m,\n", - " \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.70'\u001b[0m,\n", - " \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.69'\u001b[0m, \u001b[32m'main-text.70'\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", - " \u001b[1m)\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", - " \u001b[33mgen_ctx_paths\u001b[0m=\u001b[1m[\u001b[0m\n", - " \u001b[32m'main-text.58'\u001b[0m,\n", - " \u001b[32m'main-text.59'\u001b[0m,\n", - " \u001b[32m'main-text.60'\u001b[0m,\n", - " \u001b[32m'main-text.61'\u001b[0m,\n", - " \u001b[32m'main-text.62'\u001b[0m,\n", - " \u001b[32m'main-text.63'\u001b[0m,\n", - " \u001b[32m'main-text.64'\u001b[0m,\n", - " \u001b[32m'main-text.65'\u001b[0m,\n", - " \u001b[32m'main-text.66'\u001b[0m,\n", - " \u001b[32m'main-text.67'\u001b[0m,\n", - " \u001b[32m'main-text.68'\u001b[0m,\n", - " \u001b[32m'main-text.69'\u001b[0m,\n", - " \u001b[32m'main-text.70'\u001b[0m,\n", - " \u001b[32m'main-text.71'\u001b[0m,\n", - " \u001b[32m'main-text.72'\u001b[0m,\n", - " \u001b[32m'main-text.73'\u001b[0m,\n", - " \u001b[32m'main-text.74'\u001b[0m,\n", - " \u001b[32m'main-text.75'\u001b[0m,\n", - " \u001b[32m'main-text.76'\u001b[0m\n", - " \u001b[1m]\u001b[0m\n", - " \u001b[1m)\u001b[0m,\n", - " \u001b[33mprompt\u001b[0m=\u001b[3;35mNone\u001b[0m\n", - " \u001b[1m)\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", - " \u001b[33msearch_result_items\u001b[0m=\u001b[1m[\u001b[0m\n", - " \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mdoc_hash\u001b[0m=\u001b[32m'b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c'\u001b[0m,\n", - " \u001b[33mchunk\u001b[0m=\u001b[32m'Switzerland\\nIBM Research-Zurich \u001b[0m\u001b[32m(\u001b[0m\u001b[32mpreviously called IBM Zurich Research Laboratory, ZRL\u001b[0m\u001b[32m)\u001b[0m\u001b[32m is the \u001b[0m\n", - "\u001b[32mEuropean branch of IBM Research. It was opened in 1956 and is located in Rüschlikon, near Zurich, Switzerland.'\u001b[0m,\n", - " \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.70'\u001b[0m,\n", - " \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.69'\u001b[0m, \u001b[32m'main-text.70'\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", - " \u001b[1m)\u001b[0m,\n", - " \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mdoc_hash\u001b[0m=\u001b[32m'b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c'\u001b[0m,\n", - " \u001b[33mchunk\u001b[0m=\u001b[32m'Switzerland\\nIn 1956, IBM opened their first European research laboratory in Adliswil, \u001b[0m\n", - "\u001b[32mSwitzerland, near Zurich. The lab moved to its own campus in neighboring Rüschlikon in 1962. The Zurich lab is \u001b[0m\n", - "\u001b[32mstaffed by a multicultural and interdisciplinary team of a few hundred permanent research staff members, graduate \u001b[0m\n", - "\u001b[32mstudents and post-doctoral fellows, representing about 45 nationalities. Collocated with the lab is a Client Center\u001b[0m\n", - "\u001b[32m(\u001b[0m\u001b[32mformerly the Industry Solutions Lab\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, an executive briefing facility demonstrating technology prototypes and \u001b[0m\n", - "\u001b[32msolutions.'\u001b[0m,\n", - " \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.71'\u001b[0m,\n", - " \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.69'\u001b[0m, \u001b[32m'main-text.71'\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", - " \u001b[1m)\u001b[0m,\n", - " \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mdoc_hash\u001b[0m=\u001b[32m'b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c'\u001b[0m,\n", - " \u001b[33mchunk\u001b[0m=\u001b[32m'Switzerland\\nThe research projects pursued at the IBM Zurich lab are organized into four \u001b[0m\n", - "\u001b[32mscientific and technical departments: Science & Technology, Cloud and AI Systems Research, Cognitive Computing & \u001b[0m\n", - "\u001b[32mIndustry Solutions and Security Research. The lab is currently managed by Alessandro Curioni.'\u001b[0m,\n", - " \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.74'\u001b[0m,\n", - " \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.69'\u001b[0m, \u001b[32m'main-text.74'\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", - " \u001b[1m)\u001b[0m\n", - " \u001b[1m]\u001b[0m\n", - "\u001b[1m)\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "question = \"Where is the IBM lab in Zurich?\"\n", - "\n", - "# submit natural-language query on collection\n", - "question_query = RAGQuery(\n", - " question=question,\n", - " project=PROJ_KEY,\n", - " data_source=data_source,\n", - "\n", - " ## optional retrieval params\n", - " retr_k=RETR_K,\n", - ")\n", - "api_output = api.queries.run(question_query)\n", - "rag_result = RAGResult.from_api_output(api_output, raise_on_error=RAISE)\n", - "\n", - "rich.print(rag_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Additionally, we can generate a provenance URL to the document in the Deep Search UI:" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/markdown": [ - "The provenance of the answer can be inspected on the [source document](https://sds.app.accelerate.science/projects/b09ae7561a01dc7c4b0fd21a43bfd93d140766d1/library/private/6b70072911ad2794a3844dd44d1705a5ba37ca0b?search=JTdCJTIycHJpdmF0ZUNvbGxlY3Rpb24lMjIlM0ElMjI2YjcwMDcyOTExYWQyNzk0YTM4NDRkZDQ0ZDE3MDVhNWJhMzdjYTBiJTIyJTJDJTIydHlwZSUyMiUzQSUyMkRvY3VtZW50JTIyJTJDJTIyZXhwcmVzc2lvbiUyMiUzQSUyMmZpbGUtaW5mby5kb2N1bWVudC1oYXNoJTNBJTIwJTVDJTIyYjMwYmM2NjdhMzI0YWUxMTFkMDI1NTI2NTYzYjY3NGE4ZDNmZDg2OWJjMDdjOGZkMjA0YWE5NWIwNWQ0MWYwYyU1QyUyMiUyMiUyQyUyMmZpbHRlcnMlMjIlM0ElNUIlNUQlMkMlMjJzZWxlY3QlMjIlM0ElNUIlMjJfbmFtZSUyMiUyQyUyMmRlc2NyaXB0aW9uLmNvbGxlY3Rpb24lMjIlMkMlMjJwcm92JTIyJTJDJTIyZGVzY3JpcHRpb24udGl0bGUlMjIlMkMlMjJkZXNjcmlwdGlvbi5wdWJsaWNhdGlvbl9kYXRlJTIyJTJDJTIyZGVzY3JpcHRpb24udXJsX3JlZnMlMjIlNUQlMkMlMjJpdGVtSW5kZXglMjIlM0EwJTJDJTIycGFnZVNpemUlMjIlM0ExMCUyQyUyMnNlYXJjaEFmdGVySGlzdG9yeSUyMiUzQSU1QiU1RCUyQyUyMnZpZXdUeXBlJTIyJTNBJTIyc25pcHBldHMlMjIlMkMlMjJyZWNvcmRTZWxlY3Rpb24lMjIlM0ElN0IlMjJyZWNvcmQlMjIlM0ElN0IlMjJpZCUyMiUzQSUyMmIzMGJjNjY3YTMyNGFlMTExZDAyNTUyNjU2M2I2NzRhOGQzZmQ4NjliYzA3YzhmZDIwNGFhOTViMDVkNDFmMGMlMjIlN0QlMkMlMjJpdGVtSW5kZXglMjIlM0E3MCU3RCU3RA%3D%3D)." - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "render_provenance_url(api=api, coords=coll_coords, retr_item=rag_result.answers[0].grounding.retr_items[0])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let us try out a different question on our document corpus.\n", - "Here we also include (commented out) various additional parameters the user can optionally set:\n", - "- `retr_k`: number of items to retrieve\n", - "- `text_weight`: weight of lexical search (`0.0`: fully semantic search, `1.0`: fully lexical search, anything in-between: hybrid search)\n", - "- `rerank`: whether to rerank the retrieval results\n", - "- `gen_ctx_extr_method` (Literal[\"window\", \"page\"], optional): method for gen context extraction from document; defaults to \"window\"\n", - "- `gen_ctx_window_size` (int, optional): (relevant only if `gen_ctx_extr_method` is \"window\") max chars to use for extracted gen context (actual extraction quantized on doc item level); defaults to 5000\n", - "- `gen_ctx_window_lead_weight` (float, optional): (relevant only if `gen_ctx_extr_method` is \"window\") weight of leading text for distributing remaining window size after extracting the `main_path`; defaults to 0.5 (centered around `main_path`)\n", - "- `return_prompt` (bool, optional): whether to return the instantiated prompt; defaults to False\n", - "- `gen_timeout` (float, optional): timeout for LLM generation; defaults to None, i.e. determined by system\n", - "\n", - "For more details refer to `deepsearch.cps.queries.RAGQuery`." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "89d95a17-1569-4c90-a983-8ca437b7569d", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
RAGResult(\n",
-                            "    answers=[\n",
-                            "        RAGAnswerItem(\n",
-                            "            answer='The term machine learning was coined in 1959 by Arthur Samuel, an IBM employee and pioneer in \n",
-                            "the field of computer gaming and artificial intelligence.',\n",
-                            "            grounding=RAGGroundingInfo(\n",
-                            "                retr_items=[\n",
-                            "                    SearchResultItem(\n",
-                            "                        doc_hash='e470e7b42a92c8e5f25094362361947b9203e0074c2223505b4921940ec075a1',\n",
-                            "                        chunk='History and relationships to other fields\\nThe term machine learning was coined in \n",
-                            "1959 by Arthur Samuel, an IBM employee and pioneer in the field of computer gaming and artificial intelligence. \n",
-                            "$^{[10][11]}$ The synonym self-teaching computers was also used in this time period. [12][13]',\n",
-                            "                        main_path='main-text.6',\n",
-                            "                        path_group=['main-text.5', 'main-text.6'],\n",
-                            "                        source_is_text=True\n",
-                            "                    )\n",
-                            "                ],\n",
-                            "                gen_ctx_paths=[\n",
-                            "                    'main-text.1',\n",
-                            "                    'main-text.2',\n",
-                            "                    'main-text.3',\n",
-                            "                    'main-text.4',\n",
-                            "                    'main-text.5',\n",
-                            "                    'main-text.6',\n",
-                            "                    'main-text.7',\n",
-                            "                    'main-text.8',\n",
-                            "                    'main-text.9',\n",
-                            "                    'main-text.10'\n",
-                            "                ]\n",
-                            "            ),\n",
-                            "            prompt=None\n",
-                            "        )\n",
-                            "    ],\n",
-                            "    search_result_items=[\n",
-                            "        SearchResultItem(\n",
-                            "            doc_hash='e470e7b42a92c8e5f25094362361947b9203e0074c2223505b4921940ec075a1',\n",
-                            "            chunk='History and relationships to other fields\\nThe term machine learning was coined in 1959 by \n",
-                            "Arthur Samuel, an IBM employee and pioneer in the field of computer gaming and artificial intelligence. \n",
-                            "$^{[10][11]}$ The synonym self-teaching computers was also used in this time period. [12][13]',\n",
-                            "            main_path='main-text.6',\n",
-                            "            path_group=['main-text.5', 'main-text.6'],\n",
-                            "            source_is_text=True\n",
-                            "        ),\n",
-                            "        SearchResultItem(\n",
-                            "            doc_hash='e470e7b42a92c8e5f25094362361947b9203e0074c2223505b4921940ec075a1',\n",
-                            "            chunk=\"Machine learning\\nMachine learning (ML) is an umbrella term for solving problems for which \n",
-                            "development of algorithms by human programmers would be cost-prohibitive, and instead the problems are solved by \n",
-                            "helping machines 'discover ' their 'own ' algorithms, $^{[1]}$ without needing to be explicitly told what to do by \n",
-                            "any human-developed algorithms. $^{[2]}$ Recently, generative artificial neural networks have been able to surpass \n",
-                            "results of many previous approaches. $^{[3][4]}$ Machine-learning approaches have been applied to large language \n",
-                            "models, computer vision, speech recognition, email filtering, agriculture and medicine, where it is too costly to \n",
-                            "develop algorithms to perform the needed tasks. [5][6]\",\n",
-                            "            main_path='main-text.2',\n",
-                            "            path_group=['main-text.1', 'main-text.2'],\n",
-                            "            source_is_text=True\n",
-                            "        ),\n",
-                            "        SearchResultItem(\n",
-                            "            doc_hash='e470e7b42a92c8e5f25094362361947b9203e0074c2223505b4921940ec075a1',\n",
-                            "            chunk='Artificial intelligence\\nMachine learning (ML), reorganized and recognized as its own field, \n",
-                            "started to flourish in the 1990s. The field changed its goal from achieving artificial intelligence to tackling \n",
-                            "solvable problems of a practical nature. It shifted focus away from the symbolic approaches it had inherited from \n",
-                            "AI, and toward methods and models borrowed from statistics, fuzzy logic, and probability theory. [24]',\n",
-                            "            main_path='main-text.15',\n",
-                            "            path_group=['main-text.10', 'main-text.15'],\n",
-                            "            source_is_text=True\n",
-                            "        )\n",
-                            "    ]\n",
-                            ")\n",
-                            "
\n" - ], - "text/plain": [ - "\u001b[1;35mRAGResult\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33manswers\u001b[0m=\u001b[1m[\u001b[0m\n", - " \u001b[1;35mRAGAnswerItem\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33manswer\u001b[0m=\u001b[32m'The term machine learning was coined in 1959 by Arthur Samuel, an IBM employee and pioneer in \u001b[0m\n", - "\u001b[32mthe field of computer gaming and artificial intelligence.'\u001b[0m,\n", - " \u001b[33mgrounding\u001b[0m=\u001b[1;35mRAGGroundingInfo\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mretr_items\u001b[0m=\u001b[1m[\u001b[0m\n", - " \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mdoc_hash\u001b[0m=\u001b[32m'e470e7b42a92c8e5f25094362361947b9203e0074c2223505b4921940ec075a1'\u001b[0m,\n", - " \u001b[33mchunk\u001b[0m=\u001b[32m'History and relationships to other fields\\nThe term machine learning was coined in \u001b[0m\n", - "\u001b[32m1959 by Arthur Samuel, an IBM employee and pioneer in the field of computer gaming and artificial intelligence. \u001b[0m\n", - "\u001b[32m$^\u001b[0m\u001b[32m{\u001b[0m\u001b[32m[\u001b[0m\u001b[32m10\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m11\u001b[0m\u001b[32m]\u001b[0m\u001b[32m}\u001b[0m\u001b[32m$ The synonym self-teaching computers was also used in this time period. \u001b[0m\u001b[32m[\u001b[0m\u001b[32m12\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m13\u001b[0m\u001b[32m]\u001b[0m\u001b[32m'\u001b[0m,\n", - " \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.6'\u001b[0m,\n", - " \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.5'\u001b[0m, \u001b[32m'main-text.6'\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", - " \u001b[1m)\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", - " \u001b[33mgen_ctx_paths\u001b[0m=\u001b[1m[\u001b[0m\n", - " \u001b[32m'main-text.1'\u001b[0m,\n", - " \u001b[32m'main-text.2'\u001b[0m,\n", - " \u001b[32m'main-text.3'\u001b[0m,\n", - " \u001b[32m'main-text.4'\u001b[0m,\n", - " \u001b[32m'main-text.5'\u001b[0m,\n", - " \u001b[32m'main-text.6'\u001b[0m,\n", - " \u001b[32m'main-text.7'\u001b[0m,\n", - " \u001b[32m'main-text.8'\u001b[0m,\n", - " \u001b[32m'main-text.9'\u001b[0m,\n", - " \u001b[32m'main-text.10'\u001b[0m\n", - " \u001b[1m]\u001b[0m\n", - " \u001b[1m)\u001b[0m,\n", - " \u001b[33mprompt\u001b[0m=\u001b[3;35mNone\u001b[0m\n", - " \u001b[1m)\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", - " \u001b[33msearch_result_items\u001b[0m=\u001b[1m[\u001b[0m\n", - " \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mdoc_hash\u001b[0m=\u001b[32m'e470e7b42a92c8e5f25094362361947b9203e0074c2223505b4921940ec075a1'\u001b[0m,\n", - " \u001b[33mchunk\u001b[0m=\u001b[32m'History and relationships to other fields\\nThe term machine learning was coined in 1959 by \u001b[0m\n", - "\u001b[32mArthur Samuel, an IBM employee and pioneer in the field of computer gaming and artificial intelligence. \u001b[0m\n", - "\u001b[32m$^\u001b[0m\u001b[32m{\u001b[0m\u001b[32m[\u001b[0m\u001b[32m10\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m11\u001b[0m\u001b[32m]\u001b[0m\u001b[32m}\u001b[0m\u001b[32m$ The synonym self-teaching computers was also used in this time period. \u001b[0m\u001b[32m[\u001b[0m\u001b[32m12\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m13\u001b[0m\u001b[32m]\u001b[0m\u001b[32m'\u001b[0m,\n", - " \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.6'\u001b[0m,\n", - " \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.5'\u001b[0m, \u001b[32m'main-text.6'\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", - " \u001b[1m)\u001b[0m,\n", - " \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mdoc_hash\u001b[0m=\u001b[32m'e470e7b42a92c8e5f25094362361947b9203e0074c2223505b4921940ec075a1'\u001b[0m,\n", - " \u001b[33mchunk\u001b[0m=\u001b[32m\"Machine\u001b[0m\u001b[32m learning\\nMachine learning \u001b[0m\u001b[32m(\u001b[0m\u001b[32mML\u001b[0m\u001b[32m)\u001b[0m\u001b[32m is an umbrella term for solving problems for which \u001b[0m\n", - "\u001b[32mdevelopment of algorithms by human programmers would be cost-prohibitive, and instead the problems are solved by \u001b[0m\n", - "\u001b[32mhelping machines 'discover ' their 'own ' algorithms, $^\u001b[0m\u001b[32m{\u001b[0m\u001b[32m[\u001b[0m\u001b[32m1\u001b[0m\u001b[32m]\u001b[0m\u001b[32m}\u001b[0m\u001b[32m$ without needing to be explicitly told what to do by \u001b[0m\n", - "\u001b[32many human-developed algorithms. $^\u001b[0m\u001b[32m{\u001b[0m\u001b[32m[\u001b[0m\u001b[32m2\u001b[0m\u001b[32m]\u001b[0m\u001b[32m}\u001b[0m\u001b[32m$ Recently, generative artificial neural networks have been able to surpass \u001b[0m\n", - "\u001b[32mresults of many previous approaches. $^\u001b[0m\u001b[32m{\u001b[0m\u001b[32m[\u001b[0m\u001b[32m3\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m4\u001b[0m\u001b[32m]\u001b[0m\u001b[32m}\u001b[0m\u001b[32m$ Machine-learning approaches have been applied to large language \u001b[0m\n", - "\u001b[32mmodels, computer vision, speech recognition, email filtering, agriculture and medicine, where it is too costly to \u001b[0m\n", - "\u001b[32mdevelop algorithms to perform the needed tasks. \u001b[0m\u001b[32m[\u001b[0m\u001b[32m5\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m6\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\"\u001b[0m,\n", - " \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.2'\u001b[0m,\n", - " \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.1'\u001b[0m, \u001b[32m'main-text.2'\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", - " \u001b[1m)\u001b[0m,\n", - " \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mdoc_hash\u001b[0m=\u001b[32m'e470e7b42a92c8e5f25094362361947b9203e0074c2223505b4921940ec075a1'\u001b[0m,\n", - " \u001b[33mchunk\u001b[0m=\u001b[32m'Artificial intelligence\\nMachine learning \u001b[0m\u001b[32m(\u001b[0m\u001b[32mML\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, reorganized and recognized as its own field, \u001b[0m\n", - "\u001b[32mstarted to flourish in the 1990s. The field changed its goal from achieving artificial intelligence to tackling \u001b[0m\n", - "\u001b[32msolvable problems of a practical nature. It shifted focus away from the symbolic approaches it had inherited from \u001b[0m\n", - "\u001b[32mAI, and toward methods and models borrowed from statistics, fuzzy logic, and probability theory. \u001b[0m\u001b[32m[\u001b[0m\u001b[32m24\u001b[0m\u001b[32m]\u001b[0m\u001b[32m'\u001b[0m,\n", - " \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.15'\u001b[0m,\n", - " \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.10'\u001b[0m, \u001b[32m'main-text.15'\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", - " \u001b[1m)\u001b[0m\n", - " \u001b[1m]\u001b[0m\n", - "\u001b[1m)\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "question = \"Who came up with the term 'machine learning'?\"\n", - "\n", - "# submit natural-language query on collection\n", - "question_query = RAGQuery(\n", - " question=question,\n", - " project=PROJ_KEY,\n", - " data_source=data_source,\n", - "\n", - " ## optional retrieval params\n", - " retr_k=RETR_K,\n", - " # text_weight=TEXT_WEIGHT,\n", - " # rerank=RERANK,\n", - "\n", - " ## optional generation params\n", - " # model_id=\"ibm-mistralai/mixtral-8x7b-instruct-v01-q\",\n", - " # gen_params={\"random_seed\": 42, \"max_new_tokens\": 1024},\n", - " # prompt_template=\"Answer the query based on the context.\\n\\nContext: {{ context }}\\n\\nQuery: {{ query }}\",\n", - "\n", - " # gen_ctx_extr_method=\"window\",\n", - " # gen_ctx_window_size=5000,\n", - " # gen_ctx_window_lead_weight=0.5\n", - " # return_prompt=True,\n", - " # gen_timeout=10.0,\n", - ")\n", - "api_output = api.queries.run(question_query)\n", - "rag_result = RAGResult.from_api_output(api_output, raise_on_error=RAISE)\n", - "\n", - "rich.print(rag_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As seen by the returned `doc_hash`, this answer came from a different document than the previous one." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/markdown": [ - "The provenance of the answer can be inspected on the [source document](https://sds.app.accelerate.science/projects/b09ae7561a01dc7c4b0fd21a43bfd93d140766d1/library/private/6b70072911ad2794a3844dd44d1705a5ba37ca0b?search=JTdCJTIycHJpdmF0ZUNvbGxlY3Rpb24lMjIlM0ElMjI2YjcwMDcyOTExYWQyNzk0YTM4NDRkZDQ0ZDE3MDVhNWJhMzdjYTBiJTIyJTJDJTIydHlwZSUyMiUzQSUyMkRvY3VtZW50JTIyJTJDJTIyZXhwcmVzc2lvbiUyMiUzQSUyMmZpbGUtaW5mby5kb2N1bWVudC1oYXNoJTNBJTIwJTVDJTIyZTQ3MGU3YjQyYTkyYzhlNWYyNTA5NDM2MjM2MTk0N2I5MjAzZTAwNzRjMjIyMzUwNWI0OTIxOTQwZWMwNzVhMSU1QyUyMiUyMiUyQyUyMmZpbHRlcnMlMjIlM0ElNUIlNUQlMkMlMjJzZWxlY3QlMjIlM0ElNUIlMjJfbmFtZSUyMiUyQyUyMmRlc2NyaXB0aW9uLmNvbGxlY3Rpb24lMjIlMkMlMjJwcm92JTIyJTJDJTIyZGVzY3JpcHRpb24udGl0bGUlMjIlMkMlMjJkZXNjcmlwdGlvbi5wdWJsaWNhdGlvbl9kYXRlJTIyJTJDJTIyZGVzY3JpcHRpb24udXJsX3JlZnMlMjIlNUQlMkMlMjJpdGVtSW5kZXglMjIlM0EwJTJDJTIycGFnZVNpemUlMjIlM0ExMCUyQyUyMnNlYXJjaEFmdGVySGlzdG9yeSUyMiUzQSU1QiU1RCUyQyUyMnZpZXdUeXBlJTIyJTNBJTIyc25pcHBldHMlMjIlMkMlMjJyZWNvcmRTZWxlY3Rpb24lMjIlM0ElN0IlMjJyZWNvcmQlMjIlM0ElN0IlMjJpZCUyMiUzQSUyMmU0NzBlN2I0MmE5MmM4ZTVmMjUwOTQzNjIzNjE5NDdiOTIwM2UwMDc0YzIyMjM1MDViNDkyMTk0MGVjMDc1YTElMjIlN0QlMkMlMjJpdGVtSW5kZXglMjIlM0E2JTdEJTdE)." - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "render_provenance_url(api=api, coords=coll_coords, retr_item=rag_result.answers[0].grounding.retr_items[0])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Semantic retrieval\n", - "\n", - "Besides RAG, which includes natural language generation, a user may only be interested in\n", - "the semantic retrieval part.\n", - "\n", - "This can be obtained very similarly to RAG, as shown below:" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
SearchResult(\n",
-                            "    search_result_items=[\n",
-                            "        SearchResultItem(\n",
-                            "            doc_hash='b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c',\n",
-                            "            chunk='Switzerland\\nIBM Research-Zurich (previously called IBM Zurich Research Laboratory, ZRL) is the \n",
-                            "European branch of IBM Research. It was opened in 1956 and is located in Rüschlikon, near Zurich, Switzerland.',\n",
-                            "            main_path='main-text.70',\n",
-                            "            path_group=['main-text.69', 'main-text.70'],\n",
-                            "            source_is_text=True\n",
-                            "        ),\n",
-                            "        SearchResultItem(\n",
-                            "            doc_hash='b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c',\n",
-                            "            chunk='Switzerland\\nIn 1956, IBM opened their first European research laboratory in Adliswil, \n",
-                            "Switzerland, near Zurich. The lab moved to its own campus in neighboring Rüschlikon in 1962. The Zurich lab is \n",
-                            "staffed by a multicultural and interdisciplinary team of a few hundred permanent research staff members, graduate \n",
-                            "students and post-doctoral fellows, representing about 45 nationalities. Collocated with the lab is a Client Center\n",
-                            "(formerly the Industry Solutions Lab), an executive briefing facility demonstrating technology prototypes and \n",
-                            "solutions.',\n",
-                            "            main_path='main-text.71',\n",
-                            "            path_group=['main-text.69', 'main-text.71'],\n",
-                            "            source_is_text=True\n",
-                            "        ),\n",
-                            "        SearchResultItem(\n",
-                            "            doc_hash='b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c',\n",
-                            "            chunk='Switzerland\\nThe research projects pursued at the IBM Zurich lab are organized into four \n",
-                            "scientific and technical departments: Science & Technology, Cloud and AI Systems Research, Cognitive Computing & \n",
-                            "Industry Solutions and Security Research. The lab is currently managed by Alessandro Curioni.',\n",
-                            "            main_path='main-text.74',\n",
-                            "            path_group=['main-text.69', 'main-text.74'],\n",
-                            "            source_is_text=True\n",
-                            "        )\n",
-                            "    ]\n",
-                            ")\n",
-                            "
\n" - ], - "text/plain": [ - "\u001b[1;35mSearchResult\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33msearch_result_items\u001b[0m=\u001b[1m[\u001b[0m\n", - " \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mdoc_hash\u001b[0m=\u001b[32m'b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c'\u001b[0m,\n", - " \u001b[33mchunk\u001b[0m=\u001b[32m'Switzerland\\nIBM Research-Zurich \u001b[0m\u001b[32m(\u001b[0m\u001b[32mpreviously called IBM Zurich Research Laboratory, ZRL\u001b[0m\u001b[32m)\u001b[0m\u001b[32m is the \u001b[0m\n", - "\u001b[32mEuropean branch of IBM Research. It was opened in 1956 and is located in Rüschlikon, near Zurich, Switzerland.'\u001b[0m,\n", - " \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.70'\u001b[0m,\n", - " \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.69'\u001b[0m, \u001b[32m'main-text.70'\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", - " \u001b[1m)\u001b[0m,\n", - " \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mdoc_hash\u001b[0m=\u001b[32m'b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c'\u001b[0m,\n", - " \u001b[33mchunk\u001b[0m=\u001b[32m'Switzerland\\nIn 1956, IBM opened their first European research laboratory in Adliswil, \u001b[0m\n", - "\u001b[32mSwitzerland, near Zurich. The lab moved to its own campus in neighboring Rüschlikon in 1962. The Zurich lab is \u001b[0m\n", - "\u001b[32mstaffed by a multicultural and interdisciplinary team of a few hundred permanent research staff members, graduate \u001b[0m\n", - "\u001b[32mstudents and post-doctoral fellows, representing about 45 nationalities. Collocated with the lab is a Client Center\u001b[0m\n", - "\u001b[32m(\u001b[0m\u001b[32mformerly the Industry Solutions Lab\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, an executive briefing facility demonstrating technology prototypes and \u001b[0m\n", - "\u001b[32msolutions.'\u001b[0m,\n", - " \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.71'\u001b[0m,\n", - " \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.69'\u001b[0m, \u001b[32m'main-text.71'\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", - " \u001b[1m)\u001b[0m,\n", - " \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mdoc_hash\u001b[0m=\u001b[32m'b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c'\u001b[0m,\n", - " \u001b[33mchunk\u001b[0m=\u001b[32m'Switzerland\\nThe research projects pursued at the IBM Zurich lab are organized into four \u001b[0m\n", - "\u001b[32mscientific and technical departments: Science & Technology, Cloud and AI Systems Research, Cognitive Computing & \u001b[0m\n", - "\u001b[32mIndustry Solutions and Security Research. The lab is currently managed by Alessandro Curioni.'\u001b[0m,\n", - " \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.74'\u001b[0m,\n", - " \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.69'\u001b[0m, \u001b[32m'main-text.74'\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", - " \u001b[1m)\u001b[0m\n", - " \u001b[1m]\u001b[0m\n", - "\u001b[1m)\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "question = \"Where is the IBM lab in Zurich?\"\n", - "\n", - "# submit natural-language query on collection\n", - "question_query = SemanticQuery(\n", - " question=question,\n", - " project=PROJ_KEY,\n", - " data_source=data_source,\n", - "\n", - " ## optional params\n", - " retr_k=RETR_K,\n", - " # text_weight=TEXT_WEIGHT,\n", - " # rerank=RERANK,\n", - ")\n", - "api_output = api.queries.run(question_query)\n", - "search_result = SearchResult.from_api_output(api_output, raise_on_error=RAISE)\n", - "\n", - "rich.print(search_result)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.4" - } + "cells": [ + { + "cell_type": "markdown", + "id": "99f717ef-4cba-4300-b258-0b1c248cb873", + "metadata": {}, + "source": [ + "# RAG and Semantic Retrieval on a Document Collection\n", + "\n", + "Deep Search allows users to interact with the documents using conversational AI, i.e. you interact with a virtual assistant which answer your questions using the information in the corpus.\n", + "\n", + "In this example we demonstrate how achive the same interaction programmatically.\n", + "\n", + "### Access required\n", + "\n", + "The content of this notebook requires access to Deep Search capabilities which are not\n", + "available on the public access system.\n", + "\n", + "[Contact us](https://ds4sd.github.io) if you are interested in exploring\n", + "these Deep Search capabilities.\n", + "\n", + "\n", + "### GenAI Integration required\n", + "\n", + "When interacting with the virtual assistant, Deep Search requires a connection to a Generative AI API. Currently, we support connections to [watsonx.ai](https://www.ibm.com/products/watsonx-ai) or the IBM-internal GenAI platform BAM.\n", + "\n", + "Deep Search allows custom GenAI configurations for each project.\n", + "In the following example you will require to work in a project which has such GenAI capabilities activated." + ] + }, + { + "cell_type": "markdown", + "id": "256aef50-71a1-4278-9b22-17cb99a6566e", + "metadata": {}, + "source": [ + "### Set notebook parameters\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "5b244bdd-1b52-41ff-b63e-9a203570d210", + "metadata": {}, + "outputs": [], + "source": [ + "from dsnotebooks.settings import CollQANotebookSettings\n", + "\n", + "# notebooks settings auto-loaded from .env / env vars\n", + "notebook_settings = CollQANotebookSettings()\n", + "\n", + "PROFILE_NAME = notebook_settings.profile # the profile to use\n", + "PROJ_KEY = notebook_settings.proj_key # the project to use\n", + "INDEX_KEY = notebook_settings.sem_on_idx_key # the collection to use\n", + "\n", + "SKIP_INGESTED_DOCS = (\n", + " notebook_settings.skip_ingested_docs\n", + ") # whether to skip any already semantically ingested docs\n", + "\n", + "RETR_K = notebook_settings.retr_k # the number of search results to retrieve\n", + "TEXT_WEIGHT = (\n", + " notebook_settings.text_weight\n", + ") # the weight of lexical search (0.0: semantic-only, 1.0: lexical-only, anything in between: hybrid search)\n", + "RERANK = notebook_settings.rerank # whether to rerank the search results\n", + "RAISE = (\n", + " notebook_settings.raise_on_sem_err\n", + ") # whether semantic operation errors should raise an exception or be reflected in response fields" + ] + }, + { + "cell_type": "markdown", + "id": "a5269060-bb5f-4fe3-9b64-547202db6714", + "metadata": {}, + "source": [ + "### Import example dependencies" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "5d236ea0-db1c-4171-8e11-cdd0bad69d66", + "metadata": {}, + "outputs": [], + "source": [ + "# Import standard dependenices\n", + "import pandas as pd\n", + "import rich\n", + "\n", + "# IPython utilities\n", + "from IPython.display import display, Markdown\n", + "\n", + "# Import the deepsearch-toolkit\n", + "from deepsearch.cps.client.api import CpsApi\n", + "from deepsearch.cps.client.components.elastic import ElasticProjectDataCollectionSource\n", + "from deepsearch.cps.queries import DataQuery, RAGQuery, SemanticQuery\n", + "from deepsearch.cps.queries.results import RAGResult, SearchResult, SearchResultItem" + ] + }, + { + "cell_type": "markdown", + "id": "293c249b-6018-46f2-b4d8-795f994d4729", + "metadata": {}, + "source": [ + "### Connect to Deep Search" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "9c108432-a285-4c7b-a996-008ac3ff3d7a", + "metadata": {}, + "outputs": [], + "source": [ + "api = CpsApi.from_env(profile_name=PROFILE_NAME)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Utils" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def render_provenance_url(\n", + " api: CpsApi,\n", + " coords: ElasticProjectDataCollectionSource,\n", + " retr_item: SearchResultItem,\n", + "):\n", + " ## compute URL to the document in the Deep Search UI\n", + " item_index = int(retr_item.main_path[retr_item.main_path.rfind(\".\") + 1 :])\n", + " doc_url = api.documents.generate_url(\n", + " document_hash=retr_item.doc_hash,\n", + " data_source=coords,\n", + " item_index=item_index,\n", + " )\n", + " display(\n", + " Markdown(\n", + " f\"The provenance of the answer can be inspected on the [source document]({doc_url}).\"\n", + " )\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "38cde869-46d1-4833-8eb3-2381b5e5fb68", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Prepare the collection coordinates:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "coll_coords = ElasticProjectDataCollectionSource(\n", + " proj_key=PROJ_KEY,\n", + " index_key=INDEX_KEY,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We are using a small collection, so we can just list its documents to get an idea of its contents (for more details on querying, check the [Data Query Quick Start](https://github.com/DS4SD/deepsearch-examples/tree/main/examples/data_query_quick_start))." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "2b38875e-f39c-4dd5-9d42-3ffca5d0bdac", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Finished fetching all data. Total is 10 records.\n" + ] }, - "nbformat": 4, - "nbformat_minor": 5 + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
FilenameDocHash
0natural-language-processing.pdf000f892ddcc67f165797a96e94f44fb9e0697c7912a383...
1ibm-z.pdf07e56eb5a10f725fccad9386d126b7b05bec1fa71b9b3d...
2ibm.pdf234bc5cf2c860d49574b0ff7191c354b7bbc11472a0997...
3ibm-the-great-mind-challenge.pdf335120a57b418655196e3315b562a2f9e89cedeaef9318...
4turing-award.pdf8a7c91a269abc3063df9f4e19f7961ddb8e2393fa0f272...
5ibm-research.pdfb30bc667a324ae111d025526563b674a8d3fd869bc07c8...
6artificial-intelligence.pdfb60a87c1d62a59d517f2fd6f2d3ea1a96c58b651332a8b...
7machine-learning.pdfe470e7b42a92c8e5f25094362361947b9203e0074c2223...
8deep-blue-chess-computer.pdffa7ce2f66a7a5e061813d36348425f81d9e7ebc23454d8...
9red-hat.pdffb53bb607f2e9642d7fe044585d1dcdb052c57febe1b87...
\n", + "
" + ], + "text/plain": [ + " Filename \\\n", + "0 natural-language-processing.pdf \n", + "1 ibm-z.pdf \n", + "2 ibm.pdf \n", + "3 ibm-the-great-mind-challenge.pdf \n", + "4 turing-award.pdf \n", + "5 ibm-research.pdf \n", + "6 artificial-intelligence.pdf \n", + "7 machine-learning.pdf \n", + "8 deep-blue-chess-computer.pdf \n", + "9 red-hat.pdf \n", + "\n", + " DocHash \n", + "0 000f892ddcc67f165797a96e94f44fb9e0697c7912a383... \n", + "1 07e56eb5a10f725fccad9386d126b7b05bec1fa71b9b3d... \n", + "2 234bc5cf2c860d49574b0ff7191c354b7bbc11472a0997... \n", + "3 335120a57b418655196e3315b562a2f9e89cedeaef9318... \n", + "4 8a7c91a269abc3063df9f4e19f7961ddb8e2393fa0f272... \n", + "5 b30bc667a324ae111d025526563b674a8d3fd869bc07c8... \n", + "6 b60a87c1d62a59d517f2fd6f2d3ea1a96c58b651332a8b... \n", + "7 e470e7b42a92c8e5f25094362361947b9203e0074c2223... \n", + "8 fa7ce2f66a7a5e061813d36348425f81d9e7ebc23454d8... \n", + "9 fb53bb607f2e9642d7fe044585d1dcdb052c57febe1b87... " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Prepare the data query\n", + "query = DataQuery(\n", + " search_query=\"*\", # The search query to be executed\n", + " source=[ # Which fields of documents we want to fetch\n", + " \"file-info.document-hash\",\n", + " \"file-info.filename\",\n", + " # \"description.title\",\n", + " ],\n", + " coordinates=coll_coords, # The data collection to be queries\n", + ")\n", + "\n", + "# Query Deep Search for the documents matching the query\n", + "results = []\n", + "query_results = api.queries.run(query)\n", + "for row in query_results.outputs[\"data_outputs\"]:\n", + " # Add row to results table\n", + " results.append(\n", + " {\n", + " \"Filename\": row[\"_source\"][\"file-info\"][\"filename\"],\n", + " \"DocHash\": row[\"_source\"][\"file-info\"][\"document-hash\"],\n", + " # \"Title\": row[\"_source\"].get(\"description\", {}).get(\"title\"),\n", + " }\n", + " )\n", + "\n", + "print(f\"Finished fetching all data. Total is {len(results)} records.\")\n", + "\n", + "# Visualize the table with all results\n", + "df = pd.json_normalize(results)\n", + "display(df)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prepare source" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "from deepsearch.cps.client.components.documents import (\n", + " PrivateDataCollectionSource,\n", + " PrivateDataDocumentSource,\n", + " PublicDataDocumentSource,\n", + ")\n", + "\n", + "data_source = PrivateDataCollectionSource(\n", + " source=coll_coords,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Ingestion" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the cell below we show how to semantically index your collection (indexing of already indexed docs is controlled via param `skip_ingested_docs`):" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/pva/work/github.com/DS4SD/deepsearch-examples/.venv/lib/python3.10/site-packages/pydantic/main.py:314: UserWarning: Pydantic serializer warnings:\n", + " Expected `list[str]` but got `_LiteralGenericAlias` - serialized value may not be as expected\n", + " return self.__pydantic_serializer__.to_python(\n" + ] + }, + { + "data": { + "text/plain": [ + "{'ing_out': {}}" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# launch the ingestion of the collection for DocumentQA\n", + "task = api.documents.semantic_ingest(\n", + " project=PROJ_KEY,\n", + " data_source=data_source,\n", + " skip_ingested_docs=SKIP_INGESTED_DOCS,\n", + ")\n", + "\n", + "# wait for the ingestion task to finish\n", + "api.tasks.wait_for(PROJ_KEY, task.task_id)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## RAG" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "ee573e76-98ea-43ce-a2ba-a81f64b3adf3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
RAGResult(\n",
+       "    answers=[\n",
+       "        RAGAnswerItem(\n",
+       "            answer='The IBM lab in Zurich is located in Rüschlikon, Switzerland.',\n",
+       "            grounding=RAGGroundingInfo(\n",
+       "                retr_items=[\n",
+       "                    SearchResultItem(\n",
+       "                        doc_hash='b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c',\n",
+       "                        chunk='Switzerland\\nIBM Research-Zurich (previously called IBM Zurich Research Laboratory, \n",
+       "ZRL) is the European branch of IBM Research. It was opened in 1956 and is located in Rüschlikon, near Zurich, \n",
+       "Switzerland.',\n",
+       "                        main_path='main-text.70',\n",
+       "                        path_group=['main-text.69', 'main-text.70'],\n",
+       "                        source_is_text=True\n",
+       "                    )\n",
+       "                ],\n",
+       "                gen_ctx_paths=[\n",
+       "                    'main-text.58',\n",
+       "                    'main-text.59',\n",
+       "                    'main-text.60',\n",
+       "                    'main-text.61',\n",
+       "                    'main-text.62',\n",
+       "                    'main-text.63',\n",
+       "                    'main-text.64',\n",
+       "                    'main-text.65',\n",
+       "                    'main-text.66',\n",
+       "                    'main-text.67',\n",
+       "                    'main-text.68',\n",
+       "                    'main-text.69',\n",
+       "                    'main-text.70',\n",
+       "                    'main-text.71',\n",
+       "                    'main-text.72',\n",
+       "                    'main-text.73',\n",
+       "                    'main-text.74',\n",
+       "                    'main-text.75',\n",
+       "                    'main-text.76'\n",
+       "                ]\n",
+       "            ),\n",
+       "            prompt=None\n",
+       "        )\n",
+       "    ],\n",
+       "    search_result_items=[\n",
+       "        SearchResultItem(\n",
+       "            doc_hash='b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c',\n",
+       "            chunk='Switzerland\\nIBM Research-Zurich (previously called IBM Zurich Research Laboratory, ZRL) is the \n",
+       "European branch of IBM Research. It was opened in 1956 and is located in Rüschlikon, near Zurich, Switzerland.',\n",
+       "            main_path='main-text.70',\n",
+       "            path_group=['main-text.69', 'main-text.70'],\n",
+       "            source_is_text=True\n",
+       "        ),\n",
+       "        SearchResultItem(\n",
+       "            doc_hash='b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c',\n",
+       "            chunk='Switzerland\\nIn 1956, IBM opened their first European research laboratory in Adliswil, \n",
+       "Switzerland, near Zurich. The lab moved to its own campus in neighboring Rüschlikon in 1962. The Zurich lab is \n",
+       "staffed by a multicultural and interdisciplinary team of a few hundred permanent research staff members, graduate \n",
+       "students and post-doctoral fellows, representing about 45 nationalities. Collocated with the lab is a Client Center\n",
+       "(formerly the Industry Solutions Lab), an executive briefing facility demonstrating technology prototypes and \n",
+       "solutions.',\n",
+       "            main_path='main-text.71',\n",
+       "            path_group=['main-text.69', 'main-text.71'],\n",
+       "            source_is_text=True\n",
+       "        ),\n",
+       "        SearchResultItem(\n",
+       "            doc_hash='b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c',\n",
+       "            chunk='Switzerland\\nThe research projects pursued at the IBM Zurich lab are organized into four \n",
+       "scientific and technical departments: Science & Technology, Cloud and AI Systems Research, Cognitive Computing & \n",
+       "Industry Solutions and Security Research. The lab is currently managed by Alessandro Curioni.',\n",
+       "            main_path='main-text.74',\n",
+       "            path_group=['main-text.69', 'main-text.74'],\n",
+       "            source_is_text=True\n",
+       "        )\n",
+       "    ]\n",
+       ")\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1;35mRAGResult\u001b[0m\u001b[1m(\u001b[0m\n", + " \u001b[33manswers\u001b[0m=\u001b[1m[\u001b[0m\n", + " \u001b[1;35mRAGAnswerItem\u001b[0m\u001b[1m(\u001b[0m\n", + " \u001b[33manswer\u001b[0m=\u001b[32m'The IBM lab in Zurich is located in Rüschlikon, Switzerland.'\u001b[0m,\n", + " \u001b[33mgrounding\u001b[0m=\u001b[1;35mRAGGroundingInfo\u001b[0m\u001b[1m(\u001b[0m\n", + " \u001b[33mretr_items\u001b[0m=\u001b[1m[\u001b[0m\n", + " \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n", + " \u001b[33mdoc_hash\u001b[0m=\u001b[32m'b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c'\u001b[0m,\n", + " \u001b[33mchunk\u001b[0m=\u001b[32m'Switzerland\\nIBM Research-Zurich \u001b[0m\u001b[32m(\u001b[0m\u001b[32mpreviously called IBM Zurich Research Laboratory, \u001b[0m\n", + "\u001b[32mZRL\u001b[0m\u001b[32m)\u001b[0m\u001b[32m is the European branch of IBM Research. It was opened in 1956 and is located in Rüschlikon, near Zurich, \u001b[0m\n", + "\u001b[32mSwitzerland.'\u001b[0m,\n", + " \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.70'\u001b[0m,\n", + " \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.69'\u001b[0m, \u001b[32m'main-text.70'\u001b[0m\u001b[1m]\u001b[0m,\n", + " \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", + " \u001b[1m)\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[33mgen_ctx_paths\u001b[0m=\u001b[1m[\u001b[0m\n", + " \u001b[32m'main-text.58'\u001b[0m,\n", + " \u001b[32m'main-text.59'\u001b[0m,\n", + " \u001b[32m'main-text.60'\u001b[0m,\n", + " \u001b[32m'main-text.61'\u001b[0m,\n", + " \u001b[32m'main-text.62'\u001b[0m,\n", + " \u001b[32m'main-text.63'\u001b[0m,\n", + " \u001b[32m'main-text.64'\u001b[0m,\n", + " \u001b[32m'main-text.65'\u001b[0m,\n", + " \u001b[32m'main-text.66'\u001b[0m,\n", + " \u001b[32m'main-text.67'\u001b[0m,\n", + " \u001b[32m'main-text.68'\u001b[0m,\n", + " \u001b[32m'main-text.69'\u001b[0m,\n", + " \u001b[32m'main-text.70'\u001b[0m,\n", + " \u001b[32m'main-text.71'\u001b[0m,\n", + " \u001b[32m'main-text.72'\u001b[0m,\n", + " \u001b[32m'main-text.73'\u001b[0m,\n", + " \u001b[32m'main-text.74'\u001b[0m,\n", + " \u001b[32m'main-text.75'\u001b[0m,\n", + " \u001b[32m'main-text.76'\u001b[0m\n", + " \u001b[1m]\u001b[0m\n", + " \u001b[1m)\u001b[0m,\n", + " \u001b[33mprompt\u001b[0m=\u001b[3;35mNone\u001b[0m\n", + " \u001b[1m)\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[33msearch_result_items\u001b[0m=\u001b[1m[\u001b[0m\n", + " \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n", + " \u001b[33mdoc_hash\u001b[0m=\u001b[32m'b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c'\u001b[0m,\n", + " \u001b[33mchunk\u001b[0m=\u001b[32m'Switzerland\\nIBM Research-Zurich \u001b[0m\u001b[32m(\u001b[0m\u001b[32mpreviously called IBM Zurich Research Laboratory, ZRL\u001b[0m\u001b[32m)\u001b[0m\u001b[32m is the \u001b[0m\n", + "\u001b[32mEuropean branch of IBM Research. It was opened in 1956 and is located in Rüschlikon, near Zurich, Switzerland.'\u001b[0m,\n", + " \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.70'\u001b[0m,\n", + " \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.69'\u001b[0m, \u001b[32m'main-text.70'\u001b[0m\u001b[1m]\u001b[0m,\n", + " \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", + " \u001b[1m)\u001b[0m,\n", + " \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n", + " \u001b[33mdoc_hash\u001b[0m=\u001b[32m'b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c'\u001b[0m,\n", + " \u001b[33mchunk\u001b[0m=\u001b[32m'Switzerland\\nIn 1956, IBM opened their first European research laboratory in Adliswil, \u001b[0m\n", + "\u001b[32mSwitzerland, near Zurich. The lab moved to its own campus in neighboring Rüschlikon in 1962. The Zurich lab is \u001b[0m\n", + "\u001b[32mstaffed by a multicultural and interdisciplinary team of a few hundred permanent research staff members, graduate \u001b[0m\n", + "\u001b[32mstudents and post-doctoral fellows, representing about 45 nationalities. Collocated with the lab is a Client Center\u001b[0m\n", + "\u001b[32m(\u001b[0m\u001b[32mformerly the Industry Solutions Lab\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, an executive briefing facility demonstrating technology prototypes and \u001b[0m\n", + "\u001b[32msolutions.'\u001b[0m,\n", + " \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.71'\u001b[0m,\n", + " \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.69'\u001b[0m, \u001b[32m'main-text.71'\u001b[0m\u001b[1m]\u001b[0m,\n", + " \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", + " \u001b[1m)\u001b[0m,\n", + " \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n", + " \u001b[33mdoc_hash\u001b[0m=\u001b[32m'b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c'\u001b[0m,\n", + " \u001b[33mchunk\u001b[0m=\u001b[32m'Switzerland\\nThe research projects pursued at the IBM Zurich lab are organized into four \u001b[0m\n", + "\u001b[32mscientific and technical departments: Science & Technology, Cloud and AI Systems Research, Cognitive Computing & \u001b[0m\n", + "\u001b[32mIndustry Solutions and Security Research. The lab is currently managed by Alessandro Curioni.'\u001b[0m,\n", + " \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.74'\u001b[0m,\n", + " \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.69'\u001b[0m, \u001b[32m'main-text.74'\u001b[0m\u001b[1m]\u001b[0m,\n", + " \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", + " \u001b[1m)\u001b[0m\n", + " \u001b[1m]\u001b[0m\n", + "\u001b[1m)\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "question = \"Where is the IBM lab in Zurich?\"\n", + "\n", + "# submit natural-language query on collection\n", + "question_query = RAGQuery(\n", + " question=question,\n", + " project=PROJ_KEY,\n", + " data_source=data_source,\n", + " ## optional retrieval params\n", + " retr_k=RETR_K,\n", + ")\n", + "api_output = api.queries.run(question_query)\n", + "rag_result = RAGResult.from_api_output(api_output, raise_on_error=RAISE)\n", + "\n", + "rich.print(rag_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Additionally, we can generate a provenance URL to the document in the Deep Search UI:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "The provenance of the answer can be inspected on the [source document](https://sds.app.accelerate.science/projects/b09ae7561a01dc7c4b0fd21a43bfd93d140766d1/library/private/6b70072911ad2794a3844dd44d1705a5ba37ca0b?search=JTdCJTIycHJpdmF0ZUNvbGxlY3Rpb24lMjIlM0ElMjI2YjcwMDcyOTExYWQyNzk0YTM4NDRkZDQ0ZDE3MDVhNWJhMzdjYTBiJTIyJTJDJTIydHlwZSUyMiUzQSUyMkRvY3VtZW50JTIyJTJDJTIyZXhwcmVzc2lvbiUyMiUzQSUyMmZpbGUtaW5mby5kb2N1bWVudC1oYXNoJTNBJTIwJTVDJTIyYjMwYmM2NjdhMzI0YWUxMTFkMDI1NTI2NTYzYjY3NGE4ZDNmZDg2OWJjMDdjOGZkMjA0YWE5NWIwNWQ0MWYwYyU1QyUyMiUyMiUyQyUyMmZpbHRlcnMlMjIlM0ElNUIlNUQlMkMlMjJzZWxlY3QlMjIlM0ElNUIlMjJfbmFtZSUyMiUyQyUyMmRlc2NyaXB0aW9uLmNvbGxlY3Rpb24lMjIlMkMlMjJwcm92JTIyJTJDJTIyZGVzY3JpcHRpb24udGl0bGUlMjIlMkMlMjJkZXNjcmlwdGlvbi5wdWJsaWNhdGlvbl9kYXRlJTIyJTJDJTIyZGVzY3JpcHRpb24udXJsX3JlZnMlMjIlNUQlMkMlMjJpdGVtSW5kZXglMjIlM0EwJTJDJTIycGFnZVNpemUlMjIlM0ExMCUyQyUyMnNlYXJjaEFmdGVySGlzdG9yeSUyMiUzQSU1QiU1RCUyQyUyMnZpZXdUeXBlJTIyJTNBJTIyc25pcHBldHMlMjIlMkMlMjJyZWNvcmRTZWxlY3Rpb24lMjIlM0ElN0IlMjJyZWNvcmQlMjIlM0ElN0IlMjJpZCUyMiUzQSUyMmIzMGJjNjY3YTMyNGFlMTExZDAyNTUyNjU2M2I2NzRhOGQzZmQ4NjliYzA3YzhmZDIwNGFhOTViMDVkNDFmMGMlMjIlN0QlMkMlMjJpdGVtSW5kZXglMjIlM0E3MCU3RCU3RA%3D%3D)." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "render_provenance_url(\n", + " api=api, coords=coll_coords, retr_item=rag_result.answers[0].grounding.retr_items[0]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let us try out a different question on our document corpus.\n", + "Here we also include (commented out) various additional parameters the user can optionally set:\n", + "- `retr_k`: number of items to retrieve\n", + "- `text_weight`: weight of lexical search (`0.0`: fully semantic search, `1.0`: fully lexical search, anything in-between: hybrid search)\n", + "- `rerank`: whether to rerank the retrieval results\n", + "- `gen_ctx_extr_method` (Literal[\"window\", \"page\"], optional): method for gen context extraction from document; defaults to \"window\"\n", + "- `gen_ctx_window_size` (int, optional): (relevant only if `gen_ctx_extr_method` is \"window\") max chars to use for extracted gen context (actual extraction quantized on doc item level); defaults to 5000\n", + "- `gen_ctx_window_lead_weight` (float, optional): (relevant only if `gen_ctx_extr_method` is \"window\") weight of leading text for distributing remaining window size after extracting the `main_path`; defaults to 0.5 (centered around `main_path`)\n", + "- `return_prompt` (bool, optional): whether to return the instantiated prompt; defaults to False\n", + "- `gen_timeout` (float, optional): timeout for LLM generation; defaults to None, i.e. determined by system\n", + "\n", + "For more details refer to `deepsearch.cps.queries.RAGQuery`." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "89d95a17-1569-4c90-a983-8ca437b7569d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
RAGResult(\n",
+       "    answers=[\n",
+       "        RAGAnswerItem(\n",
+       "            answer='The term machine learning was coined in 1959 by Arthur Samuel, an IBM employee and pioneer in \n",
+       "the field of computer gaming and artificial intelligence.',\n",
+       "            grounding=RAGGroundingInfo(\n",
+       "                retr_items=[\n",
+       "                    SearchResultItem(\n",
+       "                        doc_hash='e470e7b42a92c8e5f25094362361947b9203e0074c2223505b4921940ec075a1',\n",
+       "                        chunk='History and relationships to other fields\\nThe term machine learning was coined in \n",
+       "1959 by Arthur Samuel, an IBM employee and pioneer in the field of computer gaming and artificial intelligence. \n",
+       "$^{[10][11]}$ The synonym self-teaching computers was also used in this time period. [12][13]',\n",
+       "                        main_path='main-text.6',\n",
+       "                        path_group=['main-text.5', 'main-text.6'],\n",
+       "                        source_is_text=True\n",
+       "                    )\n",
+       "                ],\n",
+       "                gen_ctx_paths=[\n",
+       "                    'main-text.1',\n",
+       "                    'main-text.2',\n",
+       "                    'main-text.3',\n",
+       "                    'main-text.4',\n",
+       "                    'main-text.5',\n",
+       "                    'main-text.6',\n",
+       "                    'main-text.7',\n",
+       "                    'main-text.8',\n",
+       "                    'main-text.9',\n",
+       "                    'main-text.10'\n",
+       "                ]\n",
+       "            ),\n",
+       "            prompt=None\n",
+       "        )\n",
+       "    ],\n",
+       "    search_result_items=[\n",
+       "        SearchResultItem(\n",
+       "            doc_hash='e470e7b42a92c8e5f25094362361947b9203e0074c2223505b4921940ec075a1',\n",
+       "            chunk='History and relationships to other fields\\nThe term machine learning was coined in 1959 by \n",
+       "Arthur Samuel, an IBM employee and pioneer in the field of computer gaming and artificial intelligence. \n",
+       "$^{[10][11]}$ The synonym self-teaching computers was also used in this time period. [12][13]',\n",
+       "            main_path='main-text.6',\n",
+       "            path_group=['main-text.5', 'main-text.6'],\n",
+       "            source_is_text=True\n",
+       "        ),\n",
+       "        SearchResultItem(\n",
+       "            doc_hash='e470e7b42a92c8e5f25094362361947b9203e0074c2223505b4921940ec075a1',\n",
+       "            chunk=\"Machine learning\\nMachine learning (ML) is an umbrella term for solving problems for which \n",
+       "development of algorithms by human programmers would be cost-prohibitive, and instead the problems are solved by \n",
+       "helping machines 'discover ' their 'own ' algorithms, $^{[1]}$ without needing to be explicitly told what to do by \n",
+       "any human-developed algorithms. $^{[2]}$ Recently, generative artificial neural networks have been able to surpass \n",
+       "results of many previous approaches. $^{[3][4]}$ Machine-learning approaches have been applied to large language \n",
+       "models, computer vision, speech recognition, email filtering, agriculture and medicine, where it is too costly to \n",
+       "develop algorithms to perform the needed tasks. [5][6]\",\n",
+       "            main_path='main-text.2',\n",
+       "            path_group=['main-text.1', 'main-text.2'],\n",
+       "            source_is_text=True\n",
+       "        ),\n",
+       "        SearchResultItem(\n",
+       "            doc_hash='e470e7b42a92c8e5f25094362361947b9203e0074c2223505b4921940ec075a1',\n",
+       "            chunk='Artificial intelligence\\nMachine learning (ML), reorganized and recognized as its own field, \n",
+       "started to flourish in the 1990s. The field changed its goal from achieving artificial intelligence to tackling \n",
+       "solvable problems of a practical nature. It shifted focus away from the symbolic approaches it had inherited from \n",
+       "AI, and toward methods and models borrowed from statistics, fuzzy logic, and probability theory. [24]',\n",
+       "            main_path='main-text.15',\n",
+       "            path_group=['main-text.10', 'main-text.15'],\n",
+       "            source_is_text=True\n",
+       "        )\n",
+       "    ]\n",
+       ")\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1;35mRAGResult\u001b[0m\u001b[1m(\u001b[0m\n", + " \u001b[33manswers\u001b[0m=\u001b[1m[\u001b[0m\n", + " \u001b[1;35mRAGAnswerItem\u001b[0m\u001b[1m(\u001b[0m\n", + " \u001b[33manswer\u001b[0m=\u001b[32m'The term machine learning was coined in 1959 by Arthur Samuel, an IBM employee and pioneer in \u001b[0m\n", + "\u001b[32mthe field of computer gaming and artificial intelligence.'\u001b[0m,\n", + " \u001b[33mgrounding\u001b[0m=\u001b[1;35mRAGGroundingInfo\u001b[0m\u001b[1m(\u001b[0m\n", + " \u001b[33mretr_items\u001b[0m=\u001b[1m[\u001b[0m\n", + " \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n", + " \u001b[33mdoc_hash\u001b[0m=\u001b[32m'e470e7b42a92c8e5f25094362361947b9203e0074c2223505b4921940ec075a1'\u001b[0m,\n", + " \u001b[33mchunk\u001b[0m=\u001b[32m'History and relationships to other fields\\nThe term machine learning was coined in \u001b[0m\n", + "\u001b[32m1959 by Arthur Samuel, an IBM employee and pioneer in the field of computer gaming and artificial intelligence. \u001b[0m\n", + "\u001b[32m$^\u001b[0m\u001b[32m{\u001b[0m\u001b[32m[\u001b[0m\u001b[32m10\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m11\u001b[0m\u001b[32m]\u001b[0m\u001b[32m}\u001b[0m\u001b[32m$ The synonym self-teaching computers was also used in this time period. \u001b[0m\u001b[32m[\u001b[0m\u001b[32m12\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m13\u001b[0m\u001b[32m]\u001b[0m\u001b[32m'\u001b[0m,\n", + " \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.6'\u001b[0m,\n", + " \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.5'\u001b[0m, \u001b[32m'main-text.6'\u001b[0m\u001b[1m]\u001b[0m,\n", + " \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", + " \u001b[1m)\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[33mgen_ctx_paths\u001b[0m=\u001b[1m[\u001b[0m\n", + " \u001b[32m'main-text.1'\u001b[0m,\n", + " \u001b[32m'main-text.2'\u001b[0m,\n", + " \u001b[32m'main-text.3'\u001b[0m,\n", + " \u001b[32m'main-text.4'\u001b[0m,\n", + " \u001b[32m'main-text.5'\u001b[0m,\n", + " \u001b[32m'main-text.6'\u001b[0m,\n", + " \u001b[32m'main-text.7'\u001b[0m,\n", + " \u001b[32m'main-text.8'\u001b[0m,\n", + " \u001b[32m'main-text.9'\u001b[0m,\n", + " \u001b[32m'main-text.10'\u001b[0m\n", + " \u001b[1m]\u001b[0m\n", + " \u001b[1m)\u001b[0m,\n", + " \u001b[33mprompt\u001b[0m=\u001b[3;35mNone\u001b[0m\n", + " \u001b[1m)\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[33msearch_result_items\u001b[0m=\u001b[1m[\u001b[0m\n", + " \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n", + " \u001b[33mdoc_hash\u001b[0m=\u001b[32m'e470e7b42a92c8e5f25094362361947b9203e0074c2223505b4921940ec075a1'\u001b[0m,\n", + " \u001b[33mchunk\u001b[0m=\u001b[32m'History and relationships to other fields\\nThe term machine learning was coined in 1959 by \u001b[0m\n", + "\u001b[32mArthur Samuel, an IBM employee and pioneer in the field of computer gaming and artificial intelligence. \u001b[0m\n", + "\u001b[32m$^\u001b[0m\u001b[32m{\u001b[0m\u001b[32m[\u001b[0m\u001b[32m10\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m11\u001b[0m\u001b[32m]\u001b[0m\u001b[32m}\u001b[0m\u001b[32m$ The synonym self-teaching computers was also used in this time period. \u001b[0m\u001b[32m[\u001b[0m\u001b[32m12\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m13\u001b[0m\u001b[32m]\u001b[0m\u001b[32m'\u001b[0m,\n", + " \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.6'\u001b[0m,\n", + " \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.5'\u001b[0m, \u001b[32m'main-text.6'\u001b[0m\u001b[1m]\u001b[0m,\n", + " \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", + " \u001b[1m)\u001b[0m,\n", + " \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n", + " \u001b[33mdoc_hash\u001b[0m=\u001b[32m'e470e7b42a92c8e5f25094362361947b9203e0074c2223505b4921940ec075a1'\u001b[0m,\n", + " \u001b[33mchunk\u001b[0m=\u001b[32m\"Machine\u001b[0m\u001b[32m learning\\nMachine learning \u001b[0m\u001b[32m(\u001b[0m\u001b[32mML\u001b[0m\u001b[32m)\u001b[0m\u001b[32m is an umbrella term for solving problems for which \u001b[0m\n", + "\u001b[32mdevelopment of algorithms by human programmers would be cost-prohibitive, and instead the problems are solved by \u001b[0m\n", + "\u001b[32mhelping machines 'discover ' their 'own ' algorithms, $^\u001b[0m\u001b[32m{\u001b[0m\u001b[32m[\u001b[0m\u001b[32m1\u001b[0m\u001b[32m]\u001b[0m\u001b[32m}\u001b[0m\u001b[32m$ without needing to be explicitly told what to do by \u001b[0m\n", + "\u001b[32many human-developed algorithms. $^\u001b[0m\u001b[32m{\u001b[0m\u001b[32m[\u001b[0m\u001b[32m2\u001b[0m\u001b[32m]\u001b[0m\u001b[32m}\u001b[0m\u001b[32m$ Recently, generative artificial neural networks have been able to surpass \u001b[0m\n", + "\u001b[32mresults of many previous approaches. $^\u001b[0m\u001b[32m{\u001b[0m\u001b[32m[\u001b[0m\u001b[32m3\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m4\u001b[0m\u001b[32m]\u001b[0m\u001b[32m}\u001b[0m\u001b[32m$ Machine-learning approaches have been applied to large language \u001b[0m\n", + "\u001b[32mmodels, computer vision, speech recognition, email filtering, agriculture and medicine, where it is too costly to \u001b[0m\n", + "\u001b[32mdevelop algorithms to perform the needed tasks. \u001b[0m\u001b[32m[\u001b[0m\u001b[32m5\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m6\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\"\u001b[0m,\n", + " \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.2'\u001b[0m,\n", + " \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.1'\u001b[0m, \u001b[32m'main-text.2'\u001b[0m\u001b[1m]\u001b[0m,\n", + " \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", + " \u001b[1m)\u001b[0m,\n", + " \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n", + " \u001b[33mdoc_hash\u001b[0m=\u001b[32m'e470e7b42a92c8e5f25094362361947b9203e0074c2223505b4921940ec075a1'\u001b[0m,\n", + " \u001b[33mchunk\u001b[0m=\u001b[32m'Artificial intelligence\\nMachine learning \u001b[0m\u001b[32m(\u001b[0m\u001b[32mML\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, reorganized and recognized as its own field, \u001b[0m\n", + "\u001b[32mstarted to flourish in the 1990s. The field changed its goal from achieving artificial intelligence to tackling \u001b[0m\n", + "\u001b[32msolvable problems of a practical nature. It shifted focus away from the symbolic approaches it had inherited from \u001b[0m\n", + "\u001b[32mAI, and toward methods and models borrowed from statistics, fuzzy logic, and probability theory. \u001b[0m\u001b[32m[\u001b[0m\u001b[32m24\u001b[0m\u001b[32m]\u001b[0m\u001b[32m'\u001b[0m,\n", + " \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.15'\u001b[0m,\n", + " \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.10'\u001b[0m, \u001b[32m'main-text.15'\u001b[0m\u001b[1m]\u001b[0m,\n", + " \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", + " \u001b[1m)\u001b[0m\n", + " \u001b[1m]\u001b[0m\n", + "\u001b[1m)\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "question = \"Who came up with the term 'machine learning'?\"\n", + "\n", + "# submit natural-language query on collection\n", + "question_query = RAGQuery(\n", + " question=question,\n", + " project=PROJ_KEY,\n", + " data_source=data_source,\n", + " ## optional retrieval params\n", + " retr_k=RETR_K,\n", + " # text_weight=TEXT_WEIGHT,\n", + " # rerank=RERANK,\n", + " ## optional generation params\n", + " # model_id=\"ibm-mistralai/mixtral-8x7b-instruct-v01-q\",\n", + " # gen_params={\"random_seed\": 42, \"max_new_tokens\": 1024},\n", + " # prompt_template=\"Answer the query based on the context.\\n\\nContext: {{ context }}\\n\\nQuery: {{ query }}\",\n", + " # gen_ctx_extr_method=\"window\",\n", + " # gen_ctx_window_size=5000,\n", + " # gen_ctx_window_lead_weight=0.5\n", + " # return_prompt=True,\n", + " # gen_timeout=10.0,\n", + ")\n", + "api_output = api.queries.run(question_query)\n", + "rag_result = RAGResult.from_api_output(api_output, raise_on_error=RAISE)\n", + "\n", + "rich.print(rag_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As seen by the returned `doc_hash`, this answer came from a different document than the previous one." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "The provenance of the answer can be inspected on the [source document](https://sds.app.accelerate.science/projects/b09ae7561a01dc7c4b0fd21a43bfd93d140766d1/library/private/6b70072911ad2794a3844dd44d1705a5ba37ca0b?search=JTdCJTIycHJpdmF0ZUNvbGxlY3Rpb24lMjIlM0ElMjI2YjcwMDcyOTExYWQyNzk0YTM4NDRkZDQ0ZDE3MDVhNWJhMzdjYTBiJTIyJTJDJTIydHlwZSUyMiUzQSUyMkRvY3VtZW50JTIyJTJDJTIyZXhwcmVzc2lvbiUyMiUzQSUyMmZpbGUtaW5mby5kb2N1bWVudC1oYXNoJTNBJTIwJTVDJTIyZTQ3MGU3YjQyYTkyYzhlNWYyNTA5NDM2MjM2MTk0N2I5MjAzZTAwNzRjMjIyMzUwNWI0OTIxOTQwZWMwNzVhMSU1QyUyMiUyMiUyQyUyMmZpbHRlcnMlMjIlM0ElNUIlNUQlMkMlMjJzZWxlY3QlMjIlM0ElNUIlMjJfbmFtZSUyMiUyQyUyMmRlc2NyaXB0aW9uLmNvbGxlY3Rpb24lMjIlMkMlMjJwcm92JTIyJTJDJTIyZGVzY3JpcHRpb24udGl0bGUlMjIlMkMlMjJkZXNjcmlwdGlvbi5wdWJsaWNhdGlvbl9kYXRlJTIyJTJDJTIyZGVzY3JpcHRpb24udXJsX3JlZnMlMjIlNUQlMkMlMjJpdGVtSW5kZXglMjIlM0EwJTJDJTIycGFnZVNpemUlMjIlM0ExMCUyQyUyMnNlYXJjaEFmdGVySGlzdG9yeSUyMiUzQSU1QiU1RCUyQyUyMnZpZXdUeXBlJTIyJTNBJTIyc25pcHBldHMlMjIlMkMlMjJyZWNvcmRTZWxlY3Rpb24lMjIlM0ElN0IlMjJyZWNvcmQlMjIlM0ElN0IlMjJpZCUyMiUzQSUyMmU0NzBlN2I0MmE5MmM4ZTVmMjUwOTQzNjIzNjE5NDdiOTIwM2UwMDc0YzIyMjM1MDViNDkyMTk0MGVjMDc1YTElMjIlN0QlMkMlMjJpdGVtSW5kZXglMjIlM0E2JTdEJTdE)." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "render_provenance_url(\n", + " api=api, coords=coll_coords, retr_item=rag_result.answers[0].grounding.retr_items[0]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Semantic retrieval\n", + "\n", + "Besides RAG, which includes natural language generation, a user may only be interested in\n", + "the semantic retrieval part.\n", + "\n", + "This can be obtained very similarly to RAG, as shown below:" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
SearchResult(\n",
+       "    search_result_items=[\n",
+       "        SearchResultItem(\n",
+       "            doc_hash='b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c',\n",
+       "            chunk='Switzerland\\nIBM Research-Zurich (previously called IBM Zurich Research Laboratory, ZRL) is the \n",
+       "European branch of IBM Research. It was opened in 1956 and is located in Rüschlikon, near Zurich, Switzerland.',\n",
+       "            main_path='main-text.70',\n",
+       "            path_group=['main-text.69', 'main-text.70'],\n",
+       "            source_is_text=True\n",
+       "        ),\n",
+       "        SearchResultItem(\n",
+       "            doc_hash='b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c',\n",
+       "            chunk='Switzerland\\nIn 1956, IBM opened their first European research laboratory in Adliswil, \n",
+       "Switzerland, near Zurich. The lab moved to its own campus in neighboring Rüschlikon in 1962. The Zurich lab is \n",
+       "staffed by a multicultural and interdisciplinary team of a few hundred permanent research staff members, graduate \n",
+       "students and post-doctoral fellows, representing about 45 nationalities. Collocated with the lab is a Client Center\n",
+       "(formerly the Industry Solutions Lab), an executive briefing facility demonstrating technology prototypes and \n",
+       "solutions.',\n",
+       "            main_path='main-text.71',\n",
+       "            path_group=['main-text.69', 'main-text.71'],\n",
+       "            source_is_text=True\n",
+       "        ),\n",
+       "        SearchResultItem(\n",
+       "            doc_hash='b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c',\n",
+       "            chunk='Switzerland\\nThe research projects pursued at the IBM Zurich lab are organized into four \n",
+       "scientific and technical departments: Science & Technology, Cloud and AI Systems Research, Cognitive Computing & \n",
+       "Industry Solutions and Security Research. The lab is currently managed by Alessandro Curioni.',\n",
+       "            main_path='main-text.74',\n",
+       "            path_group=['main-text.69', 'main-text.74'],\n",
+       "            source_is_text=True\n",
+       "        )\n",
+       "    ]\n",
+       ")\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1;35mSearchResult\u001b[0m\u001b[1m(\u001b[0m\n", + " \u001b[33msearch_result_items\u001b[0m=\u001b[1m[\u001b[0m\n", + " \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n", + " \u001b[33mdoc_hash\u001b[0m=\u001b[32m'b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c'\u001b[0m,\n", + " \u001b[33mchunk\u001b[0m=\u001b[32m'Switzerland\\nIBM Research-Zurich \u001b[0m\u001b[32m(\u001b[0m\u001b[32mpreviously called IBM Zurich Research Laboratory, ZRL\u001b[0m\u001b[32m)\u001b[0m\u001b[32m is the \u001b[0m\n", + "\u001b[32mEuropean branch of IBM Research. It was opened in 1956 and is located in Rüschlikon, near Zurich, Switzerland.'\u001b[0m,\n", + " \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.70'\u001b[0m,\n", + " \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.69'\u001b[0m, \u001b[32m'main-text.70'\u001b[0m\u001b[1m]\u001b[0m,\n", + " \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", + " \u001b[1m)\u001b[0m,\n", + " \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n", + " \u001b[33mdoc_hash\u001b[0m=\u001b[32m'b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c'\u001b[0m,\n", + " \u001b[33mchunk\u001b[0m=\u001b[32m'Switzerland\\nIn 1956, IBM opened their first European research laboratory in Adliswil, \u001b[0m\n", + "\u001b[32mSwitzerland, near Zurich. The lab moved to its own campus in neighboring Rüschlikon in 1962. The Zurich lab is \u001b[0m\n", + "\u001b[32mstaffed by a multicultural and interdisciplinary team of a few hundred permanent research staff members, graduate \u001b[0m\n", + "\u001b[32mstudents and post-doctoral fellows, representing about 45 nationalities. Collocated with the lab is a Client Center\u001b[0m\n", + "\u001b[32m(\u001b[0m\u001b[32mformerly the Industry Solutions Lab\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, an executive briefing facility demonstrating technology prototypes and \u001b[0m\n", + "\u001b[32msolutions.'\u001b[0m,\n", + " \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.71'\u001b[0m,\n", + " \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.69'\u001b[0m, \u001b[32m'main-text.71'\u001b[0m\u001b[1m]\u001b[0m,\n", + " \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", + " \u001b[1m)\u001b[0m,\n", + " \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n", + " \u001b[33mdoc_hash\u001b[0m=\u001b[32m'b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c'\u001b[0m,\n", + " \u001b[33mchunk\u001b[0m=\u001b[32m'Switzerland\\nThe research projects pursued at the IBM Zurich lab are organized into four \u001b[0m\n", + "\u001b[32mscientific and technical departments: Science & Technology, Cloud and AI Systems Research, Cognitive Computing & \u001b[0m\n", + "\u001b[32mIndustry Solutions and Security Research. The lab is currently managed by Alessandro Curioni.'\u001b[0m,\n", + " \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.74'\u001b[0m,\n", + " \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.69'\u001b[0m, \u001b[32m'main-text.74'\u001b[0m\u001b[1m]\u001b[0m,\n", + " \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", + " \u001b[1m)\u001b[0m\n", + " \u001b[1m]\u001b[0m\n", + "\u001b[1m)\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "question = \"Where is the IBM lab in Zurich?\"\n", + "\n", + "# submit natural-language query on collection\n", + "question_query = SemanticQuery(\n", + " question=question,\n", + " project=PROJ_KEY,\n", + " data_source=data_source,\n", + " ## optional params\n", + " retr_k=RETR_K,\n", + " # text_weight=TEXT_WEIGHT,\n", + " # rerank=RERANK,\n", + ")\n", + "api_output = api.queries.run(question_query)\n", + "search_result = SearchResult.from_api_output(api_output, raise_on_error=RAISE)\n", + "\n", + "rich.print(search_result)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/examples/qa_single_doc/single_doc_qa.ipynb b/examples/qa_single_doc/single_doc_qa.ipynb index 36f3683..90ab125 100644 --- a/examples/qa_single_doc/single_doc_qa.ipynb +++ b/examples/qa_single_doc/single_doc_qa.ipynb @@ -48,8 +48,8 @@ "# notebooks settings auto-loaded from .env / env vars\n", "notebook_settings = DocQANotebookSettings()\n", "\n", - "PROFILE_NAME = notebook_settings.profile # the profile to use\n", - "PROJ_KEY = notebook_settings.proj_key # the project to use\n", + "PROFILE_NAME = notebook_settings.profile # the profile to use\n", + "PROJ_KEY = notebook_settings.proj_key # the project to use\n", "\n", "# index and doc for doc QA from semantically indexed collection\n", "SEM_ON_IDX_KEY = notebook_settings.sem_on_idx_key\n", @@ -59,12 +59,18 @@ "SEM_OFF_IDX_KEY = notebook_settings.sem_off_idx_key\n", "SEM_OFF_IDX_DOC_HASH = notebook_settings.sem_off_idx_doc_hash\n", "\n", - "SKIP_INGESTED_DOCS = notebook_settings.skip_ingested_docs # whether to skip any already semantically ingested docs\n", + "SKIP_INGESTED_DOCS = (\n", + " notebook_settings.skip_ingested_docs\n", + ") # whether to skip any already semantically ingested docs\n", "\n", - "RETR_K = notebook_settings.retr_k # the number of search results to retrieve\n", - "TEXT_WEIGHT = notebook_settings.text_weight # the weight of lexical search (0.0: semantic-only, 1.0: lexical-only, anything in between: hybrid search)\n", - "RERANK = notebook_settings.rerank # whether to rerank the search results\n", - "RAISE = notebook_settings.raise_on_sem_err # whether semantic operation errors should raise an exception or be reflected in response fields" + "RETR_K = notebook_settings.retr_k # the number of search results to retrieve\n", + "TEXT_WEIGHT = (\n", + " notebook_settings.text_weight\n", + ") # the weight of lexical search (0.0: semantic-only, 1.0: lexical-only, anything in between: hybrid search)\n", + "RERANK = notebook_settings.rerank # whether to rerank the search results\n", + "RAISE = (\n", + " notebook_settings.raise_on_sem_err\n", + ") # whether semantic operation errors should raise an exception or be reflected in response fields" ] }, { @@ -92,7 +98,7 @@ "from deepsearch.cps.client.api import CpsApi\n", "from deepsearch.cps.client.components.elastic import ElasticProjectDataCollectionSource\n", "from deepsearch.cps.queries import RAGQuery, SemanticQuery\n", - "from deepsearch.cps.queries.results import RAGResult, SearchResult, SearchResultItem\n" + "from deepsearch.cps.queries.results import RAGResult, SearchResult, SearchResultItem" ] }, { @@ -110,7 +116,7 @@ "metadata": {}, "outputs": [], "source": [ - "api = CpsApi.from_env(profile_name=PROFILE_NAME)\n" + "api = CpsApi.from_env(profile_name=PROFILE_NAME)" ] }, { @@ -132,13 +138,17 @@ " retr_item: SearchResultItem,\n", "):\n", " ## compute URL to the document in the Deep Search UI\n", - " item_index = int(retr_item.main_path[retr_item.main_path.rfind(\".\")+1:])\n", + " item_index = int(retr_item.main_path[retr_item.main_path.rfind(\".\") + 1 :])\n", " doc_url = api.documents.generate_url(\n", " document_hash=retr_item.doc_hash,\n", " data_source=coords,\n", " item_index=item_index,\n", " )\n", - " display(Markdown(f\"The provenance of the answer can be inspected on the [source document]({doc_url}).\"))" + " display(\n", + " Markdown(\n", + " f\"The provenance of the answer can be inspected on the [source document]({doc_url}).\"\n", + " )\n", + " )" ] }, { @@ -377,7 +387,6 @@ " question=question,\n", " project=PROJ_KEY,\n", " data_source=data_source,\n", - "\n", " ## optional retrieval params\n", " retr_k=RETR_K,\n", ")\n", @@ -413,7 +422,9 @@ } ], "source": [ - "render_provenance_url(api=api, coords=coords, retr_item=rag_result.answers[0].grounding.retr_items[0])" + "render_provenance_url(\n", + " api=api, coords=coords, retr_item=rag_result.answers[0].grounding.retr_items[0]\n", + ")" ] }, { @@ -693,23 +704,19 @@ " question=question,\n", " project=PROJ_KEY,\n", " data_source=data_source,\n", - "\n", " ## optional retrieval params\n", " retr_k=RETR_K,\n", " # text_weight=TEXT_WEIGHT,\n", " rerank=RERANK,\n", - "\n", " ## optional generation params\n", " # model_id=\"ibm-mistralai/mixtral-8x7b-instruct-v01-q\",\n", " # gen_params={\"random_seed\": 42, \"max_new_tokens\": 1024},\n", " # prompt_template=\"Answer the query based on the context.\\n\\nContext: {{ context }}\\n\\nQuery: {{ query }}\",\n", - "\n", " # gen_ctx_extr_method=\"window\",\n", " # gen_ctx_window_size=5000,\n", " # gen_ctx_window_lead_weight=0.5\n", " # return_prompt=True,\n", " # gen_timeout=10.0,\n", - "\n", ")\n", "api_output = api.queries.run(question_query)\n", "rag_result = RAGResult.from_api_output(api_output, raise_on_error=RAISE)\n", @@ -736,7 +743,9 @@ } ], "source": [ - "render_provenance_url(api=api, coords=coords, retr_item=rag_result.answers[0].grounding.retr_items[0])" + "render_provenance_url(\n", + " api=api, coords=coords, retr_item=rag_result.answers[0].grounding.retr_items[0]\n", + ")" ] }, { @@ -844,7 +853,6 @@ " question=question,\n", " project=PROJ_KEY,\n", " data_source=data_source,\n", - "\n", " ## optional params\n", " retr_k=RETR_K,\n", " # text_weight=TEXT_WEIGHT,\n", @@ -1150,12 +1158,10 @@ " question=question,\n", " project=PROJ_KEY,\n", " data_source=data_source,\n", - "\n", " ## optional retrieval params\n", " retr_k=4,\n", " # text_weight=TEXT_WEIGHT,\n", " rerank=RERANK,\n", - "\n", " ## optional generation params\n", " # model_id=\"ibm-mistralai/mixtral-8x7b-instruct-v01-q\",\n", " # gen_params={\"random_seed\": 42, \"max_new_tokens\": 1024},\n", @@ -1164,7 +1170,7 @@ "api_output = api.queries.run(question_query)\n", "rag_result = RAGResult.from_api_output(api_output, raise_on_error=RAISE)\n", "\n", - "rich.print(rag_result)\n" + "rich.print(rag_result)" ] }, { @@ -1186,7 +1192,9 @@ } ], "source": [ - "render_provenance_url(api=api, coords=coords, retr_item=rag_result.answers[0].grounding.retr_items[0])" + "render_provenance_url(\n", + " api=api, coords=coords, retr_item=rag_result.answers[0].grounding.retr_items[0]\n", + ")" ] }, { @@ -1319,7 +1327,6 @@ " question=question,\n", " project=PROJ_KEY,\n", " data_source=data_source,\n", - "\n", " ## optional params\n", " retr_k=4,\n", " # text_weight=TEXT_WEIGHT,\n", @@ -1627,7 +1634,6 @@ " question=question,\n", " project=PROJ_KEY,\n", " data_source=data_source,\n", - "\n", " ## optional retrieval params\n", " retr_k=RETR_K,\n", ")\n",