From 51579b68c219a300ae3e634b3032f0655a4f1a0a Mon Sep 17 00:00:00 2001 From: Ben Lasscock Date: Tue, 9 May 2023 13:54:24 -0500 Subject: [PATCH] Redis: propagate redis query syntax though "filter" (#209) * fix: remove the * wildcard from the TAG escape characters. Add notebook to demo behaviour of the filter feature. * fix: remove print * fix: remove parentheses * fix: remove whitespace --------- Co-authored-by: Ben Lasscock --- datastore/providers/redis_datastore.py | 7 +- .../redis/semantic-search-and-filter.ipynb | 511 ++++++++++++++++++ 2 files changed, 515 insertions(+), 3 deletions(-) create mode 100644 examples/providers/redis/semantic-search-and-filter.ipynb diff --git a/datastore/providers/redis_datastore.py b/datastore/providers/redis_datastore.py index 5e28e2f25..669f3fb83 100644 --- a/datastore/providers/redis_datastore.py +++ b/datastore/providers/redis_datastore.py @@ -44,7 +44,8 @@ {"name": "search", "ver": 20600}, {"name": "ReJSON", "ver": 20404} ] -REDIS_DEFAULT_ESCAPED_CHARS = re.compile(r"[,.<>{}\[\]\\\"\':;!@#$%^&*()\-+=~\/ ]") + +REDIS_DEFAULT_ESCAPED_CHARS = re.compile(r"[,.<>{}\[\]\\\"\':;!@#$%^&()\-+=~\/ ]") # Helper functions def unpack_schema(d: dict): @@ -59,7 +60,7 @@ async def _check_redis_module_exist(client: redis.Redis, modules: List[dict]): installed_modules = {module["name"]: module for module in installed_modules} for module in modules: if module["name"] not in installed_modules or int(installed_modules[module["name"]]["ver"]) < int(module["ver"]): - error_message = "You must add the RediSearch (>= 2.6) and ReJSON (>= 2.4) modules from Redis Stack. " \ + error_message = "You must add the RediSearch (>= 2.6) and ReJSON (>= 2.4) modules from Redis Stack. " \ "Please refer to Redis Stack docs: https://redis.io/docs/stack/" logging.error(error_message) raise AttributeError(error_message) @@ -207,7 +208,7 @@ def _typ_to_str(typ, field, value) -> str: # type: ignore if isinstance(typ, TagField): return f"@{field}:{{{self._escape(value)}}} " elif isinstance(typ, TextField): - return f"@{field}:{self._escape(value)} " + return f"@{field}:{value} " elif isinstance(typ, NumericField): num = to_unix_timestamp(value) match field: diff --git a/examples/providers/redis/semantic-search-and-filter.ipynb b/examples/providers/redis/semantic-search-and-filter.ipynb new file mode 100644 index 000000000..1d7b63ade --- /dev/null +++ b/examples/providers/redis/semantic-search-and-filter.ipynb @@ -0,0 +1,511 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import os\n", + "import requests" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Document retrieval: upsert and query basic usage\n", + "\n", + "In this walkthrough we will see how to use the retrieval API with a Redis datastore for *semantic search / question-answering*. We will also provide a basic demo showing how to use the \"filter\" function.\n", + "\n", + "Before running this notebook you should have already initialized the retrieval API and have it running locally or elsewhere. The full instructions for doing this are found in on the chatgpt-retrieval-plugin page [page](https://github.com/openai/chatgpt-retrieval-plugin#quickstart). Please follow the instructions to start the app with the redis datastore.\n", + "\n", + "Additional examples using the search features can be found [here](https://github.com/openai/chatgpt-retrieval-plugin/blob/main/examples/providers/pinecone/semantic-search.ipynb)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Document\n", + "\n", + "First we will prepare a collection of documents. From the perspective of the retrieval plugin, a [document](https://github.com/openai/chatgpt-retrieval-plugin/blob/main/models/models.py) this consists\n", + "of an \"id\", \"text\" and a collection of \"metadata\".\n", + "\n", + "The \"metadata\" has \"source\", \"source_id\", \"created_at\", \"url\" and \"author\" fields. Query metadata does not expose the \"url\" field.\n", + "\n", + "The \"source\" field is an Enum and can only be one of (\"file\", \"email\" or \"chat\").\n", + "\n", + "Text is taken from company SEC 10-K filings which are in the public domain.\n", + "\n", + "For demonstration, we will insert some **fake** authors for the documents, see the respective links for the original sources. " + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "document_1 = {\n", + " \"id\": \"twtr\",\n", + " \"text\": \"\"\"Postponements, suspensions or cancellations of major events, such as sporting events\n", + " and music festivals, may lead to people perceiving the content on Twitter as less\n", + " relevant or useful or of lower quality, which could negatively affect mDAU growth,\n", + " or may reduce monetization opportunities in connection with such events.\"\"\",\n", + " \"metadata\" : {\n", + " \"source\" : \"file\",\n", + " \"source_id\" : \"test:twtr10k\",\n", + " \"created_at\": \"2020-12-31\",\n", + " \"url\": \"https://www.sec.gov/Archives/edgar/data/1418091/000141809121000031/twtr-20201231.htm\",\n", + " \"author\": 'Elvis Tusk Sr.' \n", + " }\n", + "}\n", + "\n", + "document_2 = {\n", + " \"id\": \"tsla\",\n", + " \"text\": \"\"\"Because we do not have independent dealer networks, we are responsible for delivering\n", + " all of our vehicles to our customers.\"\"\",\n", + " \"metadata\" : {\n", + " \"source\" : \"file\",\n", + " \"source_id\" : \"test:tesla10k\",\n", + " \"created_at\": \"2021-12-31\",\n", + " \"url\": \"https://www.sec.gov/Archives/edgar/data/1318605/000095017022000796/tsla-20211231.htm\",\n", + " \"author\": 'Elvis Tusk Jr.' \n", + " } \n", + "}\n", + "\n", + "document_3 = {\n", + " \"id\": \"xom\",\n", + " \"text\": \"\"\"All practical and economically-viable energy sources will need to be pursued to continue\n", + " meeting global energy demand, recognizing the scale and variety of worldwide energy needs\n", + " as well as the importance of expanding access to modern energy to promote better standards\n", + " of living for billions of people.\"\"\",\n", + " \"metadata\" : {\n", + " \"source\" : \"file\",\n", + " \"source_id\" : \"test:xom10k\",\n", + " \"created_at\": \"2020-12-31\",\n", + " \"url\": \"https://www.sec.gov/Archives/edgar/data/34088/000003408821000012/xom-20201231.htm\",\n", + " \"author\": 'Vape Jordan' \n", + " } \n", + "}\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Indexing the Docs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We're now ready to begin indexing (or *upserting*) our `documents`. To make these requests to the retrieval app API, we will need to provide authorization in the form of the `BEARER_TOKEN` we set earlier. We do this below:" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "BEARER_TOKEN = os.environ.get(\"BEARER_TOKEN\") or \"BEARER_TOKEN_HERE\"\n", + "endpoint_url = 'http://0.0.0.0:8000'\n", + "headers = {\n", + " \"Authorization\": f\"Bearer {BEARER_TOKEN}\"\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Use the `BEARER_TOKEN` to create our authorization `headers`:" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "response = requests.post(\n", + " f\"{endpoint_url}/upsert\",\n", + " headers=headers,\n", + " json={\n", + " \"documents\": [document_1, document_2, document_3]\n", + " }\n", + ")\n", + "response.raise_for_status()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Example filter syntax\n", + "In our example data we have tagged each companies 10k documents as a source: test:twtr10k, test:tsla10k, and test:xom10k.\n", + "And we have created **fake** authors of the documents, Elvis Tusk Jr., Elvis Tusk Sr. and Vape Jordan. We will then filter based on these fields." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "### TAG Fields\n", + "\n", + "source and source_id are \"TAG\" fields, Redis supports a limited [query syntax](https://redis.io/docs/stack/search/reference/tags/) on TAGS, which includes and \"or\" syntax, i.e. \"test:twtr10k|test:tesla10k\" or a ```*``` wildcard to match a prefix.\n", + "\n", + "In this example we have only two documents that match the filter so only two documents will show.\n", + "\n", + "Gotcha: There cannot be a space between the bar \"|\", i.e. \"test:twtr10k|test:tesla10k\" is valid, \"test:twtr10k | test:tesla10k\" is not." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'results': [{'query': 'How does Tesla deliver cars?',\n", + " 'results': [{'id': 'tsla',\n", + " 'text': 'Because we do not have independent dealer networks, we are responsible for delivering all of our vehicles to our customers.',\n", + " 'metadata': {'source': 'file',\n", + " 'source_id': 'test:tesla10k',\n", + " 'url': 'https://www.sec.gov/Archives/edgar/data/1318605/000095017022000796/tsla-20211231.htm',\n", + " 'created_at': '1640908800',\n", + " 'author': 'Elvis Tusk Jr.',\n", + " 'document_id': 'tsla'},\n", + " 'embedding': None,\n", + " 'score': 0.185401830213},\n", + " {'id': 'twtr',\n", + " 'text': 'Postponements, suspensions or cancellations of major events, such as sporting events and music festivals, may lead to people perceiving the content on Twitter as less relevant or useful or of lower quality, which could negatively affect mDAU growth, or may reduce monetization opportunities in connection with such events.',\n", + " 'metadata': {'source': 'file',\n", + " 'source_id': 'test:twtr10k',\n", + " 'url': 'https://www.sec.gov/Archives/edgar/data/1418091/000141809121000031/twtr-20201231.htm',\n", + " 'created_at': '1609372800',\n", + " 'author': 'Elvis Tusk Sr.',\n", + " 'document_id': 'twtr'},\n", + " 'embedding': None,\n", + " 'score': 0.300053447242}]}]}" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "query = {\n", + " \"query\": \"How does Tesla deliver cars?\",\n", + " \"filter\": {\"source_id\": \"test:twtr10k|test:tesla10k\"},\n", + " \"top_k\": 3\n", + "}\n", + "\n", + "response = requests.post(\n", + " f\"{endpoint_url}/query\",\n", + " headers=headers,\n", + " json={\n", + " \"queries\": [query]\n", + " }\n", + ")\n", + "response.raise_for_status()\n", + "\n", + "response.json()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this example we use a wild card to filter by prefix. There are three documents matching this filter so three results will be printed.\n", + "\n", + "Gotcha, only prefix filtering is supported for redis TAGS, i.e. \"test*\" is valid, where as \"te\\*t\\*\" is not." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'results': [{'query': 'I want information related to car dealerships.',\n", + " 'results': [{'id': 'tsla',\n", + " 'text': 'Because we do not have independent dealer networks, we are responsible for delivering all of our vehicles to our customers.',\n", + " 'metadata': {'source': 'file',\n", + " 'source_id': 'test:tesla10k',\n", + " 'url': 'https://www.sec.gov/Archives/edgar/data/1318605/000095017022000796/tsla-20211231.htm',\n", + " 'created_at': '1640908800',\n", + " 'author': 'Elvis Tusk Jr.',\n", + " 'document_id': 'tsla'},\n", + " 'embedding': None,\n", + " 'score': 0.204279193893},\n", + " {'id': 'twtr',\n", + " 'text': 'Postponements, suspensions or cancellations of major events, such as sporting events and music festivals, may lead to people perceiving the content on Twitter as less relevant or useful or of lower quality, which could negatively affect mDAU growth, or may reduce monetization opportunities in connection with such events.',\n", + " 'metadata': {'source': 'file',\n", + " 'source_id': 'test:twtr10k',\n", + " 'url': 'https://www.sec.gov/Archives/edgar/data/1418091/000141809121000031/twtr-20201231.htm',\n", + " 'created_at': '1609372800',\n", + " 'author': 'Elvis Tusk Sr.',\n", + " 'document_id': 'twtr'},\n", + " 'embedding': None,\n", + " 'score': 0.292188997496},\n", + " {'id': 'xom',\n", + " 'text': 'All practical and economically-viable energy sources will need to be pursued to continue meeting global energy demand, recognizing the scale and variety of worldwide energy needs as well as the importance of expanding access to modern energy to promote better standards of living for billions of people.',\n", + " 'metadata': {'source': 'file',\n", + " 'source_id': 'test:xom10k',\n", + " 'url': 'https://www.sec.gov/Archives/edgar/data/34088/000003408821000012/xom-20201231.htm',\n", + " 'created_at': '1609372800',\n", + " 'author': 'Vape Jordan',\n", + " 'document_id': 'xom'},\n", + " 'embedding': None,\n", + " 'score': 0.305264299269}]}]}" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "query = {\n", + " \"query\": \"I want information related to car dealerships.\",\n", + " \"filter\": {\"source_id\": \"test:*\"},\n", + " \"top_k\": 3\n", + "}\n", + "\n", + "response = requests.post(\n", + " f\"{endpoint_url}/query\",\n", + " headers=headers,\n", + " json={\n", + " \"queries\": [query]\n", + " }\n", + ")\n", + "response.raise_for_status()\n", + "\n", + "response.json()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The last example we filter by the \"author\" field. The author field is a TextField, and so we have more options for filtering, \n", + "see [here](https://redis.io/docs/stack/search/reference/query_syntax/) for a complete set of examples.\n", + "\n", + "We can select by a specific author, here we only expect to return a single result." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'results': [{'query': 'I want information related to car dealerships.',\n", + " 'results': [{'id': 'xom',\n", + " 'text': 'All practical and economically-viable energy sources will need to be pursued to continue meeting global energy demand, recognizing the scale and variety of worldwide energy needs as well as the importance of expanding access to modern energy to promote better standards of living for billions of people.',\n", + " 'metadata': {'source': 'file',\n", + " 'source_id': 'test:xom10k',\n", + " 'url': 'https://www.sec.gov/Archives/edgar/data/34088/000003408821000012/xom-20201231.htm',\n", + " 'created_at': '1609372800',\n", + " 'author': 'Vape Jordan',\n", + " 'document_id': 'xom'},\n", + " 'embedding': None,\n", + " 'score': 0.305264299269}]}]}" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "query = {\n", + " \"query\": \"I want information related to car dealerships.\",\n", + " \"filter\": {\"source_id\": \"test:*\", \"author\": \"Vape Jordan\"},\n", + " \"top_k\": 3\n", + "}\n", + "\n", + "response = requests.post(\n", + " f\"{endpoint_url}/query\",\n", + " headers=headers,\n", + " json={\n", + " \"queries\": [query]\n", + " }\n", + ")\n", + "response.raise_for_status()\n", + "\n", + "response.json()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here we use the negation \"-\" to select all documents, except those published by an author called Elvis" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'results': [{'query': 'I want information related to car dealerships.',\n", + " 'results': [{'id': 'xom',\n", + " 'text': 'All practical and economically-viable energy sources will need to be pursued to continue meeting global energy demand, recognizing the scale and variety of worldwide energy needs as well as the importance of expanding access to modern energy to promote better standards of living for billions of people.',\n", + " 'metadata': {'source': 'file',\n", + " 'source_id': 'test:xom10k',\n", + " 'url': 'https://www.sec.gov/Archives/edgar/data/34088/000003408821000012/xom-20201231.htm',\n", + " 'created_at': '1609372800',\n", + " 'author': 'Vape Jordan',\n", + " 'document_id': 'xom'},\n", + " 'embedding': None,\n", + " 'score': 0.305264299269}]}]}" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "query = {\n", + " \"query\": \"I want information related to car dealerships.\",\n", + " \"filter\": {\"source_id\": \"test:*\", \"author\": \"-Elvis\"},\n", + " \"top_k\": 3\n", + "}\n", + "\n", + "response = requests.post(\n", + " f\"{endpoint_url}/query\",\n", + " headers=headers,\n", + " json={\n", + " \"queries\": [query]\n", + " }\n", + ")\n", + "response.raise_for_status()\n", + "\n", + "response.json()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Last example we filter two of the authors:" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'results': [{'query': 'I want information related to car dealerships.',\n", + " 'results': [{'id': 'tsla',\n", + " 'text': 'Because we do not have independent dealer networks, we are responsible for delivering all of our vehicles to our customers.',\n", + " 'metadata': {'source': 'file',\n", + " 'source_id': 'test:tesla10k',\n", + " 'url': 'https://www.sec.gov/Archives/edgar/data/1318605/000095017022000796/tsla-20211231.htm',\n", + " 'created_at': '1640908800',\n", + " 'author': 'Elvis Tusk Jr.',\n", + " 'document_id': 'tsla'},\n", + " 'embedding': None,\n", + " 'score': 0.204279193893},\n", + " {'id': 'xom',\n", + " 'text': 'All practical and economically-viable energy sources will need to be pursued to continue meeting global energy demand, recognizing the scale and variety of worldwide energy needs as well as the importance of expanding access to modern energy to promote better standards of living for billions of people.',\n", + " 'metadata': {'source': 'file',\n", + " 'source_id': 'test:xom10k',\n", + " 'url': 'https://www.sec.gov/Archives/edgar/data/34088/000003408821000012/xom-20201231.htm',\n", + " 'created_at': '1609372800',\n", + " 'author': 'Vape Jordan',\n", + " 'document_id': 'xom'},\n", + " 'embedding': None,\n", + " 'score': 0.305264299269}]}]}" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "query = {\n", + " \"query\": \"I want information related to car dealerships.\",\n", + " \"filter\": {\"source_id\": \"test:*\", \"author\": \"Elvis*Jr.|Vape\"},\n", + " \"top_k\": 3\n", + "}\n", + "\n", + "response = requests.post(\n", + " f\"{endpoint_url}/query\",\n", + " headers=headers,\n", + " json={\n", + " \"queries\": [query]\n", + " }\n", + ")\n", + "response.raise_for_status()\n", + "\n", + "response.json()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.1" + }, + "vscode": { + "interpreter": { + "hash": "1979a773a5778de9a5fa593a629dff0ab3c80c2563810d3e6a8dfb123dc01c7d" + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}