diff --git a/README.md b/README.md index 56653b064..49742a0b2 100644 --- a/README.md +++ b/README.md @@ -44,6 +44,7 @@ This README provides detailed information on how to set up, develop, and deploy - [Llama Index](#llamaindex) - [Chroma](#chroma) - [Azure Cognitive Search](#azure-cognitive-search) + - [Azure CosmosDB Mongo vCore](#azure-cosmosdb-mongo-vcore) - [Supabase](#supabase) - [Postgres](#postgres) - [AnalyticDB](#analyticdb) @@ -154,6 +155,12 @@ Follow these steps to quickly set up and run the ChatGPT Retrieval Plugin: export AZURESEARCH_SERVICE= export AZURESEARCH_INDEX= export AZURESEARCH_API_KEY= (optional, uses key-free managed identity if not set) + + # Azure CosmosDB Mongo vCore + export AZCOSMOS_API = + export AZCOSMOS_CONNSTR = + export AZCOSMOS_DATABASE_NAME = + export AZCOSMOS_CONTAINER_NAME = # Supabase export SUPABASE_URL= @@ -351,6 +358,9 @@ For detailed setup instructions, refer to [`/docs/providers/llama/setup.md`](/do [Azure Cognitive Search](https://azure.microsoft.com/products/search/) is a complete retrieval cloud service that supports vector search, text search, and hybrid (vectors + text combined to yield the best of the two approaches). It also offers an [optional L2 re-ranking step](https://learn.microsoft.com/azure/search/semantic-search-overview) to further improve results quality. For detailed setup instructions, refer to [`/docs/providers/azuresearch/setup.md`](/docs/providers/azuresearch/setup.md) +#### Azure CosmosDB Mongo vCore +[Azure CosmosDB Mongo vCore](https://learn.microsoft.com/en-us/azure/cosmos-db/mongodb/vcore/) supports vector search on embeddings, and it could be used to seamlessly integrate your AI-based applications with your data stored in the Azure CosmosDB. For detailed instructions, refer to [`/docs/providers/azurecosmosdb/setup.md`](/docs/providers/azurecosmosdb/setup.md) + #### Supabase [Supabase](https://supabase.com/blog/openai-embeddings-postgres-vector) offers an easy and efficient way to store vectors via [pgvector](https://github.com/pgvector/pgvector) extension for Postgres Database. [You can use Supabase CLI](https://github.com/supabase/cli) to set up a whole Supabase stack locally or in the cloud or you can also use docker-compose, k8s and other options available. For a hosted/managed solution, try [Supabase.com](https://supabase.com/) and unlock the full power of Postgres with built-in authentication, storage, auto APIs, and Realtime features. For detailed setup instructions, refer to [`/docs/providers/supabase/setup.md`](/docs/providers/supabase/setup.md). diff --git a/datastore/factory.py b/datastore/factory.py index dd4e9b538..41abb60eb 100644 --- a/datastore/factory.py +++ b/datastore/factory.py @@ -36,6 +36,10 @@ async def get_datastore() -> DataStore: from datastore.providers.redis_datastore import RedisDataStore return await RedisDataStore.init() + case "azurecosmosdb": + from datastore.providers.azurecosmosdb_datastore import AzureCosmosDBDataStore + + return await AzureCosmosDBDataStore.create() case "qdrant": from datastore.providers.qdrant_datastore import QdrantDataStore diff --git a/datastore/providers/azurecosmosdb_datastore.py b/datastore/providers/azurecosmosdb_datastore.py new file mode 100644 index 000000000..f9d3507d8 --- /dev/null +++ b/datastore/providers/azurecosmosdb_datastore.py @@ -0,0 +1,277 @@ +import logging +import os + +import certifi +import numpy as np +import pymongo + +from pymongo.mongo_client import MongoClient +from abc import ABC, abstractmethod + +from typing import Dict, List, Optional +from datetime import datetime +from datastore.datastore import DataStore +from models.models import ( + DocumentChunk, + DocumentMetadataFilter, + DocumentChunkWithScore, + DocumentMetadataFilter, + QueryResult, + QueryWithEmbedding, +) +from services.date import to_unix_timestamp + + +# Read environment variables for CosmosDB Mongo vCore +AZCOSMOS_API = os.environ.get("AZCOSMOS_API", "mongo-vcore") +AZCOSMOS_CONNSTR = os.environ.get("AZCOSMOS_CONNSTR") +AZCOSMOS_DATABASE_NAME = os.environ.get("AZCOSMOS_DATABASE_NAME") +AZCOSMOS_CONTAINER_NAME = os.environ.get("AZCOSMOS_CONTAINER_NAME") +assert AZCOSMOS_API is not None +assert AZCOSMOS_CONNSTR is not None +assert AZCOSMOS_DATABASE_NAME is not None +assert AZCOSMOS_CONTAINER_NAME is not None + +# OpenAI Ada Embeddings Dimension +VECTOR_DIMENSION = 1536 + + +# Abstract class similar to the original data store that allows API level abstraction +class AzureCosmosDBStoreApi(ABC): + @abstractmethod + async def ensure(self, num_lists, similarity): + raise NotImplementedError + + @abstractmethod + async def upsert_core(self, docId: str, chunks: List[DocumentChunk]) -> List[str]: + raise NotImplementedError + + @abstractmethod + async def query_core(self, query: QueryWithEmbedding) -> List[DocumentChunkWithScore]: + raise NotImplementedError + + @abstractmethod + async def drop_container(self): + raise NotImplementedError + + @abstractmethod + async def delete_filter(self, filter: DocumentMetadataFilter): + raise NotImplementedError + + @abstractmethod + async def delete_ids(self, ids: List[str]): + raise NotImplementedError + + @abstractmethod + async def delete_document_ids(self, documentIds: List[str]): + raise NotImplementedError + + +class MongoStoreApi(AzureCosmosDBStoreApi): + def __init__(self, mongoClient: MongoClient): + self.mongoClient = mongoClient + + @staticmethod + def _get_metadata_filter(filter: DocumentMetadataFilter) -> dict: + returnedFilter: dict = {} + if filter.document_id is not None: + returnedFilter["document_id"] = filter.document_id + if filter.author is not None: + returnedFilter["metadata.author"] = filter.author + if filter.start_date is not None: + returnedFilter["metadata.created_at"] = {"$gt": datetime.fromisoformat(filter.start_date)} + if filter.end_date is not None: + returnedFilter["metadata.created_at"] = {"$lt": datetime.fromisoformat(filter.end_date)} + if filter.source is not None: + returnedFilter["metadata.source"] = filter.source + if filter.source_id is not None: + returnedFilter["metadata.source_id"] = filter.source_id + return returnedFilter + + async def ensure(self, num_lists, similarity): + assert self.mongoClient.is_mongos + self.collection = self.mongoClient[AZCOSMOS_DATABASE_NAME][AZCOSMOS_CONTAINER_NAME] + + indexes = self.collection.index_information() + if indexes.get("embedding_cosmosSearch") is None: + # Ensure the vector index exists. + indexDefs: List[any] = [ + { + "name": "embedding_cosmosSearch", + "key": {"embedding": "cosmosSearch"}, + "cosmosSearchOptions": { + "kind": "vector-ivf", + "numLists": num_lists, + "similarity": similarity, + "dimensions": VECTOR_DIMENSION, + }, + } + ] + self.mongoClient[AZCOSMOS_DATABASE_NAME].command("createIndexes", AZCOSMOS_CONTAINER_NAME, + indexes=indexDefs) + + async def upsert_core(self, docId: str, chunks: List[DocumentChunk]) -> List[str]: + # Until nested doc embedding support is done, treat each chunk as a separate doc. + doc_ids: List[str] = [] + for chunk in chunks: + finalDocChunk: dict = { + "_id": f"doc:{docId}:chunk:{chunk.id}", + "document_id": docId, + 'embedding': chunk.embedding, + "text": chunk.text, + "metadata": chunk.metadata.__dict__ + } + + if chunk.metadata.created_at is not None: + finalDocChunk["metadata"]["created_at"] = datetime.fromisoformat(chunk.metadata.created_at) + self.collection.insert_one(finalDocChunk) + doc_ids.append(finalDocChunk["_id"]) + return doc_ids + + async def query_core(self, query: QueryWithEmbedding) -> List[DocumentChunkWithScore]: + pipeline = [ + { + "$search": { + "cosmosSearch": { + "vector": query.embedding, + "path": "embedding", + "k": query.top_k}, + "returnStoredSource": True} + }, + { + "$project": { + "similarityScore": { + "$meta": "searchScore" + }, + "document": "$$ROOT" + } + } + ] + + # TODO: Add in match filter (once it can be satisfied). + # Perform vector search + query_results: List[DocumentChunkWithScore] = [] + for aggResult in self.collection.aggregate(pipeline): + finalMetadata = aggResult["document"]["metadata"] + if finalMetadata["created_at"] is not None: + finalMetadata["created_at"] = datetime.isoformat(finalMetadata["created_at"]) + result = DocumentChunkWithScore( + id=aggResult["_id"], + score=aggResult["similarityScore"], + text=aggResult["document"]["text"], + metadata=finalMetadata + ) + query_results.append(result) + return query_results + + async def drop_container(self): + self.collection.drop() + + async def delete_filter(self, filter: DocumentMetadataFilter): + delete_filter = self._get_metadata_filter(filter) + self.collection.delete_many(delete_filter) + + async def delete_ids(self, ids: List[str]): + self.collection.delete_many({"_id": {"$in": ids}}) + + async def delete_document_ids(self, documentIds: List[str]): + self.collection.delete_many({"document_id": {"$in": documentIds}}) + + +# Datastore implementation. +""" +A class representing a memory store for Azure CosmosDB DataStore, currently only supports Mongo vCore +""" +class AzureCosmosDBDataStore(DataStore): + def __init__(self, cosmosStore: AzureCosmosDBStoreApi): + self.cosmosStore = cosmosStore + + """ + Creates a new datastore based on the Cosmos Api provided in the environment variables, + only supports Mongo vCore for now + + Args: + numLists (int) : This integer is the number of clusters that the inverted file (IVF) index + uses to group the vector data. We recommend that numLists is set to + documentCount/1000 for up to 1 million documents and to sqrt(documentCount) + for more than 1 million documents. Using a numLists value of 1 is akin to + performing brute-force search, which has limited performance. + similarity (str) : Similarity metric to use with the IVF index. Possible options are COS (cosine distance), + L2 (Euclidean distance), and IP (inner product). + + """ + @staticmethod + async def create(num_lists, similarity) -> DataStore: + + # Create underlying data store based on the API definition. + # Right now this only supports Mongo, but set up to support more. + apiStore: AzureCosmosDBStoreApi = None + if AZCOSMOS_API == "mongo-vcore": + mongoClient = MongoClient(AZCOSMOS_CONNSTR) + apiStore = MongoStoreApi(mongoClient) + else: + raise NotImplementedError + + await apiStore.ensure(num_lists, similarity) + store = AzureCosmosDBDataStore(apiStore) + return store + + async def _upsert(self, chunks: Dict[str, List[DocumentChunk]]) -> List[str]: + """ + Takes in a list of list of document chunks and inserts them into the database. + Return a list of document ids. + """ + # Initialize a list of ids to return + doc_ids: List[str] = [] + for doc_id, chunk_list in chunks.items(): + returnedIds = await self.cosmosStore.upsert_core(doc_id, chunk_list) + for returnedId in returnedIds: + doc_ids.append(returnedId) + return doc_ids + + async def _query( + self, + queries: List[QueryWithEmbedding], + ) -> List[QueryResult]: + """ + Takes in a list of queries with embeddings and filters and + returns a list of query results with matching document chunks and scores. + """ + # Prepare query responses and results object + results: List[QueryResult] = [] + + # Gather query results in a pipeline + logging.info(f"Gathering {len(queries)} query results", flush=True) + for query in queries: + logging.info(f"Query: {query.query}") + query_results = await self.cosmosStore.query_core(query) + + # Add to overall results + results.append(QueryResult(query=query.query, results=query_results)) + return results + + async def delete( + self, + ids: Optional[List[str]] = None, + filter: Optional[DocumentMetadataFilter] = None, + delete_all: Optional[bool] = None, + ) -> bool: + """ + Removes vectors by ids, filter, or everything in the datastore. + Returns whether the operation was successful. + """ + if delete_all: + # fast path - truncate/delete all items. + await self.cosmosStore.drop_container() + return True + + if filter: + if filter.document_id is not None: + await self.cosmosStore.delete_document_ids([filter.document_id]) + else: + await self.cosmosStore.delete_filter(filter) + + if ids: + await self.cosmosStore.delete_ids(ids) + + return True diff --git a/docs/providers/azurecosmosdb/setup.md b/docs/providers/azurecosmosdb/setup.md new file mode 100644 index 000000000..33d8caa22 --- /dev/null +++ b/docs/providers/azurecosmosdb/setup.md @@ -0,0 +1,20 @@ +# Azure Cosmos DB + +[Azure Cosmos DB](https://azure.microsoft.com/en-us/products/cosmos-db/) Azure Cosmos DB is a fully managed NoSQL and relational database for modern app development. Using Azure Cosmos DB for MongoDB vCore, you can store vector embeddings in your documents and perform [vector similarity search](https://learn.microsoft.com/azure/cosmos-db/mongodb/vcore/vector-search) on a fully managed MongoDB-compatible database service. + +Learn more about Azure Cosmos DB for MongoDB vCore [here](https://learn.microsoft.com/azure/cosmos-db/mongodb/vcore/). If you don't have an Azure account, you can start setting one up [here](https://azure.microsoft.com/). + +## Environment variables + +| Name | Required | Description | Default | +| ---------------------------- | -------- |-------------------------------------------------------------------------| ------------------- | +| `DATASTORE` | Yes | Datastore name, set to `azurecosmosdb` | | +| `BEARER_TOKEN` | Yes | Secret token | | +| `OPENAI_API_KEY` | Yes | OpenAI API key | | +| `AZCOSMOS_API` | Yes | Name of the API you're connecting to. Currently supported `mongo-vcore` | | +| `AZCOSMOS_CONNSTR` | Yes | The connection string to your account. | | +| `AZCOSMOS_DATABASE_NAME` | Yes | The database where the data is stored/queried | | +| `AZCOSMOS_CONTAINER_NAME` | Yes | The container where the data is stored/queried | | + +## Indexing +On first insert, the datastore will create the collection and index if necessary on the field `embedding`. Currently hybrid search is not yet supported. diff --git a/examples/providers/azurecosmosdb/semantic-search.ipynb b/examples/providers/azurecosmosdb/semantic-search.ipynb new file mode 100644 index 000000000..226a96fdd --- /dev/null +++ b/examples/providers/azurecosmosdb/semantic-search.ipynb @@ -0,0 +1,228 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "de02cdc9", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import requests" + ] + }, + { + "cell_type": "markdown", + "id": "7e5d60e1", + "metadata": {}, + "source": [ + "# Document retrieval: upsert and basic query usage\n", + "\n", + "In this walkthrough we will go over the Retrieval API with a Azure CosmosDB Mongo vCore datastore for semantic search.\n", + "\n", + "Before running the notebook please initialize the retrieval API and have it running locally somewhere. Please follow the instructions to start the Retreival API provided [here](https://github.com/openai/chatgpt-retrieval-plugin#quickstart). \n", + "\n", + "[Azure Cosmos DB](https://azure.microsoft.com/en-us/products/cosmos-db/) Azure Cosmos DB is a fully managed NoSQL and relational database for modern app development. Using Azure Cosmos DB for MongoDB vCore, you can store vector embeddings in your documents and perform [vector similarity search](https://learn.microsoft.com/azure/cosmos-db/mongodb/vcore/vector-search) on a fully managed MongoDB-compatible database service.\n", + "\n", + "Learn more about Azure Cosmos DB for MongoDB vCore [here](https://learn.microsoft.com/azure/cosmos-db/mongodb/vcore/). If you don't have an Azure account, you can start setting one up [here](https://azure.microsoft.com/)." + ] + }, + { + "cell_type": "markdown", + "id": "80988348", + "metadata": {}, + "source": [ + "## Document\n", + "\n", + "First we will create a list of documents. From the perspective of the retrieval plugin, a [document](https://github.com/openai/chatgpt-retrieval-plugin/blob/main/models/models.py) consists of an \"id\", \"text\", \"embedding\"(optional) and a collection of \"metadata\". The \"metadata\" has \"source\", \"source_id\", \"created_at\", \"url\" and \"author\" fields. Query metadata does not expose the \"url\" field.\n", + "\n", + "For this example we have taken some data about a few dog breeds. " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "52829ffc", + "metadata": {}, + "outputs": [], + "source": [ + "document_1 = {\n", + " \"id\": \"Siberian Husky\",\n", + " \"text\": \"Siberian Huskies are strikingly beautiful and energetic Arctic breed dogs known for their captivating blue eyes and remarkable endurance in cold climates.\"\n", + "}\n", + "\n", + "document_2 = {\n", + " \"id\": \"Alaskan Malamute\",\n", + " \"text\": \"The Alaskan Malamute is a powerful and friendly Arctic sled dog breed known for its strength, endurance, and affectionate nature.\"\n", + "}\n", + "\n", + "document_3 = {\n", + " \"id\": \"Samoyed\",\n", + " \"text\": \"The Samoyed is a cheerful and fluffy Arctic breed, renowned for its smile and gentle disposition, originally used for herding reindeer and pulling sleds in Siberia.\"\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "6af96f59", + "metadata": {}, + "source": [ + "## Indexing the Docs\n", + "\n", + "On the first insert, the datastore will create the collection and index if necessary on the field `embedding`. Currently hybrid search is not yet supported. \n", + "\n", + "To make these requests to the retrieval app API, we will need to provide authorization in the form of the BEARER_TOKEN we set earlier. We do this below:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "d68e796e", + "metadata": {}, + "outputs": [], + "source": [ + "BEARER_TOKEN_HERE = \"eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6IkFheXVzaCBLYXRhcmlhIiwiaWF0IjoxNTE2MjM5MDIyfQ.VHEVK_IdThXZJr8aQsfjVQ-_n4raepdpqsC5gYDsubE\"\n", + "endpoint_url = 'http://0.0.0.0:8000'\n", + "headers = {\n", + " \"Authorization\": f\"Bearer {BEARER_TOKEN_HERE}\"\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "954a09da", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'ids': ['doc:Siberian Husky:chunk:Siberian Husky_0',\n", + " 'doc:Alaskan Malamute:chunk:Alaskan Malamute_0',\n", + " 'doc:Samoyed:chunk:Samoyed_0']}" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "response = requests.post(\n", + " f\"{endpoint_url}/upsert\",\n", + " headers=headers,\n", + " json={\"documents\": [document_1, document_2, document_3]\n", + " }\n", + ")\n", + "\n", + "response.json()" + ] + }, + { + "cell_type": "markdown", + "id": "431a8616", + "metadata": {}, + "source": [ + "## Querying the datastore\n", + "Let's query the data store for dogs based on the place of their origin." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "23441d46", + "metadata": {}, + "outputs": [], + "source": [ + "queries = [\n", + " {\n", + " \"query\":\"I want dog breeds from Siberia.\",\n", + " \"top_k\":2\n", + " },\n", + " {\n", + " \"query\":\"I want dog breed from Alaska.\",\n", + " \"top_k\":1\n", + " }\n", + "]\n", + "\n", + "response = requests.post(\n", + " f\"{endpoint_url}/query\",\n", + " headers=headers,\n", + " json={\"queries\":queries}\n", + ")\n", + "\n", + "response.json()" + ] + }, + { + "cell_type": "markdown", + "id": "705181ee", + "metadata": {}, + "source": [ + "## Deleting the data from the datastore\n", + "You can either delete all the data, or provide a list of docIds to delete" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b15513ca", + "metadata": {}, + "outputs": [], + "source": [ + "response = requests.delete(\n", + " f\"{endpoint_url}/delete\",\n", + " headers=headers,\n", + " json={\"ids\":[\"doc:SiberianHusky:chunk:SiberianHusky_0\"]}\n", + ")\n", + "\n", + "response.json()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cc748e50", + "metadata": {}, + "outputs": [], + "source": [ + "response = requests.delete(\n", + " f\"{endpoint_url}/delete\",\n", + " headers=headers,\n", + " json={\"delete_all\":True}\n", + ")\n", + "\n", + "response.json()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "19531965", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/poetry.lock b/poetry.lock index b60c492e6..fe1d21c99 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1177,6 +1177,7 @@ files = [ {file = "greenlet-2.0.2-cp27-cp27m-win32.whl", hash = "sha256:6c3acb79b0bfd4fe733dff8bc62695283b57949ebcca05ae5c129eb606ff2d74"}, {file = "greenlet-2.0.2-cp27-cp27m-win_amd64.whl", hash = "sha256:283737e0da3f08bd637b5ad058507e578dd462db259f7f6e4c5c365ba4ee9343"}, {file = "greenlet-2.0.2-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:d27ec7509b9c18b6d73f2f5ede2622441de812e7b1a80bbd446cb0633bd3d5ae"}, + {file = "greenlet-2.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d967650d3f56af314b72df7089d96cda1083a7fc2da05b375d2bc48c82ab3f3c"}, {file = "greenlet-2.0.2-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:30bcf80dda7f15ac77ba5af2b961bdd9dbc77fd4ac6105cee85b0d0a5fcf74df"}, {file = "greenlet-2.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:26fbfce90728d82bc9e6c38ea4d038cba20b7faf8a0ca53a9c07b67318d46088"}, {file = "greenlet-2.0.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9190f09060ea4debddd24665d6804b995a9c122ef5917ab26e1566dcc712ceeb"}, @@ -1185,6 +1186,7 @@ files = [ {file = "greenlet-2.0.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:76ae285c8104046b3a7f06b42f29c7b73f77683df18c49ab5af7983994c2dd91"}, {file = "greenlet-2.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:2d4686f195e32d36b4d7cf2d166857dbd0ee9f3d20ae349b6bf8afc8485b3645"}, {file = "greenlet-2.0.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c4302695ad8027363e96311df24ee28978162cdcdd2006476c43970b384a244c"}, + {file = "greenlet-2.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d4606a527e30548153be1a9f155f4e283d109ffba663a15856089fb55f933e47"}, {file = "greenlet-2.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c48f54ef8e05f04d6eff74b8233f6063cb1ed960243eacc474ee73a2ea8573ca"}, {file = "greenlet-2.0.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a1846f1b999e78e13837c93c778dcfc3365902cfb8d1bdb7dd73ead37059f0d0"}, {file = "greenlet-2.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a06ad5312349fec0ab944664b01d26f8d1f05009566339ac6f63f56589bc1a2"}, @@ -1214,6 +1216,7 @@ files = [ {file = "greenlet-2.0.2-cp37-cp37m-win32.whl", hash = "sha256:3f6ea9bd35eb450837a3d80e77b517ea5bc56b4647f5502cd28de13675ee12f7"}, {file = "greenlet-2.0.2-cp37-cp37m-win_amd64.whl", hash = "sha256:7492e2b7bd7c9b9916388d9df23fa49d9b88ac0640db0a5b4ecc2b653bf451e3"}, {file = "greenlet-2.0.2-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:b864ba53912b6c3ab6bcb2beb19f19edd01a6bfcbdfe1f37ddd1778abfe75a30"}, + {file = "greenlet-2.0.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:1087300cf9700bbf455b1b97e24db18f2f77b55302a68272c56209d5587c12d1"}, {file = "greenlet-2.0.2-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:ba2956617f1c42598a308a84c6cf021a90ff3862eddafd20c3333d50f0edb45b"}, {file = "greenlet-2.0.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fc3a569657468b6f3fb60587e48356fe512c1754ca05a564f11366ac9e306526"}, {file = "greenlet-2.0.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8eab883b3b2a38cc1e050819ef06a7e6344d4a990d24d45bc6f2cf959045a45b"}, @@ -1222,6 +1225,7 @@ files = [ {file = "greenlet-2.0.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:b0ef99cdbe2b682b9ccbb964743a6aca37905fda5e0452e5ee239b1654d37f2a"}, {file = "greenlet-2.0.2-cp38-cp38-win32.whl", hash = "sha256:b80f600eddddce72320dbbc8e3784d16bd3fb7b517e82476d8da921f27d4b249"}, {file = "greenlet-2.0.2-cp38-cp38-win_amd64.whl", hash = "sha256:4d2e11331fc0c02b6e84b0d28ece3a36e0548ee1a1ce9ddde03752d9b79bba40"}, + {file = "greenlet-2.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:8512a0c38cfd4e66a858ddd1b17705587900dd760c6003998e9472b77b56d417"}, {file = "greenlet-2.0.2-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:88d9ab96491d38a5ab7c56dd7a3cc37d83336ecc564e4e8816dbed12e5aaefc8"}, {file = "greenlet-2.0.2-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:561091a7be172ab497a3527602d467e2b3fbe75f9e783d8b8ce403fa414f71a6"}, {file = "greenlet-2.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:971ce5e14dc5e73715755d0ca2975ac88cfdaefcaab078a284fea6cfabf866df"}, @@ -2730,6 +2734,107 @@ pandas = ">=1.2.4" protobuf = ">=3.20.0" ujson = ">=2.0.0" +[[package]] +name = "pymongo" +version = "4.5.0" +description = "Python driver for MongoDB " +optional = false +python-versions = ">=3.7" +files = [ + {file = "pymongo-4.5.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:2d4fa1b01fa7e5b7bb8d312e3542e211b320eb7a4e3d8dc884327039d93cb9e0"}, + {file = "pymongo-4.5.0-cp310-cp310-manylinux1_i686.whl", hash = "sha256:dfcd2b9f510411de615ccedd47462dae80e82fdc09fe9ab0f0f32f11cf57eeb5"}, + {file = "pymongo-4.5.0-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:3e33064f1984db412b34d51496f4ea785a9cff621c67de58e09fb28da6468a52"}, + {file = "pymongo-4.5.0-cp310-cp310-manylinux2014_i686.whl", hash = "sha256:33faa786cc907de63f745f587e9879429b46033d7d97a7b84b37f4f8f47b9b32"}, + {file = "pymongo-4.5.0-cp310-cp310-manylinux2014_ppc64le.whl", hash = "sha256:76a262c41c1a7cbb84a3b11976578a7eb8e788c4b7bfbd15c005fb6ca88e6e50"}, + {file = "pymongo-4.5.0-cp310-cp310-manylinux2014_s390x.whl", hash = "sha256:0f4b125b46fe377984fbaecf2af40ed48b05a4b7676a2ff98999f2016d66b3ec"}, + {file = "pymongo-4.5.0-cp310-cp310-manylinux2014_x86_64.whl", hash = "sha256:40d5f6e853ece9bfc01e9129b228df446f49316a4252bb1fbfae5c3c9dedebad"}, + {file = "pymongo-4.5.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:152259f0f1a60f560323aacf463a3642a65a25557683f49cfa08c8f1ecb2395a"}, + {file = "pymongo-4.5.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6d64878d1659d2a5bdfd0f0a4d79bafe68653c573681495e424ab40d7b6d6d41"}, + {file = "pymongo-4.5.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f1bb3a62395ffe835dbef3a1cbff48fbcce709c78bd1f52e896aee990928432b"}, + {file = "pymongo-4.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fe48f50fb6348511a3268a893bfd4ab5f263f5ac220782449d03cd05964d1ae7"}, + {file = "pymongo-4.5.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7591a3beea6a9a4fa3080d27d193b41f631130e3ffa76b88c9ccea123f26dc59"}, + {file = "pymongo-4.5.0-cp310-cp310-win32.whl", hash = "sha256:3a7166d57dc74d679caa7743b8ecf7dc3a1235a9fd178654dddb2b2a627ae229"}, + {file = "pymongo-4.5.0-cp310-cp310-win_amd64.whl", hash = "sha256:21b953da14549ff62ea4ae20889c71564328958cbdf880c64a92a48dda4c9c53"}, + {file = "pymongo-4.5.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:ead4f19d0257a756b21ac2e0e85a37a7245ddec36d3b6008d5bfe416525967dc"}, + {file = "pymongo-4.5.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9aff6279e405dc953eeb540ab061e72c03cf38119613fce183a8e94f31be608f"}, + {file = "pymongo-4.5.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cd4c8d6aa91d3e35016847cbe8d73106e3d1c9a4e6578d38e2c346bfe8edb3ca"}, + {file = "pymongo-4.5.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:08819da7864f9b8d4a95729b2bea5fffed08b63d3b9c15b4fea47de655766cf5"}, + {file = "pymongo-4.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a253b765b7cbc4209f1d8ee16c7287c4268d3243070bf72d7eec5aa9dfe2a2c2"}, + {file = "pymongo-4.5.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8027c9063579083746147cf401a7072a9fb6829678076cd3deff28bb0e0f50c8"}, + {file = "pymongo-4.5.0-cp311-cp311-win32.whl", hash = "sha256:9d2346b00af524757576cc2406414562cced1d4349c92166a0ee377a2a483a80"}, + {file = "pymongo-4.5.0-cp311-cp311-win_amd64.whl", hash = "sha256:c3c3525ea8658ee1192cdddf5faf99b07ebe1eeaa61bf32821126df6d1b8072b"}, + {file = "pymongo-4.5.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:e5a27f348909235a106a3903fc8e70f573d89b41d723a500869c6569a391cff7"}, + {file = "pymongo-4.5.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c9a9a39b7cac81dca79fca8c2a6479ef4c7b1aab95fad7544cc0e8fd943595a2"}, + {file = "pymongo-4.5.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:496c9cbcb4951183d4503a9d7d2c1e3694aab1304262f831d5e1917e60386036"}, + {file = "pymongo-4.5.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:23cc6d7eb009c688d70da186b8f362d61d5dd1a2c14a45b890bd1e91e9c451f2"}, + {file = "pymongo-4.5.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fff7d17d30b2cd45afd654b3fc117755c5d84506ed25fda386494e4e0a3416e1"}, + {file = "pymongo-4.5.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6422b6763b016f2ef2beedded0e546d6aa6ba87910f9244d86e0ac7690f75c96"}, + {file = "pymongo-4.5.0-cp312-cp312-win32.whl", hash = "sha256:77cfff95c1fafd09e940b3fdcb7b65f11442662fad611d0e69b4dd5d17a81c60"}, + {file = "pymongo-4.5.0-cp312-cp312-win_amd64.whl", hash = "sha256:e57d859b972c75ee44ea2ef4758f12821243e99de814030f69a3decb2aa86807"}, + {file = "pymongo-4.5.0-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:2b0176f9233a5927084c79ff80b51bd70bfd57e4f3d564f50f80238e797f0c8a"}, + {file = "pymongo-4.5.0-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:89b3f2da57a27913d15d2a07d58482f33d0a5b28abd20b8e643ab4d625e36257"}, + {file = "pymongo-4.5.0-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:5caee7bd08c3d36ec54617832b44985bd70c4cbd77c5b313de6f7fce0bb34f93"}, + {file = "pymongo-4.5.0-cp37-cp37m-manylinux2014_i686.whl", hash = "sha256:1d40ad09d9f5e719bc6f729cc6b17f31c0b055029719406bd31dde2f72fca7e7"}, + {file = "pymongo-4.5.0-cp37-cp37m-manylinux2014_ppc64le.whl", hash = "sha256:076afa0a4a96ca9f77fec0e4a0d241200b3b3a1766f8d7be9a905ecf59a7416b"}, + {file = "pymongo-4.5.0-cp37-cp37m-manylinux2014_s390x.whl", hash = "sha256:3fa3648e4f1e63ddfe53563ee111079ea3ab35c3b09cd25bc22dadc8269a495f"}, + {file = "pymongo-4.5.0-cp37-cp37m-manylinux2014_x86_64.whl", hash = "sha256:44ee985194c426ddf781fa784f31ffa29cb59657b2dba09250a4245431847d73"}, + {file = "pymongo-4.5.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b33c17d9e694b66d7e96977e9e56df19d662031483efe121a24772a44ccbbc7e"}, + {file = "pymongo-4.5.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3d79ae3bb1ff041c0db56f138c88ce1dfb0209f3546d8d6e7c3f74944ecd2439"}, + {file = "pymongo-4.5.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d67225f05f6ea27c8dc57f3fa6397c96d09c42af69d46629f71e82e66d33fa4f"}, + {file = "pymongo-4.5.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:41771b22dd2822540f79a877c391283d4e6368125999a5ec8beee1ce566f3f82"}, + {file = "pymongo-4.5.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a1f26bc1f5ce774d99725773901820dfdfd24e875028da4a0252a5b48dcab5c"}, + {file = "pymongo-4.5.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:3236cf89d69679eaeb9119c840f5c7eb388a2110b57af6bb6baf01a1da387c18"}, + {file = "pymongo-4.5.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:e1f61355c821e870fb4c17cdb318669cfbcf245a291ce5053b41140870c3e5cc"}, + {file = "pymongo-4.5.0-cp37-cp37m-win32.whl", hash = "sha256:49dce6957598975d8b8d506329d2a3a6c4aee911fa4bbcf5e52ffc6897122950"}, + {file = "pymongo-4.5.0-cp37-cp37m-win_amd64.whl", hash = "sha256:f2227a08b091bd41df5aadee0a5037673f691e2aa000e1968b1ea2342afc6880"}, + {file = "pymongo-4.5.0-cp38-cp38-macosx_11_0_universal2.whl", hash = "sha256:435228d3c16a375274ac8ab9c4f9aef40c5e57ddb8296e20ecec9e2461da1017"}, + {file = "pymongo-4.5.0-cp38-cp38-manylinux1_i686.whl", hash = "sha256:8e559116e4128630ad3b7e788e2e5da81cbc2344dee246af44471fa650486a70"}, + {file = "pymongo-4.5.0-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:840eaf30ccac122df260b6005f9dfae4ac287c498ee91e3e90c56781614ca238"}, + {file = "pymongo-4.5.0-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:b4fe46b58010115514b842c669a0ed9b6a342017b15905653a5b1724ab80917f"}, + {file = "pymongo-4.5.0-cp38-cp38-manylinux2014_i686.whl", hash = "sha256:a8127437ebc196a6f5e8fddd746bd0903a400dc6b5ae35df672dd1ccc7170a2a"}, + {file = "pymongo-4.5.0-cp38-cp38-manylinux2014_ppc64le.whl", hash = "sha256:2988ef5e6b360b3ff1c6d55c53515499de5f48df31afd9f785d788cdacfbe2d3"}, + {file = "pymongo-4.5.0-cp38-cp38-manylinux2014_s390x.whl", hash = "sha256:e249190b018d63c901678053b4a43e797ca78b93fb6d17633e3567d4b3ec6107"}, + {file = "pymongo-4.5.0-cp38-cp38-manylinux2014_x86_64.whl", hash = "sha256:1240edc1a448d4ada4bf1a0e55550b6292420915292408e59159fd8bbdaf8f63"}, + {file = "pymongo-4.5.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b6d2a56fc2354bb6378f3634402eec788a8f3facf0b3e7d468db5f2b5a78d763"}, + {file = "pymongo-4.5.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2a0aade2b11dc0c326ccd429ee4134d2d47459ff68d449c6d7e01e74651bd255"}, + {file = "pymongo-4.5.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:74c0da07c04d0781490b2915e7514b1adb265ef22af039a947988c331ee7455b"}, + {file = "pymongo-4.5.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3754acbd7efc7f1b529039fcffc092a15e1cf045e31f22f6c9c5950c613ec4d"}, + {file = "pymongo-4.5.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:631492573a1bef2f74f9ac0f9d84e0ce422c251644cd81207530af4aa2ee1980"}, + {file = "pymongo-4.5.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e2654d1278384cff75952682d17c718ecc1ad1d6227bb0068fd826ba47d426a5"}, + {file = "pymongo-4.5.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:168172ef7856e20ec024fe2a746bfa895c88b32720138e6438fd765ebd2b62dd"}, + {file = "pymongo-4.5.0-cp38-cp38-win32.whl", hash = "sha256:b25f7bea162b3dbec6d33c522097ef81df7c19a9300722fa6853f5b495aecb77"}, + {file = "pymongo-4.5.0-cp38-cp38-win_amd64.whl", hash = "sha256:b520aafc6cb148bac09ccf532f52cbd31d83acf4d3e5070d84efe3c019a1adbf"}, + {file = "pymongo-4.5.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:8543253adfaa0b802bfa88386db1009c6ebb7d5684d093ee4edc725007553d21"}, + {file = "pymongo-4.5.0-cp39-cp39-manylinux1_i686.whl", hash = "sha256:bc5d8c3647b8ae28e4312f1492b8f29deebd31479cd3abaa989090fb1d66db83"}, + {file = "pymongo-4.5.0-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:505f8519c4c782a61d94a17b0da50be639ec462128fbd10ab0a34889218fdee3"}, + {file = "pymongo-4.5.0-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:53f2dda54d76a98b43a410498bd12f6034b2a14b6844ca08513733b2b20b7ad8"}, + {file = "pymongo-4.5.0-cp39-cp39-manylinux2014_i686.whl", hash = "sha256:9c04b9560872fa9a91251030c488e0a73bce9321a70f991f830c72b3f8115d0d"}, + {file = "pymongo-4.5.0-cp39-cp39-manylinux2014_ppc64le.whl", hash = "sha256:58a63a26a1e3dc481dd3a18d6d9f8bd1d576cd1ffe0d479ba7dd38b0aeb20066"}, + {file = "pymongo-4.5.0-cp39-cp39-manylinux2014_s390x.whl", hash = "sha256:f076b779aa3dc179aa3ed861be063a313ed4e48ae9f6a8370a9b1295d4502111"}, + {file = "pymongo-4.5.0-cp39-cp39-manylinux2014_x86_64.whl", hash = "sha256:1b1d7d9aabd8629a31d63cd106d56cca0e6420f38e50563278b520f385c0d86e"}, + {file = "pymongo-4.5.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:37df8f6006286a5896d1cbc3efb8471ced42e3568d38e6cb00857277047b0d63"}, + {file = "pymongo-4.5.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:56320c401f544d762fc35766936178fbceb1d9261cd7b24fbfbc8fb6f67aa8a5"}, + {file = "pymongo-4.5.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bbd705d5f3c3d1ff2d169e418bb789ff07ab3c70d567cc6ba6b72b04b9143481"}, + {file = "pymongo-4.5.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80a167081c75cf66b32f30e2f1eaee9365af935a86dbd76788169911bed9b5d5"}, + {file = "pymongo-4.5.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4c42748ccc451dfcd9cef6c5447a7ab727351fd9747ad431db5ebb18a9b78a4d"}, + {file = "pymongo-4.5.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:cf62da7a4cdec9a4b2981fcbd5e08053edffccf20e845c0b6ec1e77eb7fab61d"}, + {file = "pymongo-4.5.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:b5bbb87fa0511bd313d9a2c90294c88db837667c2bda2ea3fa7a35b59fd93b1f"}, + {file = "pymongo-4.5.0-cp39-cp39-win32.whl", hash = "sha256:465fd5b040206f8bce7016b01d7e7f79d2fcd7c2b8e41791be9632a9df1b4999"}, + {file = "pymongo-4.5.0-cp39-cp39-win_amd64.whl", hash = "sha256:63d8019eee119df308a075b8a7bdb06d4720bf791e2b73d5ab0e7473c115d79c"}, + {file = "pymongo-4.5.0.tar.gz", hash = "sha256:681f252e43b3ef054ca9161635f81b730f4d8cadd28b3f2b2004f5a72f853982"}, +] + +[package.dependencies] +dnspython = ">=1.16.0,<3.0.0" + +[package.extras] +aws = ["pymongo-auth-aws (<2.0.0)"] +encryption = ["certifi", "pymongo[aws]", "pymongocrypt (>=1.6.0,<2.0.0)"] +gssapi = ["pykerberos", "winkerberos (>=0.5.0)"] +ocsp = ["certifi", "cryptography (>=2.5)", "pyopenssl (>=17.2.0)", "requests (<3.0.0)", "service-identity (>=18.1.0)"] +snappy = ["python-snappy"] +zstd = ["zstandard"] + [[package]] name = "pypdf2" version = "3.0.1" @@ -4218,4 +4323,4 @@ postgresql = ["psycopg2cffi"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "39179f3602509004d328d1fb7a48068c86f8e93ab0660cf18c7d7f85018cacf7" +content-hash = "75528d93a802f01a02594d42fe33c96146fbf6c6e35edade1b6c86afce50f9e1" diff --git a/pyproject.toml b/pyproject.toml index 814eeaa65..1ba7bf7cc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,6 +36,7 @@ pgvector = "^0.1.7" psycopg2cffi = {version = "^2.9.0", optional = true} loguru = "^0.7.0" elasticsearch = "8.8.2" +pymongo = "^4.3.3" [tool.poetry.scripts] start = "server.main:start" diff --git a/tests/datastore/providers/azurecosmosdb/test_azurecosmosdb_datastore.py b/tests/datastore/providers/azurecosmosdb/test_azurecosmosdb_datastore.py new file mode 100644 index 000000000..7b238e4d5 --- /dev/null +++ b/tests/datastore/providers/azurecosmosdb/test_azurecosmosdb_datastore.py @@ -0,0 +1,182 @@ +import pytest +from typing import Dict, List +from dotenv import dotenv_values + +from datastore.datastore import DataStore +from datastore.providers.azurecosmosdb_datastore import AzureCosmosDBDataStore +from models.models import ( + DocumentChunk, + DocumentChunkMetadata, + QueryWithEmbedding, +) + + +num_lists = 1 +similarity = "COS" + + +def create_embedding(non_zero_pos: int) -> List[float]: + # create a vector with a single non-zero value of dimension 1536 + vector = [0.0] * 1536 + vector[non_zero_pos - 1] = 1.0 + return vector + + +@pytest.fixture +def azure_cosmos_db_settings_from_dot_env() -> dict: + """ + Reads the Azure CosmosDB environment variables for the .env file. + + Returns: + dict: The Azure CosmosDB environment variables + """ + config = dotenv_values(".env") + env_variables = { + "DATASTORE": "azurecosmosdb", + "AZCOSMOS_API": config.get(("AZCOSMOS_API")), # Right now CosmosDB only supports vector search in Mongo vCore. + "AZCOSMOS_CONNSTR": config.get("AZCOSMOS_CONNSTR"), + "AZCOSMOS_DATABASE_NAME": config.get("AZCOSMOS_DATABASE_NAME"), + "AZCOSMOS_CONTAINER_NAME": config.get("AZCOSMOS_CONTAINER_NAME"), + } + + return env_variables + + +@pytest.fixture +def initial_document_chunks() -> Dict[str, List[DocumentChunk]]: + first_doc_chunks = [ + DocumentChunk( + id=f"first-doc-{i}", + text=f"Lorem ipsum {i}", + metadata=DocumentChunkMetadata(), + embedding=create_embedding(i), + ) + for i in range(4, 7) + ] + return { + "first-doc": first_doc_chunks, + } + + +@pytest.fixture +def queries() -> List[QueryWithEmbedding]: + queries = [ + QueryWithEmbedding( + query="Query 1", + top_k=1, + embedding=create_embedding(4), + ), + QueryWithEmbedding( + query="Query 2", + top_k=2, + embedding=create_embedding(5), + ), + ] + return queries + + +@pytest.fixture +async def azurecosmosdb_datastore() -> DataStore: + return await AzureCosmosDBDataStore.create(num_lists=num_lists, similarity=similarity) + + +@pytest.mark.asyncio +async def test_upsert( + azurecosmosdb_datastore: AzureCosmosDBDataStore, + initial_document_chunks: Dict[str, List[DocumentChunk]], +) -> None: + """Test basic upsert.""" + doc_ids = await azurecosmosdb_datastore._upsert(initial_document_chunks) + assert doc_ids == [f"doc:{doc_id}:chunk:{chunk.id}" for doc_id, chunk_list in initial_document_chunks.items() + for chunk in chunk_list] + + +@pytest.mark.asyncio +async def test_query( + azurecosmosdb_datastore: AzureCosmosDBDataStore, + initial_document_chunks: Dict[str, List[DocumentChunk]], + queries: List[QueryWithEmbedding], +) -> None: + """Test basic query.""" + await azurecosmosdb_datastore.delete(delete_all=True) + # insert to prepare for the test + await azurecosmosdb_datastore._upsert(initial_document_chunks) + + query_results = await azurecosmosdb_datastore._query(queries) + assert len(query_results) == len(queries) + + query_0_results = query_results[0].results + query_1_results = query_results[1].results + + assert len(query_0_results) == 1 + assert len(query_1_results) == 2 + + # NOTE: this is the correct behavior + assert query_0_results[0].id == "doc:first-doc:chunk:first-doc-4" + assert query_1_results[0].id == "doc:first-doc:chunk:first-doc-5" + assert query_1_results[1].id == "doc:first-doc:chunk:first-doc-4" + + +@pytest.mark.asyncio +async def test_delete(azurecosmosdb_datastore: AzureCosmosDBDataStore) -> None: + await azurecosmosdb_datastore.delete(delete_all=True) + chunk1 = DocumentChunk( + id="deleteChunk1", + text="delete text 1", + embedding=[1] * 1536, + metadata=DocumentChunkMetadata(), + ) + chunk2 = DocumentChunk( + id="deleteChunk2", + text="delete text 2", + embedding=[1] * 1536, + metadata=DocumentChunkMetadata(), + ) + # insert to prepare for test + await azurecosmosdb_datastore._upsert({"deleteDoc1": [chunk1], "deleteDoc2": [chunk2]}) + + query_embedding = [1] * 1536 + query = QueryWithEmbedding( + query="Query for delete", + embedding=query_embedding, + ) + results = await azurecosmosdb_datastore._query([query]) + + assert len(results[0].results) == 2 + assert results[0].results[0].id == "doc:deleteDoc1:chunk:deleteChunk1" + assert results[0].results[1].id == "doc:deleteDoc2:chunk:deleteChunk2" + + await azurecosmosdb_datastore.delete(ids=["doc:deleteDoc1:chunk:deleteChunk1"]) + results_after_delete = await azurecosmosdb_datastore._query([query]) + + assert len(results_after_delete[0].results) == 1 + assert results_after_delete[0].results[0].id == "doc:deleteDoc2:chunk:deleteChunk2" + + +@pytest.mark.asynio +async def test_delete_all(azurecosmosdb_datastore: AzureCosmosDBDataStore) -> None: + await azurecosmosdb_datastore.delete(delete_all=True) + chunk = DocumentChunk( + id="deleteChunk", + text="delete text", + embedding=[1] * 1536, + metadata=DocumentChunkMetadata(), + ) + await azurecosmosdb_datastore._upsert({"deleteDoc": [chunk]}) + + query_embedding = [1] * 1536 + query = QueryWithEmbedding( + query="delete query", + embedding=query_embedding, + top_k=1, + ) + results = await azurecosmosdb_datastore._query([query]) + + assert len(results) == 1 + assert len(results[0].results) == 1 + assert results[0].results[0].id == "doc:deleteDoc:chunk:deleteChunk" + + await azurecosmosdb_datastore.delete(delete_all=True) + results_after_delete = await azurecosmosdb_datastore._query([query]) + + assert len(results_after_delete[0].results) == 0