forked from openai/chatgpt-retrieval-plugin
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* First draft of ChromaDataStore * Add embeddings to the invocation of collection.add * Resolve uncertainty over embedding_function by using outer embeddings * Add CHROMA_IN_MEMORY config variable * Update poetry.lock * Fix import error * Fix default collection name to pass validation * Add empty scaffolding for integration tests * Fix error: add should return document ids * Add first pass at integration tests using same fixtures as from Zilliz integration tests * Fix created_at handling and pass test test_query_filter * Fix more tests * Fix host/port initialization * Fix NotEnoughElementsException * Fix filter handling for source * Fix deletion tests * Change defaults, upsert method * Clients, metadata handling * Update tests * Docstrings * Update README * Updated tests and docs * Updated poetry.lock * Updated tests * Cleanup after tests * Fix docs path * Remove embeddings return * Reference --------- Co-authored-by: Chelsea Voss <[email protected]>
- Loading branch information
Showing
7 changed files
with
2,425 additions
and
597 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,249 @@ | ||
""" | ||
Chroma datastore support for the ChatGPT retrieval plugin. | ||
Consult the Chroma docs and GitHub repo for more information: | ||
- https://docs.trychroma.com/usage-guide?lang=py | ||
- https://github.com/chroma-core/chroma | ||
- https://www.trychroma.com/ | ||
""" | ||
|
||
import os | ||
from datetime import datetime | ||
from typing import Dict, List, Optional | ||
|
||
import chromadb | ||
|
||
from datastore.datastore import DataStore | ||
from models.models import ( | ||
DocumentChunk, | ||
DocumentChunkMetadata, | ||
DocumentChunkWithScore, | ||
DocumentMetadataFilter, | ||
QueryResult, | ||
QueryWithEmbedding, | ||
Source, | ||
) | ||
from services.chunks import get_document_chunks | ||
|
||
CHROMA_IN_MEMORY = os.environ.get("CHROMA_IN_MEMORY", "True") | ||
CHROMA_PERSISTENCE_DIR = os.environ.get("CHROMA_PERSISTENCE_DIR", "openai") | ||
CHROMA_HOST = os.environ.get("CHROMA_HOST", "http://127.0.0.1") | ||
CHROMA_PORT = os.environ.get("CHROMA_PORT", "8000") | ||
CHROMA_COLLECTION = os.environ.get("CHROMA_COLLECTION", "openaiembeddings") | ||
|
||
|
||
class ChromaDataStore(DataStore): | ||
def __init__( | ||
self, | ||
in_memory: bool = CHROMA_IN_MEMORY, | ||
persistence_dir: Optional[str] = CHROMA_PERSISTENCE_DIR, | ||
collection_name: str = CHROMA_COLLECTION, | ||
host: str = CHROMA_HOST, | ||
port: str = CHROMA_PORT, | ||
client: Optional[chromadb.Client] = None, | ||
): | ||
if client: | ||
self._client = client | ||
else: | ||
if in_memory: | ||
settings = ( | ||
chromadb.config.Settings( | ||
chroma_db_impl="duckdb+parquet", | ||
persist_directory=persistence_dir, | ||
) | ||
if persistence_dir | ||
else chromadb.config.Settings() | ||
) | ||
|
||
self._client = chromadb.Client(settings=settings) | ||
else: | ||
self._client = chromadb.Client( | ||
settings=chromadb.config.Settings( | ||
chroma_api_impl="rest", | ||
chroma_server_host=host, | ||
chroma_server_http_port=port, | ||
) | ||
) | ||
self._collection = self._client.get_or_create_collection( | ||
name=collection_name, | ||
embedding_function=None, | ||
) | ||
|
||
async def upsert( | ||
self, documents: List[DocumentChunk], chunk_token_size: Optional[int] = None | ||
) -> List[str]: | ||
""" | ||
Takes in a list of documents and inserts them into the database. If an id already exists, the document is updated. | ||
Return a list of document ids. | ||
""" | ||
|
||
chunks = get_document_chunks(documents, chunk_token_size) | ||
|
||
# Chroma has a true upsert, so we don't need to delete first | ||
return await self._upsert(chunks) | ||
|
||
async def _upsert(self, chunks: Dict[str, List[DocumentChunk]]) -> List[str]: | ||
""" | ||
Takes in a list of list of document chunks and inserts them into the database. | ||
Return a list of document ids. | ||
""" | ||
|
||
self._collection.upsert( | ||
ids=[chunk.id for chunk_list in chunks.values() for chunk in chunk_list], | ||
embeddings=[ | ||
chunk.embedding | ||
for chunk_list in chunks.values() | ||
for chunk in chunk_list | ||
], | ||
documents=[ | ||
chunk.text for chunk_list in chunks.values() for chunk in chunk_list | ||
], | ||
metadatas=[ | ||
self._process_metadata_for_storage(chunk.metadata) | ||
for chunk_list in chunks.values() | ||
for chunk in chunk_list | ||
], | ||
) | ||
return list(chunks.keys()) | ||
|
||
def _where_from_query_filter(self, query_filter: DocumentMetadataFilter) -> Dict: | ||
output = { | ||
k: v | ||
for (k, v) in query_filter.dict().items() | ||
if v is not None and k != "start_date" and k != "end_date" and k != "source" | ||
} | ||
if query_filter.source: | ||
output["source"] = query_filter.source.value | ||
if query_filter.start_date and query_filter.end_date: | ||
output["$and"] = [ | ||
{ | ||
"created_at": { | ||
"$gte": int( | ||
datetime.fromisoformat(query_filter.start_date).timestamp() | ||
) | ||
} | ||
}, | ||
{ | ||
"created_at": { | ||
"$lte": int( | ||
datetime.fromisoformat(query_filter.end_date).timestamp() | ||
) | ||
} | ||
}, | ||
] | ||
elif query_filter.start_date: | ||
output["created_at"] = { | ||
"$gte": int(datetime.fromisoformat(query_filter.start_date).timestamp()) | ||
} | ||
elif query_filter.end_date: | ||
output["created_at"] = { | ||
"$lte": int(datetime.fromisoformat(query_filter.end_date).timestamp()) | ||
} | ||
|
||
return output | ||
|
||
def _process_metadata_for_storage(self, metadata: DocumentChunkMetadata) -> Dict: | ||
stored_metadata = {} | ||
if metadata.source: | ||
stored_metadata["source"] = metadata.source.value | ||
if metadata.source_id: | ||
stored_metadata["source_id"] = metadata.source_id | ||
if metadata.url: | ||
stored_metadata["url"] = metadata.url | ||
if metadata.created_at: | ||
stored_metadata["created_at"] = int( | ||
datetime.fromisoformat(metadata.created_at).timestamp() | ||
) | ||
if metadata.author: | ||
stored_metadata["author"] = metadata.author | ||
if metadata.document_id: | ||
stored_metadata["document_id"] = metadata.document_id | ||
|
||
return stored_metadata | ||
|
||
def _process_metadata_from_storage(self, metadata: Dict) -> DocumentChunkMetadata: | ||
return DocumentChunkMetadata( | ||
source=Source(metadata["source"]) if "source" in metadata else None, | ||
source_id=metadata.get("source_id", None), | ||
url=metadata.get("url", None), | ||
created_at=datetime.fromtimestamp(metadata["created_at"]).isoformat() | ||
if "created_at" in metadata | ||
else None, | ||
author=metadata.get("author", None), | ||
document_id=metadata.get("document_id", None), | ||
) | ||
|
||
async def _query(self, queries: List[QueryWithEmbedding]) -> List[QueryResult]: | ||
""" | ||
Takes in a list of queries with embeddings and filters and returns a list of query results with matching document chunks and scores. | ||
""" | ||
results = [ | ||
self._collection.query( | ||
query_embeddings=[query.embedding], | ||
include=["documents", "distances", "metadatas"], # embeddings | ||
n_results=min(query.top_k, self._collection.count()), | ||
where=( | ||
self._where_from_query_filter(query.filter) if query.filter else {} | ||
), | ||
) | ||
for query in queries | ||
] | ||
|
||
output = [] | ||
for query, result in zip(queries, results): | ||
inner_results = [] | ||
(ids,) = result["ids"] | ||
# (embeddings,) = result["embeddings"] | ||
(documents,) = result["documents"] | ||
(metadatas,) = result["metadatas"] | ||
(distances,) = result["distances"] | ||
for id_, text, metadata, distance in zip( | ||
ids, | ||
documents, | ||
metadatas, | ||
distances, # embeddings (https://github.com/openai/chatgpt-retrieval-plugin/pull/59#discussion_r1154985153) | ||
): | ||
inner_results.append( | ||
DocumentChunkWithScore( | ||
id=id_, | ||
text=text, | ||
metadata=self._process_metadata_from_storage(metadata), | ||
# embedding=embedding, | ||
score=distance, | ||
) | ||
) | ||
output.append(QueryResult(query=query.query, results=inner_results)) | ||
|
||
return output | ||
|
||
async def delete( | ||
self, | ||
ids: Optional[List[str]] = None, | ||
filter: Optional[DocumentMetadataFilter] = None, | ||
delete_all: Optional[bool] = None, | ||
) -> bool: | ||
""" | ||
Removes vectors by ids, filter, or everything in the datastore. | ||
Multiple parameters can be used at once. | ||
Returns whether the operation was successful. | ||
""" | ||
if delete_all: | ||
self._collection.delete() | ||
return True | ||
|
||
if ids and len(ids) > 0: | ||
if len(ids) > 1: | ||
where_clause = {"$or": [{"document_id": id_} for id_ in ids]} | ||
else: | ||
(id_,) = ids | ||
where_clause = {"document_id": id_} | ||
|
||
if filter: | ||
where_clause = { | ||
"$and": [self._where_from_query_filter(filter), where_clause] | ||
} | ||
elif filter: | ||
where_clause = self._where_from_query_filter(filter) | ||
|
||
self._collection.delete(where=where_clause) | ||
return True |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
[Chroma](https://trychroma.com) is an AI-native open-source embedding database designed to make it easy to work with embeddings. Chroma runs in-memory, or in a client-server setup. | ||
|
||
Install Chroma by running `pip install chromadb`. Once installed, the core API consists of four essential commands for creating collections, adding embeddings, documents, and metadata, and querying embeddings to find similar documents. Get started with Chroma by visiting the [Getting Started](https://docs.trychroma.com) page on their documentation website, or explore the open-source code on their [GitHub repository](https://github.com/chroma-core/chroma). | ||
|
||
**Chroma Environment Variables** | ||
|
||
To set up Chroma and start using it as your vector database provider, you need to define some environment variables to connect to your Chroma instance. | ||
|
||
**Chroma Datastore Environment Variables** | ||
|
||
Chroma runs _in-memory_ by default, with local persistence. It can also run in [self-hosted](https://docs.trychroma.com/usage-guide#running-chroma-in-clientserver-mode) client-server mode, with a fully managed hosted version coming soon. | ||
|
||
| Name | Required | Description | Default | | ||
| ------------------------ | -------- | -------------------------------------------------------------------------------------------------- | ---------------- | | ||
| `DATASTORE` | Yes | Datastore name. Set this to `chroma` | | | ||
| `BEARER_TOKEN` | Yes | Your secret token for authenticating requests to the API | | | ||
| `OPENAI_API_KEY` | Yes | Your OpenAI API key for generating embeddings | | | ||
| `CHROMA_COLLECTION` | Optional | Your chosen Chroma collection name to store your embeddings | openaiembeddings | | ||
| `CHROMA_IN_MEMORY` | Optional | If set to `True`, ignore `CHROMA_HOST` and `CHROMA_PORT` and just use an in-memory Chroma instance | `True` | | ||
| `CHROMA_PERSISTENCE_DIR` | Optional | If set, and `CHROMA_IN_MEMORY` is set, persist to and load from this directory. | `openai` | | ||
|
||
To run Chroma in self-hosted client-server mode, st the following variables: | ||
|
||
| Name | Required | Description | Default | | ||
| ------------- | -------- | --------------------------------------------------- | ------------------ | | ||
| `CHROMA_HOST` | Optional | Your Chroma instance host address (see notes below) | `http://127.0.0.1` | | ||
| `CHROMA_PORT` | Optional | Your Chroma port number | `8000` | | ||
|
||
> For **self-hosted instances**, if your instance is not at 127.0.0.1:8000, set `CHROMA_HOST` and `CHROMA_PORT` accordingly. For example: `CHROMA_HOST=http://localhost/` and `CHROMA_PORT=8080`. |
Oops, something went wrong.