Skip to content

Commit

Permalink
KXI-28991 initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
Charli Posner committed Aug 25, 2023
1 parent f6a7995 commit 808f96b
Show file tree
Hide file tree
Showing 122 changed files with 18,334 additions and 0 deletions.
459 changes: 459 additions & 0 deletions ChatGPT_Retrieval_Plugin_QA.ipynb

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions ChatGPT_Retrieval_Plugin_QA.ipynb:Zone.Identifier
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[ZoneTransfer]
ZoneId=3
HostUrl=https://files.slack.com/files-pri/T0HLFPUJE-F05J0S9UA8P/download/chatgpt_retrieval_plugin_qa.ipynb?origin_team=T0HLFPUJE
24 changes: 24 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@

FROM python:3.10 as requirements-stage

WORKDIR /tmp

RUN pip install poetry

COPY ./pyproject.toml ./poetry.lock* /tmp/


RUN poetry export -f requirements.txt --output requirements.txt --without-hashes

FROM python:3.10

WORKDIR /code

COPY --from=requirements-stage /tmp/requirements.txt /code/requirements.txt

RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt

COPY . /code/

# Heroku uses PORT, Azure App Services uses WEBSITES_PORT, Fly.io uses 8080 by default
CMD ["sh", "-c", "uvicorn server.main:app --host 0.0.0.0 --port ${PORT:-${WEBSITES_PORT:-8080}}"]
21 changes: 21 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
MIT License

Copyright (c) 2023 OpenAI

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
14 changes: 14 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Heroku
# make heroku-login
# make heroku-push

HEROKU_APP = <your app name>

heroku-push:
docker buildx build --platform linux/amd64 -t ${HEROKU_APP} .
docker tag ${HEROKU_APP} registry.heroku.com/${HEROKU_APP}/web
docker push registry.heroku.com/${HEROKU_APP}/web
heroku container:release web -a ${HEROKU_APP}

heroku-login:
heroku container:login
607 changes: 607 additions & 0 deletions README.md

Large diffs are not rendered by default.

Empty file added datastore/__init__.py
Empty file.
Binary file added datastore/__pycache__/__init__.cpython-310.pyc
Binary file not shown.
Binary file added datastore/__pycache__/datastore.cpython-310.pyc
Binary file not shown.
Binary file added datastore/__pycache__/factory.cpython-310.pyc
Binary file not shown.
86 changes: 86 additions & 0 deletions datastore/datastore.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
from abc import ABC, abstractmethod
from typing import Dict, List, Optional
import asyncio

from models.models import (
Document,
DocumentChunk,
DocumentMetadataFilter,
Query,
QueryResult,
QueryWithEmbedding,
)
from services.chunks import get_document_chunks
from services.openai import get_embeddings


class DataStore(ABC):
async def upsert(
self, documents: List[Document], chunk_token_size: Optional[int] = None
) -> List[str]:
"""
Takes in a list of documents and inserts them into the database.
First deletes all the existing vectors with the document id (if necessary, depends on the vector db), then inserts the new ones.
Return a list of document ids.
"""
# Delete any existing vectors for documents with the input document ids
await asyncio.gather(
*[
self.delete(
filter=DocumentMetadataFilter(
document_id=document.id,
),
delete_all=False,
)
for document in documents
if document.id
]
)

chunks = get_document_chunks(documents, chunk_token_size)

return await self._upsert(chunks)

@abstractmethod
async def _upsert(self, chunks: Dict[str, List[DocumentChunk]]) -> List[str]:
"""
Takes in a list of list of document chunks and inserts them into the database.
Return a list of document ids.
"""

raise NotImplementedError

async def query(self, queries: List[Query]) -> List[QueryResult]:
"""
Takes in a list of queries and filters and returns a list of query results with matching document chunks and scores.
"""
# get a list of of just the queries from the Query list
query_texts = [query.query for query in queries]
query_embeddings = get_embeddings(query_texts)
# hydrate the queries with embeddings
queries_with_embeddings = [
QueryWithEmbedding(**query.dict(), embedding=embedding)
for query, embedding in zip(queries, query_embeddings)
]
return await self._query(queries_with_embeddings)

@abstractmethod
async def _query(self, queries: List[QueryWithEmbedding]) -> List[QueryResult]:
"""
Takes in a list of queries with embeddings and filters and returns a list of query results with matching document chunks and scores.
"""
raise NotImplementedError

@abstractmethod
async def delete(
self,
ids: Optional[List[str]] = None,
filter: Optional[DocumentMetadataFilter] = None,
delete_all: Optional[bool] = None,
) -> bool:
"""
Removes vectors by ids, filter, or everything in the datastore.
Multiple parameters can be used at once.
Returns whether the operation was successful.
"""
raise NotImplementedError
72 changes: 72 additions & 0 deletions datastore/factory.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
from datastore.datastore import DataStore
import os


async def get_datastore() -> DataStore:
datastore = os.environ.get("DATASTORE")
assert datastore is not None

match datastore:
case "chroma":
from datastore.providers.chroma_datastore import ChromaDataStore

return ChromaDataStore()
case "llama":
from datastore.providers.llama_datastore import LlamaDataStore

return LlamaDataStore()

case "pinecone":
from datastore.providers.pinecone_datastore import PineconeDataStore

return PineconeDataStore()
case "weaviate":
from datastore.providers.weaviate_datastore import WeaviateDataStore

return WeaviateDataStore()
case "milvus":
from datastore.providers.milvus_datastore import MilvusDataStore

return MilvusDataStore()
case "zilliz":
from datastore.providers.zilliz_datastore import ZillizDataStore

return ZillizDataStore()
case "redis":
from datastore.providers.redis_datastore import RedisDataStore

return await RedisDataStore.init()
case "qdrant":
from datastore.providers.qdrant_datastore import QdrantDataStore

return QdrantDataStore()
case "azuresearch":
from datastore.providers.azuresearch_datastore import AzureSearchDataStore

return AzureSearchDataStore()
case "supabase":
from datastore.providers.supabase_datastore import SupabaseDataStore

return SupabaseDataStore()
case "postgres":
from datastore.providers.postgres_datastore import PostgresDataStore

return PostgresDataStore()
case "analyticdb":
from datastore.providers.analyticdb_datastore import AnalyticDBDataStore

return AnalyticDBDataStore()
case "elasticsearch":
from datastore.providers.elasticsearch_datastore import (
ElasticsearchDataStore,
)
return ElasticsearchDataStore()
case "kdbai":
from datastore.providers.kdbai_datastore import KDBAIDataStore

return KDBAIDataStore()
case _:
raise ValueError(
f"Unsupported vector database: {datastore}. "
f"Try one of the following: llama, elasticsearch, pinecone, weaviate, milvus, zilliz, redis, or qdrant"
)
Empty file added datastore/providers/__init__.py
Empty file.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading

0 comments on commit 808f96b

Please sign in to comment.