diff --git a/.gitignore b/.gitignore index ea09c3ecf..5ab5d45ac 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ *.pyc .DS_Store backup -chroma \ No newline at end of file +chroma +venv diff --git a/README.md b/README.md index 6af3a5251..4af16f6cf 100644 --- a/README.md +++ b/README.md @@ -1 +1,14 @@ # rag-tutorial-v2 + +## Getting Started +Before running, ensure that ollama has been installed. If not, please install it [here](https://ollama.com/download). After that you **must** run `ollama serve`. + +1. `python -m venv venv` +2. `source venv/bin/activate` +3. `pip install -r requirements.txt` +4. `python ./populate_database.py` +5. `python ./query_data.py "How much each players get for each round in Monopoly?"` + +## Update document +1. Place the pdf file inside the `data` +2. `python ./populate_database.py` diff --git a/get_embedding_function.py b/get_embedding_function.py index 79d04113b..079867376 100644 --- a/get_embedding_function.py +++ b/get_embedding_function.py @@ -3,8 +3,8 @@ def get_embedding_function(): - embeddings = BedrockEmbeddings( - credentials_profile_name="default", region_name="us-east-1" - ) - # embeddings = OllamaEmbeddings(model="nomic-embed-text") + # embeddings = BedrockEmbeddings( + # credentials_profile_name="default", region_name="us-east-1" + # ) + embeddings = OllamaEmbeddings(model="mxbai-embed-large") return embeddings diff --git a/populate_database.py b/populate_database.py index 3d2a1ab8a..966643f92 100644 --- a/populate_database.py +++ b/populate_database.py @@ -1,11 +1,11 @@ import argparse import os import shutil -from langchain.document_loaders.pdf import PyPDFDirectoryLoader +from langchain_community.document_loaders.pdf import PyPDFDirectoryLoader from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain.schema.document import Document from get_embedding_function import get_embedding_function -from langchain.vectorstores.chroma import Chroma +from langchain_chroma import Chroma CHROMA_PATH = "chroma" @@ -67,7 +67,6 @@ def add_to_chroma(chunks: list[Document]): print(f"👉 Adding new documents: {len(new_chunks)}") new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks] db.add_documents(new_chunks, ids=new_chunk_ids) - db.persist() else: print("✅ No new documents to add") diff --git a/query_data.py b/query_data.py index 33299e582..482ef57a6 100644 --- a/query_data.py +++ b/query_data.py @@ -1,5 +1,5 @@ import argparse -from langchain.vectorstores.chroma import Chroma +from langchain_chroma import Chroma from langchain.prompts import ChatPromptTemplate from langchain_community.llms.ollama import Ollama diff --git a/requirements.txt b/requirements.txt index b290b554c..d5c081c86 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ pypdf langchain -chromadb # Vector storage +langchain_community +langchain_chroma pytest boto3