diff --git a/README.md b/README.md index 6af3a5251..55dc54032 100644 --- a/README.md +++ b/README.md @@ -1 +1,196 @@ -# rag-tutorial-v2 +# RAG Tutorial v2 + +This project demonstrates a Retrieval-Augmented Generation (RAG) pipeline using LangChain, ChromaDB, and Ollama for local LLM and embedding inference. It features both a command-line interface and a **minimalistic web-based chat UI** built with Streamlit. + +## 🚀 Quick Start - Web Interface + +1. Make sure you have followed all setup steps in the sections below. +2. Start the Ollama server (if not already running): + ``` + ollama serve + ``` +3. Launch the minimalistic chat interface: + ``` + streamlit run app.py + ``` +4. Open your browser to the provided URL (usually `http://localhost:8501`) +5. Upload PDF documents directly through the web interface or use existing files +6. Start chatting with your documents! + +## 🖥️ Command Line Interface + +For traditional command-line usage: + +1. (Optional) Add or update PDF files in the `data/` directory. +2. Populate the database: + ``` + python populate_database.py + ``` +3. Run a query: + ``` + python query_data.py "Your question here" + ``` + +## ✨ Features + +### Web Interface (app.py) +- **Minimalistic Design**: Clean, distraction-free chat interface +- **File Upload**: Drag-and-drop PDF upload with real-time processing +- **Interactive Chat**: Conversation history with user-friendly message bubbles +- **Collapsible Upload**: Hide/show document upload section to focus on chat +- **Real-time Processing**: Instant document ingestion and querying +- **Settings Panel**: Clear chat history and reset database +- **Responsive Design**: Works well on desktop and mobile devices + +### Core RAG Pipeline +- **Document Processing**: Automatic PDF text extraction and chunking +- **Vector Database**: ChromaDB for efficient similarity search +- **Local LLM**: Mistral model via Ollama (no API keys required) +- **Local Embeddings**: Nomic-embed-text model for text embeddings +- **Smart Retrieval**: Context-aware document retrieval for accurate answers + +## 📁 Project Structure + +``` +rag-tutorial-v2/ +├── app.py # Streamlit web interface (minimalistic design) +├── query_data.py # Command-line querying +├── populate_database.py # Database population utilities +├── get_embedding_function.py # Embedding model configuration +├── test_rag.py # Testing utilities +├── requirements.txt # Python dependencies +├── data/ # PDF documents directory +│ └── ath.pdf # Example document +└── chroma/ # ChromaDB vector store + └── chroma.sqlite3 # Vector database file +``` + +## 🎨 UI Design Features + +The Streamlit interface features a **minimalistic design philosophy**: + +- **Clean Layout**: Centered layout with optimal reading width +- **Minimal Visual Clutter**: Hidden Streamlit branding and unnecessary elements +- **Modern Chat Bubbles**: Distinct styling for user and assistant messages +- **Smooth Interactions**: Hover effects and transitions for better UX +- **Collapsible Sections**: Upload area can be hidden to focus on conversation +- **Color Scheme**: Subtle grays and blues for a professional appearance +- **Typography**: Clean, readable fonts with proper spacing +- **Responsive**: Adapts to different screen sizes + +## Prerequisites +- Python 3.10+ +- [Ollama](https://ollama.com/) installed and running locally +- (Optional) AWS credentials if using Bedrock embeddings + +## Setup Instructions + +### 1. Clone the Repository +``` +git clone +cd rag-tutorial-v2 +``` + +### 2. Install Python Dependencies +``` +pip install -r requirements.txt +``` + +### 3. Download Ollama Models +You need the following models: +- `nomic-embed-text` (for embeddings) +- `mistral` (for LLM) + +Pull them using: +``` +ollama pull nomic-embed-text +ollama pull mistral +``` + +### 4. Add Your PDF Files +Place your PDF files in the `data/` directory. Example: +``` +data/ + monopoly.pdf + ticket_to_ride.pdf + your_file.pdf +``` + +### 5. Populate the Vector Database +This step processes all PDFs and creates the Chroma vector store: +``` +python populate_database.py +``` + +### 6. Query the Data +Ask questions using the RAG pipeline: +``` +python query_data.py "Your question here" +``` +Example: +``` +python query_data.py "How much total money does a player start with in Monopoly?" +``` + +## Usage Examples + +### Web Interface +1. **Upload a Document**: Click "Add Document" and select a PDF file +2. **Ask Questions**: Type your question in the chat input +3. **View Responses**: See AI responses with context from your documents +4. **Manage Chat**: Use sidebar to clear chat or reset database + +### Command Line +```bash +# Process documents +python populate_database.py + +# Ask questions +python query_data.py "What is the main topic of the document?" +python query_data.py "How much money does each player start with?" +python query_data.py "What are the rules for passing GO?" +``` + +## Updating Data + +### Web Interface +- Simply upload new PDF files through the web interface +- Documents are automatically processed and added to the vector database +- No manual database population needed + +### Command Line +If you add new PDFs to the `data/` directory, run: +```bash +python populate_database.py +``` + +## Notes +- If you see deprecation warnings, consider updating imports as suggested in the warnings. +- To use AWS Bedrock embeddings, update `get_embedding_function.py` and configure your AWS credentials. + +## Troubleshooting + +### Common Issues +- **Ollama Connection**: Ensure Ollama is running (`ollama serve`) +- **Model Not Found**: Pull required models (`ollama pull mistral` and `ollama pull nomic-embed-text`) +- **Streamlit Port**: If port 8501 is busy, Streamlit will suggest an alternative +- **File Upload**: Ensure PDF files are valid and not password-protected +- **Memory Issues**: For large documents, consider splitting them into smaller files + +### Performance Tips +- Use the web interface for better user experience +- Upload smaller PDF files for faster processing +- Clear the database periodically if it becomes too large +- Close the upload section to focus on chat interface + +## Tech Stack + +- **Frontend**: Streamlit with custom CSS for minimalistic design +- **Backend**: Python with LangChain framework +- **Vector Database**: ChromaDB for similarity search +- **LLM**: Mistral via Ollama (local inference) +- **Embeddings**: Nomic-embed-text (local embeddings) +- **Document Processing**: PyPDF for PDF text extraction + +## License +MIT diff --git a/app.py b/app.py new file mode 100644 index 000000000..32aaf9e88 --- /dev/null +++ b/app.py @@ -0,0 +1,192 @@ + +import streamlit as st +import os +import tempfile +from populate_database import ingest_file, clear_database +from query_data import query_rag + +# Minimalistic page config +st.set_page_config( + page_title="RAG Chat", + page_icon="💬", + layout="centered", + initial_sidebar_state="collapsed" +) + +# Custom CSS for minimalistic design +st.markdown(""" + +""", unsafe_allow_html=True) + +# Initialize session state +if 'chat_history' not in st.session_state: + st.session_state['chat_history'] = [] +if 'show_upload' not in st.session_state: + st.session_state['show_upload'] = True + +# App title - minimalistic +st.markdown("

💬 RAG Chat

", unsafe_allow_html=True) + +# Toggle upload section +col1, col2, col3 = st.columns([1, 2, 1]) +with col2: + if st.button("📄 " + ("Hide Upload" if st.session_state['show_upload'] else "Add Document"), + use_container_width=True, type="secondary"): + st.session_state['show_upload'] = not st.session_state['show_upload'] + +# Upload section - collapsible +if st.session_state['show_upload']: + with st.container(): + st.markdown('
', unsafe_allow_html=True) + st.markdown("##### 📄 Upload PDF Document") + uploaded_file = st.file_uploader("", type=["pdf"], label_visibility="collapsed") + + if uploaded_file is not None: + temp_dir = tempfile.gettempdir() + file_path = os.path.join(temp_dir, uploaded_file.name) + with open(file_path, "wb") as f: + f.write(uploaded_file.getbuffer()) + + with st.spinner("Processing document..."): + result = ingest_file(file_path) + st.success(f"✅ {uploaded_file.name} added successfully") + # Auto-hide upload section after successful upload + st.session_state['show_upload'] = False + st.rerun() + + st.markdown('
', unsafe_allow_html=True) + +# Chat Interface +st.markdown("---") + +# Chat input at the bottom +with st.form(key="chat_form", clear_on_submit=True): + col1, col2 = st.columns([4, 1]) + with col1: + user_input = st.text_input("", placeholder="Ask a question about your documents...", label_visibility="collapsed") + with col2: + submit_button = st.form_submit_button("Send", use_container_width=True) + +if submit_button and user_input: + with st.spinner("Thinking..."): + try: + answer = query_rag(user_input) + except Exception as e: + answer = f"I encountered an error: {str(e)}" + + st.session_state['chat_history'].append((user_input, answer)) + st.rerun() + +# Display chat history with minimalistic design +if st.session_state['chat_history']: + st.markdown("### Conversation") + + # Reverse order to show latest messages first + for i, (question, answer) in enumerate(reversed(st.session_state['chat_history'])): + # User message + st.markdown(f""" +
+ You: {question} +
+ """, unsafe_allow_html=True) + + # Assistant message + st.markdown(f""" +
+ Assistant: {answer} +
+ """, unsafe_allow_html=True) + + if i < len(st.session_state['chat_history']) - 1: + st.markdown("
", unsafe_allow_html=True) + +# Settings in sidebar for advanced users +with st.sidebar: + st.markdown("### ⚙️ Settings") + + if st.button("🗑️ Clear Chat", use_container_width=True): + st.session_state['chat_history'] = [] + st.rerun() + + st.markdown("---") + + if st.button("�️ Clear Database", use_container_width=True, type="secondary"): + with st.spinner("Clearing database..."): + clear_database() + st.session_state['chat_history'] = [] + st.success("Database cleared!") + st.rerun() + + st.markdown("---") + st.markdown("*Made with Streamlit*") diff --git a/data/ath.pdf b/data/ath.pdf new file mode 100644 index 000000000..d12ee114c Binary files /dev/null and b/data/ath.pdf differ diff --git a/data/monopoly.pdf b/data/monopoly.pdf deleted file mode 100644 index 62700472c..000000000 Binary files a/data/monopoly.pdf and /dev/null differ diff --git a/data/ticket_to_ride.pdf b/data/ticket_to_ride.pdf deleted file mode 100644 index 35f5a87fa..000000000 Binary files a/data/ticket_to_ride.pdf and /dev/null differ diff --git a/get_embedding_function.py b/get_embedding_function.py index 79d04113b..88e2da4af 100644 --- a/get_embedding_function.py +++ b/get_embedding_function.py @@ -3,8 +3,8 @@ def get_embedding_function(): - embeddings = BedrockEmbeddings( - credentials_profile_name="default", region_name="us-east-1" - ) - # embeddings = OllamaEmbeddings(model="nomic-embed-text") + # embeddings = BedrockEmbeddings( + # credentials_profile_name="default", region_name="us-east-1" + # ) + embeddings = OllamaEmbeddings(model="nomic-embed-text") return embeddings diff --git a/populate_database.py b/populate_database.py index 3d2a1ab8a..77898bc7f 100644 --- a/populate_database.py +++ b/populate_database.py @@ -1,3 +1,33 @@ +def ingest_file(file_path): + """Ingest a single PDF file into the vector database.""" + from langchain_community.document_loaders import PyPDFLoader + from langchain_text_splitters import RecursiveCharacterTextSplitter + from langchain.vectorstores.chroma import Chroma + from get_embedding_function import get_embedding_function + + loader = PyPDFLoader(file_path) + documents = loader.load() + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=800, + chunk_overlap=80, + length_function=len, + is_separator_regex=False, + ) + chunks = text_splitter.split_documents(documents) + db = Chroma( + persist_directory=CHROMA_PATH, embedding_function=get_embedding_function() + ) + chunks_with_ids = calculate_chunk_ids(chunks) + existing_items = db.get(include=[]) + existing_ids = set(existing_items["ids"]) + new_chunks = [chunk for chunk in chunks_with_ids if chunk.metadata["id"] not in existing_ids] + if new_chunks: + new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks] + db.add_documents(new_chunks, ids=new_chunk_ids) + db.persist() + return f"Added {len(new_chunks)} new chunks from {os.path.basename(file_path)}" + else: + return f"No new content to add from {os.path.basename(file_path)}" import argparse import os import shutil diff --git a/query_data.py b/query_data.py index 33299e582..e35af3f7f 100644 --- a/query_data.py +++ b/query_data.py @@ -1,7 +1,10 @@ + import argparse -from langchain.vectorstores.chroma import Chroma from langchain.prompts import ChatPromptTemplate from langchain_community.llms.ollama import Ollama +from langchain_community.vectorstores import Chroma +from langchain_ollama import OllamaEmbeddings, OllamaLLM +from langchain_chroma import Chroma from get_embedding_function import get_embedding_function @@ -18,13 +21,14 @@ """ + def main(): # Create CLI. parser = argparse.ArgumentParser() parser.add_argument("query_text", type=str, help="The query text.") args = parser.parse_args() query_text = args.query_text - query_rag(query_text) + print(query_rag(query_text)) def query_rag(query_text: str): @@ -40,7 +44,8 @@ def query_rag(query_text: str): prompt = prompt_template.format(context=context_text, question=query_text) # print(prompt) - model = Ollama(model="mistral") + # model = Ollama(model="mistral") + model = Ollama(model="mistral:7b-instruct-q4_0") # Smaller, faster model response_text = model.invoke(prompt) sources = [doc.metadata.get("id", None) for doc, _score in results] diff --git a/requirements.txt b/requirements.txt index b290b554c..23ea9ad1b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,5 @@ +# Streamlit for web UI +streamlit pypdf langchain chromadb # Vector storage