augcog · FranardoHuang · Jan 30, 2026 · Nov 24, 2025 · Nov 24, 2025 · Dec 5, 2025
diff --git a/ai_chatbot_backend/.env.example b/ai_chatbot_backend/.env.example
@@ -30,13 +30,36 @@ DATA_DIR=data
 
 # LLM Configuration
 # Options: local, remote, mock
-# local = use local model (requires GPU and model files)
-# remote = use remote API endpoint
+# local = connects to external vLLM servers (OpenAI-compatible API)
+# remote = use legacy remote API endpoint
 # mock = use mock responses for testing
 llm_mode=mock
 # URL for remote model API (used when llm_mode=remote)
 remote_model_url=https://tai.berkeley.edu/api/chat
 
+# vLLM Server Configuration (used when llm_mode=local)
+# These settings configure connections to external vLLM servers running OpenAI-compatible APIs
+# The backend server can run on a different machine from the vLLM servers
+
+# vLLM Chat Server (main LLM for chat completions)
+VLLM_CHAT_URL=http://localhost:8001/v1
+VLLM_CHAT_MODEL=cpatonn/Qwen3-30B-A3B-Thinking-2507-AWQ-4bit
+
+# vLLM Embedding Server (for RAG retrieval)
+VLLM_EMBEDDING_URL=http://localhost:8002/v1
+VLLM_EMBEDDING_MODEL=Qwen/Qwen3-Embedding-4B
+
+# vLLM Whisper Server (for speech-to-text)
+VLLM_WHISPER_URL=http://localhost:8003/v1
+VLLM_WHISPER_MODEL=openai/whisper-large-v3
+
+# vLLM TTS Server (for text-to-speech, optional)
+VLLM_TTS_URL=http://localhost:8004/v1
+VLLM_TTS_MODEL=
+
+# API key for all vLLM servers (use 'EMPTY' if no authentication required)
+VLLM_API_KEY=EMPTY
+
 # MongoDB Configuration
 # MongoDB connection URI for cloud data storage
 MONGODB_URI=mongodb+srv://username:[email protected]/?retryWrites=true&w=majority&appName=yourapp

diff --git a/ai_chatbot_backend/README.md b/ai_chatbot_backend/README.md
@@ -21,20 +21,24 @@ make db-init
 make dev
 ```
 
-### Production Setup (Linux with NVIDIA GPU)
+### Production Setup
+
+The backend connects to external vLLM servers for AI model inference.
 
 ```bash
-# Install with GPU dependencies (vLLM)
-make install-gpu
+# 1. Start vLLM servers (follow instructions in docs/vllm-setup.md)
 
-# Configure environment for production
+# 2. Install backend dependencies
+make install
+
+# 3. Configure environment for production
 cp .env.example .env
-# Edit .env: set environment=production, configure MongoDB URI
+# Edit .env: set environment=production, configure MongoDB URI and vLLM server URLs
 
-# Initialize database
+# 4. Initialize database
 make db-init
 
-# Start production server
+# 5. Start production server
 make server
 ```
 
@@ -44,9 +48,26 @@ Visit `http://localhost:8000` for API documentation.
 
 - **Framework**: FastAPI with SQLAlchemy ORM
 - **Database**: Hybrid SQLite (local) + MongoDB (cloud backup)
-- **AI Models**: BGE-M3 embeddings, vLLM (production), OpenAI/Remote APIs
+- **AI Models**: Qwen3-Embedding, vLLM servers (OpenAI-compatible API)
 - **Authentication**: JWT token-based with Google OAuth support
 
+### Distributed Architecture
+
+The backend is designed to run separately from the AI model servers:
+
+```
+┌─────────────────────┐     ┌─────────────────────────────────┐
+│   Backend Server    │     │      GPU Server (vLLM)          │
+│   (FastAPI)         │────▶│  - Chat Model (:8001)           │
+│   - API endpoints   │     │  - Embedding Model (:8002)      │
+│   - RAG pipeline    │     │  - Whisper Model (:8003)        │
+│   - File services   │     │  - TTS Model (:8004, optional)  │
+└─────────────────────┘     └─────────────────────────────────┘
+```
+
+This allows running the backend on a lightweight server while GPU-intensive
+model inference runs on dedicated hardware.
+
 ## 📦 Installation Options
 
 ```bash
@@ -74,15 +95,38 @@ MONGODB_URI=mongodb+srv://user:[email protected]/
 MONGODB_ENABLED=true
 
 # LLM Configuration
-llm_mode=remote              # local (vLLM), remote (API), mock (dev)
-remote_model_url=https://your-api-endpoint.com
+llm_mode=local               # local (vLLM servers), remote (legacy API), mock (dev)
 
 # Server
 HOST=127.0.0.1
 PORT=8000
 RELOAD=true                  # false for production
 ```
 
+### vLLM Server Configuration
+
+When `llm_mode=local`, the backend connects to external vLLM servers via OpenAI-compatible APIs.
+Configure the server URLs in your `.env` file:
+
+```bash
+# vLLM Server URLs (default: localhost, change IP for remote GPU servers)
+VLLM_CHAT_URL=http://localhost:8001/v1        # Main chat model
+VLLM_EMBEDDING_URL=http://localhost:8002/v1   # Embedding model for RAG
+VLLM_WHISPER_URL=http://localhost:8003/v1     # Speech-to-text
+VLLM_TTS_URL=http://localhost:8004/v1         # Text-to-speech (optional)
+
+# API key for vLLM servers (use 'EMPTY' if no auth required)
+VLLM_API_KEY=EMPTY
+
+# Model IDs (must match models loaded on vLLM servers)
+VLLM_CHAT_MODEL=cpatonn/Qwen3-30B-A3B-Thinking-2507-AWQ-4bit
+VLLM_EMBEDDING_MODEL=Qwen/Qwen3-Embedding-4B
+VLLM_WHISPER_MODEL=openai/whisper-large-v3
+```
+
+**Running on separate machines:** Replace `localhost` with the GPU server's IP address.
+See [4090modelservice.md](docs/vllm-setup.md) for instructions on starting vLLM servers.
+
 ## 🛠️ Development Commands
 
 ```bash
@@ -130,12 +174,18 @@ python scripts/initialize_db_and_files.py --force
 
 | Endpoint | Method | Description |
 |----------|--------|-------------|
+| `/api/chat/completions` | POST | Chat completions with RAG (streaming) |
+| `/api/chat/tts` | POST | Text-to-speech conversion |
+| `/api/chat/voice_to_text` | POST | Speech-to-text transcription |
 | `/api/courses` | GET | List courses with pagination |
 | `/api/files` | GET | List files with filtering |
 | `/api/files/{file_id}/download` | GET | Download file by UUID |
-| `/api/completions` | POST | Generate chat completions with RAG |
+| `/api/files/{file_id}/extra_info` | GET | File sections and concepts |
+| `/api/files/browse` | GET | Directory browser |
+| `/api/problems` | GET | List problems by file |
 | `/admin/` | GET | Database administration interface |
 | `/health` | GET | Health check |
+| `/database-status` | GET | Database initialization status |
 
 ## 🚀 Production Deployment
 
@@ -167,12 +217,18 @@ poetry run gunicorn -w 4 -k uvicorn.workers.UvicornWorker main:app
 ```bash
 # Required for production
 environment=production
-llm_mode=local              # Uses vLLM for local inference
+llm_mode=local              # Connects to vLLM servers
 MONGODB_URI=mongodb+srv://...
 DATA_DIR=/path/to/course/files
 RELOAD=false
 LOG_LEVEL=info
 HOST=0.0.0.0
+
+# vLLM servers (change localhost to remote IP if servers are on different machines)
+VLLM_CHAT_URL=http://localhost:8001/v1
+VLLM_EMBEDDING_URL=http://localhost:8002/v1
+VLLM_WHISPER_URL=http://localhost:8003/v1
+VLLM_API_KEY=your-secure-api-key
 ```
 
 ## 🔧 Troubleshooting
@@ -213,6 +269,7 @@ python -c "import torch; print(torch.cuda.is_available())"
 
 ## 📚 Documentation
 
-- [TAI Project Overview](../../README.md)
+- [TAI Project Overview](../README.md)
+- [vLLM Server Setup Guide](docs/vllm-setup.md)
 - [FastAPI Documentation](https://fastapi.tiangolo.com/)
 - [Development Commands Reference](./Makefile)
diff --git a/ai_chatbot_backend/app/api/routes/completions.py b/ai_chatbot_backend/app/api/routes/completions.py
@@ -155,18 +155,16 @@ async def voice_to_text(
     """
     Endpoint for converting voice messages to text.
     """
-    # Get the pre-initialized Whisper model engine
-    whisper_engine = get_whisper_engine()
-
     # Convert audio message to text
     if params.stream:
-        stream = await audio_to_text(params.audio, whisper_engine, stream=params.stream, sample_rate=24000)
+        # Use async streaming transcription
         return StreamingResponse(
-            audio_stream_parser(stream), media_type="text/event-stream"
+            audio_stream_parser(params.audio, sample_rate=24000), media_type="text/event-stream"
         )
     else:
-        transcription = audio_to_text(
-            params.audio, whisper_engine, stream=params.stream, sample_rate=24000)
+        # Use synchronous transcription
+        whisper_engine = get_whisper_engine()
+        transcription = audio_to_text(params.audio, whisper_engine, sample_rate=24000)
         return JSONResponse(AudioTranscript(text=transcription).model_dump_json(exclude_unset=True))
 
 
@@ -206,17 +204,14 @@ async def create_or_update_memory_synopsis(
         JSON response with memory_synopsis_sid if successful, error message if failed
     """
     try:
-        # Get the pre-initialized pipeline
+        # Get the pre-initialized pipeline (OpenAI client)
         engine = get_model_engine()
 
-        # Import TOKENIZER from rag_generation
-        from app.services.rag_generation import TOKENIZER
-
         # Initialize memory synopsis service
         service = MemorySynopsisService()
 
         # Create or update memory synopsis
-        memory_synopsis_sid = await service.create_or_update_memory(sid, format_chat_msg(messages), engine, TOKENIZER)
+        memory_synopsis_sid = await service.create_or_update_memory(sid, format_chat_msg(messages), engine)
 
         if memory_synopsis_sid:
             return JSONResponse({

diff --git a/ai_chatbot_backend/app/config.py b/ai_chatbot_backend/app/config.py
@@ -37,6 +37,53 @@ class Settings(BaseSettings):
     )
     remote_model_url: str = Field(description="URL for remote model API")
 
+    # vLLM Server Configuration
+    vllm_chat_url: str = Field(
+        default="http://localhost:8001/v1",
+        description="vLLM server URL for chat/responses API",
+        alias="VLLM_CHAT_URL"
+    )
+    vllm_whisper_url: str = Field(
+        default="http://localhost:8003/v1",
+        description="vLLM server URL for Whisper transcription API",
+        alias="VLLM_WHISPER_URL"
+    )
+    vllm_embedding_url: str = Field(
+        default="http://localhost:8002/v1",
+        description="vLLM server URL for embeddings API",
+        alias="VLLM_EMBEDDING_URL"
+    )
+    vllm_api_key: str = Field(
+        default="EMPTY",
+        description="API key for vLLM servers (set 'EMPTY' if no auth required)",
+        alias="VLLM_API_KEY"
+    )
+    vllm_tts_url: str = Field(
+        default="http://localhost:8004/v1",
+        description="vLLM server URL for TTS (audio generation) API",
+        alias="VLLM_TTS_URL"
+    )
+    vllm_tts_model: str = Field(
+        default="",
+        description="Model ID for vLLM TTS server (leave empty to auto-detect)",
+        alias="VLLM_TTS_MODEL"
+    )
+    vllm_chat_model: str = Field(
+        default="cpatonn/Qwen3-30B-A3B-Thinking-2507-AWQ-4bit",
+        description="Model ID for vLLM chat server",
+        alias="VLLM_CHAT_MODEL"
+    )
+    vllm_whisper_model: str = Field(
+        default="openai/whisper-large-v3",
+        description="Model ID for vLLM Whisper server",
+        alias="VLLM_WHISPER_MODEL"
+    )
+    vllm_embedding_model: str = Field(
+        default="Qwen/Qwen3-Embedding-4B",
+        description="Model ID for vLLM embedding server",
+        alias="VLLM_EMBEDDING_MODEL"
+    )
+
     admin_token: str = Field(
         description="Admin token required for course management endpoints. Must be set in .env file."
     )