bjeans · bjeans · Dec 20, 2025 · Dec 18, 2025 · Dec 18, 2025 · Dec 18, 2025
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -34,6 +34,32 @@ Backend runs on port 8000, frontend on port 5173. If port 8000 is in use:
 lsof -ti:8000 | xargs kill -9    # Kill process on port 8000
 ```
 
+## Code Validation
+
+**IMPORTANT**: After making code changes, always validate that the Docker image still builds successfully:
+
+```bash
+docker compose build
+```
+
+This command:
+- Validates frontend code compiles (Vite build)
+- Validates backend dependencies install correctly
+- Ensures production build artifacts are created
+- Catches build-time errors before deployment
+
+Common build failures:
+- CSS class name errors (invalid Tailwind utilities)
+- TypeScript/JavaScript import errors
+- Python dependency conflicts
+- Missing environment variables in build process
+
+If the build fails, fix the errors before committing. A successful build should complete with:
+```
+✓ built in XXXms
+bjeans/multi-ai-chat:latest  Built
+```
+
 ## Architecture
 
 ### Backend Request Flow

diff --git a/backend/.env.example b/backend/.env.example
@@ -2,3 +2,7 @@ LITELLM_PROXY_URL=http://localhost:4000
 LITELLM_API_KEY=your-api-key-here
 DATABASE_URL=sqlite+aiosqlite:///./database.db
 CORS_ORIGINS=http://localhost:5173,http://localhost:3000
+
+# Cache TTL for server groups (in seconds)
+# Default: 120 (2 minutes)
+CACHE_TTL_SECONDS=120
diff --git a/backend/api/config.py b/backend/api/config.py
@@ -1,12 +1,41 @@
 from fastapi import APIRouter
-from typing import List
+from typing import List, Optional
 from pydantic import BaseModel
+import asyncio
+import time
+import os
+import logging
 
-from models.schemas import ModelInfo
+from models.schemas import ModelInfo, ServerGroup, SelectionAnalysis
 from services.litellm_client import LiteLLMClient
+from services.model_processor import process_models_with_health, analyze_selection
+
+logger = logging.getLogger(__name__)
 
 router = APIRouter(prefix="/api/config", tags=["config"])
 
+# Global cache for server groups with expiration
+_server_groups_cache: Optional[List[ServerGroup]] = None
+_cache_timestamp: Optional[float] = None
+_cache_lock = asyncio.Lock()
+
+# Parse cache TTL with error handling
+def _get_cache_ttl() -> int:
+    """Get cache TTL from environment with validation"""
+    default_ttl = 120
+    try:
+        ttl_str = os.getenv("CACHE_TTL_SECONDS", str(default_ttl))
+        ttl = int(ttl_str)
+        if ttl <= 0:
+            logger.warning(f"Invalid CACHE_TTL_SECONDS={ttl_str} (must be > 0), using default {default_ttl}")
+            return default_ttl
+        return ttl
+    except ValueError:
+        logger.warning(f"Invalid CACHE_TTL_SECONDS={os.getenv('CACHE_TTL_SECONDS')} (must be integer), using default {default_ttl}")
+        return default_ttl
+
+CACHE_TTL_SECONDS = _get_cache_ttl()
+
 
 class TestModelRequest(BaseModel):
     model_id: str
@@ -39,3 +68,50 @@ async def test_model(request: TestModelRequest):
         model_id=request.model_id,
         available=available,
     )
+
+
+@router.get("/models/by-server", response_model=List[ServerGroup])
+async def get_models_by_server():
+    """
+    Get models grouped by Ollama server with health status and size info
+    """
+    global _server_groups_cache, _cache_timestamp
+
+    # Use lock to prevent race conditions on cache access
+    async with _cache_lock:
+        now = time.time()
+
+        # Return cached data if valid and not expired
+        if _server_groups_cache is not None and _cache_timestamp is not None:
+            if now - _cache_timestamp < CACHE_TTL_SECONDS:
+                return _server_groups_cache
+
+        # Fetch fresh data
+        client = LiteLLMClient()
+        raw_data = await client.get_model_info()
+
+        server_groups = await process_models_with_health(raw_data)
+
+        # Update cache with timestamp
+        _server_groups_cache = server_groups
+        _cache_timestamp = now
+
+        return server_groups
+
+
+class AnalyzeSelectionRequest(BaseModel):
+    model_ids: List[str]
+
+
+@router.post("/models/analyze-selection", response_model=SelectionAnalysis)
+async def analyze_model_selection(request: AnalyzeSelectionRequest):
+    """
+    Analyze selected models for resource conflicts and provide recommendations
+    """
+    # Get current server groups
+    server_groups = await get_models_by_server()
+
+    # Analyze selection
+    analysis = await analyze_selection(request.model_ids, server_groups)
+
+    return analysis
diff --git a/backend/models/schemas.py b/backend/models/schemas.py
@@ -16,6 +16,96 @@ class ModelInfo(BaseModel):
     provider: Optional[str] = None
 
 
+class ModelHealth(BaseModel):
+    status: str  # "healthy", "unhealthy", "unknown"
+    healthy_count: int
+    unhealthy_count: int
+    response_time_ms: float
+    last_checked: Optional[datetime]
+    error_message: Optional[str] = None
+
+
+class ModelSize(BaseModel):
+    parameters: str  # "70B", "32B", "3B", etc.
+    parameters_billions: float  # Numeric for sorting
+    estimated_memory_gb: int  # Rough estimate for warnings
+    size_tier: str  # "tiny" (<2B), "small" (2-10B), "medium" (10-30B), "large" (30B+)
+
+
+class OllamaServerInfo(BaseModel):
+    api_base: str
+    host: str
+    tpm: int
+    rpm: int
+    performance_tier: str
+    health_status: str = "unknown"
+    model_count: int = 0
+    selected_model_count: int = 0
+    total_selected_memory_gb: int = 0
+
+
+class ModelInfoDetailed(BaseModel):
+    id: str
+    display_name: str
+    base_model: str
+    actual_tag: Optional[str] = None
+    is_latest_alias: bool = False
+    resolves_to: Optional[str] = None
+
+    # Server information
+    api_base: str
+    server_host: str
+    server_tpm: int
+    server_rpm: int
+
+    # Metadata
+    provider: str
+    model_family: str
+    model_category: str
+
+    # Model specs
+    size: ModelSize
+    health: Optional[ModelHealth] = None
+    max_tokens: int = 4096
+    supports_function_calling: bool = False
+
+    # Duplication info
+    is_duplicate: bool = False
+    better_server: Optional[str] = None
+    duplicate_count: int = 1
+
+
+class ServerGroup(BaseModel):
+    server: OllamaServerInfo
+    models: List[ModelInfoDetailed]
+    warnings: List[str] = []
+    recommendations: List[str] = []
+
+
+class SelectionWarning(BaseModel):
+    severity: str  # "high", "medium", "info"
+    server: str
+    message: str
+    models: Optional[List[str]] = None
+    estimated_total_memory: Optional[str] = None
+
+
+class SelectionRecommendation(BaseModel):
+    type: str
+    model: str
+    from_server: str
+    to_server: str
+    reason: str
+
+
+class SelectionAnalysis(BaseModel):
+    warnings: List[SelectionWarning]
+    recommendations: List[SelectionRecommendation]
+    total_models_selected: int
+    servers_used: int
+    diversity_score: int
+
+
 class ResponseSchema(BaseModel):
     id: int
     decision_id: int

diff --git a/backend/services/litellm_client.py b/backend/services/litellm_client.py
@@ -52,6 +52,34 @@ async def get_available_models(self) -> List[Dict[str, any]]:
             print(f"Error fetching models: {e}")
             return []
 
+    async def get_model_info(self) -> dict:
+        """Fetch detailed model info from /v1/model/info endpoint"""
+        try:
+            async with httpx.AsyncClient(timeout=self.timeout, verify=False) as client:
+                response = await client.get(
+                    f"{self.base_url}/v1/model/info",
+                    headers=self._get_headers()
+                )
+                response.raise_for_status()
+                return response.json()
+        except Exception as e:
+            print(f"Error fetching model info: {e}")
+            return {"data": []}
+
+    async def get_health_status(self) -> dict:
+        """Fetch health status from /health/latest endpoint"""
+        try:
+            async with httpx.AsyncClient(timeout=self.timeout, verify=False) as client:
+                response = await client.get(
+                    f"{self.base_url}/health/latest",
+                    headers=self._get_headers()
+                )
+                response.raise_for_status()
+                return response.json()
+        except Exception as e:
+            print(f"Error fetching health status: {e}")
+            return {"latest_health_checks": {}, "total_models": 0}
+
     async def test_model(self, model_id: str) -> bool:
         """Test if a model is available and responding"""
         try: