diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 37c06a7..98d9d05 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -25,6 +25,7 @@ jobs: run: | cd backend python -m pip install --upgrade pip + pip install --no-cache-dir torch==2.4.1+cpu torchaudio==2.4.1+cpu --index-url https://download.pytorch.org/whl/cpu pip install -r requirements.txt - name: Run tests diff --git a/.gitignore b/.gitignore index 993dad7..ed0cd39 100644 --- a/.gitignore +++ b/.gitignore @@ -162,11 +162,6 @@ docker-compose.override.yml # ------------------------- .DS_Store -backend/storage/* -!backend/storage/.gitkeep - -backend/models/* -!backend/models/.gitkeep - -backend/whisper/* -!backend/whisper/.gitkeep +# Faiss index and metadata files +backend/storage/faiss.index +backend/storage/metadata.json diff --git a/backend/app/embeddings/generate.py b/backend/app/embeddings/generate.py index fb5ee98..9fe5211 100644 --- a/backend/app/embeddings/generate.py +++ b/backend/app/embeddings/generate.py @@ -72,63 +72,61 @@ def _get_clip_model() -> torch.nn.Module: local_path = next((p for p in [env_path, repo_local, alt_repo_local] if p and os.path.exists(p)), None) if local_path: - # Check if it's a TorchScript model or state dict - # TorchScript models may have hardcoded CUDA references, so we need to be careful + # Check file size to detect TorchScript (they're usually small serialized format) + # TorchScript models may have hardcoded CUDA references, so we skip them by default + import stat try: - # Try loading as TorchScript first (always force CPU to avoid CUDA issues) - model = torch.jit.load(local_path, map_location="cpu") - # For TorchScript, we still need preprocess - _, preprocess, _ = open_clip.create_model_and_transforms("ViT-B-32", pretrained=False) - # Force model to CPU (TorchScript models may have hardcoded CUDA) - model = model.cpu() - model.eval() - except (Exception, RuntimeError) as e: - error_str = str(e) - # Check if error is CUDA-related - if so, skip this file entirely - is_cuda_error = "CUDA" in error_str or "cuda" in error_str.lower() + file_size = os.path.getsize(local_path) + is_likely_torchscript = file_size < 10 * 1024 * 1024 # Less than 10MB is likely TorchScript - if is_cuda_error: - # TorchScript file has hardcoded CUDA - skip it and create fresh model - print(f"Warning: TorchScript model file has hardcoded CUDA references. Skipping file and creating fresh CPU-compatible model.") + if is_likely_torchscript: + print(f"Warning: Detected TorchScript model file ({file_size} bytes). Skipping to avoid CUDA hardcoding.") model, preprocess, _ = open_clip.create_model_and_transforms( "ViT-B-32", pretrained="openai" ) model = model.to(device) else: - # Fall back to state dict loading or create fresh model - print(f"Loading model from state dict (TorchScript failed: {error_str[:100]})") + # Larger files might be state dicts try: - model, preprocess, _ = open_clip.create_model_and_transforms( - "ViT-B-32", pretrained=False - ) checkpoint = torch.load(local_path, map_location="cpu", weights_only=False) if isinstance(checkpoint, dict) and 'state_dict' in checkpoint: + # State dict checkpoint + model, preprocess, _ = open_clip.create_model_and_transforms( + "ViT-B-32", pretrained=False + ) model.load_state_dict(checkpoint['state_dict']) + model = model.to(device) + print(f"Loaded model from state dict checkpoint") elif isinstance(checkpoint, dict): + # Attempt to load dict as state + model, preprocess, _ = open_clip.create_model_and_transforms( + "ViT-B-32", pretrained=False + ) model.load_state_dict(checkpoint) + model = model.to(device) + print(f"Loaded model from state dict") else: - # If it's not a dict, it might be a TorchScript model object - # Check if it has TorchScript attributes - if hasattr(checkpoint, 'encode_image') and hasattr(checkpoint, 'graph'): - # It's a TorchScript model - skip it - print("Warning: Model file is TorchScript. Creating fresh model for CPU compatibility.") - model, preprocess, _ = open_clip.create_model_and_transforms( - "ViT-B-32", pretrained="openai" - ) - else: - model = checkpoint - _, preprocess, _ = open_clip.create_model_and_transforms("ViT-B-32", pretrained=False) - # Move to device (will be CPU) - model = model.to(device) - except Exception as e2: - # If state dict loading also fails, create a fresh model - print(f"Warning: Could not load model from file. Creating fresh model. Error: {str(e2)[:100]}") + # Unknown format - create fresh model + print(f"Unknown checkpoint format. Creating fresh model.") + model, preprocess, _ = open_clip.create_model_and_transforms( + "ViT-B-32", pretrained="openai" + ) + model = model.to(device) + except Exception as e: + print(f"Warning: Could not load checkpoint. Creating fresh model. Error: {str(e)[:100]}") model, preprocess, _ = open_clip.create_model_and_transforms( "ViT-B-32", pretrained="openai" ) model = model.to(device) + except Exception as e: + print(f"Warning: Error checking local model. Creating fresh model. Error: {str(e)[:100]}") + model, preprocess, _ = open_clip.create_model_and_transforms( + "ViT-B-32", pretrained="openai" + ) + model = model.to(device) else: # Download from openai (requires internet on first run) + print("No local CLIP model found. Downloading from OpenAI...") model, preprocess, _ = open_clip.create_model_and_transforms( "ViT-B-32", pretrained="openai" ) @@ -215,7 +213,11 @@ def embed_text(text: Union[str, List[str]]) -> np.ndarray: def embed_image(path: str) -> np.ndarray: - """Generate embeddings for image with strict CPU-only mode.""" + """Generate embeddings for image with strict CPU-only mode. + + Returns: + numpy array of shape (1, 512) containing normalized CLIP image embedding. + """ global _clip_model, _clip_preprocess # Force CPU-only environment @@ -242,14 +244,22 @@ def embed_image(path: str) -> np.ndarray: try: # Encode image and normalize feats = model.encode_image(batch) + # Normalize: feats shape is (1, 512), normalize per-sample feats = feats / feats.norm(dim=-1, keepdim=True) - return feats.cpu().numpy().astype(np.float32) + result = feats.cpu().numpy().astype(np.float32) + + # Ensure shape is (1, 512) + if result.ndim == 1: + result = result.reshape(1, -1) + + assert result.shape == (1, 512), f"Expected shape (1, 512), got {result.shape}" + return result + except RuntimeError as e: error_msg = str(e) if "CUDA" in error_msg or "cuda" in error_msg.lower(): print("Warning: CUDA operation attempted. Reinitializing in CPU-only mode...") # Reset model to force CPU reinitialization - global _clip_model, _clip_preprocess _clip_model = None _clip_preprocess = None # Retry with fresh CPU model @@ -258,38 +268,15 @@ def embed_image(path: str) -> np.ndarray: batch = torch.stack([image]).cpu() feats = model.encode_image(batch) feats = feats / feats.norm(dim=-1, keepdim=True) - return feats.cpu().numpy().astype(np.float32) + result = feats.cpu().numpy().astype(np.float32) + if result.ndim == 1: + result = result.reshape(1, -1) + return result raise # Re-raise if it's not a CUDA error + except Exception as e: print(f"Error processing image {path}: {str(e)}") raise - - try: - with torch.no_grad(): - feats = model.encode_image(batch) - feats = feats / feats.norm(dim=-1, keepdim=True) - except RuntimeError as e: - if "CUDA" in str(e) or "cuda" in str(e).lower(): - # Model has CUDA hardcoded in TorchScript - need to recreate model - print(f"Warning: TorchScript model has hardcoded CUDA references. Recreating model for CPU compatibility.") - # Reset global model to force recreation - _clip_model = None - _clip_preprocess = None - - # Recreate model (will be CPU-compatible) - model, preprocess = get_clip() - device = torch.device("cpu") - model = model.cpu() - batch = batch.cpu() - - with torch.no_grad(): - feats = model.encode_image(batch) - feats = feats / feats.norm(dim=-1, keepdim=True) - else: - raise - - # Always return CPU numpy array - return feats.cpu().numpy().astype(np.float32) def embed_audio_segment(transcript: str) -> np.ndarray: diff --git a/backend/app/index_store.py b/backend/app/index_store.py index 976cdcb..6e1923e 100644 --- a/backend/app/index_store.py +++ b/backend/app/index_store.py @@ -27,7 +27,7 @@ def connect_db(): CREATE TABLE IF NOT EXISTS vectors ( id INTEGER PRIMARY KEY AUTOINCREMENT, vector_id INTEGER, - content TEXT, + page_content TEXT, file_name TEXT, file_type TEXT, page_number INTEGER, @@ -70,12 +70,12 @@ def add_embeddings_with_metadata(embeddings: np.ndarray, metadatas: List[dict]) for i, meta in enumerate(metadatas): cur.execute( """ - INSERT INTO vectors (vector_id, content, file_name, file_type, page_number, timestamp, filepath, width, height, bbox) + INSERT INTO vectors (vector_id, page_content, file_name, file_type, page_number, timestamp, filepath, width, height, bbox) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) """, ( start_id + i, - meta.get("content"), + meta.get("page_content"), meta.get("file_name"), meta.get("file_type"), meta.get("page_number"), @@ -107,7 +107,7 @@ def status() -> dict: def rebuild_from_db(dim: int) -> dict: conn = connect_db() cur = conn.cursor() - cur.execute("SELECT rowid, content FROM vectors ORDER BY rowid") + cur.execute("SELECT rowid, page_content FROM vectors ORDER BY rowid") rows = cur.fetchall() conn.close() if not rows: diff --git a/backend/app/ingestion/audio_transcriber.py b/backend/app/ingestion/audio_transcriber.py index 278d9e9..82ac459 100644 --- a/backend/app/ingestion/audio_transcriber.py +++ b/backend/app/ingestion/audio_transcriber.py @@ -1,108 +1,153 @@ -""" -Audio transcription using Whisper. +"""Audio transcription using local ASR (whisper/faster-whisper if available). + +This module tries to transcribe audio offline using locally installed Whisper +or faster-whisper backends. If neither is available, it will return an empty +transcript and the ingestion pipeline will surface an appropriate message. """ from __future__ import annotations import os -import subprocess -import tempfile from typing import List, Tuple from .base import Chunk, _split_text -def transcribe_audio_with_whisper_cpp(path: str) -> Tuple[str, List[Tuple[float, float, str]]]: - """ - Transcribe audio using Whisper model. - Returns (full_transcript, segments). - segments is a list of tuples (start_ms, end_ms, text). +def _format_timestamp(seconds: float) -> str: + """Format seconds as HH:MM:SS or MM:SS string.""" + try: + seconds = float(seconds) + except Exception: + return "" + hours = int(seconds // 3600) + minutes = int((seconds % 3600) // 60) + secs = int(seconds % 60) + if hours > 0: + return f"{hours:02d}:{minutes:02d}:{secs:02d}" + return f"{minutes:02d}:{secs:02d}" + + +def transcribe_audio_with_whisper(path: str) -> Tuple[str, List[Tuple[float, float, str]]]: + """Transcribe audio using available local Whisper implementation. + + Returns (full_transcript, segments) where segments is a list of + (start_seconds, end_seconds, text). """ + # Try openai/whisper first try: import whisper - import torch - import numpy as np - - # Force CPU mode for consistency - device = "cpu" - - # Load model with absolute path - model_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "models", "whisper")) - model = whisper.load_model("base", download_root=model_path, device=device) - - # Transcribe audio - result = model.transcribe(path, language=None) - - # Extract transcript and segments + print(f"[Audio] Loading openai-whisper model 'base'...") + model = whisper.load_model("base", device="cpu") + print(f"[Audio] Transcribing {path}...") + result = model.transcribe(path) transcript = result.get("text", "").strip() segments = [] - - # Process segments if available for seg in result.get("segments", []): + start = seg.get("start", 0.0) + end = seg.get("end", 0.0) text = seg.get("text", "").strip() if text: - start = seg.get("start", 0) - end = seg.get("end", 0) - segments.append((start, end, text)) - + segments.append((float(start), float(end), text)) + print(f"[Audio] Transcription complete. Length: {len(transcript)} chars.") return transcript, segments + except ImportError: + print("[Audio] openai-whisper not installed.") + except Exception as e: + print(f"[Audio] openai-whisper failed: {e}") + + # Try faster-whisper if installed (it often works well offline) + try: + from faster_whisper import WhisperModel + print(f"[Audio] Loading faster-whisper model...") + model_size = "small" + model_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "models", "whisper")) + model = WhisperModel(model_dir, device="cpu", compute_type="int8_float16") + segments_iter, info = model.transcribe(path, beam_size=5) + transcript_parts = [] + segments = [] + for segment in segments_iter: + start = float(segment.start) + end = float(segment.end) + text = segment.text.strip() + if text: + transcript_parts.append(text) + segments.append((start, end, text)) + return " ".join(transcript_parts), segments + except ImportError: + print("[Audio] faster-whisper not installed.") except Exception as e: - print(f"Error transcribing audio: {str(e)}") - pass + print(f"[Audio] faster-whisper failed: {e}") + + # If no local ASR available, return empty results (ingestion will handle fallback) + print("[Audio] No ASR backend available or all failed.") return "", [] def transcribe_audio(path: str, file_name: str) -> List[Chunk]: - """Transcribe audio file and create chunks with timestamps.""" - transcript, segments = transcribe_audio_with_whisper_cpp(path) + """Transcribe audio file and return a list of Chunk objects with timestamps. + + Each chunk will have attributes: + - content: transcript text + - file_name, file_type='audio', filepath + - start_ts (float seconds), end_ts (float seconds) + - timestamp (formatted start time string) + """ + transcript, segments = transcribe_audio_with_whisper(path) chunks: List[Chunk] = [] - - if not transcript or transcript.strip() == "": - # If transcription failed, return empty list - return chunks - - # Split transcript into chunks - split_chunks = _split_text(transcript) - - # Map chunks to segments for timestamp assignment - segment_idx = 0 - char_pos = 0 - - for i, ch in enumerate(split_chunks): - # Find corresponding segment for this chunk - chunk_start = char_pos - chunk_end = char_pos + len(ch) - - # Find segment that contains this chunk - timestamp_str = None - for seg_start, seg_end, seg_text in segments: - seg_char_start = sum(len(seg[2]) + 1 for seg in segments[:segments.index((seg_start, seg_end, seg_text))]) - seg_char_end = seg_char_start + len(seg_text) - - if chunk_start >= seg_char_start and chunk_start < seg_char_end: - # Format timestamp as "HH:MM:SS" or "MM:SS" - if seg_start is not None and seg_end is not None: - start_sec = seg_start / 1000.0 - end_sec = seg_end / 1000.0 - hours = int(start_sec // 3600) - minutes = int((start_sec % 3600) // 60) - seconds = int(start_sec % 60) - if hours > 0: - timestamp_str = f"{hours:02d}:{minutes:02d}:{seconds:02d}" - else: - timestamp_str = f"{minutes:02d}:{seconds:02d}" - break - - c = Chunk( - content=ch, - file_name=file_name, - file_type="audio", + + if not segments: + # If ASR failed, create a single placeholder chunk to surface the file + placeholder = Chunk( + content="[Transcription unavailable: please install a local ASR model (whisper or faster-whisper)]", + file_name=file_name, + file_type="audio", filepath=path, - timestamp=timestamp_str # Store timestamp in Chunk + timestamp=None ) - # Also store segments for reference - setattr(c, "segments", segments) - chunks.append(c) - - char_pos = chunk_end - + setattr(placeholder, "start_ts", None) + setattr(placeholder, "end_ts", None) + chunks.append(placeholder) + return chunks + + # For each segment returned by ASR, split further if needed and assign timestamps + for seg_start, seg_end, seg_text in segments: + seg_text = (seg_text or "").strip() + if not seg_text: + continue + + # Split long segment text into subchunks while preserving approximate timestamps + subchunks = _split_text(seg_text, min_size=100, max_size=400) + if not subchunks: + subchunks = [seg_text] + + total_chars = sum(len(s) for s in subchunks) + if total_chars == 0: + continue + + # Distribute timestamp range proportionally across subchunks + seg_duration = max(0.0, float(seg_end) - float(seg_start)) + char_cursor = 0 + for sub in subchunks: + proportion = len(sub) / total_chars + start_offset = (char_cursor / total_chars) * seg_duration + end_offset = ((char_cursor + len(sub)) / total_chars) * seg_duration + start_ts = float(seg_start) + start_offset + end_ts = float(seg_start) + end_offset + + ts_str = _format_timestamp(start_ts) + + c = Chunk( + content=sub, + file_name=file_name, + file_type="audio", + filepath=path, + timestamp=ts_str + ) + setattr(c, "start_ts", start_ts) + setattr(c, "end_ts", end_ts) + setattr(c, "char_start", None) + setattr(c, "char_end", None) + chunks.append(c) + + char_cursor += len(sub) + return chunks diff --git a/backend/app/ingestion/image_processor.py b/backend/app/ingestion/image_processor.py index 1c38353..b08eab6 100644 --- a/backend/app/ingestion/image_processor.py +++ b/backend/app/ingestion/image_processor.py @@ -5,38 +5,77 @@ from __future__ import annotations import os +import logging from typing import List, Optional from PIL import Image from .base import Chunk from ..image_utils import get_image_size +logger = logging.getLogger(__name__) + def detect_text_in_image(path: str) -> str: - """Extract text from image using OCR (optional pytesseract).""" + """Extract text from image using OCR (optional pytesseract). + + Returns: + Extracted text string, or empty string if OCR is not available or fails. + """ try: import pytesseract + logger.debug(f"Attempting OCR on image: {path}") image = Image.open(path) text = pytesseract.image_to_string(image) - return text.strip() + extracted = text.strip() + if extracted: + logger.debug(f"Successfully extracted {len(extracted)} characters via OCR") + return extracted except ImportError: - # pytesseract not available, return empty + # pytesseract not available - use fallback description + logger.debug("pytesseract not available. Using filename-based description.") return "" - except Exception: + except Exception as e: + # Log but don't fail - continue processing without OCR + logger.warning(f"OCR extraction failed for {path}: {e}") return "" def image_to_embedding(path: str, file_name: str) -> List[Chunk]: - """Process image and create chunk for embedding.""" + """Process image and create chunk for embedding with rich content. + + The chunk content includes: + 1. Image filename (always) + 2. OCR text if available (extracted via Tesseract) + 3. A semantic description hint + + This ensures that both visual (via CLIP embedding) and textual (via OCR) + aspects of the image are indexed and retrievable. + """ try: w, h = get_image_size(path) - except Exception: + except Exception as e: + logger.warning(f"Could not get image dimensions for {path}: {e}") w, h = None, None - # Try OCR if available + # Extract OCR text from image (if pytesseract available) ocr_text = detect_text_in_image(path) + # Build rich content description for the image chunk + content_parts = [f"Image: {file_name}"] + + # Add OCR text as primary content if available + if ocr_text: + content_parts.append(f"OCR Text: {ocr_text}") + else: + # Add dimensional hints for retrieval if no OCR + if w and h: + content_parts.append(f"Dimensions: {w}x{h} pixels") + + # Combine content - this will be embedded by CLIP (image embedding) + # and also used for text search in vector store + full_content = " | ".join(content_parts) + ch = Chunk( - content=f"Image: {file_name}" + (f" | OCR: {ocr_text}" if ocr_text else ""), + content=full_content, file_name=file_name, file_type="image", filepath=path, diff --git a/backend/app/llm/prompts.py b/backend/app/llm/prompts.py index 3e2157e..7d20554 100644 --- a/backend/app/llm/prompts.py +++ b/backend/app/llm/prompts.py @@ -26,7 +26,7 @@ def build_prompt(query: str, sources: List[Dict[str, Any]]) -> str: - """Build a comprehensive prompt with context sources.""" + """Build a comprehensive prompt with context sources, optimized for multimodal content.""" if not sources: return f"USER QUESTION: {query}\n\nAnswer: I don't have any relevant information to answer this question." @@ -35,20 +35,29 @@ def build_prompt(query: str, sources: List[Dict[str, Any]]) -> str: # Build source description source_desc = f"[{i}] " - # Add file information + # Add file information and modality indicator file_name = source.get('file_name', 'Unknown') file_type = source.get('file_type', 'unknown') - source_desc += f"{file_name} ({file_type})" + modality = source.get('modality', file_type) + + source_desc += f"{file_name}" + + # Add modality hint for better LLM context + if file_type == 'image': + source_desc += " (Screenshot/Image)" + elif file_type == 'pdf': + source_desc += " (PDF Document)" + elif file_type == 'audio': + source_desc += " (Audio)" # Add location information if file_type == 'pdf' and source.get('page_number'): source_desc += f" - Page {source['page_number']}" elif file_type == 'audio' and source.get('timestamp'): source_desc += f" - {source['timestamp']}" - elif file_type == 'image': - source_desc += " - Image" # Add content snippet + # For images, we want OCR text to be prominent content = source.get('content', '').strip() if content: # Truncate very long content @@ -56,6 +65,12 @@ def build_prompt(query: str, sources: List[Dict[str, Any]]) -> str: content = content[:500] + "..." source_desc += f": {content}" + # If no content but we have OCR text in metadata, show it + # (for backward compatibility, also check for ocr_text field) + ocr_text = source.get('ocr_text', '') + if ocr_text and not content: + source_desc += f": (OCR) {ocr_text}" + context_lines.append(source_desc) context_sources = "\n".join(context_lines) @@ -88,7 +103,7 @@ def build_simple_prompt(query: str, sources: List[Dict[str, Any]]) -> str: def build_multimodal_prompt(query: str, sources: List[Dict[str, Any]]) -> str: - """Build prompt optimized for multimodal content.""" + """Build prompt optimized for multimodal content with proper image handling.""" if not sources: return f"Question: {query}\nAnswer: I don't have any relevant information." @@ -107,31 +122,55 @@ def build_multimodal_prompt(query: str, sources: List[Dict[str, Any]]) -> str: text_sources.append(source) prompt_parts = [] + counter = 1 if text_sources: prompt_parts.append("TEXT SOURCES:") - for i, source in enumerate(text_sources, start=1): + for source in text_sources: content = source.get('content', '').strip() file_name = source.get('file_name', 'Unknown') - prompt_parts.append(f"[{i}] {file_name}: {content}") + if content: + if len(content) > 300: + content = content[:300] + "..." + prompt_parts.append(f"[{counter}] {file_name}: {content}") + counter += 1 if image_sources: - prompt_parts.append("\nIMAGE SOURCES:") - for i, source in enumerate(image_sources, start=len(text_sources) + 1): + prompt_parts.append("\nIMAGE SOURCES (Screenshots/Diagrams):") + for source in image_sources: + # For images, the content field should already contain OCR text (from image_processor.py) content = source.get('content', '').strip() file_name = source.get('file_name', 'Unknown') - prompt_parts.append(f"[{i}] {file_name}: {content}") + + # Fallback to ocr_text field if available and content is just the filename + ocr_text = source.get('ocr_text', '') + if not content or content.startswith('Image:'): + if ocr_text: + content = ocr_text + else: + content = "(Image - describe based on filename and any visible elements)" + + if len(content) > 300: + content = content[:300] + "..." + + # Make image source more distinct for the LLM + prompt_parts.append(f"[{counter}] {file_name} (IMAGE): {content}") + counter += 1 if audio_sources: - prompt_parts.append("\nAUDIO SOURCES:") - for i, source in enumerate(audio_sources, start=len(text_sources) + len(image_sources) + 1): + prompt_parts.append("\nAUDIO SOURCES (Transcripts):") + for source in audio_sources: content = source.get('content', '').strip() file_name = source.get('file_name', 'Unknown') timestamp = source.get('timestamp', '') - if timestamp: - prompt_parts.append(f"[{i}] {file_name} ({timestamp}): {content}") - else: - prompt_parts.append(f"[{i}] {file_name}: {content}") + if content: + if len(content) > 300: + content = content[:300] + "..." + if timestamp: + prompt_parts.append(f"[{counter}] {file_name} ({timestamp}): {content}") + else: + prompt_parts.append(f"[{counter}] {file_name}: {content}") + counter += 1 context = "\n".join(prompt_parts) @@ -141,4 +180,11 @@ def build_multimodal_prompt(query: str, sources: List[Dict[str, Any]]) -> str: Question: {query} -Answer based on the provided sources. Use [1], [2], etc. for citations. If information is missing, say so.""" +Important Instructions: +1. Use ONLY information from the sources [1] through [{counter-1}] +2. For image sources, describe what they show based on the text/OCR content provided +3. Cite sources inline using [1], [2], [3], etc. +4. If information is not in the sources, say "I don't have that information in the provided sources" +5. Be specific and accurate in your answers + +Answer:""" diff --git a/backend/app/main.py b/backend/app/main.py index 35c1289..585fe08 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -116,10 +116,28 @@ async def ingest(file: UploadFile = File(...)): ) if not chunks: - raise HTTPException( - status_code=422, - detail=f"No content could be extracted from {file.filename}. The file may be corrupted or in an unsupported format." - ) + # If the uploaded file is audio, don't fail the whole ingestion when + # transcription cannot be performed (missing ASR/model). Instead, + # create a placeholder audio chunk so the file is still indexed and + # can be surfaced to the user with a message about transcription. + audio_exts = ('.mp3', '.wav', '.m4a', '.flac', '.ogg') + if file.filename.lower().endswith(audio_exts): + from .ingestion.base import Chunk + placeholder = Chunk( + content="[Transcription unavailable: install a local ASR model (whisper or faster-whisper) to enable transcripts]", + file_name=file.filename, + file_type="audio", + filepath=dest_path, + timestamp=None + ) + setattr(placeholder, 'start_ts', None) + setattr(placeholder, 'end_ts', None) + chunks = [placeholder] + else: + raise HTTPException( + status_code=422, + detail=f"No content could be extracted from {file.filename}. The file may be corrupted or in an unsupported format." + ) # Generate embeddings and store using new vector store try: @@ -162,14 +180,16 @@ async def ingest(file: UploadFile = File(...)): 'file_name': chunk.file_name, 'file_type': chunk.file_type, 'page_number': chunk.page_number, - 'timestamp': chunk.timestamp, + 'timestamp': chunk.timestamp, + 'start_ts': getattr(chunk, 'start_ts', None), + 'end_ts': getattr(chunk, 'end_ts', None), 'filepath': chunk.filepath, 'width': getattr(chunk, 'width', None), 'height': getattr(chunk, 'height', None), 'bbox': getattr(chunk, 'bbox', None), 'char_start': getattr(chunk, 'char_start', None), 'char_end': getattr(chunk, 'char_end', None), - 'modality': chunk.file_type + 'modality': chunk.file_type } }) except Exception as e: @@ -193,6 +213,8 @@ async def ingest(file: UploadFile = File(...)): 'file_type': chunk.file_type, 'page_number': chunk.page_number, 'timestamp': chunk.timestamp, + 'start_ts': getattr(chunk, 'start_ts', None), + 'end_ts': getattr(chunk, 'end_ts', None), 'filepath': chunk.filepath, 'width': getattr(chunk, 'width', None), 'height': getattr(chunk, 'height', None), @@ -270,16 +292,34 @@ def query(payload: dict): file_type = r.get("modality", r.get("file_type", "text")) page_num = r.get("page_number") timestamp = r.get("timestamp") - + start_ts = r.get("start_ts") + end_ts = r.get("end_ts") + # Build URL efficiently url = None if file_name: base_path = f"/files/{file_name}" if file_type == "pdf" and page_num is not None: url = f"{base_path}#page={page_num}" - elif file_type == "audio" and timestamp: - # URL-safe timestamp encoding - url = f"{base_path}#t={timestamp.replace(':', '')}" + elif file_type == "audio": + # Prefer numeric start_ts if available, otherwise fall back to timestamp string + ts_str = None + if start_ts is not None: + # format seconds to HH:MM:SS + try: + s = int(start_ts) + h = s // 3600 + m = (s % 3600) // 60 + sec = s % 60 + ts_str = f"{h:02d}:{m:02d}:{sec:02d}" if h > 0 else f"{m:02d}:{sec:02d}" + except Exception: + ts_str = None + if not ts_str and timestamp: + ts_str = timestamp + if ts_str: + url = f"{base_path}#timestamp={ts_str}" + else: + url = base_path elif file_type == "image": url = base_path else: diff --git a/backend/app/rag.py b/backend/app/rag.py index 78b769e..713a45a 100644 --- a/backend/app/rag.py +++ b/backend/app/rag.py @@ -42,7 +42,7 @@ def similarity_search(query: str, k: int) -> List[Dict]: cur = conn.cursor() placeholders = ",".join(["?"] * len(ids)) cur.execute( - f"SELECT vector_id, content, file_name, file_type, page_number, timestamp, filepath FROM vectors WHERE vector_id IN ({placeholders})", + f"SELECT vector_id, page_content, file_name, file_type, page_number, timestamp, filepath FROM vectors WHERE vector_id IN ({placeholders})", ids, ) rows = cur.fetchall() @@ -56,7 +56,7 @@ def similarity_search(query: str, k: int) -> List[Dict]: results.append( { "vector_id": r[0], - "content": r[1], + "page_content": r[1], "file_name": r[2], "file_type": r[3], "page_number": r[4], @@ -77,7 +77,7 @@ def build_prompt(query: str, sources: List[Dict]) -> str: where += f" page {s['page_number']}" if s.get("file_type") == "audio" and s.get("timestamp"): where += f" {s['timestamp']}" - snippet = (s.get("content") or "").replace("\n", " ") + snippet = (s.get("page_content") or "").replace("\n", " ") lines.append(f"Source {marker} {where}: \"{snippet}\"") lines.append("\nAnswer the user query using only the information from sources [1..k]. Provide citations inline like [1], [2]. If the answer is unknown from sources, say you don't know.") lines.append(f"\nUser query: {query}\nAnswer:") @@ -97,7 +97,7 @@ def answer_query(cfg_path: str, query: str) -> dict: { "id": i, "file_name": s.get("file_name"), - "snippet": s.get("content"), + "snippet": s.get("page_content"), "page_number": s.get("page_number"), "timestamp": s.get("timestamp"), "score": s.get("score"), diff --git a/backend/app/vector_store/faiss_store.py b/backend/app/vector_store/faiss_store.py index d2549b3..4157b78 100644 --- a/backend/app/vector_store/faiss_store.py +++ b/backend/app/vector_store/faiss_store.py @@ -121,11 +121,13 @@ def upsert(self, items: List[Dict[str, Any]]) -> int: vector_id = start_id + i self.metadata[str(vector_id)] = { 'vector_id': vector_id, - 'content': metadata.get('content', ''), + 'page_content': metadata.get('page_content', ''), 'file_name': metadata.get('file_name', ''), 'file_type': metadata.get('file_type', ''), 'page_number': metadata.get('page_number'), 'timestamp': metadata.get('timestamp'), + 'start_ts': metadata.get('start_ts'), + 'end_ts': metadata.get('end_ts'), 'filepath': metadata.get('filepath'), 'width': metadata.get('width'), 'height': metadata.get('height'), @@ -217,9 +219,27 @@ def rebuild_from_metadata(self): # This would require re-computing embeddings from original content # For now, just reinitialize with current dimension self.index = faiss.IndexFlatIP(self.dimension) + + # Get all embeddings and IDs from metadata + embeddings = [] + ids = [] + for vector_id, meta in self.metadata.items(): + # Assuming 'page_content' holds the text for embedding + text_to_embed = meta.get('page_content', '') + if text_to_embed: + # Re-create embedding. This requires access to an embedding function. + # This is a placeholder for where you'd call your embedding function. + # from ..embeddings import embed_text + # embedding = embed_text(text_to_embed) + # embeddings.append(embedding) + # ids.append(int(vector_id)) + pass # Placeholder for re-embedding logic + + # if embeddings: + # self.index.add_with_ids(np.array(embeddings), np.array(ids)) + self._persist_index() - # Global store instance for backward compatibility _store_instance: Optional[FAISSStore] = None diff --git a/backend/requirements.txt b/backend/requirements.txt index 367ea45..a24fbfb 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -8,9 +8,9 @@ sentence-transformers==3.0.1 PyMuPDF==1.24.10 python-docx==1.1.2 Pillow==10.4.0 -torch==2.4.1 +# torch==2.4.1 open-clip-torch==2.26.1 -torchaudio==2.4.1 +# torchaudio==2.4.1 rich==13.8.1 aiohttp==3.10.5 @@ -32,3 +32,7 @@ pytest==7.4.3 pytest-asyncio==0.21.1 + +# Audio Transcription +openai-whisper==20231117 +ffmpeg-python==0.2.0 diff --git a/backend/scripts/test_audio_pipeline.py b/backend/scripts/test_audio_pipeline.py new file mode 100644 index 0000000..dee0b73 --- /dev/null +++ b/backend/scripts/test_audio_pipeline.py @@ -0,0 +1,85 @@ +"""End-to-end test script for the audio ingestion pipeline. + +This script is intended to be runnable locally. It: + - Creates a short synthetic WAV file in storage + - Calls the ingestion/transcription pipeline directly + - Embeds transcript segments using the unified embed_text() + - Indexes embeddings in FAISS + - Performs a simple search to verify vectors are searchable + +The script is tolerant of missing ASR backends: if no local ASR is available, +the pipeline will insert a placeholder chunk and indexing will still succeed. +""" + +import os +import wave +import struct +import tempfile +import numpy as np + +from backend.app.ingestion.audio_transcriber import transcribe_audio +from backend.app.embeddings.generate import embed_text +from backend.app.vector_store.faiss_store import get_store, upsert, search + + +def generate_silent_wav(path: str, duration_sec: float = 1.0, rate: int = 16000): + n_samples = int(duration_sec * rate) + amplitude = 0 + with wave.open(path, 'w') as wf: + wf.setnchannels(1) + wf.setsampwidth(2) + wf.setframerate(rate) + frames = (struct.pack(' { - if (typeof window === "undefined") return false; - const saved = localStorage.getItem("theme-dark"); - return saved ? JSON.parse(saved) : false; - }); + const [isDarkMode, setIsDarkMode] = useState(true); + + // Static chat history for demo + const [chatHistory] = useState([ + { id: 1, title: "What is RAG?", date: "Today" }, + { id: 2, title: "Document analysis help", date: "Yesterday" }, + { id: 3, title: "Audio transcription query", date: "Nov 20" }, + ]); // Monitor online/offline status useEffect(() => { @@ -35,7 +30,7 @@ function App() { ...prev, { role: "system", - content: "βœ… Connection restored. You're back online!", + content: "Connection restored. You're back online!", }, ]); }; @@ -46,7 +41,8 @@ function App() { ...prev, { role: "system", - content: "πŸ“΄ You're offline. Don't worry, you can still query your uploaded documents!", + content: + "You're offline. Don't worry, you can still query your uploaded documents!", }, ]); }; @@ -60,19 +56,7 @@ function App() { }; }, []); - useEffect(() => { - document.documentElement.classList.toggle("dark", dark); - localStorage.setItem("theme-dark", JSON.stringify(dark)); - }, [dark]); - // Welcome message on first load - useEffect(() => { - if (messages.length === 0) { - setMessages([ - - ]); - } - }, []); const handleSend = async (text) => { setMessages((prev) => [...prev, { role: "user", content: text }]); @@ -88,7 +72,7 @@ function App() { if (!res.ok) throw new Error("Query failed"); const data = await res.json(); - + setMessages((prev) => [ ...prev, { @@ -110,176 +94,368 @@ function App() { { role: "assistant", content: isOnline - ? "❌ Sorry, I couldn't connect to the server. Please check if the backend is running." - : "πŸ“΄ You're offline. Make sure your documents are already indexed to query them.", + ? "Sorry, I couldn't connect to the server. Please check if the backend is running." + : "You're offline. Make sure your documents are already indexed to query them.", }, ]); } setIsTyping(false); }; - const doSearch = async (query) => { - if (!query || !query.trim()) { - setResults([]); - return; - } - try { - const res = await fetch("http://localhost:8000/search/similarity", { - method: "POST", - headers: { "Content-Type": "application/json" }, - body: JSON.stringify({ query, k: 8 }), - }); - const data = await res.json(); - setResults(data.results || []); - } catch (e) { - console.error("Search failed:", e); - } - }; - const handleUploadComplete = (data) => { setIndexedDocs((prev) => prev + 1); setMessages((prev) => [ ...prev, { role: "system", - content: `βœ… Successfully indexed: ${data.file || "document"}\nπŸ“Š Vectors indexed: ${data.vectors_indexed || 0}`, + content: `Successfully indexed: ${ + data.file || "document" + }\nVectors indexed: ${data.vectors_indexed || 0}`, }, ]); }; - return ( -
-
+ return ( +
+ {/* Mobile Navigation */} +
+ + + + + +
+ - {/* Offline Banner */} - {!isOnline && ( -
-

- πŸ”Œ Offline Mode Active - You can query already indexed documents -

-
- )} +
+
+ Chat History +
+
+ {chatHistory.map((chat) => ( +
+
+ {chat.title} +
+
+ {chat.date} +
+
+ ))} +
+
- {/* Mobile Navigation (same, convenient buttons) */} -
- - - - - - - Upload Documents - -
- -
-
-
+
+
+ Database Stats +
+
+ + {indexedDocs} indexed +
+
- - - - - - - Retrieved Context - -
- -
-
-
-
+ {/* User Area */} +
+
+
+ U +
+
User
+
+
+
+
+
+
- {/* Main Layout */} -
- {/* Left Panel - Library + Context (ChatGPT ke sidebar jaisa) */} + {/* Sidebar (Desktop) */} - {/* Center Panel - ChatGPT style Chat Interface */} -
-
-
- {/* Optional Chat Header (ChatGPT vibe) */} -
-
-

Assistant

-

- Multimodal RAG Β· {isOnline ? "Online" : "Offline mode"} -

-
+ {/* User Area at bottom */} +
+
+
+ U
- -
-
- -
+
+ User
-
-
+ - {/* Toggle Sidebar Button (ab sirf left sidebar ke liye) */} - + {/* Main Content Area */} +
+ {/* Top Navbar */} +
+
+ +

+ Local AI Assistant +

+
+
+ +
+ {isOnline ? "● Online" : "β—‹ Offline"} +
+
+
+ + {/* Chat Area */} +
+ +
+
- {modalItem && ( - setModalItem(null)} /> - )} -
-); + {/* Right Sidebar (Context) - Desktop only */} + {results.length > 0 && ( + + )} } -export default App; \ No newline at end of file +export default App; diff --git a/frontend/src/ChatUI.jsx b/frontend/src/ChatUI.jsx index 0b6f365..b7fab4b 100644 --- a/frontend/src/ChatUI.jsx +++ b/frontend/src/ChatUI.jsx @@ -1,10 +1,11 @@ import { useEffect, useRef, useState } from "react"; +import { Mic } from "lucide-react"; export default function ChatUI({ messages, onSend, isTyping = false }) { const [input, setInput] = useState(""); const scrollRef = useRef(null); - // Simplified submit handler for text-only input + // Submit handler const submit = (e) => { e.preventDefault(); if (!input.trim()) return; @@ -12,6 +13,7 @@ export default function ChatUI({ messages, onSend, isTyping = false }) { setInput(""); }; + // Auto-scroll to bottom on new messages useEffect(() => { if (scrollRef.current) { scrollRef.current.scrollTop = scrollRef.current.scrollHeight; @@ -20,63 +22,85 @@ export default function ChatUI({ messages, onSend, isTyping = false }) { const Message = ({ role, content }) => { const isUser = role === "user"; + const isSystem = role === "system"; + + if (isSystem) { + return ( +
+ + {content} + +
+ ); + } + return (
- {!isUser && ( -
- A +
+
+ {isUser ? ( +
+ U +
+ ) : ( +
+ + + + +
+ )}
- )} -
-

{content}

-
- {isUser && ( -
- U + +
+
+

{content}

+
- )} +
); }; const Typing = () => ( -
- - - +
+
+
+ + + + +
+
+ + + +
+
); return ( -
- {/* Messages area */} -
- {messages.length === 0 && ( -
-
- AI +
+ {/* Messages Area */} +
+ {messages.length === 0 && ( +
+
+ + + + +
+

How can I help you today?

+

Ask me anything about your documents

Chat with your documents

@@ -85,92 +109,50 @@ export default function ChatUI({ messages, onSend, isTyping = false }) {

)} - {messages.map((m, idx) => { - const isUser = m.role === "user"; - - return ( -
- {/* Assistant avatar */} - {!isUser && ( -
- AI -
- )} - -
-
- {m.content} -
-
- - {/* User avatar */} - {isUser && ( -
- You -
- )} -
- ); - })} - - {/* Typing indicator (assistant) */} - {isTyping && ( -
-
- AI -
-
- -
+
+ {messages.map((m, idx) => ( + + ))} + {isTyping && }
- )} -
+
- {/* Input area – bottom, ChatGPT style */} -
-
-
-