diff --git a/context7.json b/context7.json new file mode 100644 index 0000000..626d0e7 --- /dev/null +++ b/context7.json @@ -0,0 +1,55 @@ +{ + "$schema": "https://context7.com/schema.json", + "name": "CodeRAG", + "description": "Lightning-fast semantic code search with AST chunking - RAG-ready for AI assistants", + "repository": "https://github.com/SylphxAI/coderag", + "documentation": "https://coderag.sylphx.com", + "npm": "@sylphx/coderag", + "rules": [ + "Always use PersistentStorage for production - it provides SQLite persistence and instant startup", + "Use async parseAsync() not sync parse() - WASM parsers require async initialization", + "Import from subpaths when possible: '@sylphx/coderag/storage', '@sylphx/coderag/tfidf'", + "Set lowMemoryMode: true for codebases over 10,000 files to use SQL-based search", + "Use hybridSearch() for best results when OPENAI_API_KEY is set, otherwise use indexer.search()", + "File watching requires await indexer.index({ watch: true }) - don't forget await", + "SearchResult.snippet contains formatted code with line numbers, chunkType indicates the AST node type", + "StarCoder2 tokenizer downloads on first use (~4.7MB) - pre-initialize with initializeTokenizer()", + "Synth parsers are optional dependencies - they install automatically when needed" + ], + "folders": [ + { + "path": "docs", + "description": "VitePress documentation site" + }, + { + "path": "packages/core/src", + "description": "Core library source code" + }, + { + "path": "packages/mcp-server/src", + "description": "MCP server implementation" + } + ], + "excludeFolders": [ + "node_modules", + "dist", + ".vitepress/dist", + ".vitepress/cache", + ".turbo", + ".git" + ], + "examples": [ + { + "title": "Basic Usage", + "code": "import { CodebaseIndexer, PersistentStorage } from '@sylphx/coderag'\n\nconst storage = new PersistentStorage({ codebaseRoot: '.' })\nconst indexer = new CodebaseIndexer({ codebaseRoot: '.', storage })\n\nawait indexer.index()\nconst results = await indexer.search('authentication')" + }, + { + "title": "Hybrid Search", + "code": "import { CodebaseIndexer, PersistentStorage, createEmbeddingProvider, hybridSearch } from '@sylphx/coderag'\n\nconst embeddingProvider = await createEmbeddingProvider({ provider: 'openai' })\nconst storage = new PersistentStorage({ codebaseRoot: '.' })\nconst indexer = new CodebaseIndexer({ codebaseRoot: '.', storage, embeddingProvider })\n\nawait indexer.index()\nconst results = await hybridSearch('user login flow', indexer, { vectorWeight: 0.7 })" + }, + { + "title": "MCP Server", + "code": "npx @sylphx/coderag-mcp --root=/path/to/project" + } + ] +} diff --git a/docs/.vitepress/config.ts b/docs/.vitepress/config.ts index d4bf8fe..354b84a 100644 --- a/docs/.vitepress/config.ts +++ b/docs/.vitepress/config.ts @@ -1,19 +1,78 @@ import { defineConfig } from 'vitepress' +const title = 'CodeRAG' +const description = + 'Lightning-fast semantic code search with AST chunking - RAG-ready for AI assistants' +const url = 'https://coderag.sylphx.com' +const ogImage = `${url}/og-image.png` + export default defineConfig({ - title: 'CodeRAG', - description: 'Lightning-fast hybrid code search (TF-IDF + Vector) - RAG-ready for AI assistants', + title, + description, base: '/', + cleanUrls: true, ignoreDeadLinks: true, + lastUpdated: true, + + head: [ + // Favicon + ['link', { rel: 'icon', type: 'image/svg+xml', href: '/logo.svg' }], + ['link', { rel: 'icon', type: 'image/png', href: '/favicon.png' }], + + // SEO + ['meta', { name: 'theme-color', content: '#6366f1' }], + ['meta', { name: 'author', content: 'Sylphx' }], + [ + 'meta', + { + name: 'keywords', + content: + 'code search, RAG, retrieval augmented generation, TF-IDF, BM25, vector search, embeddings, AST, semantic search, MCP, AI assistant', + }, + ], + + // Open Graph + ['meta', { property: 'og:type', content: 'website' }], + ['meta', { property: 'og:title', content: title }], + ['meta', { property: 'og:description', content: description }], + ['meta', { property: 'og:url', content: url }], + ['meta', { property: 'og:image', content: ogImage }], + ['meta', { property: 'og:image:width', content: '1200' }], + ['meta', { property: 'og:image:height', content: '630' }], + ['meta', { property: 'og:site_name', content: 'CodeRAG' }], + ['meta', { property: 'og:locale', content: 'en_US' }], + + // Twitter Card + ['meta', { name: 'twitter:card', content: 'summary_large_image' }], + ['meta', { name: 'twitter:title', content: title }], + ['meta', { name: 'twitter:description', content: description }], + ['meta', { name: 'twitter:image', content: ogImage }], + ['meta', { name: 'twitter:site', content: '@SylphxAI' }], + + // Canonical + ['link', { rel: 'canonical', href: url }], + ], + + sitemap: { + hostname: url, + }, themeConfig: { logo: '/logo.svg', + siteTitle: 'CodeRAG', nav: [ { text: 'Guide', link: '/guide/getting-started' }, - { text: 'API', link: '/api/core' }, + { text: 'API', link: '/api/overview' }, { text: 'MCP Server', link: '/mcp/overview' }, - { text: 'GitHub', link: 'https://github.com/sylphlab/coderag' }, + { + text: 'Resources', + items: [ + { text: 'GitHub', link: 'https://github.com/SylphxAI/coderag' }, + { text: 'npm', link: 'https://www.npmjs.com/package/@sylphx/coderag' }, + { text: 'Changelog', link: 'https://github.com/SylphxAI/coderag/releases' }, + ], + }, ], sidebar: { @@ -21,7 +80,7 @@ export default defineConfig({ { text: 'Introduction', items: [ - { text: 'Getting Started', link: '/guide/getting-started' }, + { text: 'What is CodeRAG?', link: '/guide/getting-started' }, { text: 'Installation', link: '/guide/installation' }, { text: 'Quick Start', link: '/guide/quick-start' }, ], @@ -29,17 +88,19 @@ export default defineConfig({ { text: 'Core Concepts', items: [ - { text: 'TF-IDF Search', link: '/guide/tfidf' }, + { text: 'How Search Works', link: '/guide/how-search-works' }, + { text: 'AST Chunking', link: '/guide/ast-chunking' }, + { text: 'TF-IDF & BM25', link: '/guide/tfidf' }, { text: 'Vector Search', link: '/guide/vector-search' }, { text: 'Hybrid Search', link: '/guide/hybrid-search' }, - { text: 'Code Tokenization', link: '/guide/tokenization' }, ], }, { - text: 'Configuration', + text: 'Advanced', items: [ - { text: 'Embedding Providers', link: '/guide/providers' }, - { text: 'Custom Providers', link: '/guide/custom-providers' }, + { text: 'Persistent Storage', link: '/guide/storage' }, + { text: 'File Watching', link: '/guide/file-watching' }, + { text: 'Language Support', link: '/guide/languages' }, { text: 'Performance Tuning', link: '/guide/performance' }, ], }, @@ -48,10 +109,13 @@ export default defineConfig({ { text: 'API Reference', items: [ - { text: 'Core Package', link: '/api/core' }, - { text: 'Embeddings', link: '/api/embeddings' }, - { text: 'Search', link: '/api/search' }, - { text: 'Storage', link: '/api/storage' }, + { text: 'Overview', link: '/api/overview' }, + { text: 'CodebaseIndexer', link: '/api/indexer' }, + { text: 'PersistentStorage', link: '/api/storage' }, + { text: 'Search Functions', link: '/api/search' }, + { text: 'Embedding Providers', link: '/api/embeddings' }, + { text: 'AST Chunking', link: '/api/chunking' }, + { text: 'Types', link: '/api/types' }, ], }, ], @@ -61,22 +125,45 @@ export default defineConfig({ items: [ { text: 'Overview', link: '/mcp/overview' }, { text: 'Installation', link: '/mcp/installation' }, - { text: 'Tools', link: '/mcp/tools' }, { text: 'Configuration', link: '/mcp/configuration' }, + { text: 'Tools Reference', link: '/mcp/tools' }, + { text: 'IDE Integration', link: '/mcp/ide-integration' }, ], }, ], }, - socialLinks: [{ icon: 'github', link: 'https://github.com/sylphlab/coderag' }], + socialLinks: [ + { icon: 'github', link: 'https://github.com/SylphxAI/coderag' }, + { icon: 'npm', link: 'https://www.npmjs.com/package/@sylphx/coderag' }, + ], footer: { message: 'Released under the MIT License.', - copyright: 'Copyright Β© 2024 SylphLab', + copyright: 'Copyright Β© 2024 Sylphx', }, search: { provider: 'local', + options: { + detailedView: true, + }, + }, + + editLink: { + pattern: 'https://github.com/SylphxAI/coderag/edit/main/docs/:path', + text: 'Edit this page on GitHub', + }, + + outline: { + level: [2, 3], + }, + + lastUpdated: { + text: 'Last updated', + formatOptions: { + dateStyle: 'medium', + }, }, }, diff --git a/docs/api/chunking.md b/docs/api/chunking.md new file mode 100644 index 0000000..6df21e3 --- /dev/null +++ b/docs/api/chunking.md @@ -0,0 +1,499 @@ +# AST Chunking + +CodeRAG uses AST-based chunking to split code at semantic boundaries (functions, classes, etc.) using the Synth parser library. + +## chunkCodeByAST() + +Split code into semantic chunks using AST analysis. + +```typescript +async function chunkCodeByAST( + code: string, + filePath: string, + options?: ASTChunkOptions +): Promise +``` + +### Parameters + +**code** `string` - Source code to chunk + +**filePath** `string` - File path (used for language detection) + +**options** `ASTChunkOptions` (optional) + +```typescript +interface ASTChunkOptions { + maxChunkSize?: number // Max chunk chars (default: 1000) + minChunkSize?: number // Min chunk chars (default: 100) + preserveContext?: boolean // Include imports/types (default: true) + nodeTypes?: string[] // Custom AST node types to chunk + parseEmbedded?: boolean // Parse code in markdown (default: true) +} +``` + +### Returns + +`Promise` - Array of semantic chunks + +```typescript +interface ChunkResult { + content: string // Chunk content + type: string // AST node type + startLine: number // Start line (1-indexed) + endLine: number // End line (inclusive) + metadata: Record // Additional metadata +} +``` + +### Example + +```typescript +import { chunkCodeByAST } from '@sylphx/coderag' + +const code = ` +import { z } from 'zod' + +export function validateUser(user: unknown) { + const schema = z.object({ + name: z.string(), + email: z.string().email() + }) + return schema.parse(user) +} + +export class UserService { + async createUser(data: unknown) { + const validated = validateUser(data) + return this.db.users.create(validated) + } +} +` + +const chunks = await chunkCodeByAST(code, 'user-service.ts') + +for (const chunk of chunks) { + console.log(`Type: ${chunk.type}`) + console.log(`Lines: ${chunk.startLine}-${chunk.endLine}`) + console.log(chunk.content) + console.log('---') +} + +// Output: +// Type: FunctionDeclaration +// Lines: 3-10 +// import { z } from 'zod' +// +// export function validateUser(user: unknown) { ... } +// --- +// Type: ClassDeclaration +// Lines: 12-17 +// import { z } from 'zod' +// +// export class UserService { ... } +``` + +## chunkCodeByASTSimple() + +Simplified wrapper that returns chunk content only. + +```typescript +async function chunkCodeByASTSimple( + code: string, + filePath: string, + options?: ASTChunkOptions +): Promise +``` + +### Returns + +`Promise` - Array of chunk content strings + +### Example + +```typescript +const chunks = await chunkCodeByASTSimple(code, 'example.ts') +// Returns: ['function a() { }', 'class B { }'] +``` + +## getSupportedLanguages() + +Get list of supported languages for AST chunking. + +```typescript +function getSupportedLanguages(): string[] +``` + +### Returns + +`string[]` - Array of supported language names + +### Example + +```typescript +import { getSupportedLanguages } from '@sylphx/coderag' + +const languages = getSupportedLanguages() +console.log(languages) +// ['javascript', 'typescript', 'python', 'rust', 'go', ...] +``` + +## Supported Languages + +CodeRAG supports AST chunking for the following languages: + +**JavaScript/TypeScript** +- Extensions: `.js`, `.mjs`, `.cjs`, `.jsx`, `.ts`, `.tsx`, `.mts`, `.cts` +- Boundaries: Functions, classes, methods, arrow functions +- Context: Imports, type definitions + +**Python** +- Extensions: `.py` +- Boundaries: Functions, classes, methods +- Context: Imports, class definitions + +**Rust** +- Extensions: `.rs` +- Boundaries: Functions, structs, impls, traits +- Context: Use statements, type definitions + +**Go** +- Extensions: `.go` +- Boundaries: Functions, methods, structs +- Context: Imports, type declarations + +**Java** +- Extensions: `.java` +- Boundaries: Classes, methods, constructors +- Context: Imports, class declarations + +**C/C++** +- Extensions: `.c`, `.h`, `.cpp`, `.hpp`, `.cc`, `.hh` +- Boundaries: Functions, structs, classes +- Context: Includes, type definitions + +**Ruby** +- Extensions: `.rb` +- Boundaries: Modules, classes, methods +- Context: Requires, module definitions + +**PHP** +- Extensions: `.php` +- Boundaries: Classes, functions, methods +- Context: Namespace, use statements + +**Markdown** +- Extensions: `.md`, `.mdx` +- Boundaries: Headings, code blocks +- Embedded: Recursively parses code blocks + +**Markup/Data** +- HTML: `.html`, `.htm` +- XML: `.xml` +- JSON: `.json` +- YAML: `.yaml`, `.yml` +- TOML: `.toml` + +## Chunking Behavior + +### Semantic Boundaries + +Code is split at meaningful boundaries based on AST structure: + +```typescript +const code = ` +function greet() { + console.log('hello') +} + +function farewell() { + console.log('goodbye') +} +` + +const chunks = await chunkCodeByAST(code, 'example.js') +// Returns 2 chunks (one per function) +``` + +### Context Preservation + +By default, context (imports, types) is prepended to each chunk: + +```typescript +const code = ` +import { User } from './types' + +export function createUser(data: User) { + return { ...data, id: generateId() } +} + +export function deleteUser(id: string) { + return db.users.delete(id) +} +` + +const chunks = await chunkCodeByAST(code, 'user.ts', { + preserveContext: true // Default +}) + +// Each chunk includes: import { User } from './types' +``` + +Disable context preservation: + +```typescript +const chunks = await chunkCodeByAST(code, 'user.ts', { + preserveContext: false +}) +// Chunks contain only the function code +``` + +### Large Chunk Handling + +Chunks exceeding `maxChunkSize` are recursively split: + +```typescript +const code = ` +export function processLargeData(items: Item[]) { + // 2000 lines of code... +} +` + +const chunks = await chunkCodeByAST(code, 'process.ts', { + maxChunkSize: 1000 +}) +// Returns multiple sub-chunks from the function body +``` + +### Small Chunk Merging + +Small non-semantic chunks are merged to reach `minChunkSize`: + +```typescript +const code = ` +const a = 1 +const b = 2 +const c = 3 + +function main() { } +` + +const chunks = await chunkCodeByAST(code, 'example.js', { + minChunkSize: 50 +}) +// Constants merged into single chunk, function as separate chunk +``` + +### Fallback Behavior + +If AST parsing fails, falls back to character chunking: + +```typescript +const invalidCode = ` +function incomplete( +` + +const chunks = await chunkCodeByAST(invalidCode, 'broken.js') +// Returns character-based chunks with metadata.fallback = true +``` + +## Embedded Code Parsing + +Markdown code blocks are recursively parsed: + +```typescript +const markdown = ` +# Example + +\`\`\`typescript +function hello() { + console.log('hi') +} +\`\`\` +` + +const chunks = await chunkCodeByAST(markdown, 'example.md', { + parseEmbedded: true // Default +}) + +// Returns chunk with: +// - type: 'FunctionDeclaration' +// - metadata.embeddedIn: 'CodeBlock' +// - metadata.embeddedLanguage: 'typescript' +``` + +Disable embedded parsing: + +```typescript +const chunks = await chunkCodeByAST(markdown, 'example.md', { + parseEmbedded: false +}) +// Returns code block as-is without parsing +``` + +## Custom Node Types + +Override default semantic boundaries: + +```typescript +const code = ` +const config = { + host: 'localhost', + port: 3000 +} + +function start() { } +` + +const chunks = await chunkCodeByAST(code, 'config.js', { + nodeTypes: ['VariableDeclaration', 'FunctionDeclaration'] +}) +// Chunks both config and function separately +``` + +## Chunk Metadata + +Each chunk includes metadata about its origin: + +```typescript +const chunks = await chunkCodeByAST(code, 'example.ts') + +const chunk = chunks[0] +console.log(chunk.metadata) + +// Common metadata: +// - name: Function/class name (if available) +// - async: true/false for async functions +// - export: true/false for exported symbols +// - fallback: true if character chunking was used +// - split: true if chunk was split from larger node +// - merged: true if merged from small chunks +// - embeddedIn: Parent node type for embedded code +// - embeddedLanguage: Language of embedded code +``` + +## Performance + +### Parsing Speed + +Approximate parsing speeds: +- JavaScript/TypeScript: ~5000 lines/sec +- Python: ~3000 lines/sec +- Rust/Go/Java/C++: ~2000 lines/sec (WASM-based) +- Markdown: ~10000 lines/sec + +### Memory Usage + +- In-memory AST: ~1KB per 100 lines of code +- Chunk overhead: ~100 bytes per chunk +- Total: ~50-100MB for large codebases + +## Error Handling + +### Invalid Syntax + +```typescript +try { + const chunks = await chunkCodeByAST(invalidCode, 'broken.js') + // Falls back to character chunking + if (chunks[0].metadata.fallback) { + console.log('AST parsing failed, using fallback') + } +} catch (error) { + console.error('Chunking failed:', error) +} +``` + +### Unknown Language + +```typescript +const chunks = await chunkCodeByAST(code, 'unknown.xyz') +// Logs: [WARN] Unknown language, falling back to character chunking +// Returns character-based chunks +``` + +## Best Practices + +**Choose appropriate chunk sizes:** +```typescript +// For embeddings (typical token limit: 8192) +const chunks = await chunkCodeByAST(code, file, { + maxChunkSize: 1000, // ~500 tokens + minChunkSize: 100 +}) + +// For LLM context (larger is better) +const chunks = await chunkCodeByAST(code, file, { + maxChunkSize: 2000, + minChunkSize: 200 +}) +``` + +**Preserve context for better search:** +```typescript +const chunks = await chunkCodeByAST(code, file, { + preserveContext: true // Include imports/types +}) +``` + +**Enable embedded parsing for documentation:** +```typescript +const chunks = await chunkCodeByAST(markdown, 'README.md', { + parseEmbedded: true // Parse code examples +}) +``` + +**Handle fallback gracefully:** +```typescript +const chunks = await chunkCodeByAST(code, file) +const validChunks = chunks.filter(c => !c.metadata.fallback) +``` + +## Integration with Indexer + +CodebaseIndexer automatically uses AST chunking: + +```typescript +import { CodebaseIndexer } from '@sylphx/coderag' + +const indexer = new CodebaseIndexer({ + codebaseRoot: './src' +}) + +await indexer.index() +// Automatically chunks all files using chunkCodeByAST() +``` + +Search results include chunk metadata: + +```typescript +const results = await indexer.search('authentication') + +for (const result of results) { + console.log(`${result.chunkType} at ${result.path}:${result.startLine}`) + // FunctionDeclaration at src/auth.ts:42 +} +``` + +## Language Configuration + +View language-specific settings: + +```typescript +import { getLanguageConfig } from '@sylphx/coderag' + +const config = getLanguageConfig('typescript') +console.log(config) + +// { +// parser: '@sylphx/synth-js', +// boundaries: ['FunctionDeclaration', 'ClassDeclaration', ...], +// contextTypes: ['ImportDeclaration', 'TypeAlias', ...], +// embedded: [{ nodeType: 'CodeBlock', ... }] +// } +``` + +## Related + +- [CodebaseIndexer](./indexer.md) +- [Types](./types.md) diff --git a/docs/api/embeddings.md b/docs/api/embeddings.md new file mode 100644 index 0000000..d4c5f9f --- /dev/null +++ b/docs/api/embeddings.md @@ -0,0 +1,437 @@ +# Embedding Providers + +CodeRAG supports multiple embedding providers for semantic vector search. Providers use the Vercel AI SDK with OpenAI-compatible APIs. + +## createEmbeddingProvider() + +Factory function to create embedding providers. + +```typescript +function createEmbeddingProvider(config: EmbeddingConfig): EmbeddingProvider +``` + +### Parameters + +**config** `EmbeddingConfig` + +```typescript +interface EmbeddingConfig { + provider: 'openai' | 'openai-compatible' | 'mock' + model: string // Model name + dimensions: number // Vector dimensions + apiKey?: string // API key (or use OPENAI_API_KEY env var) + baseURL?: string // Custom endpoint for OpenAI-compatible APIs + batchSize?: number // Embedding batch size (default: 10) +} +``` + +### Returns + +`EmbeddingProvider` instance + +```typescript +interface EmbeddingProvider { + name: string + model: string + dimensions: number + generateEmbedding(text: string): Promise + generateEmbeddings(texts: string[]): Promise +} +``` + +### Example + +```typescript +import { createEmbeddingProvider } from '@sylphx/coderag' + +const provider = createEmbeddingProvider({ + provider: 'openai', + model: 'text-embedding-3-small', + dimensions: 1536, + apiKey: process.env.OPENAI_API_KEY +}) + +// Generate single embedding +const embedding = await provider.generateEmbedding('function hello() {}') +console.log(embedding.length) // 1536 + +// Generate batch +const embeddings = await provider.generateEmbeddings([ + 'function a() {}', + 'class B {}' +]) +console.log(embeddings.length) // 2 +``` + +## OpenAI Provider + +Uses OpenAI's embedding models. + +### Supported Models + +**text-embedding-3-small** (recommended) +- Dimensions: 1536 +- Performance: Fast +- Cost: Low +- Quality: Good for code + +**text-embedding-3-large** +- Dimensions: 3072 +- Performance: Slower +- Cost: Higher +- Quality: Better semantic understanding + +**text-embedding-ada-002** (legacy) +- Dimensions: 1536 +- Performance: Fast +- Cost: Low +- Quality: Good + +### Example + +```typescript +const provider = createEmbeddingProvider({ + provider: 'openai', + model: 'text-embedding-3-small', + dimensions: 1536, + apiKey: process.env.OPENAI_API_KEY +}) +``` + +### Environment Variables + +```bash +OPENAI_API_KEY=sk-... +EMBEDDING_MODEL=text-embedding-3-small # Optional +EMBEDDING_DIMENSIONS=1536 # Optional +``` + +## OpenAI-Compatible Provider + +Use OpenAI-compatible endpoints (OpenRouter, Together AI, etc.). + +### Example + +```typescript +// OpenRouter +const provider = createEmbeddingProvider({ + provider: 'openai-compatible', + model: 'openai/text-embedding-3-small', + dimensions: 1536, + apiKey: process.env.OPENROUTER_API_KEY, + baseURL: 'https://openrouter.ai/api/v1' +}) + +// Together AI +const provider = createEmbeddingProvider({ + provider: 'openai-compatible', + model: 'togethercomputer/m2-bert-80M-8k-retrieval', + dimensions: 768, + apiKey: process.env.TOGETHER_API_KEY, + baseURL: 'https://api.together.xyz/v1' +}) +``` + +### Environment Variables + +```bash +OPENAI_API_KEY=your-key +OPENAI_BASE_URL=https://custom-endpoint/v1 +EMBEDDING_MODEL=custom-model-name +EMBEDDING_DIMENSIONS=768 +``` + +## Mock Provider + +Deterministic hash-based embeddings for testing (no API calls). + +### Example + +```typescript +const provider = createEmbeddingProvider({ + provider: 'mock', + model: 'mock', + dimensions: 1536 +}) + +// Or use createMockProvider() directly +import { createMockProvider } from '@sylphx/coderag' + +const provider = createMockProvider(1536) +``` + +### Use Cases + +- Testing without API costs +- Offline development +- CI/CD pipelines +- Fallback when API unavailable + +## Default Provider + +Get provider from environment variables. + +```typescript +import { getDefaultEmbeddingProvider } from '@sylphx/coderag' + +const provider = await getDefaultEmbeddingProvider() +// Detects from OPENAI_API_KEY and OPENAI_BASE_URL +``` + +### Detection Logic + +1. If `OPENAI_BASE_URL` set: `openai-compatible` +2. If `OPENAI_API_KEY` set: `openai` +3. Otherwise: `mock` + +## Custom Providers + +Register custom embedding providers. + +```typescript +import { registerProvider } from '@sylphx/coderag' + +registerProvider('huggingface', (config) => ({ + name: 'huggingface', + model: config.model, + dimensions: config.dimensions, + generateEmbedding: async (text) => { + // Your implementation + const response = await fetch('https://api-inference.huggingface.co/...', { + method: 'POST', + headers: { Authorization: `Bearer ${config.apiKey}` }, + body: JSON.stringify({ inputs: text }) + }) + const data = await response.json() + return data.embeddings + }, + generateEmbeddings: async (texts) => { + // Batch implementation + return Promise.all(texts.map(text => this.generateEmbedding(text))) + } +})) + +// Use custom provider +const provider = createEmbeddingProvider({ + provider: 'huggingface', + model: 'sentence-transformers/all-MiniLM-L6-v2', + dimensions: 384, + apiKey: process.env.HF_API_KEY +}) +``` + +## Provider Composition + +Combine providers for fallback behavior. + +```typescript +import { composeProviders, createOpenAIProvider, createMockProvider } from '@sylphx/coderag' + +const primary = createOpenAIProvider({ + provider: 'openai', + model: 'text-embedding-3-small', + dimensions: 1536, + apiKey: process.env.OPENAI_API_KEY +}) + +const fallback = createMockProvider(1536) + +const composed = composeProviders(primary, fallback) +// Uses primary, falls back to mock if primary fails +``` + +## Utility Functions + +### generateMockEmbedding() + +Generate deterministic mock embedding. + +```typescript +import { generateMockEmbedding } from '@sylphx/coderag' + +const embedding = generateMockEmbedding('hello world', 1536) +// Always returns same vector for same input +``` + +### cosineSimilarity() + +Calculate similarity between two vectors. + +```typescript +import { cosineSimilarity } from '@sylphx/coderag' + +const similarity = cosineSimilarity(embedding1, embedding2) +// Returns: -1 to 1 (1 = identical, 0 = orthogonal, -1 = opposite) +``` + +### normalizeVector() + +Normalize vector to unit length. + +```typescript +import { normalizeVector } from '@sylphx/coderag' + +const normalized = normalizeVector([3, 4]) +// Returns: [0.6, 0.8] (magnitude = 1) +``` + +### chunkText() + +Split text into chunks for embedding. + +```typescript +import { chunkText } from '@sylphx/coderag' + +const chunks = chunkText(longText, { + maxChunkSize: 1000, + overlap: 100 +}) +``` + +## Usage with CodebaseIndexer + +```typescript +import { CodebaseIndexer, createEmbeddingProvider, PersistentStorage } from '@sylphx/coderag' + +const embeddingProvider = createEmbeddingProvider({ + provider: 'openai', + model: 'text-embedding-3-small', + dimensions: 1536, + apiKey: process.env.OPENAI_API_KEY +}) + +const indexer = new CodebaseIndexer({ + codebaseRoot: './src', + storage: new PersistentStorage(), + embeddingProvider, + vectorBatchSize: 20 // Generate 20 embeddings at once +}) + +await indexer.index() +// Generates embeddings for each code chunk +``` + +## Performance + +### Batch Size + +Control API request size: + +```typescript +const provider = createEmbeddingProvider({ + provider: 'openai', + model: 'text-embedding-3-small', + dimensions: 1536, + batchSize: 10 // Default +}) + +// Larger batches = fewer API calls but higher latency +const indexer = new CodebaseIndexer({ + embeddingProvider: provider, + vectorBatchSize: 50 // 50 chunks per batch +}) +``` + +### Speed Comparison + +Model speeds (approximate): +- `text-embedding-3-small`: ~1000 tokens/sec +- `text-embedding-3-large`: ~500 tokens/sec +- `text-embedding-ada-002`: ~1000 tokens/sec + +Indexing 1000 code chunks (~500 tokens each): +- Small model: ~250 seconds (50 batches of 20) +- Large model: ~500 seconds +- Mock provider: ~1 second (no API calls) + +## Cost Estimation + +OpenAI pricing (as of 2024): +- `text-embedding-3-small`: $0.02 / 1M tokens +- `text-embedding-3-large`: $0.13 / 1M tokens +- `text-embedding-ada-002`: $0.10 / 1M tokens + +Example: 10,000 code chunks, 500 tokens each = 5M tokens +- Small: $0.10 +- Large: $0.65 +- Ada-002: $0.50 + +## Error Handling + +### Automatic Fallback + +Provider automatically falls back to mock on error: + +```typescript +const provider = createEmbeddingProvider({ + provider: 'openai', + model: 'text-embedding-3-small', + dimensions: 1536, + apiKey: 'invalid-key' +}) + +const embedding = await provider.generateEmbedding('test') +// Logs: [WARN] OpenAI embedding failed, falling back to mock +// Returns: Mock embedding (deterministic hash) +``` + +### Manual Error Handling + +```typescript +try { + const embedding = await provider.generateEmbedding(text) +} catch (error) { + console.error('Embedding generation failed:', error) + // Handle error +} +``` + +## Best Practices + +**Use environment variables:** +```typescript +// Good +const provider = createEmbeddingProvider({ + provider: 'openai', + model: 'text-embedding-3-small', + dimensions: 1536, + apiKey: process.env.OPENAI_API_KEY // From env +}) + +// Avoid +const provider = createEmbeddingProvider({ + apiKey: 'sk-hardcoded-key' // Security risk +}) +``` + +**Batch when possible:** +```typescript +// Good: Single API call +const embeddings = await provider.generateEmbeddings(texts) + +// Avoid: Multiple API calls +const embeddings = await Promise.all( + texts.map(text => provider.generateEmbedding(text)) +) +``` + +**Choose appropriate dimensions:** +```typescript +// For code search: 1536 is sufficient +const provider = createEmbeddingProvider({ + model: 'text-embedding-3-small', + dimensions: 1536 +}) + +// For semantic understanding: 3072 is better +const provider = createEmbeddingProvider({ + model: 'text-embedding-3-large', + dimensions: 3072 +}) +``` + +## Related + +- [CodebaseIndexer](./indexer.md) +- [Search Functions](./search.md) +- [Types](./types.md) diff --git a/docs/api/indexer.md b/docs/api/indexer.md new file mode 100644 index 0000000..9243703 --- /dev/null +++ b/docs/api/indexer.md @@ -0,0 +1,437 @@ +# CodebaseIndexer + +The primary class for indexing and searching codebases. Provides TF-IDF keyword search, optional vector search, and hybrid search capabilities. + +## Constructor + +```typescript +new CodebaseIndexer(options?: IndexerOptions) +``` + +Creates a new indexer instance. + +### Parameters + +**options** `IndexerOptions` (optional) + +Configuration options for the indexer: + +```typescript +interface IndexerOptions { + codebaseRoot?: string // Root directory (default: process.cwd()) + maxFileSize?: number // Max file size in bytes (default: 1MB) + storage?: Storage // Storage implementation (default: MemoryStorage) + onProgress?: (current: number, total: number, file: string) => void + watch?: boolean // Enable file watching (default: false) + onFileChange?: (event: FileChangeEvent) => void + embeddingProvider?: EmbeddingProvider // Optional for vector search + vectorBatchSize?: number // Embedding batch size (default: 10) + indexingBatchSize?: number // Files per batch (default: 50) + lowMemoryMode?: boolean // Use SQL search (default: true with PersistentStorage) +} +``` + +### Returns + +A new `CodebaseIndexer` instance. + +### Example + +```typescript +import { CodebaseIndexer, PersistentStorage } from '@sylphx/coderag' + +const indexer = new CodebaseIndexer({ + codebaseRoot: '/path/to/project', + storage: new PersistentStorage(), + maxFileSize: 2 * 1024 * 1024, // 2MB + watch: true, + onProgress: (current, total, file) => { + console.log(`Indexing ${current}/${total}: ${file}`) + } +}) +``` + +## Methods + +### index() + +Index or update the codebase index. + +```typescript +async index(options?: IndexerOptions): Promise +``` + +Scans the codebase, chunks files at semantic boundaries, and builds TF-IDF and optional vector indexes. Automatically detects changes and performs incremental updates when using persistent storage. + +#### Parameters + +**options** `IndexerOptions` (optional) - Override constructor options for this indexing run. + +#### Behavior + +1. **First Run**: Full index of all files +2. **Subsequent Runs**: Incremental updates (detects added, changed, deleted files) +3. **Chunk-Level Indexing**: Uses AST to split code at function/class boundaries +4. **Progress Tracking**: Calls `onProgress` callback during indexing + +#### Example + +```typescript +// Initial index +await indexer.index() + +// Re-index with progress tracking +await indexer.index({ + onProgress: (current, total, file) => { + const percent = Math.round((current / total) * 100) + console.log(`${percent}% - ${file}`) + } +}) +``` + +### search() + +Search the indexed codebase using TF-IDF or BM25 scoring. + +```typescript +async search( + query: string, + options?: SearchOptions +): Promise +``` + +#### Parameters + +**query** `string` - Search query (tokenized and ranked) + +**options** `SearchOptions` (optional) + +```typescript +interface SearchOptions { + limit?: number // Max results (default: 10) + includeContent?: boolean // Include snippets (default: true) + fileExtensions?: string[] // Filter by extensions (e.g., ['.ts', '.js']) + pathFilter?: string // Include paths containing string + excludePaths?: string[] // Exclude paths containing strings + contextLines?: number // Snippet context (default: 3) + maxSnippetChars?: number // Max snippet length (default: 2000) + maxSnippetBlocks?: number // Max code blocks (default: 4) +} +``` + +#### Returns + +`Promise` - Ranked search results + +```typescript +interface SearchResult { + path: string // File path + score: number // Relevance score + matchedTerms: string[] // Matched query terms + language?: string // Detected language + size: number // Content size + snippet?: string // Code snippet with line numbers + chunkType?: string // AST node type (e.g., 'FunctionDeclaration') + startLine?: number // Chunk start line + endLine?: number // Chunk end line +} +``` + +#### Example + +```typescript +// Basic search +const results = await indexer.search('authentication') + +// Advanced search with filters +const results = await indexer.search('user login', { + limit: 20, + fileExtensions: ['.ts', '.tsx'], + pathFilter: 'src/auth', + excludePaths: ['node_modules', 'dist'] +}) + +// Process results +for (const result of results) { + console.log(`${result.path}:${result.startLine}`) + console.log(`Score: ${result.score.toFixed(2)}`) + console.log(`Matched: ${result.matchedTerms.join(', ')}`) + console.log(result.snippet) +} +``` + +### startWatch() + +Start watching for file changes. + +```typescript +async startWatch(): Promise +``` + +Uses `@parcel/watcher` for native file system events (FSEvents on macOS, inotify on Linux). File changes trigger incremental index updates. + +#### Example + +```typescript +const indexer = new CodebaseIndexer({ + codebaseRoot: './src', + onFileChange: (event) => { + console.log(`${event.type}: ${event.path} at ${event.timestamp}`) + } +}) + +await indexer.index() +await indexer.startWatch() +// Index updates automatically on file changes +``` + +### stopWatch() + +Stop watching for file changes. + +```typescript +async stopWatch(): Promise +``` + +Cleans up file watcher and pending updates. + +#### Example + +```typescript +await indexer.stopWatch() +``` + +### getStatus() + +Get current indexing status. + +```typescript +getStatus(): IndexingStatus +``` + +#### Returns + +```typescript +interface IndexingStatus { + isIndexing: boolean // Currently indexing + progress: number // 0-100 + totalFiles: number // Total files found + processedFiles: number // Files processed + totalChunks: number // Total chunks created + indexedChunks: number // Chunks indexed + currentFile?: string // Current file being processed +} +``` + +#### Example + +```typescript +const status = indexer.getStatus() +console.log(`Progress: ${status.progress}%`) +console.log(`Chunks: ${status.indexedChunks}/${status.totalChunks}`) +``` + +### getIndexedCount() + +Get total number of indexed files. + +```typescript +async getIndexedCount(): Promise +``` + +#### Returns + +`Promise` - Number of files in the index. + +#### Example + +```typescript +const count = await indexer.getIndexedCount() +console.log(`Indexed ${count} files`) +``` + +### getFileContent() + +Retrieve raw content of an indexed file. + +```typescript +async getFileContent(filePath: string): Promise +``` + +#### Parameters + +**filePath** `string` - Relative file path. + +#### Returns + +`Promise` - File content or null if not found. + +#### Example + +```typescript +const content = await indexer.getFileContent('src/index.ts') +if (content) { + console.log(content) +} +``` + +### getVectorStorage() + +Get the vector storage instance (if embeddings enabled). + +```typescript +getVectorStorage(): VectorStorage | undefined +``` + +#### Returns + +`VectorStorage | undefined` - Vector storage or undefined if not configured. + +### getEmbeddingProvider() + +Get the embedding provider (if configured). + +```typescript +getEmbeddingProvider(): EmbeddingProvider | undefined +``` + +#### Returns + +`EmbeddingProvider | undefined` - Embedding provider or undefined if not configured. + +## Events + +### onFileChange + +Callback invoked when a file changes (requires `watch: true`). + +```typescript +interface FileChangeEvent { + type: 'add' | 'change' | 'unlink' + path: string + timestamp: number +} +``` + +#### Example + +```typescript +const indexer = new CodebaseIndexer({ + watch: true, + onFileChange: (event) => { + if (event.type === 'add') { + console.log(`New file: ${event.path}`) + } else if (event.type === 'change') { + console.log(`Modified: ${event.path}`) + } else if (event.type === 'unlink') { + console.log(`Deleted: ${event.path}`) + } + } +}) +``` + +### onProgress + +Callback invoked during indexing to report progress. + +```typescript +(current: number, total: number, file: string) => void +``` + +#### Example + +```typescript +await indexer.index({ + onProgress: (current, total, file) => { + process.stdout.write(`\rIndexing ${current}/${total}: ${file.padEnd(50)}`) + } +}) +``` + +## Advanced Usage + +### With Vector Embeddings + +```typescript +import { + CodebaseIndexer, + createEmbeddingProvider, + PersistentStorage +} from '@sylphx/coderag' + +const embeddingProvider = createEmbeddingProvider({ + provider: 'openai', + model: 'text-embedding-3-small', + dimensions: 1536, + apiKey: process.env.OPENAI_API_KEY +}) + +const indexer = new CodebaseIndexer({ + codebaseRoot: './src', + storage: new PersistentStorage(), + embeddingProvider, + vectorBatchSize: 20 // Generate embeddings for 20 chunks at once +}) + +await indexer.index() +``` + +### Low Memory Mode + +Uses SQL-based search instead of in-memory indexes: + +```typescript +const indexer = new CodebaseIndexer({ + storage: new PersistentStorage(), + lowMemoryMode: true, // Default when using PersistentStorage + indexingBatchSize: 25 // Process fewer files at once +}) +``` + +### Custom Storage + +```typescript +import { MemoryStorage } from '@sylphx/coderag' + +const storage = new MemoryStorage() +const indexer = new CodebaseIndexer({ storage }) + +await indexer.index() + +// Access stored files +const files = await storage.getAllFiles() +``` + +## Performance + +### Indexing Speed + +- In-memory: ~1000 files/second +- Persistent: ~500 files/second (chunk-level indexing) +- With embeddings: Depends on API rate limits (typically 10-50 chunks/second) + +### Memory Usage + +- Low memory mode: ~50-100MB for large codebases +- In-memory mode: ~200-500MB depending on codebase size +- Vector storage: Additional ~1MB per 1000 chunks (1536 dimensions) + +### Search Speed + +- TF-IDF (SQL): ~5-10ms per query +- TF-IDF (in-memory): ~1-2ms per query +- Vector search: ~10-50ms per query (LanceDB) +- Hybrid search: ~20-60ms per query + +## Limitations + +- Maximum file size: 1MB (configurable via `maxFileSize`) +- Binary files: Automatically skipped +- Chunk size: 100-1000 characters (configurable) +- Supported languages: See [language-config.ts](https://github.com/SylphxAI/coderag/blob/main/packages/core/src/language-config.ts) + +## Related + +- [Storage API](./storage.md) +- [Search Functions](./search.md) +- [Embeddings](./embeddings.md) diff --git a/docs/api/overview.md b/docs/api/overview.md new file mode 100644 index 0000000..be17da3 --- /dev/null +++ b/docs/api/overview.md @@ -0,0 +1,200 @@ +# API Overview + +CodeRAG provides a comprehensive API for semantic code search with vector embeddings. This reference documents all exported types, classes, and functions. + +## Installation + +```bash +npm install @sylphx/coderag +``` + +## Package Structure + +CodeRAG uses ESM-only exports with TypeScript support. The package provides both a main entry point and specialized subpath exports: + +### Main Entry Point + +```typescript +import { CodebaseIndexer } from '@sylphx/coderag' +``` + +Exports all core functionality including: +- CodebaseIndexer (primary class) +- Search functions (hybrid, semantic, keyword) +- Embedding providers +- Storage implementations +- AST chunking utilities +- Type definitions + +### Subpath Exports + +Specialized exports for advanced use cases: + +```typescript +// Indexer only +import { CodebaseIndexer } from '@sylphx/coderag/indexer' + +// TF-IDF utilities +import { buildSearchIndex, searchDocuments } from '@sylphx/coderag/tfidf' + +// Storage implementations +import { PersistentStorage, MemoryStorage } from '@sylphx/coderag/storage' + +// Utilities +import { scanFiles, detectLanguage } from '@sylphx/coderag/utils' +``` + +## Import Patterns + +### Basic Usage + +```typescript +import { CodebaseIndexer } from '@sylphx/coderag' + +const indexer = new CodebaseIndexer({ + codebaseRoot: '/path/to/codebase', + storage: new PersistentStorage() +}) + +await indexer.index() +const results = await indexer.search('authentication') +``` + +### With Embeddings + +```typescript +import { + CodebaseIndexer, + createEmbeddingProvider, + PersistentStorage +} from '@sylphx/coderag' + +const embeddingProvider = createEmbeddingProvider({ + provider: 'openai', + model: 'text-embedding-3-small', + dimensions: 1536, + apiKey: process.env.OPENAI_API_KEY +}) + +const indexer = new CodebaseIndexer({ + codebaseRoot: '/path/to/codebase', + storage: new PersistentStorage(), + embeddingProvider +}) + +await indexer.index() +``` + +### Hybrid Search + +```typescript +import { CodebaseIndexer, hybridSearch } from '@sylphx/coderag' + +const indexer = new CodebaseIndexer({ /* ... */ }) +await indexer.index() + +const results = await hybridSearch('authentication flow', indexer, { + vectorWeight: 0.7, // 70% semantic, 30% keyword + limit: 10 +}) +``` + +## TypeScript Support + +CodeRAG includes comprehensive TypeScript definitions: + +```typescript +import type { + IndexerOptions, + SearchResult, + EmbeddingProvider, + CodebaseFile, + ChunkResult +} from '@sylphx/coderag' + +// Type-safe configuration +const options: IndexerOptions = { + codebaseRoot: './src', + maxFileSize: 1048576, + watch: true +} + +// Type-safe results +const results: SearchResult[] = await indexer.search('query') +``` + +## Environment Variables + +CodeRAG respects the following environment variables: + +```bash +# OpenAI API configuration +OPENAI_API_KEY=your-api-key +OPENAI_BASE_URL=https://api.openai.com/v1 # Optional: custom endpoint + +# Embedding model configuration +EMBEDDING_MODEL=text-embedding-3-small +EMBEDDING_DIMENSIONS=1536 # Optional: override default +``` + +## Core Concepts + +### Chunk-Based Indexing + +CodeRAG uses AST-based chunking to split code at semantic boundaries (functions, classes, etc.). This provides better search granularity than file-level indexing. + +```typescript +import { chunkCodeByAST } from '@sylphx/coderag' + +const chunks = await chunkCodeByAST( + code, + 'example.ts', + { maxChunkSize: 1000, preserveContext: true } +) +// Returns: ChunkResult[] with type, startLine, endLine, content +``` + +### Persistent Storage + +CodeRAG uses SQLite via LibSQL for persistent storage: + +```typescript +import { PersistentStorage } from '@sylphx/coderag' + +const storage = new PersistentStorage({ + codebaseRoot: '/path/to/codebase', + // Automatically creates ~/.coderag/projects// +}) +``` + +### Incremental Updates + +File changes are detected and indexed incrementally: + +```typescript +const indexer = new CodebaseIndexer({ + codebaseRoot: './src', + watch: true, + onFileChange: (event) => { + console.log(`File ${event.type}: ${event.path}`) + } +}) + +await indexer.index() +// Subsequent calls detect changes and update incrementally +``` + +## API Reference + +- [CodebaseIndexer](./indexer.md) - Main indexing and search class +- [Storage](./storage.md) - Persistent and in-memory storage +- [Search Functions](./search.md) - Hybrid, semantic, and keyword search +- [Embeddings](./embeddings.md) - Embedding provider configuration +- [AST Chunking](./chunking.md) - Code chunking utilities +- [Types](./types.md) - TypeScript type definitions + +## Next Steps + +- Read the [CodebaseIndexer API](./indexer.md) for the main class documentation +- Explore [Search Functions](./search.md) for advanced search capabilities +- Learn about [Embedding Providers](./embeddings.md) for semantic search diff --git a/docs/api/search.md b/docs/api/search.md new file mode 100644 index 0000000..ec4cbf7 --- /dev/null +++ b/docs/api/search.md @@ -0,0 +1,437 @@ +# Search Functions + +CodeRAG provides three search modes: hybrid (combines vector and TF-IDF), semantic (vector only), and keyword (TF-IDF only). + +## hybridSearch() + +Combines vector embeddings and TF-IDF keyword search with weighted scoring. + +```typescript +async function hybridSearch( + query: string, + indexer: CodebaseIndexer, + options?: HybridSearchOptions +): Promise +``` + +### Parameters + +**query** `string` - Search query + +**indexer** `CodebaseIndexer` - Configured indexer with embeddings + +**options** `HybridSearchOptions` (optional) + +```typescript +interface HybridSearchOptions { + limit?: number // Max results (default: 10) + minScore?: number // Min relevance score (default: 0.01) + vectorWeight?: number // 0-1, vector vs TF-IDF (default: 0.7) + includeContent?: boolean // Include snippets (default: false) + fileExtensions?: string[] // Filter by extension + pathFilter?: string // Include paths containing string + excludePaths?: string[] // Exclude paths +} +``` + +### Returns + +`Promise` - Ranked results with metadata + +```typescript +interface HybridSearchResult { + path: string + score: number + method: 'vector' | 'tfidf' | 'hybrid' + matchedTerms?: string[] + similarity?: number + content?: string + chunkType?: string + startLine?: number + endLine?: number + language?: string +} +``` + +### Behavior + +**vectorWeight Modes:** +- `>= 0.99`: Pure vector search (semantic only) +- `<= 0.01`: Pure TF-IDF search (keyword only) +- `0.02-0.98`: Hybrid mode (combines both) + +**Scoring:** +- Vector score: Cosine similarity (0-1) +- TF-IDF score: BM25 relevance (normalized) +- Combined: `vectorWeight * vector + (1 - vectorWeight) * tfidf` + +### Example + +```typescript +import { CodebaseIndexer, hybridSearch, createEmbeddingProvider } from '@sylphx/coderag' + +// Setup +const embeddingProvider = createEmbeddingProvider({ + provider: 'openai', + model: 'text-embedding-3-small', + dimensions: 1536, + apiKey: process.env.OPENAI_API_KEY +}) + +const indexer = new CodebaseIndexer({ + codebaseRoot: './src', + embeddingProvider +}) + +await indexer.index() + +// Hybrid search (70% semantic, 30% keyword) +const results = await hybridSearch( + 'how to authenticate users', + indexer, + { + vectorWeight: 0.7, + limit: 10, + includeContent: true + } +) + +for (const result of results) { + console.log(`${result.path}:${result.startLine} (${result.method})`) + console.log(`Score: ${result.score.toFixed(3)}`) + if (result.matchedTerms) { + console.log(`Keywords: ${result.matchedTerms.join(', ')}`) + } + if (result.similarity) { + console.log(`Semantic similarity: ${result.similarity.toFixed(3)}`) + } + if (result.content) { + console.log(result.content) + } + console.log('---') +} +``` + +### Advanced Usage + +**Adjust semantic vs keyword balance:** + +```typescript +// More semantic (better for conceptual queries) +const semanticResults = await hybridSearch(query, indexer, { + vectorWeight: 0.9 // 90% semantic, 10% keyword +}) + +// More keyword (better for specific terms) +const keywordResults = await hybridSearch(query, indexer, { + vectorWeight: 0.3 // 30% semantic, 70% keyword +}) + +// Balanced +const balanced = await hybridSearch(query, indexer, { + vectorWeight: 0.5 // 50/50 split +}) +``` + +**Filter results:** + +```typescript +const results = await hybridSearch( + 'authentication', + indexer, + { + fileExtensions: ['.ts', '.tsx'], + pathFilter: 'src/auth', + excludePaths: ['test', 'mock'], + minScore: 0.1 + } +) +``` + +## semanticSearch() + +Pure vector search using embeddings. Convenience wrapper for `hybridSearch()` with `vectorWeight: 1.0`. + +```typescript +async function semanticSearch( + query: string, + indexer: CodebaseIndexer, + options?: Omit +): Promise +``` + +### Parameters + +Same as `hybridSearch()` except `vectorWeight` is fixed at 1.0. + +### Returns + +`Promise` - Results with `method: 'vector'` + +### Example + +```typescript +import { semanticSearch } from '@sylphx/coderag' + +// Semantic search only +const results = await semanticSearch( + 'find code that handles user authentication', + indexer, + { limit: 5 } +) + +// Works well for: +// - Conceptual queries ("how to...", "find code that...") +// - Natural language questions +// - Cross-language concepts +// - Similar functionality search +``` + +## keywordSearch() + +Pure TF-IDF/BM25 keyword search. Convenience wrapper for `hybridSearch()` with `vectorWeight: 0.0`. + +```typescript +async function keywordSearch( + query: string, + indexer: CodebaseIndexer, + options?: Omit +): Promise +``` + +### Parameters + +Same as `hybridSearch()` except `vectorWeight` is fixed at 0.0. + +### Returns + +`Promise` - Results with `method: 'tfidf'` + +### Example + +```typescript +import { keywordSearch } from '@sylphx/coderag' + +// Keyword search only +const results = await keywordSearch( + 'createUser validateEmail', + indexer, + { limit: 10 } +) + +// Works well for: +// - Specific function/variable names +// - Exact terminology +// - API identifiers +// - Symbol search +``` + +## Search Strategies + +### When to Use Each Mode + +**Semantic Search** (`semanticSearch()` or high `vectorWeight`) +- Natural language questions +- Conceptual similarity +- "Find code that does X" +- Cross-language patterns +- Requires embedding provider + +**Keyword Search** (`keywordSearch()` or low `vectorWeight`) +- Specific identifiers +- Exact function/class names +- Fast lookups +- No API calls needed +- Works offline + +**Hybrid Search** (balanced `vectorWeight`) +- Best of both worlds +- Handles varied query types +- More robust results +- Recommended for general use + +### Query Examples + +**Good for Semantic:** +```typescript +await semanticSearch('how to validate user input', indexer) +await semanticSearch('authentication flow', indexer) +await semanticSearch('error handling patterns', indexer) +``` + +**Good for Keyword:** +```typescript +await keywordSearch('createUser validateEmail', indexer) +await keywordSearch('React useState useEffect', indexer) +await keywordSearch('class AuthService', indexer) +``` + +**Good for Hybrid:** +```typescript +await hybridSearch('JWT token authentication', indexer, { vectorWeight: 0.7 }) +await hybridSearch('database connection pooling', indexer, { vectorWeight: 0.6 }) +await hybridSearch('API error handling middleware', indexer, { vectorWeight: 0.5 }) +``` + +## Score Interpretation + +### Vector Similarity + +Cosine similarity between query and chunk embeddings: +- `0.9-1.0`: Highly relevant +- `0.7-0.9`: Relevant +- `0.5-0.7`: Somewhat relevant +- `< 0.5`: Weakly relevant + +### TF-IDF/BM25 Score + +BM25 relevance scoring (varies by corpus): +- Higher = more relevant +- Normalized per query +- Affected by term frequency and document frequency + +### Combined Score + +Weighted combination in hybrid mode: +- Normalized to 0-1 range +- `vectorWeight` controls the balance +- Results sorted by combined score descending + +## Result Metadata + +### Chunk Information + +Results include AST chunk metadata: + +```typescript +const result = results[0] + +console.log(`Type: ${result.chunkType}`) // 'FunctionDeclaration', 'ClassDeclaration', etc. +console.log(`Lines: ${result.startLine}-${result.endLine}`) +console.log(`Language: ${result.language}`) +``` + +### Match Information + +```typescript +// Keyword matches +if (result.matchedTerms) { + console.log(`Matched terms: ${result.matchedTerms.join(', ')}`) +} + +// Vector similarity +if (result.similarity) { + console.log(`Similarity: ${(result.similarity * 100).toFixed(1)}%`) +} + +// Search method +console.log(`Method: ${result.method}`) // 'vector', 'tfidf', or 'hybrid' +``` + +## Filtering + +### By File Extension + +```typescript +const tsResults = await hybridSearch(query, indexer, { + fileExtensions: ['.ts', '.tsx'] +}) + +const pyResults = await hybridSearch(query, indexer, { + fileExtensions: ['.py'] +}) +``` + +### By Path + +```typescript +// Include specific paths +const authResults = await hybridSearch(query, indexer, { + pathFilter: 'src/auth' +}) + +// Exclude specific paths +const prodResults = await hybridSearch(query, indexer, { + excludePaths: ['test', 'mock', 'fixture'] +}) + +// Combine filters +const filtered = await hybridSearch(query, indexer, { + fileExtensions: ['.ts'], + pathFilter: 'src', + excludePaths: ['test', 'dist'] +}) +``` + +### By Score + +```typescript +const highQuality = await hybridSearch(query, indexer, { + minScore: 0.5, // Only high-confidence results + limit: 5 +}) +``` + +## Performance + +### Search Speed + +- Semantic: ~10-50ms (LanceDB vector search) +- Keyword: ~5-10ms (SQL BM25) +- Hybrid: ~20-60ms (both methods + merging) + +### Optimization Tips + +**Limit results:** +```typescript +// Faster: fewer results to process +const results = await hybridSearch(query, indexer, { limit: 5 }) +``` + +**Skip content:** +```typescript +// Faster: no snippet generation +const results = await hybridSearch(query, indexer, { includeContent: false }) +``` + +**Filter early:** +```typescript +// Faster: filter at query time, not after +const results = await hybridSearch(query, indexer, { + fileExtensions: ['.ts'], + pathFilter: 'src' +}) +``` + +## Error Handling + +### Fallback Behavior + +If vector search fails, automatically falls back to keyword search: + +```typescript +try { + const results = await hybridSearch(query, indexer, { vectorWeight: 0.7 }) + // Falls back to TF-IDF if embedding generation fails +} catch (error) { + console.error('Search failed:', error) +} +``` + +### Missing Embeddings + +```typescript +const indexer = new CodebaseIndexer({ + // No embeddingProvider configured +}) + +// hybridSearch() will use TF-IDF only +const results = await hybridSearch(query, indexer) +// Logs: [INFO] Using TF-IDF search only +``` + +## Related + +- [CodebaseIndexer](./indexer.md) +- [Embeddings](./embeddings.md) +- [Types](./types.md) diff --git a/docs/api/storage.md b/docs/api/storage.md new file mode 100644 index 0000000..fd92228 --- /dev/null +++ b/docs/api/storage.md @@ -0,0 +1,647 @@ +# Storage + +CodeRAG provides two storage implementations: `PersistentStorage` for SQLite-backed persistence and `MemoryStorage` for in-memory usage. + +## PersistentStorage + +SQLite-based storage using LibSQL and Drizzle ORM. Supports chunk-level indexing with BM25 scoring. + +### Constructor + +```typescript +new PersistentStorage(config?: DbConfig) +``` + +#### Parameters + +**config** `DbConfig` (optional) + +```typescript +interface DbConfig { + codebaseRoot?: string // Project root (default: process.cwd()) + dbPath?: string // Custom database path (optional) +} +``` + +If `dbPath` is not provided, uses `~/.coderag/projects//` based on the codebase root path. + +#### Example + +```typescript +import { PersistentStorage } from '@sylphx/coderag' + +// Automatic path (recommended) +const storage = new PersistentStorage({ + codebaseRoot: '/path/to/project' +}) +// Creates: ~/.coderag/projects/a3d5f8b2/index.db + +// Custom path +const storage = new PersistentStorage({ + dbPath: '/custom/path/index.db' +}) +``` + +## Storage Interface + +Both `PersistentStorage` and `MemoryStorage` implement the `Storage` interface: + +```typescript +interface Storage { + storeFile(file: CodebaseFile): Promise + storeFiles?(files: CodebaseFile[]): Promise + getFile(path: string): Promise + getAllFiles(): Promise + deleteFile(path: string): Promise + clear(): Promise + count(): Promise + getChunkCount?(): Promise + exists(path: string): Promise +} +``` + +## File Operations + +### storeFile() + +Store a single file in the index. + +```typescript +async storeFile(file: CodebaseFile): Promise +``` + +#### Parameters + +```typescript +interface CodebaseFile { + path: string + content: string + size: number + mtime: number | Date + language?: string + hash: string +} +``` + +#### Example + +```typescript +await storage.storeFile({ + path: 'src/index.ts', + content: 'export function hello() { }', + size: 30, + mtime: Date.now(), + language: 'typescript', + hash: 'abc123' +}) +``` + +### storeFiles() + +Store multiple files in a batch (more efficient than individual stores). + +```typescript +async storeFiles(files: CodebaseFile[]): Promise +``` + +#### Example + +```typescript +const files: CodebaseFile[] = [ + { path: 'src/a.ts', content: '...', /* ... */ }, + { path: 'src/b.ts', content: '...', /* ... */ } +] + +await storage.storeFiles(files) +``` + +### getFile() + +Retrieve a file by path. + +```typescript +async getFile(path: string): Promise +``` + +#### Example + +```typescript +const file = await storage.getFile('src/index.ts') +if (file) { + console.log(file.content) +} +``` + +### getAllFiles() + +Get all indexed files. + +```typescript +async getAllFiles(): Promise +``` + +#### Example + +```typescript +const files = await storage.getAllFiles() +console.log(`Total files: ${files.length}`) +``` + +### deleteFile() + +Delete a file from the index. + +```typescript +async deleteFile(path: string): Promise +``` + +#### Example + +```typescript +await storage.deleteFile('src/removed.ts') +``` + +### deleteFiles() + +Delete multiple files in a batch (PersistentStorage only). + +```typescript +async deleteFiles(paths: string[]): Promise +``` + +#### Example + +```typescript +await storage.deleteFiles(['src/a.ts', 'src/b.ts']) +``` + +### clear() + +Clear all files from the index. + +```typescript +async clear(): Promise +``` + +#### Example + +```typescript +await storage.clear() +``` + +### count() + +Get total number of indexed files. + +```typescript +async count(): Promise +``` + +#### Example + +```typescript +const fileCount = await storage.count() +console.log(`Indexed ${fileCount} files`) +``` + +### exists() + +Check if a file exists in the index. + +```typescript +async exists(path: string): Promise +``` + +#### Example + +```typescript +if (await storage.exists('src/index.ts')) { + console.log('File is indexed') +} +``` + +## Chunk Operations + +These methods are specific to `PersistentStorage` and support chunk-level indexing. + +### storeChunks() + +Store chunks for a file (replaces existing chunks). + +```typescript +async storeChunks(filePath: string, chunks: ChunkData[]): Promise +``` + +#### Parameters + +```typescript +interface ChunkData { + content: string + type: string // AST node type (e.g., 'FunctionDeclaration') + startLine: number + endLine: number + metadata?: Record +} +``` + +#### Returns + +`Promise` - Array of chunk IDs. + +#### Example + +```typescript +const chunkIds = await storage.storeChunks('src/index.ts', [ + { + content: 'export function hello() { }', + type: 'FunctionDeclaration', + startLine: 1, + endLine: 3 + } +]) +``` + +### storeManyChunks() + +Store chunks for multiple files in a batch. + +```typescript +async storeManyChunks( + fileChunks: Array<{ filePath: string; chunks: ChunkData[] }> +): Promise> +``` + +#### Returns + +`Promise>` - Map of file paths to chunk ID arrays. + +#### Example + +```typescript +const chunkIdMap = await storage.storeManyChunks([ + { + filePath: 'src/a.ts', + chunks: [{ content: '...', type: 'FunctionDeclaration', startLine: 1, endLine: 5 }] + }, + { + filePath: 'src/b.ts', + chunks: [{ content: '...', type: 'ClassDeclaration', startLine: 1, endLine: 10 }] + } +]) + +const aChunkIds = chunkIdMap.get('src/a.ts') +``` + +### getChunksForFile() + +Get all chunks for a file. + +```typescript +async getChunksForFile(filePath: string): Promise +``` + +#### Returns + +```typescript +interface StoredChunk extends ChunkData { + id: number + fileId: number + filePath: string +} +``` + +#### Example + +```typescript +const chunks = await storage.getChunksForFile('src/index.ts') +for (const chunk of chunks) { + console.log(`${chunk.type} (${chunk.startLine}-${chunk.endLine}): ${chunk.content}`) +} +``` + +### getChunkCount() + +Get total number of indexed chunks. + +```typescript +async getChunkCount(): Promise +``` + +#### Example + +```typescript +const chunkCount = await storage.getChunkCount() +console.log(`Indexed ${chunkCount} chunks`) +``` + +## Vector Operations + +These methods manage TF-IDF vectors for chunk-level search. + +### storeChunkVectors() + +Store TF-IDF vectors for a chunk. + +```typescript +async storeChunkVectors( + chunkId: number, + terms: Map, + tokenCount?: number +): Promise +``` + +#### Example + +```typescript +const terms = new Map([ + ['function', { tf: 0.5, tfidf: 1.2, rawFreq: 2 }], + ['export', { tf: 0.25, tfidf: 0.8, rawFreq: 1 }] +]) + +await storage.storeChunkVectors(123, terms, 4) +``` + +### storeManyChunkVectors() + +Store vectors for multiple chunks in a batch. + +```typescript +async storeManyChunkVectors( + chunkVectors: Array<{ + chunkId: number + terms: Map + tokenCount?: number + }> +): Promise +``` + +### getChunkVectors() + +Get TF-IDF vectors for a chunk. + +```typescript +async getChunkVectors( + chunkId: number +): Promise | null> +``` + +### getAllChunkVectors() + +Get all chunk vectors in a single query. + +```typescript +async getAllChunkVectors(): Promise< + Map> +> +``` + +Returns a map of chunk IDs to their term vectors. + +## IDF Operations + +Manage inverse document frequency scores. + +### storeIdfScores() + +Store IDF scores for all terms. + +```typescript +async storeIdfScores( + idf: Map, + docFreq: Map +): Promise +``` + +### getIdfScores() + +Get all IDF scores. + +```typescript +async getIdfScores(): Promise> +``` + +### getIdfScoresForTerms() + +Get IDF scores for specific terms only. + +```typescript +async getIdfScoresForTerms(terms: string[]): Promise> +``` + +### rebuildIdfScoresFromVectors() + +Recalculate IDF scores from stored vectors. + +```typescript +async rebuildIdfScoresFromVectors(): Promise +``` + +Uses smoothed IDF formula: `log((N+1)/(df+1)) + 1` + +## Search Operations + +### searchByTerms() + +Search chunks by query terms using SQL. + +```typescript +async searchByTerms( + queryTerms: string[], + options?: { limit?: number } +): Promise +``` + +#### Returns + +```typescript +interface SearchCandidate { + chunkId: number + filePath: string + content: string + type: string + startLine: number + endLine: number + matchedTerms: Map + magnitude: number + tokenCount: number +} +``` + +#### Example + +```typescript +const candidates = await storage.searchByTerms( + ['function', 'export'], + { limit: 20 } +) + +for (const candidate of candidates) { + console.log(`${candidate.filePath}:${candidate.startLine}`) + console.log(`Matched: ${Array.from(candidate.matchedTerms.keys()).join(', ')}`) +} +``` + +## Metadata Operations + +### setMetadata() + +Store metadata key-value pair. + +```typescript +async setMetadata(key: string, value: string): Promise +``` + +### getMetadata() + +Retrieve metadata value. + +```typescript +async getMetadata(key: string): Promise +``` + +### getAverageDocLength() + +Get average chunk token count (for BM25). + +```typescript +async getAverageDocLength(): Promise +``` + +### updateAverageDocLength() + +Recalculate and store average chunk length. + +```typescript +async updateAverageDocLength(): Promise +``` + +## Maintenance Operations + +### recalculateTfidfScores() + +Update all TF-IDF scores using current IDF values. + +```typescript +async recalculateTfidfScores(): Promise +``` + +### updateChunkMagnitudes() + +Recalculate pre-computed magnitudes for cosine similarity. + +```typescript +async updateChunkMagnitudes(): Promise +``` + +Magnitude formula: `sqrt(sum(tfidf^2))` for each chunk. + +### getAllFileMetadata() + +Get file metadata without content (for incremental updates). + +```typescript +async getAllFileMetadata(): Promise> +``` + +### getTermsForFiles() + +Get all terms used in chunks of specified files. + +```typescript +async getTermsForFiles(paths: string[]): Promise> +``` + +Useful for tracking affected terms during incremental updates. + +## MemoryStorage + +In-memory storage implementation. Useful for testing or temporary indexes. + +### Constructor + +```typescript +new MemoryStorage() +``` + +### Example + +```typescript +import { MemoryStorage } from '@sylphx/coderag' + +const storage = new MemoryStorage() + +await storage.storeFile({ + path: 'test.ts', + content: 'console.log("test")', + size: 20, + mtime: Date.now(), + hash: 'abc' +}) + +const file = await storage.getFile('test.ts') +``` + +## Database Schema + +PersistentStorage uses the following tables: + +**files** +- id, path (unique), content, hash, size, mtime, language, indexed_at + +**chunks** +- id, file_id, content, type, start_line, end_line, metadata, magnitude, token_count + +**document_vectors** +- chunk_id, term, tf, tfidf, raw_freq + +**idf_scores** +- term (unique), idf, document_frequency + +**index_metadata** +- key (unique), value, updated_at + +## Performance Tips + +### Batch Operations + +Use batch methods for better performance: + +```typescript +// Good: Single transaction +await storage.storeFiles(files) +await storage.storeManyChunks(fileChunks) + +// Bad: Multiple transactions +for (const file of files) { + await storage.storeFile(file) +} +``` + +### Incremental Updates + +Use metadata comparison to avoid unnecessary work: + +```typescript +const metadata = await storage.getAllFileMetadata() +const existing = metadata.get(filePath) + +if (existing?.hash === newHash) { + // Skip, content unchanged +} +``` + +### Low Memory Mode + +For large codebases, use SQL-based search: + +```typescript +const indexer = new CodebaseIndexer({ + storage: new PersistentStorage(), + lowMemoryMode: true // Uses searchByTerms() instead of in-memory index +}) +``` + +## Related + +- [CodebaseIndexer](./indexer.md) +- [Types](./types.md) diff --git a/docs/api/types.md b/docs/api/types.md new file mode 100644 index 0000000..be58af5 --- /dev/null +++ b/docs/api/types.md @@ -0,0 +1,595 @@ +# TypeScript Types + +Comprehensive type definitions for CodeRAG API. + +## Indexer Types + +### IndexerOptions + +Configuration for `CodebaseIndexer` constructor. + +```typescript +interface IndexerOptions { + codebaseRoot?: string // Root directory (default: process.cwd()) + maxFileSize?: number // Max file size in bytes (default: 1MB) + storage?: Storage // Storage implementation (default: MemoryStorage) + onProgress?: (current: number, total: number, file: string) => void + watch?: boolean // Enable file watching (default: false) + onFileChange?: (event: FileChangeEvent) => void + embeddingProvider?: EmbeddingProvider // Optional for vector search + vectorBatchSize?: number // Embedding batch size (default: 10) + indexingBatchSize?: number // Files per batch (default: 50) + lowMemoryMode?: boolean // Use SQL search (default: true with PersistentStorage) +} +``` + +### IndexingStatus + +Current indexing progress. + +```typescript +interface IndexingStatus { + isIndexing: boolean // Currently indexing + progress: number // 0-100 + totalFiles: number // Total files found + processedFiles: number // Files processed + totalChunks: number // Total chunks created + indexedChunks: number // Chunks indexed + currentFile?: string // Current file being processed +} +``` + +### FileChangeEvent + +File system change event. + +```typescript +interface FileChangeEvent { + type: 'add' | 'change' | 'unlink' + path: string + timestamp: number +} +``` + +### FileDiff + +Filesystem diff result for incremental updates. + +```typescript +interface FileDiff { + added: FileMetadata[] // New files + changed: FileMetadata[] // Modified files + deleted: string[] // Deleted file paths + unchanged: number // Unchanged file count +} +``` + +### SearchResult + +Search result from `CodebaseIndexer.search()`. + +```typescript +interface SearchResult { + path: string // File path + score: number // Relevance score + matchedTerms: string[] // Matched query terms + language?: string // Detected language + size: number // Content size + snippet?: string // Code snippet with line numbers + chunkType?: string // AST node type (e.g., 'FunctionDeclaration') + startLine?: number // Chunk start line + endLine?: number // Chunk end line +} +``` + +## Storage Types + +### Storage + +Storage interface implemented by `PersistentStorage` and `MemoryStorage`. + +```typescript +interface Storage { + storeFile(file: CodebaseFile): Promise + storeFiles?(files: CodebaseFile[]): Promise + getFile(path: string): Promise + getAllFiles(): Promise + deleteFile(path: string): Promise + clear(): Promise + count(): Promise + getChunkCount?(): Promise + exists(path: string): Promise +} +``` + +### CodebaseFile + +Stored file representation. + +```typescript +interface CodebaseFile { + path: string // Relative file path + content: string // File content + size: number // File size in bytes + mtime: number | Date // Modification time + language?: string // Detected language + hash: string // Content hash +} +``` + +### DbConfig + +Database configuration for `PersistentStorage`. + +```typescript +interface DbConfig { + codebaseRoot?: string // Project root (default: process.cwd()) + dbPath?: string // Custom database path (optional) +} +``` + +### ChunkData + +Chunk data for storage. + +```typescript +interface ChunkData { + content: string + type: string // AST node type + startLine: number + endLine: number + metadata?: Record +} +``` + +### StoredChunk + +Chunk with database ID. + +```typescript +interface StoredChunk extends ChunkData { + id: number // Chunk ID + fileId: number // Parent file ID + filePath: string // File path +} +``` + +## Search Types + +### HybridSearchOptions + +Options for hybrid search. + +```typescript +interface HybridSearchOptions { + limit?: number // Max results (default: 10) + minScore?: number // Min relevance score (default: 0.01) + vectorWeight?: number // 0-1, vector vs TF-IDF (default: 0.7) + includeContent?: boolean // Include snippets (default: false) + fileExtensions?: string[] // Filter by extension + pathFilter?: string // Include paths containing string + excludePaths?: string[] // Exclude paths +} +``` + +### HybridSearchResult + +Result from hybrid search. + +```typescript +interface HybridSearchResult { + path: string + score: number + method: 'vector' | 'tfidf' | 'hybrid' + matchedTerms?: string[] + similarity?: number + content?: string + chunkType?: string + startLine?: number + endLine?: number + language?: string +} +``` + +### SearchIndex + +TF-IDF search index structure. + +```typescript +interface SearchIndex { + documents: DocumentVector[] + idf: Map + totalDocuments: number + metadata: { + generatedAt: string + version: string + } +} +``` + +### DocumentVector + +TF-IDF document vector. + +```typescript +interface DocumentVector { + uri: string + magnitude: number + rawTerms: Map + tfidf: Map +} +``` + +## Embedding Types + +### EmbeddingConfig + +Embedding provider configuration. + +```typescript +interface EmbeddingConfig { + provider: 'openai' | 'openai-compatible' | 'mock' + model: string // Model name + dimensions: number // Vector dimensions + apiKey?: string // API key + baseURL?: string // Custom endpoint for OpenAI-compatible APIs + batchSize?: number // Embedding batch size (default: 10) +} +``` + +### EmbeddingProvider + +Embedding provider interface. + +```typescript +interface EmbeddingProvider { + name: string + model: string + dimensions: number + generateEmbedding(text: string): Promise + generateEmbeddings(texts: string[]): Promise +} +``` + +## Vector Storage Types + +### VectorStorageOptions + +Configuration for vector storage. + +```typescript +interface VectorStorageOptions { + dimensions: number // Vector dimensions + dbPath: string // LanceDB path +} +``` + +### VectorDocument + +Document with embedding. + +```typescript +interface VectorDocument { + id: string + embedding: number[] + metadata: Record +} +``` + +### VectorSearchResult + +Vector search result. + +```typescript +interface VectorSearchResult { + doc: VectorDocument + similarity: number +} +``` + +### VectorStorageStats + +Vector storage statistics. + +```typescript +interface VectorStorageStats { + documentCount: number + dimensions: number +} +``` + +## Chunking Types + +### ASTChunkOptions + +Options for AST-based chunking. + +```typescript +interface ASTChunkOptions { + maxChunkSize?: number // Max chunk chars (default: 1000) + minChunkSize?: number // Min chunk chars (default: 100) + preserveContext?: boolean // Include imports/types (default: true) + nodeTypes?: string[] // Custom AST node types to chunk + parseEmbedded?: boolean // Parse code in markdown (default: true) +} +``` + +### ChunkResult + +Chunk with metadata. + +```typescript +interface ChunkResult { + content: string // Chunk content + type: string // AST node type + startLine: number // Start line (1-indexed) + endLine: number // End line (inclusive) + metadata: Record // Additional metadata +} +``` + +## Language Configuration Types + +### LanguageConfig + +Language-specific configuration. + +```typescript +interface LanguageConfig { + parser: string // Parser module name + boundaries: string[] // AST node types for semantic boundaries + contextTypes?: string[] // Node types to preserve as context + parserOptions?: Record + embedded?: EmbeddedLanguageConfig[] +} +``` + +### EmbeddedLanguageConfig + +Configuration for embedded code parsing. + +```typescript +interface EmbeddedLanguageConfig { + nodeType: string // Parent node type + langAttr?: string // Attribute containing language name + defaultLanguage?: string // Default language if not specified + recursive?: boolean // Enable recursive parsing +} +``` + +## TF-IDF Types + +### IncrementalUpdate + +Update operation for incremental TF-IDF. + +```typescript +interface IncrementalUpdate { + type: 'add' | 'update' | 'delete' + uri: string + newContent?: string + oldDocument?: DocumentVector +} +``` + +### IncrementalStats + +Statistics from incremental update. + +```typescript +interface IncrementalStats { + affectedDocuments: number + affectedTerms: number + updateTime: number +} +``` + +## Utility Types + +### ScanOptions + +Options for file scanning. + +```typescript +interface ScanOptions { + ignoreFilter?: Ignore + codebaseRoot?: string + maxFileSize?: number +} +``` + +### ScanResult + +File scan result. + +```typescript +interface ScanResult { + path: string + size: number +} +``` + +### FileMetadata + +File metadata without content. + +```typescript +interface FileMetadata { + path: string + absolutePath: string + size: number + mtime: number + language?: string +} +``` + +### ProjectMetadata + +Project metadata for database location. + +```typescript +interface ProjectMetadata { + codebaseRoot: string + hash: string + dataDir: string +} +``` + +## Cache Types + +### CacheEntry + +LRU cache entry. + +```typescript +interface CacheEntry { + value: T + timestamp: number +} +``` + +### CacheStats + +Cache statistics. + +```typescript +interface CacheStats { + hits: number + misses: number + size: number + maxSize: number +} +``` + +## Type Guards + +Useful type guards for working with CodeRAG types. + +```typescript +function isPersistentStorage(storage: Storage): storage is PersistentStorage { + return 'storeChunks' in storage +} + +function hasChunkMetadata(result: SearchResult): result is SearchResult & { + chunkType: string + startLine: number + endLine: number +} { + return result.chunkType !== undefined && + result.startLine !== undefined && + result.endLine !== undefined +} + +function isHybridResult(result: HybridSearchResult): boolean { + return result.method === 'hybrid' +} +``` + +## Generic Types + +```typescript +// Storage key-value types +type StorageKey = string +type StorageValue = string | number | boolean | null + +// Vector types +type Vector = number[] +type VectorId = string + +// Score types +type Score = number // 0-1 for cosine similarity, unbounded for TF-IDF/BM25 +type Similarity = number // -1 to 1 for cosine similarity + +// Path types +type FilePath = string +type AbsolutePath = string +type RelativePath = string +``` + +## Constants + +```typescript +// Default values +const DEFAULT_MAX_FILE_SIZE = 1048576 // 1MB +const DEFAULT_MAX_CHUNK_SIZE = 1000 +const DEFAULT_MIN_CHUNK_SIZE = 100 +const DEFAULT_VECTOR_BATCH_SIZE = 10 +const DEFAULT_INDEXING_BATCH_SIZE = 50 +const DEFAULT_SEARCH_LIMIT = 10 +const DEFAULT_VECTOR_WEIGHT = 0.7 + +// BM25 parameters +const BM25_K1 = 1.2 +const BM25_B = 0.75 +``` + +## Usage Examples + +### Type-Safe Indexer Configuration + +```typescript +import type { IndexerOptions, EmbeddingConfig } from '@sylphx/coderag' + +const embeddingConfig: EmbeddingConfig = { + provider: 'openai', + model: 'text-embedding-3-small', + dimensions: 1536, + apiKey: process.env.OPENAI_API_KEY +} + +const indexerOptions: IndexerOptions = { + codebaseRoot: './src', + maxFileSize: 2 * 1024 * 1024, + watch: true, + onProgress: (current, total, file) => { + console.log(`${current}/${total}: ${file}`) + } +} +``` + +### Type-Safe Search + +```typescript +import type { SearchResult, HybridSearchOptions } from '@sylphx/coderag' + +const options: HybridSearchOptions = { + limit: 20, + vectorWeight: 0.7, + fileExtensions: ['.ts', '.tsx'] +} + +const results: SearchResult[] = await indexer.search('auth', options) + +for (const result of results) { + if (hasChunkMetadata(result)) { + console.log(`${result.chunkType} at ${result.startLine}`) + } +} +``` + +### Type-Safe Storage + +```typescript +import type { CodebaseFile, StoredChunk } from '@sylphx/coderag' + +const file: CodebaseFile = { + path: 'src/index.ts', + content: code, + size: code.length, + mtime: Date.now(), + language: 'typescript', + hash: computeHash(code) +} + +await storage.storeFile(file) + +const chunks: StoredChunk[] = await storage.getChunksForFile(file.path) +``` + +## Related + +- [CodebaseIndexer](./indexer.md) +- [Storage](./storage.md) +- [Search Functions](./search.md) +- [Embeddings](./embeddings.md) +- [AST Chunking](./chunking.md) diff --git a/docs/bun.lock b/docs/bun.lock new file mode 100644 index 0000000..dd47189 --- /dev/null +++ b/docs/bun.lock @@ -0,0 +1,354 @@ +{ + "lockfileVersion": 1, + "configVersion": 1, + "workspaces": { + "": { + "name": "coderag-docs", + "devDependencies": { + "vitepress": "^1.6.4", + "vue": "^3.5.24", + }, + }, + }, + "packages": { + "@algolia/abtesting": ["@algolia/abtesting@1.12.0", "", { "dependencies": { "@algolia/client-common": "5.46.0", "@algolia/requester-browser-xhr": "5.46.0", "@algolia/requester-fetch": "5.46.0", "@algolia/requester-node-http": "5.46.0" } }, "sha512-EfW0bfxjPs+C7ANkJDw2TATntfBKsFiy7APh+KO0pQ8A6HYa5I0NjFuCGCXWfzzzLXNZta3QUl3n5Kmm6aJo9Q=="], + + "@algolia/autocomplete-core": ["@algolia/autocomplete-core@1.17.7", "", { "dependencies": { "@algolia/autocomplete-plugin-algolia-insights": "1.17.7", "@algolia/autocomplete-shared": "1.17.7" } }, "sha512-BjiPOW6ks90UKl7TwMv7oNQMnzU+t/wk9mgIDi6b1tXpUek7MW0lbNOUHpvam9pe3lVCf4xPFT+lK7s+e+fs7Q=="], + + "@algolia/autocomplete-plugin-algolia-insights": ["@algolia/autocomplete-plugin-algolia-insights@1.17.7", "", { "dependencies": { "@algolia/autocomplete-shared": "1.17.7" }, "peerDependencies": { "search-insights": ">= 1 < 3" } }, "sha512-Jca5Ude6yUOuyzjnz57og7Et3aXjbwCSDf/8onLHSQgw1qW3ALl9mrMWaXb5FmPVkV3EtkD2F/+NkT6VHyPu9A=="], + + "@algolia/autocomplete-preset-algolia": ["@algolia/autocomplete-preset-algolia@1.17.7", "", { "dependencies": { "@algolia/autocomplete-shared": "1.17.7" }, "peerDependencies": { "@algolia/client-search": ">= 4.9.1 < 6", "algoliasearch": ">= 4.9.1 < 6" } }, "sha512-ggOQ950+nwbWROq2MOCIL71RE0DdQZsceqrg32UqnhDz8FlO9rL8ONHNsI2R1MH0tkgVIDKI/D0sMiUchsFdWA=="], + + "@algolia/autocomplete-shared": ["@algolia/autocomplete-shared@1.17.7", "", { "peerDependencies": { "@algolia/client-search": ">= 4.9.1 < 6", "algoliasearch": ">= 4.9.1 < 6" } }, "sha512-o/1Vurr42U/qskRSuhBH+VKxMvkkUVTLU6WZQr+L5lGZZLYWyhdzWjW0iGXY7EkwRTjBqvN2EsR81yCTGV/kmg=="], + + "@algolia/client-abtesting": ["@algolia/client-abtesting@5.46.0", "", { "dependencies": { "@algolia/client-common": "5.46.0", "@algolia/requester-browser-xhr": "5.46.0", "@algolia/requester-fetch": "5.46.0", "@algolia/requester-node-http": "5.46.0" } }, "sha512-eG5xV8rujK4ZIHXrRshvv9O13NmU/k42Rnd3w43iKH5RaQ2zWuZO6Q7XjaoJjAFVCsJWqRbXzbYyPGrbF3wGNg=="], + + "@algolia/client-analytics": ["@algolia/client-analytics@5.46.0", "", { "dependencies": { "@algolia/client-common": "5.46.0", "@algolia/requester-browser-xhr": "5.46.0", "@algolia/requester-fetch": "5.46.0", "@algolia/requester-node-http": "5.46.0" } }, "sha512-AYh2uL8IUW9eZrbbT+wZElyb7QkkeV3US2NEKY7doqMlyPWE8lErNfkVN1NvZdVcY4/SVic5GDbeDz2ft8YIiQ=="], + + "@algolia/client-common": ["@algolia/client-common@5.46.0", "", {}, "sha512-0emZTaYOeI9WzJi0TcNd2k3SxiN6DZfdWc2x2gHt855Jl9jPUOzfVTL6gTvCCrOlT4McvpDGg5nGO+9doEjjig=="], + + "@algolia/client-insights": ["@algolia/client-insights@5.46.0", "", { "dependencies": { "@algolia/client-common": "5.46.0", "@algolia/requester-browser-xhr": "5.46.0", "@algolia/requester-fetch": "5.46.0", "@algolia/requester-node-http": "5.46.0" } }, "sha512-wrBJ8fE+M0TDG1As4DDmwPn2TXajrvmvAN72Qwpuv8e2JOKNohF7+JxBoF70ZLlvP1A1EiH8DBu+JpfhBbNphQ=="], + + "@algolia/client-personalization": ["@algolia/client-personalization@5.46.0", "", { "dependencies": { "@algolia/client-common": "5.46.0", "@algolia/requester-browser-xhr": "5.46.0", "@algolia/requester-fetch": "5.46.0", "@algolia/requester-node-http": "5.46.0" } }, "sha512-LnkeX4p0ENt0DoftDJJDzQQJig/sFQmD1eQifl/iSjhUOGUIKC/7VTeXRcKtQB78naS8njUAwpzFvxy1CDDXDQ=="], + + "@algolia/client-query-suggestions": ["@algolia/client-query-suggestions@5.46.0", "", { "dependencies": { "@algolia/client-common": "5.46.0", "@algolia/requester-browser-xhr": "5.46.0", "@algolia/requester-fetch": "5.46.0", "@algolia/requester-node-http": "5.46.0" } }, "sha512-aF9tc4ex/smypXw+W3lBPB1jjKoaGHpZezTqofvDOI/oK1dR2sdTpFpK2Ru+7IRzYgwtRqHF3znmTlyoNs9dpA=="], + + "@algolia/client-search": ["@algolia/client-search@5.46.0", "", { "dependencies": { "@algolia/client-common": "5.46.0", "@algolia/requester-browser-xhr": "5.46.0", "@algolia/requester-fetch": "5.46.0", "@algolia/requester-node-http": "5.46.0" } }, "sha512-22SHEEVNjZfFWkFks3P6HilkR3rS7a6GjnCIqR22Zz4HNxdfT0FG+RE7efTcFVfLUkTTMQQybvaUcwMrHXYa7Q=="], + + "@algolia/ingestion": ["@algolia/ingestion@1.46.0", "", { "dependencies": { "@algolia/client-common": "5.46.0", "@algolia/requester-browser-xhr": "5.46.0", "@algolia/requester-fetch": "5.46.0", "@algolia/requester-node-http": "5.46.0" } }, "sha512-2LT0/Z+/sFwEpZLH6V17WSZ81JX2uPjgvv5eNlxgU7rPyup4NXXfuMbtCJ+6uc4RO/LQpEJd3Li59ke3wtyAsA=="], + + "@algolia/monitoring": ["@algolia/monitoring@1.46.0", "", { "dependencies": { "@algolia/client-common": "5.46.0", "@algolia/requester-browser-xhr": "5.46.0", "@algolia/requester-fetch": "5.46.0", "@algolia/requester-node-http": "5.46.0" } }, "sha512-uivZ9wSWZ8mz2ZU0dgDvQwvVZV8XBv6lYBXf8UtkQF3u7WeTqBPeU8ZoeTyLpf0jAXCYOvc1mAVmK0xPLuEwOQ=="], + + "@algolia/recommend": ["@algolia/recommend@5.46.0", "", { "dependencies": { "@algolia/client-common": "5.46.0", "@algolia/requester-browser-xhr": "5.46.0", "@algolia/requester-fetch": "5.46.0", "@algolia/requester-node-http": "5.46.0" } }, "sha512-O2BB8DuySuddgOAbhyH4jsGbL+KyDGpzJRtkDZkv091OMomqIA78emhhMhX9d/nIRrzS1wNLWB/ix7Hb2eV5rg=="], + + "@algolia/requester-browser-xhr": ["@algolia/requester-browser-xhr@5.46.0", "", { "dependencies": { "@algolia/client-common": "5.46.0" } }, "sha512-eW6xyHCyYrJD0Kjk9Mz33gQ40LfWiEA51JJTVfJy3yeoRSw/NXhAL81Pljpa0qslTs6+LO/5DYPZddct6HvISQ=="], + + "@algolia/requester-fetch": ["@algolia/requester-fetch@5.46.0", "", { "dependencies": { "@algolia/client-common": "5.46.0" } }, "sha512-Vn2+TukMGHy4PIxmdvP667tN/MhS7MPT8EEvEhS6JyFLPx3weLcxSa1F9gVvrfHWCUJhLWoMVJVB2PT8YfRGcw=="], + + "@algolia/requester-node-http": ["@algolia/requester-node-http@5.46.0", "", { "dependencies": { "@algolia/client-common": "5.46.0" } }, "sha512-xaqXyna5yBZ+r1SJ9my/DM6vfTqJg9FJgVydRJ0lnO+D5NhqGW/qaRG/iBGKr/d4fho34el6WakV7BqJvrl/HQ=="], + + "@babel/helper-string-parser": ["@babel/helper-string-parser@7.27.1", "", {}, "sha512-qMlSxKbpRlAridDExk92nSobyDdpPijUq2DW6oDnUqd0iOGxmQjyqhMIihI9+zv4LPyZdRje2cavWPbCbWm3eA=="], + + "@babel/helper-validator-identifier": ["@babel/helper-validator-identifier@7.28.5", "", {}, "sha512-qSs4ifwzKJSV39ucNjsvc6WVHs6b7S03sOh2OcHF9UHfVPqWWALUsNUVzhSBiItjRZoLHx7nIarVjqKVusUZ1Q=="], + + "@babel/parser": ["@babel/parser@7.28.5", "", { "dependencies": { "@babel/types": "^7.28.5" }, "bin": "./bin/babel-parser.js" }, "sha512-KKBU1VGYR7ORr3At5HAtUQ+TV3SzRCXmA/8OdDZiLDBIZxVyzXuztPjfLd3BV1PRAQGCMWWSHYhL0F8d5uHBDQ=="], + + "@babel/types": ["@babel/types@7.28.5", "", { "dependencies": { "@babel/helper-string-parser": "^7.27.1", "@babel/helper-validator-identifier": "^7.28.5" } }, "sha512-qQ5m48eI/MFLQ5PxQj4PFaprjyCTLI37ElWMmNs0K8Lk3dVeOdNpB3ks8jc7yM5CDmVC73eMVk/trk3fgmrUpA=="], + + "@docsearch/css": ["@docsearch/css@3.8.2", "", {}, "sha512-y05ayQFyUmCXze79+56v/4HpycYF3uFqB78pLPrSV5ZKAlDuIAAJNhaRi8tTdRNXh05yxX/TyNnzD6LwSM89vQ=="], + + "@docsearch/js": ["@docsearch/js@3.8.2", "", { "dependencies": { "@docsearch/react": "3.8.2", "preact": "^10.0.0" } }, "sha512-Q5wY66qHn0SwA7Taa0aDbHiJvaFJLOJyHmooQ7y8hlwwQLQ/5WwCcoX0g7ii04Qi2DJlHsd0XXzJ8Ypw9+9YmQ=="], + + "@docsearch/react": ["@docsearch/react@3.8.2", "", { "dependencies": { "@algolia/autocomplete-core": "1.17.7", "@algolia/autocomplete-preset-algolia": "1.17.7", "@docsearch/css": "3.8.2", "algoliasearch": "^5.14.2" }, "peerDependencies": { "@types/react": ">= 16.8.0 < 19.0.0", "react": ">= 16.8.0 < 19.0.0", "react-dom": ">= 16.8.0 < 19.0.0", "search-insights": ">= 1 < 3" }, "optionalPeers": ["@types/react", "react", "react-dom", "search-insights"] }, "sha512-xCRrJQlTt8N9GU0DG4ptwHRkfnSnD/YpdeaXe02iKfqs97TkZJv60yE+1eq/tjPcVnTW8dP5qLP7itifFVV5eg=="], + + "@esbuild/aix-ppc64": ["@esbuild/aix-ppc64@0.21.5", "", { "os": "aix", "cpu": "ppc64" }, "sha512-1SDgH6ZSPTlggy1yI6+Dbkiz8xzpHJEVAlF/AM1tHPLsf5STom9rwtjE4hKAF20FfXXNTFqEYXyJNWh1GiZedQ=="], + + "@esbuild/android-arm": ["@esbuild/android-arm@0.21.5", "", { "os": "android", "cpu": "arm" }, "sha512-vCPvzSjpPHEi1siZdlvAlsPxXl7WbOVUBBAowWug4rJHb68Ox8KualB+1ocNvT5fjv6wpkX6o/iEpbDrf68zcg=="], + + "@esbuild/android-arm64": ["@esbuild/android-arm64@0.21.5", "", { "os": "android", "cpu": "arm64" }, "sha512-c0uX9VAUBQ7dTDCjq+wdyGLowMdtR/GoC2U5IYk/7D1H1JYC0qseD7+11iMP2mRLN9RcCMRcjC4YMclCzGwS/A=="], + + "@esbuild/android-x64": ["@esbuild/android-x64@0.21.5", "", { "os": "android", "cpu": "x64" }, "sha512-D7aPRUUNHRBwHxzxRvp856rjUHRFW1SdQATKXH2hqA0kAZb1hKmi02OpYRacl0TxIGz/ZmXWlbZgjwWYaCakTA=="], + + "@esbuild/darwin-arm64": ["@esbuild/darwin-arm64@0.21.5", "", { "os": "darwin", "cpu": "arm64" }, "sha512-DwqXqZyuk5AiWWf3UfLiRDJ5EDd49zg6O9wclZ7kUMv2WRFr4HKjXp/5t8JZ11QbQfUS6/cRCKGwYhtNAY88kQ=="], + + "@esbuild/darwin-x64": ["@esbuild/darwin-x64@0.21.5", "", { "os": "darwin", "cpu": "x64" }, "sha512-se/JjF8NlmKVG4kNIuyWMV/22ZaerB+qaSi5MdrXtd6R08kvs2qCN4C09miupktDitvh8jRFflwGFBQcxZRjbw=="], + + "@esbuild/freebsd-arm64": ["@esbuild/freebsd-arm64@0.21.5", "", { "os": "freebsd", "cpu": "arm64" }, "sha512-5JcRxxRDUJLX8JXp/wcBCy3pENnCgBR9bN6JsY4OmhfUtIHe3ZW0mawA7+RDAcMLrMIZaf03NlQiX9DGyB8h4g=="], + + "@esbuild/freebsd-x64": ["@esbuild/freebsd-x64@0.21.5", "", { "os": "freebsd", "cpu": "x64" }, "sha512-J95kNBj1zkbMXtHVH29bBriQygMXqoVQOQYA+ISs0/2l3T9/kj42ow2mpqerRBxDJnmkUDCaQT/dfNXWX/ZZCQ=="], + + "@esbuild/linux-arm": ["@esbuild/linux-arm@0.21.5", "", { "os": "linux", "cpu": "arm" }, "sha512-bPb5AHZtbeNGjCKVZ9UGqGwo8EUu4cLq68E95A53KlxAPRmUyYv2D6F0uUI65XisGOL1hBP5mTronbgo+0bFcA=="], + + "@esbuild/linux-arm64": ["@esbuild/linux-arm64@0.21.5", "", { "os": "linux", "cpu": "arm64" }, "sha512-ibKvmyYzKsBeX8d8I7MH/TMfWDXBF3db4qM6sy+7re0YXya+K1cem3on9XgdT2EQGMu4hQyZhan7TeQ8XkGp4Q=="], + + "@esbuild/linux-ia32": ["@esbuild/linux-ia32@0.21.5", "", { "os": "linux", "cpu": "ia32" }, "sha512-YvjXDqLRqPDl2dvRODYmmhz4rPeVKYvppfGYKSNGdyZkA01046pLWyRKKI3ax8fbJoK5QbxblURkwK/MWY18Tg=="], + + "@esbuild/linux-loong64": ["@esbuild/linux-loong64@0.21.5", "", { "os": "linux", "cpu": "none" }, "sha512-uHf1BmMG8qEvzdrzAqg2SIG/02+4/DHB6a9Kbya0XDvwDEKCoC8ZRWI5JJvNdUjtciBGFQ5PuBlpEOXQj+JQSg=="], + + "@esbuild/linux-mips64el": ["@esbuild/linux-mips64el@0.21.5", "", { "os": "linux", "cpu": "none" }, "sha512-IajOmO+KJK23bj52dFSNCMsz1QP1DqM6cwLUv3W1QwyxkyIWecfafnI555fvSGqEKwjMXVLokcV5ygHW5b3Jbg=="], + + "@esbuild/linux-ppc64": ["@esbuild/linux-ppc64@0.21.5", "", { "os": "linux", "cpu": "ppc64" }, "sha512-1hHV/Z4OEfMwpLO8rp7CvlhBDnjsC3CttJXIhBi+5Aj5r+MBvy4egg7wCbe//hSsT+RvDAG7s81tAvpL2XAE4w=="], + + "@esbuild/linux-riscv64": ["@esbuild/linux-riscv64@0.21.5", "", { "os": "linux", "cpu": "none" }, "sha512-2HdXDMd9GMgTGrPWnJzP2ALSokE/0O5HhTUvWIbD3YdjME8JwvSCnNGBnTThKGEB91OZhzrJ4qIIxk/SBmyDDA=="], + + "@esbuild/linux-s390x": ["@esbuild/linux-s390x@0.21.5", "", { "os": "linux", "cpu": "s390x" }, "sha512-zus5sxzqBJD3eXxwvjN1yQkRepANgxE9lgOW2qLnmr8ikMTphkjgXu1HR01K4FJg8h1kEEDAqDcZQtbrRnB41A=="], + + "@esbuild/linux-x64": ["@esbuild/linux-x64@0.21.5", "", { "os": "linux", "cpu": "x64" }, "sha512-1rYdTpyv03iycF1+BhzrzQJCdOuAOtaqHTWJZCWvijKD2N5Xu0TtVC8/+1faWqcP9iBCWOmjmhoH94dH82BxPQ=="], + + "@esbuild/netbsd-x64": ["@esbuild/netbsd-x64@0.21.5", "", { "os": "none", "cpu": "x64" }, "sha512-Woi2MXzXjMULccIwMnLciyZH4nCIMpWQAs049KEeMvOcNADVxo0UBIQPfSmxB3CWKedngg7sWZdLvLczpe0tLg=="], + + "@esbuild/openbsd-x64": ["@esbuild/openbsd-x64@0.21.5", "", { "os": "openbsd", "cpu": "x64" }, "sha512-HLNNw99xsvx12lFBUwoT8EVCsSvRNDVxNpjZ7bPn947b8gJPzeHWyNVhFsaerc0n3TsbOINvRP2byTZ5LKezow=="], + + "@esbuild/sunos-x64": ["@esbuild/sunos-x64@0.21.5", "", { "os": "sunos", "cpu": "x64" }, "sha512-6+gjmFpfy0BHU5Tpptkuh8+uw3mnrvgs+dSPQXQOv3ekbordwnzTVEb4qnIvQcYXq6gzkyTnoZ9dZG+D4garKg=="], + + "@esbuild/win32-arm64": ["@esbuild/win32-arm64@0.21.5", "", { "os": "win32", "cpu": "arm64" }, "sha512-Z0gOTd75VvXqyq7nsl93zwahcTROgqvuAcYDUr+vOv8uHhNSKROyU961kgtCD1e95IqPKSQKH7tBTslnS3tA8A=="], + + "@esbuild/win32-ia32": ["@esbuild/win32-ia32@0.21.5", "", { "os": "win32", "cpu": "ia32" }, "sha512-SWXFF1CL2RVNMaVs+BBClwtfZSvDgtL//G/smwAc5oVK/UPu2Gu9tIaRgFmYFFKrmg3SyAjSrElf0TiJ1v8fYA=="], + + "@esbuild/win32-x64": ["@esbuild/win32-x64@0.21.5", "", { "os": "win32", "cpu": "x64" }, "sha512-tQd/1efJuzPC6rCFwEvLtci/xNFcTZknmXs98FYDfGE4wP9ClFV98nyKrzJKVPMhdDnjzLhdUyMX4PsQAPjwIw=="], + + "@iconify-json/simple-icons": ["@iconify-json/simple-icons@1.2.62", "", { "dependencies": { "@iconify/types": "*" } }, "sha512-GpWQ294d4lraB3D2eBSSMROh1x9uKgpmyereLlGzVQjGZ7lbeFzby2ywXxyp4vEODmTDyf1/4WcOYs/yH4rJ5Q=="], + + "@iconify/types": ["@iconify/types@2.0.0", "", {}, "sha512-+wluvCrRhXrhyOmRDJ3q8mux9JkKy5SJ/v8ol2tu4FVjyYvtEzkc/3pK15ET6RKg4b4w4BmTk1+gsCUhf21Ykg=="], + + "@jridgewell/sourcemap-codec": ["@jridgewell/sourcemap-codec@1.5.5", "", {}, "sha512-cYQ9310grqxueWbl+WuIUIaiUaDcj7WOq5fVhEljNVgRfOUhY9fy2zTvfoqWsnebh8Sl70VScFbICvJnLKB0Og=="], + + "@rollup/rollup-android-arm-eabi": ["@rollup/rollup-android-arm-eabi@4.53.3", "", { "os": "android", "cpu": "arm" }, "sha512-mRSi+4cBjrRLoaal2PnqH82Wqyb+d3HsPUN/W+WslCXsZsyHa9ZeQQX/pQsZaVIWDkPcpV6jJ+3KLbTbgnwv8w=="], + + "@rollup/rollup-android-arm64": ["@rollup/rollup-android-arm64@4.53.3", "", { "os": "android", "cpu": "arm64" }, "sha512-CbDGaMpdE9sh7sCmTrTUyllhrg65t6SwhjlMJsLr+J8YjFuPmCEjbBSx4Z/e4SmDyH3aB5hGaJUP2ltV/vcs4w=="], + + "@rollup/rollup-darwin-arm64": ["@rollup/rollup-darwin-arm64@4.53.3", "", { "os": "darwin", "cpu": "arm64" }, "sha512-Nr7SlQeqIBpOV6BHHGZgYBuSdanCXuw09hon14MGOLGmXAFYjx1wNvquVPmpZnl0tLjg25dEdr4IQ6GgyToCUA=="], + + "@rollup/rollup-darwin-x64": ["@rollup/rollup-darwin-x64@4.53.3", "", { "os": "darwin", "cpu": "x64" }, "sha512-DZ8N4CSNfl965CmPktJ8oBnfYr3F8dTTNBQkRlffnUarJ2ohudQD17sZBa097J8xhQ26AwhHJ5mvUyQW8ddTsQ=="], + + "@rollup/rollup-freebsd-arm64": ["@rollup/rollup-freebsd-arm64@4.53.3", "", { "os": "freebsd", "cpu": "arm64" }, "sha512-yMTrCrK92aGyi7GuDNtGn2sNW+Gdb4vErx4t3Gv/Tr+1zRb8ax4z8GWVRfr3Jw8zJWvpGHNpss3vVlbF58DZ4w=="], + + "@rollup/rollup-freebsd-x64": ["@rollup/rollup-freebsd-x64@4.53.3", "", { "os": "freebsd", "cpu": "x64" }, "sha512-lMfF8X7QhdQzseM6XaX0vbno2m3hlyZFhwcndRMw8fbAGUGL3WFMBdK0hbUBIUYcEcMhVLr1SIamDeuLBnXS+Q=="], + + "@rollup/rollup-linux-arm-gnueabihf": ["@rollup/rollup-linux-arm-gnueabihf@4.53.3", "", { "os": "linux", "cpu": "arm" }, "sha512-k9oD15soC/Ln6d2Wv/JOFPzZXIAIFLp6B+i14KhxAfnq76ajt0EhYc5YPeX6W1xJkAdItcVT+JhKl1QZh44/qw=="], + + "@rollup/rollup-linux-arm-musleabihf": ["@rollup/rollup-linux-arm-musleabihf@4.53.3", "", { "os": "linux", "cpu": "arm" }, "sha512-vTNlKq+N6CK/8UktsrFuc+/7NlEYVxgaEgRXVUVK258Z5ymho29skzW1sutgYjqNnquGwVUObAaxae8rZ6YMhg=="], + + "@rollup/rollup-linux-arm64-gnu": ["@rollup/rollup-linux-arm64-gnu@4.53.3", "", { "os": "linux", "cpu": "arm64" }, "sha512-RGrFLWgMhSxRs/EWJMIFM1O5Mzuz3Xy3/mnxJp/5cVhZ2XoCAxJnmNsEyeMJtpK+wu0FJFWz+QF4mjCA7AUQ3w=="], + + "@rollup/rollup-linux-arm64-musl": ["@rollup/rollup-linux-arm64-musl@4.53.3", "", { "os": "linux", "cpu": "arm64" }, "sha512-kASyvfBEWYPEwe0Qv4nfu6pNkITLTb32p4yTgzFCocHnJLAHs+9LjUu9ONIhvfT/5lv4YS5muBHyuV84epBo/A=="], + + "@rollup/rollup-linux-loong64-gnu": ["@rollup/rollup-linux-loong64-gnu@4.53.3", "", { "os": "linux", "cpu": "none" }, "sha512-JiuKcp2teLJwQ7vkJ95EwESWkNRFJD7TQgYmCnrPtlu50b4XvT5MOmurWNrCj3IFdyjBQ5p9vnrX4JM6I8OE7g=="], + + "@rollup/rollup-linux-ppc64-gnu": ["@rollup/rollup-linux-ppc64-gnu@4.53.3", "", { "os": "linux", "cpu": "ppc64" }, "sha512-EoGSa8nd6d3T7zLuqdojxC20oBfNT8nexBbB/rkxgKj5T5vhpAQKKnD+h3UkoMuTyXkP5jTjK/ccNRmQrPNDuw=="], + + "@rollup/rollup-linux-riscv64-gnu": ["@rollup/rollup-linux-riscv64-gnu@4.53.3", "", { "os": "linux", "cpu": "none" }, "sha512-4s+Wped2IHXHPnAEbIB0YWBv7SDohqxobiiPA1FIWZpX+w9o2i4LezzH/NkFUl8LRci/8udci6cLq+jJQlh+0g=="], + + "@rollup/rollup-linux-riscv64-musl": ["@rollup/rollup-linux-riscv64-musl@4.53.3", "", { "os": "linux", "cpu": "none" }, "sha512-68k2g7+0vs2u9CxDt5ktXTngsxOQkSEV/xBbwlqYcUrAVh6P9EgMZvFsnHy4SEiUl46Xf0IObWVbMvPrr2gw8A=="], + + "@rollup/rollup-linux-s390x-gnu": ["@rollup/rollup-linux-s390x-gnu@4.53.3", "", { "os": "linux", "cpu": "s390x" }, "sha512-VYsFMpULAz87ZW6BVYw3I6sWesGpsP9OPcyKe8ofdg9LHxSbRMd7zrVrr5xi/3kMZtpWL/wC+UIJWJYVX5uTKg=="], + + "@rollup/rollup-linux-x64-gnu": ["@rollup/rollup-linux-x64-gnu@4.53.3", "", { "os": "linux", "cpu": "x64" }, "sha512-3EhFi1FU6YL8HTUJZ51imGJWEX//ajQPfqWLI3BQq4TlvHy4X0MOr5q3D2Zof/ka0d5FNdPwZXm3Yyib/UEd+w=="], + + "@rollup/rollup-linux-x64-musl": ["@rollup/rollup-linux-x64-musl@4.53.3", "", { "os": "linux", "cpu": "x64" }, "sha512-eoROhjcc6HbZCJr+tvVT8X4fW3/5g/WkGvvmwz/88sDtSJzO7r/blvoBDgISDiCjDRZmHpwud7h+6Q9JxFwq1Q=="], + + "@rollup/rollup-openharmony-arm64": ["@rollup/rollup-openharmony-arm64@4.53.3", "", { "os": "none", "cpu": "arm64" }, "sha512-OueLAWgrNSPGAdUdIjSWXw+u/02BRTcnfw9PN41D2vq/JSEPnJnVuBgw18VkN8wcd4fjUs+jFHVM4t9+kBSNLw=="], + + "@rollup/rollup-win32-arm64-msvc": ["@rollup/rollup-win32-arm64-msvc@4.53.3", "", { "os": "win32", "cpu": "arm64" }, "sha512-GOFuKpsxR/whszbF/bzydebLiXIHSgsEUp6M0JI8dWvi+fFa1TD6YQa4aSZHtpmh2/uAlj/Dy+nmby3TJ3pkTw=="], + + "@rollup/rollup-win32-ia32-msvc": ["@rollup/rollup-win32-ia32-msvc@4.53.3", "", { "os": "win32", "cpu": "ia32" }, "sha512-iah+THLcBJdpfZ1TstDFbKNznlzoxa8fmnFYK4V67HvmuNYkVdAywJSoteUszvBQ9/HqN2+9AZghbajMsFT+oA=="], + + "@rollup/rollup-win32-x64-gnu": ["@rollup/rollup-win32-x64-gnu@4.53.3", "", { "os": "win32", "cpu": "x64" }, "sha512-J9QDiOIZlZLdcot5NXEepDkstocktoVjkaKUtqzgzpt2yWjGlbYiKyp05rWwk4nypbYUNoFAztEgixoLaSETkg=="], + + "@rollup/rollup-win32-x64-msvc": ["@rollup/rollup-win32-x64-msvc@4.53.3", "", { "os": "win32", "cpu": "x64" }, "sha512-UhTd8u31dXadv0MopwGgNOBpUVROFKWVQgAg5N1ESyCz8AuBcMqm4AuTjrwgQKGDfoFuz02EuMRHQIw/frmYKQ=="], + + "@shikijs/core": ["@shikijs/core@2.5.0", "", { "dependencies": { "@shikijs/engine-javascript": "2.5.0", "@shikijs/engine-oniguruma": "2.5.0", "@shikijs/types": "2.5.0", "@shikijs/vscode-textmate": "^10.0.2", "@types/hast": "^3.0.4", "hast-util-to-html": "^9.0.4" } }, "sha512-uu/8RExTKtavlpH7XqnVYBrfBkUc20ngXiX9NSrBhOVZYv/7XQRKUyhtkeflY5QsxC0GbJThCerruZfsUaSldg=="], + + "@shikijs/engine-javascript": ["@shikijs/engine-javascript@2.5.0", "", { "dependencies": { "@shikijs/types": "2.5.0", "@shikijs/vscode-textmate": "^10.0.2", "oniguruma-to-es": "^3.1.0" } }, "sha512-VjnOpnQf8WuCEZtNUdjjwGUbtAVKuZkVQ/5cHy/tojVVRIRtlWMYVjyWhxOmIq05AlSOv72z7hRNRGVBgQOl0w=="], + + "@shikijs/engine-oniguruma": ["@shikijs/engine-oniguruma@2.5.0", "", { "dependencies": { "@shikijs/types": "2.5.0", "@shikijs/vscode-textmate": "^10.0.2" } }, "sha512-pGd1wRATzbo/uatrCIILlAdFVKdxImWJGQ5rFiB5VZi2ve5xj3Ax9jny8QvkaV93btQEwR/rSz5ERFpC5mKNIw=="], + + "@shikijs/langs": ["@shikijs/langs@2.5.0", "", { "dependencies": { "@shikijs/types": "2.5.0" } }, "sha512-Qfrrt5OsNH5R+5tJ/3uYBBZv3SuGmnRPejV9IlIbFH3HTGLDlkqgHymAlzklVmKBjAaVmkPkyikAV/sQ1wSL+w=="], + + "@shikijs/themes": ["@shikijs/themes@2.5.0", "", { "dependencies": { "@shikijs/types": "2.5.0" } }, "sha512-wGrk+R8tJnO0VMzmUExHR+QdSaPUl/NKs+a4cQQRWyoc3YFbUzuLEi/KWK1hj+8BfHRKm2jNhhJck1dfstJpiw=="], + + "@shikijs/transformers": ["@shikijs/transformers@2.5.0", "", { "dependencies": { "@shikijs/core": "2.5.0", "@shikijs/types": "2.5.0" } }, "sha512-SI494W5X60CaUwgi8u4q4m4s3YAFSxln3tzNjOSYqq54wlVgz0/NbbXEb3mdLbqMBztcmS7bVTaEd2w0qMmfeg=="], + + "@shikijs/types": ["@shikijs/types@2.5.0", "", { "dependencies": { "@shikijs/vscode-textmate": "^10.0.2", "@types/hast": "^3.0.4" } }, "sha512-ygl5yhxki9ZLNuNpPitBWvcy9fsSKKaRuO4BAlMyagszQidxcpLAr0qiW/q43DtSIDxO6hEbtYLiFZNXO/hdGw=="], + + "@shikijs/vscode-textmate": ["@shikijs/vscode-textmate@10.0.2", "", {}, "sha512-83yeghZ2xxin3Nj8z1NMd/NCuca+gsYXswywDy5bHvwlWL8tpTQmzGeUuHd9FC3E/SBEMvzJRwWEOz5gGes9Qg=="], + + "@types/estree": ["@types/estree@1.0.8", "", {}, "sha512-dWHzHa2WqEXI/O1E9OjrocMTKJl2mSrEolh1Iomrv6U+JuNwaHXsXx9bLu5gG7BUWFIN0skIQJQ/L1rIex4X6w=="], + + "@types/hast": ["@types/hast@3.0.4", "", { "dependencies": { "@types/unist": "*" } }, "sha512-WPs+bbQw5aCj+x6laNGWLH3wviHtoCv/P3+otBhbOhJgG8qtpdAMlTCxLtsTWA7LH1Oh/bFCHsBn0TPS5m30EQ=="], + + "@types/linkify-it": ["@types/linkify-it@5.0.0", "", {}, "sha512-sVDA58zAw4eWAffKOaQH5/5j3XeayukzDk+ewSsnv3p4yJEZHCCzMDiZM8e0OUrRvmpGZ85jf4yDHkHsgBNr9Q=="], + + "@types/markdown-it": ["@types/markdown-it@14.1.2", "", { "dependencies": { "@types/linkify-it": "^5", "@types/mdurl": "^2" } }, "sha512-promo4eFwuiW+TfGxhi+0x3czqTYJkG8qB17ZUJiVF10Xm7NLVRSLUsfRTU/6h1e24VvRnXCx+hG7li58lkzog=="], + + "@types/mdast": ["@types/mdast@4.0.4", "", { "dependencies": { "@types/unist": "*" } }, "sha512-kGaNbPh1k7AFzgpud/gMdvIm5xuECykRR+JnWKQno9TAXVa6WIVCGTPvYGekIDL4uwCZQSYbUxNBSb1aUo79oA=="], + + "@types/mdurl": ["@types/mdurl@2.0.0", "", {}, "sha512-RGdgjQUZba5p6QEFAVx2OGb8rQDL/cPRG7GiedRzMcJ1tYnUANBncjbSB1NRGwbvjcPeikRABz2nshyPk1bhWg=="], + + "@types/unist": ["@types/unist@3.0.3", "", {}, "sha512-ko/gIFJRv177XgZsZcBwnqJN5x/Gien8qNOn0D5bQU/zAzVf9Zt3BlcUiLqhV9y4ARk0GbT3tnUiPNgnTXzc/Q=="], + + "@types/web-bluetooth": ["@types/web-bluetooth@0.0.21", "", {}, "sha512-oIQLCGWtcFZy2JW77j9k8nHzAOpqMHLQejDA48XXMWH6tjCQHz5RCFz1bzsmROyL6PUm+LLnUiI4BCn221inxA=="], + + "@ungap/structured-clone": ["@ungap/structured-clone@1.3.0", "", {}, "sha512-WmoN8qaIAo7WTYWbAZuG8PYEhn5fkz7dZrqTBZ7dtt//lL2Gwms1IcnQ5yHqjDfX8Ft5j4YzDM23f87zBfDe9g=="], + + "@vitejs/plugin-vue": ["@vitejs/plugin-vue@5.2.4", "", { "peerDependencies": { "vite": "^5.0.0 || ^6.0.0", "vue": "^3.2.25" } }, "sha512-7Yx/SXSOcQq5HiiV3orevHUFn+pmMB4cgbEkDYgnkUWb0WfeQ/wa2yFv6D5ICiCQOVpjA7vYDXrC7AGO8yjDHA=="], + + "@vue/compiler-core": ["@vue/compiler-core@3.5.25", "", { "dependencies": { "@babel/parser": "^7.28.5", "@vue/shared": "3.5.25", "entities": "^4.5.0", "estree-walker": "^2.0.2", "source-map-js": "^1.2.1" } }, "sha512-vay5/oQJdsNHmliWoZfHPoVZZRmnSWhug0BYT34njkYTPqClh3DNWLkZNJBVSjsNMrg0CCrBfoKkjZQPM/QVUw=="], + + "@vue/compiler-dom": ["@vue/compiler-dom@3.5.25", "", { "dependencies": { "@vue/compiler-core": "3.5.25", "@vue/shared": "3.5.25" } }, "sha512-4We0OAcMZsKgYoGlMjzYvaoErltdFI2/25wqanuTu+S4gismOTRTBPi4IASOjxWdzIwrYSjnqONfKvuqkXzE2Q=="], + + "@vue/compiler-sfc": ["@vue/compiler-sfc@3.5.25", "", { "dependencies": { "@babel/parser": "^7.28.5", "@vue/compiler-core": "3.5.25", "@vue/compiler-dom": "3.5.25", "@vue/compiler-ssr": "3.5.25", "@vue/shared": "3.5.25", "estree-walker": "^2.0.2", "magic-string": "^0.30.21", "postcss": "^8.5.6", "source-map-js": "^1.2.1" } }, "sha512-PUgKp2rn8fFsI++lF2sO7gwO2d9Yj57Utr5yEsDf3GNaQcowCLKL7sf+LvVFvtJDXUp/03+dC6f2+LCv5aK1ag=="], + + "@vue/compiler-ssr": ["@vue/compiler-ssr@3.5.25", "", { "dependencies": { "@vue/compiler-dom": "3.5.25", "@vue/shared": "3.5.25" } }, "sha512-ritPSKLBcParnsKYi+GNtbdbrIE1mtuFEJ4U1sWeuOMlIziK5GtOL85t5RhsNy4uWIXPgk+OUdpnXiTdzn8o3A=="], + + "@vue/devtools-api": ["@vue/devtools-api@7.7.9", "", { "dependencies": { "@vue/devtools-kit": "^7.7.9" } }, "sha512-kIE8wvwlcZ6TJTbNeU2HQNtaxLx3a84aotTITUuL/4bzfPxzajGBOoqjMhwZJ8L9qFYDU/lAYMEEm11dnZOD6g=="], + + "@vue/devtools-kit": ["@vue/devtools-kit@7.7.9", "", { "dependencies": { "@vue/devtools-shared": "^7.7.9", "birpc": "^2.3.0", "hookable": "^5.5.3", "mitt": "^3.0.1", "perfect-debounce": "^1.0.0", "speakingurl": "^14.0.1", "superjson": "^2.2.2" } }, "sha512-PyQ6odHSgiDVd4hnTP+aDk2X4gl2HmLDfiyEnn3/oV+ckFDuswRs4IbBT7vacMuGdwY/XemxBoh302ctbsptuA=="], + + "@vue/devtools-shared": ["@vue/devtools-shared@7.7.9", "", { "dependencies": { "rfdc": "^1.4.1" } }, "sha512-iWAb0v2WYf0QWmxCGy0seZNDPdO3Sp5+u78ORnyeonS6MT4PC7VPrryX2BpMJrwlDeaZ6BD4vP4XKjK0SZqaeA=="], + + "@vue/reactivity": ["@vue/reactivity@3.5.25", "", { "dependencies": { "@vue/shared": "3.5.25" } }, "sha512-5xfAypCQepv4Jog1U4zn8cZIcbKKFka3AgWHEFQeK65OW+Ys4XybP6z2kKgws4YB43KGpqp5D/K3go2UPPunLA=="], + + "@vue/runtime-core": ["@vue/runtime-core@3.5.25", "", { "dependencies": { "@vue/reactivity": "3.5.25", "@vue/shared": "3.5.25" } }, "sha512-Z751v203YWwYzy460bzsYQISDfPjHTl+6Zzwo/a3CsAf+0ccEjQ8c+0CdX1WsumRTHeywvyUFtW6KvNukT/smA=="], + + "@vue/runtime-dom": ["@vue/runtime-dom@3.5.25", "", { "dependencies": { "@vue/reactivity": "3.5.25", "@vue/runtime-core": "3.5.25", "@vue/shared": "3.5.25", "csstype": "^3.1.3" } }, "sha512-a4WrkYFbb19i9pjkz38zJBg8wa/rboNERq3+hRRb0dHiJh13c+6kAbgqCPfMaJ2gg4weWD3APZswASOfmKwamA=="], + + "@vue/server-renderer": ["@vue/server-renderer@3.5.25", "", { "dependencies": { "@vue/compiler-ssr": "3.5.25", "@vue/shared": "3.5.25" }, "peerDependencies": { "vue": "3.5.25" } }, "sha512-UJaXR54vMG61i8XNIzTSf2Q7MOqZHpp8+x3XLGtE3+fL+nQd+k7O5+X3D/uWrnQXOdMw5VPih+Uremcw+u1woQ=="], + + "@vue/shared": ["@vue/shared@3.5.25", "", {}, "sha512-AbOPdQQnAnzs58H2FrrDxYj/TJfmeS2jdfEEhgiKINy+bnOANmVizIEgq1r+C5zsbs6l1CCQxtcj71rwNQ4jWg=="], + + "@vueuse/core": ["@vueuse/core@12.8.2", "", { "dependencies": { "@types/web-bluetooth": "^0.0.21", "@vueuse/metadata": "12.8.2", "@vueuse/shared": "12.8.2", "vue": "^3.5.13" } }, "sha512-HbvCmZdzAu3VGi/pWYm5Ut+Kd9mn1ZHnn4L5G8kOQTPs/IwIAmJoBrmYk2ckLArgMXZj0AW3n5CAejLUO+PhdQ=="], + + "@vueuse/integrations": ["@vueuse/integrations@12.8.2", "", { "dependencies": { "@vueuse/core": "12.8.2", "@vueuse/shared": "12.8.2", "vue": "^3.5.13" }, "peerDependencies": { "async-validator": "^4", "axios": "^1", "change-case": "^5", "drauu": "^0.4", "focus-trap": "^7", "fuse.js": "^7", "idb-keyval": "^6", "jwt-decode": "^4", "nprogress": "^0.2", "qrcode": "^1.5", "sortablejs": "^1", "universal-cookie": "^7" }, "optionalPeers": ["async-validator", "axios", "change-case", "drauu", "focus-trap", "fuse.js", "idb-keyval", "jwt-decode", "nprogress", "qrcode", "sortablejs", "universal-cookie"] }, "sha512-fbGYivgK5uBTRt7p5F3zy6VrETlV9RtZjBqd1/HxGdjdckBgBM4ugP8LHpjolqTj14TXTxSK1ZfgPbHYyGuH7g=="], + + "@vueuse/metadata": ["@vueuse/metadata@12.8.2", "", {}, "sha512-rAyLGEuoBJ/Il5AmFHiziCPdQzRt88VxR+Y/A/QhJ1EWtWqPBBAxTAFaSkviwEuOEZNtW8pvkPgoCZQ+HxqW1A=="], + + "@vueuse/shared": ["@vueuse/shared@12.8.2", "", { "dependencies": { "vue": "^3.5.13" } }, "sha512-dznP38YzxZoNloI0qpEfpkms8knDtaoQ6Y/sfS0L7Yki4zh40LFHEhur0odJC6xTHG5dxWVPiUWBXn+wCG2s5w=="], + + "algoliasearch": ["algoliasearch@5.46.0", "", { "dependencies": { "@algolia/abtesting": "1.12.0", "@algolia/client-abtesting": "5.46.0", "@algolia/client-analytics": "5.46.0", "@algolia/client-common": "5.46.0", "@algolia/client-insights": "5.46.0", "@algolia/client-personalization": "5.46.0", "@algolia/client-query-suggestions": "5.46.0", "@algolia/client-search": "5.46.0", "@algolia/ingestion": "1.46.0", "@algolia/monitoring": "1.46.0", "@algolia/recommend": "5.46.0", "@algolia/requester-browser-xhr": "5.46.0", "@algolia/requester-fetch": "5.46.0", "@algolia/requester-node-http": "5.46.0" } }, "sha512-7ML6fa2K93FIfifG3GMWhDEwT5qQzPTmoHKCTvhzGEwdbQ4n0yYUWZlLYT75WllTGJCJtNUI0C1ybN4BCegqvg=="], + + "birpc": ["birpc@2.9.0", "", {}, "sha512-KrayHS5pBi69Xi9JmvoqrIgYGDkD6mcSe/i6YKi3w5kekCLzrX4+nawcXqrj2tIp50Kw/mT/s3p+GVK0A0sKxw=="], + + "ccount": ["ccount@2.0.1", "", {}, "sha512-eyrF0jiFpY+3drT6383f1qhkbGsLSifNAjA61IUjZjmLCWjItY6LB9ft9YhoDgwfmclB2zhu51Lc7+95b8NRAg=="], + + "character-entities-html4": ["character-entities-html4@2.1.0", "", {}, "sha512-1v7fgQRj6hnSwFpq1Eu0ynr/CDEw0rXo2B61qXrLNdHZmPKgb7fqS1a2JwF0rISo9q77jDI8VMEHoApn8qDoZA=="], + + "character-entities-legacy": ["character-entities-legacy@3.0.0", "", {}, "sha512-RpPp0asT/6ufRm//AJVwpViZbGM/MkjQFxJccQRHmISF/22NBtsHqAWmL+/pmkPWoIUJdWyeVleTl1wydHATVQ=="], + + "comma-separated-tokens": ["comma-separated-tokens@2.0.3", "", {}, "sha512-Fu4hJdvzeylCfQPp9SGWidpzrMs7tTrlu6Vb8XGaRGck8QSNZJJp538Wrb60Lax4fPwR64ViY468OIUTbRlGZg=="], + + "copy-anything": ["copy-anything@4.0.5", "", { "dependencies": { "is-what": "^5.2.0" } }, "sha512-7Vv6asjS4gMOuILabD3l739tsaxFQmC+a7pLZm02zyvs8p977bL3zEgq3yDk5rn9B0PbYgIv++jmHcuUab4RhA=="], + + "csstype": ["csstype@3.2.3", "", {}, "sha512-z1HGKcYy2xA8AGQfwrn0PAy+PB7X/GSj3UVJW9qKyn43xWa+gl5nXmU4qqLMRzWVLFC8KusUX8T/0kCiOYpAIQ=="], + + "dequal": ["dequal@2.0.3", "", {}, "sha512-0je+qPKHEMohvfRTCEo3CrPG6cAzAYgmzKyxRiYSSDkS6eGJdyVJm7WaYA5ECaAD9wLB2T4EEeymA5aFVcYXCA=="], + + "devlop": ["devlop@1.1.0", "", { "dependencies": { "dequal": "^2.0.0" } }, "sha512-RWmIqhcFf1lRYBvNmr7qTNuyCt/7/ns2jbpp1+PalgE/rDQcBT0fioSMUpJ93irlUhC5hrg4cYqe6U+0ImW0rA=="], + + "emoji-regex-xs": ["emoji-regex-xs@1.0.0", "", {}, "sha512-LRlerrMYoIDrT6jgpeZ2YYl/L8EulRTt5hQcYjy5AInh7HWXKimpqx68aknBFpGL2+/IcogTcaydJEgaTmOpDg=="], + + "entities": ["entities@4.5.0", "", {}, "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw=="], + + "esbuild": ["esbuild@0.21.5", "", { "optionalDependencies": { "@esbuild/aix-ppc64": "0.21.5", "@esbuild/android-arm": "0.21.5", "@esbuild/android-arm64": "0.21.5", "@esbuild/android-x64": "0.21.5", "@esbuild/darwin-arm64": "0.21.5", "@esbuild/darwin-x64": "0.21.5", "@esbuild/freebsd-arm64": "0.21.5", "@esbuild/freebsd-x64": "0.21.5", "@esbuild/linux-arm": "0.21.5", "@esbuild/linux-arm64": "0.21.5", "@esbuild/linux-ia32": "0.21.5", "@esbuild/linux-loong64": "0.21.5", "@esbuild/linux-mips64el": "0.21.5", "@esbuild/linux-ppc64": "0.21.5", "@esbuild/linux-riscv64": "0.21.5", "@esbuild/linux-s390x": "0.21.5", "@esbuild/linux-x64": "0.21.5", "@esbuild/netbsd-x64": "0.21.5", "@esbuild/openbsd-x64": "0.21.5", "@esbuild/sunos-x64": "0.21.5", "@esbuild/win32-arm64": "0.21.5", "@esbuild/win32-ia32": "0.21.5", "@esbuild/win32-x64": "0.21.5" }, "bin": { "esbuild": "bin/esbuild" } }, "sha512-mg3OPMV4hXywwpoDxu3Qda5xCKQi+vCTZq8S9J/EpkhB2HzKXq4SNFZE3+NK93JYxc8VMSep+lOUSC/RVKaBqw=="], + + "estree-walker": ["estree-walker@2.0.2", "", {}, "sha512-Rfkk/Mp/DL7JVje3u18FxFujQlTNR2q6QfMSMB7AvCBx91NGj/ba3kCfza0f6dVDbw7YlRf/nDrn7pQrCCyQ/w=="], + + "focus-trap": ["focus-trap@7.6.6", "", { "dependencies": { "tabbable": "^6.3.0" } }, "sha512-v/Z8bvMCajtx4mEXmOo7QEsIzlIOqRXTIwgUfsFOF9gEsespdbD0AkPIka1bSXZ8Y8oZ+2IVDQZePkTfEHZl7Q=="], + + "fsevents": ["fsevents@2.3.3", "", { "os": "darwin" }, "sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw=="], + + "hast-util-to-html": ["hast-util-to-html@9.0.5", "", { "dependencies": { "@types/hast": "^3.0.0", "@types/unist": "^3.0.0", "ccount": "^2.0.0", "comma-separated-tokens": "^2.0.0", "hast-util-whitespace": "^3.0.0", "html-void-elements": "^3.0.0", "mdast-util-to-hast": "^13.0.0", "property-information": "^7.0.0", "space-separated-tokens": "^2.0.0", "stringify-entities": "^4.0.0", "zwitch": "^2.0.4" } }, "sha512-OguPdidb+fbHQSU4Q4ZiLKnzWo8Wwsf5bZfbvu7//a9oTYoqD/fWpe96NuHkoS9h0ccGOTe0C4NGXdtS0iObOw=="], + + "hast-util-whitespace": ["hast-util-whitespace@3.0.0", "", { "dependencies": { "@types/hast": "^3.0.0" } }, "sha512-88JUN06ipLwsnv+dVn+OIYOvAuvBMy/Qoi6O7mQHxdPXpjy+Cd6xRkWwux7DKO+4sYILtLBRIKgsdpS2gQc7qw=="], + + "hookable": ["hookable@5.5.3", "", {}, "sha512-Yc+BQe8SvoXH1643Qez1zqLRmbA5rCL+sSmk6TVos0LWVfNIB7PGncdlId77WzLGSIB5KaWgTaNTs2lNVEI6VQ=="], + + "html-void-elements": ["html-void-elements@3.0.0", "", {}, "sha512-bEqo66MRXsUGxWHV5IP0PUiAWwoEjba4VCzg0LjFJBpchPaTfyfCKTG6bc5F8ucKec3q5y6qOdGyYTSBEvhCrg=="], + + "is-what": ["is-what@5.5.0", "", {}, "sha512-oG7cgbmg5kLYae2N5IVd3jm2s+vldjxJzK1pcu9LfpGuQ93MQSzo0okvRna+7y5ifrD+20FE8FvjusyGaz14fw=="], + + "magic-string": ["magic-string@0.30.21", "", { "dependencies": { "@jridgewell/sourcemap-codec": "^1.5.5" } }, "sha512-vd2F4YUyEXKGcLHoq+TEyCjxueSeHnFxyyjNp80yg0XV4vUhnDer/lvvlqM/arB5bXQN5K2/3oinyCRyx8T2CQ=="], + + "mark.js": ["mark.js@8.11.1", "", {}, "sha512-1I+1qpDt4idfgLQG+BNWmrqku+7/2bi5nLf4YwF8y8zXvmfiTBY3PV3ZibfrjBueCByROpuBjLLFCajqkgYoLQ=="], + + "mdast-util-to-hast": ["mdast-util-to-hast@13.2.1", "", { "dependencies": { "@types/hast": "^3.0.0", "@types/mdast": "^4.0.0", "@ungap/structured-clone": "^1.0.0", "devlop": "^1.0.0", "micromark-util-sanitize-uri": "^2.0.0", "trim-lines": "^3.0.0", "unist-util-position": "^5.0.0", "unist-util-visit": "^5.0.0", "vfile": "^6.0.0" } }, "sha512-cctsq2wp5vTsLIcaymblUriiTcZd0CwWtCbLvrOzYCDZoWyMNV8sZ7krj09FSnsiJi3WVsHLM4k6Dq/yaPyCXA=="], + + "micromark-util-character": ["micromark-util-character@2.1.1", "", { "dependencies": { "micromark-util-symbol": "^2.0.0", "micromark-util-types": "^2.0.0" } }, "sha512-wv8tdUTJ3thSFFFJKtpYKOYiGP2+v96Hvk4Tu8KpCAsTMs6yi+nVmGh1syvSCsaxz45J6Jbw+9DD6g97+NV67Q=="], + + "micromark-util-encode": ["micromark-util-encode@2.0.1", "", {}, "sha512-c3cVx2y4KqUnwopcO9b/SCdo2O67LwJJ/UyqGfbigahfegL9myoEFoDYZgkT7f36T0bLrM9hZTAaAyH+PCAXjw=="], + + "micromark-util-sanitize-uri": ["micromark-util-sanitize-uri@2.0.1", "", { "dependencies": { "micromark-util-character": "^2.0.0", "micromark-util-encode": "^2.0.0", "micromark-util-symbol": "^2.0.0" } }, "sha512-9N9IomZ/YuGGZZmQec1MbgxtlgougxTodVwDzzEouPKo3qFWvymFHWcnDi2vzV1ff6kas9ucW+o3yzJK9YB1AQ=="], + + "micromark-util-symbol": ["micromark-util-symbol@2.0.1", "", {}, "sha512-vs5t8Apaud9N28kgCrRUdEed4UJ+wWNvicHLPxCa9ENlYuAY31M0ETy5y1vA33YoNPDFTghEbnh6efaE8h4x0Q=="], + + "micromark-util-types": ["micromark-util-types@2.0.2", "", {}, "sha512-Yw0ECSpJoViF1qTU4DC6NwtC4aWGt1EkzaQB8KPPyCRR8z9TWeV0HbEFGTO+ZY1wB22zmxnJqhPyTpOVCpeHTA=="], + + "minisearch": ["minisearch@7.2.0", "", {}, "sha512-dqT2XBYUOZOiC5t2HRnwADjhNS2cecp9u+TJRiJ1Qp/f5qjkeT5APcGPjHw+bz89Ms8Jp+cG4AlE+QZ/QnDglg=="], + + "mitt": ["mitt@3.0.1", "", {}, "sha512-vKivATfr97l2/QBCYAkXYDbrIWPM2IIKEl7YPhjCvKlG3kE2gm+uBo6nEXK3M5/Ffh/FLpKExzOQ3JJoJGFKBw=="], + + "nanoid": ["nanoid@3.3.11", "", { "bin": { "nanoid": "bin/nanoid.cjs" } }, "sha512-N8SpfPUnUp1bK+PMYW8qSWdl9U+wwNWI4QKxOYDy9JAro3WMX7p2OeVRF9v+347pnakNevPmiHhNmZ2HbFA76w=="], + + "oniguruma-to-es": ["oniguruma-to-es@3.1.1", "", { "dependencies": { "emoji-regex-xs": "^1.0.0", "regex": "^6.0.1", "regex-recursion": "^6.0.2" } }, "sha512-bUH8SDvPkH3ho3dvwJwfonjlQ4R80vjyvrU8YpxuROddv55vAEJrTuCuCVUhhsHbtlD9tGGbaNApGQckXhS8iQ=="], + + "perfect-debounce": ["perfect-debounce@1.0.0", "", {}, "sha512-xCy9V055GLEqoFaHoC1SoLIaLmWctgCUaBaWxDZ7/Zx4CTyX7cJQLJOok/orfjZAh9kEYpjJa4d0KcJmCbctZA=="], + + "picocolors": ["picocolors@1.1.1", "", {}, "sha512-xceH2snhtb5M9liqDsmEw56le376mTZkEX/jEb/RxNFyegNul7eNslCXP9FDj/Lcu0X8KEyMceP2ntpaHrDEVA=="], + + "postcss": ["postcss@8.5.6", "", { "dependencies": { "nanoid": "^3.3.11", "picocolors": "^1.1.1", "source-map-js": "^1.2.1" } }, "sha512-3Ybi1tAuwAP9s0r1UQ2J4n5Y0G05bJkpUIO0/bI9MhwmD70S5aTWbXGBwxHrelT+XM1k6dM0pk+SwNkpTRN7Pg=="], + + "preact": ["preact@10.28.0", "", {}, "sha512-rytDAoiXr3+t6OIP3WGlDd0ouCUG1iCWzkcY3++Nreuoi17y6T5i/zRhe6uYfoVcxq6YU+sBtJouuRDsq8vvqA=="], + + "property-information": ["property-information@7.1.0", "", {}, "sha512-TwEZ+X+yCJmYfL7TPUOcvBZ4QfoT5YenQiJuX//0th53DE6w0xxLEtfK3iyryQFddXuvkIk51EEgrJQ0WJkOmQ=="], + + "regex": ["regex@6.1.0", "", { "dependencies": { "regex-utilities": "^2.3.0" } }, "sha512-6VwtthbV4o/7+OaAF9I5L5V3llLEsoPyq9P1JVXkedTP33c7MfCG0/5NOPcSJn0TzXcG9YUrR0gQSWioew3LDg=="], + + "regex-recursion": ["regex-recursion@6.0.2", "", { "dependencies": { "regex-utilities": "^2.3.0" } }, "sha512-0YCaSCq2VRIebiaUviZNs0cBz1kg5kVS2UKUfNIx8YVs1cN3AV7NTctO5FOKBA+UT2BPJIWZauYHPqJODG50cg=="], + + "regex-utilities": ["regex-utilities@2.3.0", "", {}, "sha512-8VhliFJAWRaUiVvREIiW2NXXTmHs4vMNnSzuJVhscgmGav3g9VDxLrQndI3dZZVVdp0ZO/5v0xmX516/7M9cng=="], + + "rfdc": ["rfdc@1.4.1", "", {}, "sha512-q1b3N5QkRUWUl7iyylaaj3kOpIT0N2i9MqIEQXP73GVsN9cw3fdx8X63cEmWhJGi2PPCF23Ijp7ktmd39rawIA=="], + + "rollup": ["rollup@4.53.3", "", { "dependencies": { "@types/estree": "1.0.8" }, "optionalDependencies": { "@rollup/rollup-android-arm-eabi": "4.53.3", "@rollup/rollup-android-arm64": "4.53.3", "@rollup/rollup-darwin-arm64": "4.53.3", "@rollup/rollup-darwin-x64": "4.53.3", "@rollup/rollup-freebsd-arm64": "4.53.3", "@rollup/rollup-freebsd-x64": "4.53.3", "@rollup/rollup-linux-arm-gnueabihf": "4.53.3", "@rollup/rollup-linux-arm-musleabihf": "4.53.3", "@rollup/rollup-linux-arm64-gnu": "4.53.3", "@rollup/rollup-linux-arm64-musl": "4.53.3", "@rollup/rollup-linux-loong64-gnu": "4.53.3", "@rollup/rollup-linux-ppc64-gnu": "4.53.3", "@rollup/rollup-linux-riscv64-gnu": "4.53.3", "@rollup/rollup-linux-riscv64-musl": "4.53.3", "@rollup/rollup-linux-s390x-gnu": "4.53.3", "@rollup/rollup-linux-x64-gnu": "4.53.3", "@rollup/rollup-linux-x64-musl": "4.53.3", "@rollup/rollup-openharmony-arm64": "4.53.3", "@rollup/rollup-win32-arm64-msvc": "4.53.3", "@rollup/rollup-win32-ia32-msvc": "4.53.3", "@rollup/rollup-win32-x64-gnu": "4.53.3", "@rollup/rollup-win32-x64-msvc": "4.53.3", "fsevents": "~2.3.2" }, "bin": { "rollup": "dist/bin/rollup" } }, "sha512-w8GmOxZfBmKknvdXU1sdM9NHcoQejwF/4mNgj2JuEEdRaHwwF12K7e9eXn1nLZ07ad+du76mkVsyeb2rKGllsA=="], + + "search-insights": ["search-insights@2.17.3", "", {}, "sha512-RQPdCYTa8A68uM2jwxoY842xDhvx3E5LFL1LxvxCNMev4o5mLuokczhzjAgGwUZBAmOKZknArSxLKmXtIi2AxQ=="], + + "shiki": ["shiki@2.5.0", "", { "dependencies": { "@shikijs/core": "2.5.0", "@shikijs/engine-javascript": "2.5.0", "@shikijs/engine-oniguruma": "2.5.0", "@shikijs/langs": "2.5.0", "@shikijs/themes": "2.5.0", "@shikijs/types": "2.5.0", "@shikijs/vscode-textmate": "^10.0.2", "@types/hast": "^3.0.4" } }, "sha512-mI//trrsaiCIPsja5CNfsyNOqgAZUb6VpJA+340toL42UpzQlXpwRV9nch69X6gaUxrr9kaOOa6e3y3uAkGFxQ=="], + + "source-map-js": ["source-map-js@1.2.1", "", {}, "sha512-UXWMKhLOwVKb728IUtQPXxfYU+usdybtUrK/8uGE8CQMvrhOpwvzDBwj0QhSL7MQc7vIsISBG8VQ8+IDQxpfQA=="], + + "space-separated-tokens": ["space-separated-tokens@2.0.2", "", {}, "sha512-PEGlAwrG8yXGXRjW32fGbg66JAlOAwbObuqVoJpv/mRgoWDQfgH1wDPvtzWyUSNAXBGSk8h755YDbbcEy3SH2Q=="], + + "speakingurl": ["speakingurl@14.0.1", "", {}, "sha512-1POYv7uv2gXoyGFpBCmpDVSNV74IfsWlDW216UPjbWufNf+bSU6GdbDsxdcxtfwb4xlI3yxzOTKClUosxARYrQ=="], + + "stringify-entities": ["stringify-entities@4.0.4", "", { "dependencies": { "character-entities-html4": "^2.0.0", "character-entities-legacy": "^3.0.0" } }, "sha512-IwfBptatlO+QCJUo19AqvrPNqlVMpW9YEL2LIVY+Rpv2qsjCGxaDLNRgeGsQWJhfItebuJhsGSLjaBbNSQ+ieg=="], + + "superjson": ["superjson@2.2.6", "", { "dependencies": { "copy-anything": "^4" } }, "sha512-H+ue8Zo4vJmV2nRjpx86P35lzwDT3nItnIsocgumgr0hHMQ+ZGq5vrERg9kJBo5AWGmxZDhzDo+WVIJqkB0cGA=="], + + "tabbable": ["tabbable@6.3.0", "", {}, "sha512-EIHvdY5bPLuWForiR/AN2Bxngzpuwn1is4asboytXtpTgsArc+WmSJKVLlhdh71u7jFcryDqB2A8lQvj78MkyQ=="], + + "trim-lines": ["trim-lines@3.0.1", "", {}, "sha512-kRj8B+YHZCc9kQYdWfJB2/oUl9rA99qbowYYBtr4ui4mZyAQ2JpvVBd/6U2YloATfqBhBTSMhTpgBHtU0Mf3Rg=="], + + "unist-util-is": ["unist-util-is@6.0.1", "", { "dependencies": { "@types/unist": "^3.0.0" } }, "sha512-LsiILbtBETkDz8I9p1dQ0uyRUWuaQzd/cuEeS1hoRSyW5E5XGmTzlwY1OrNzzakGowI9Dr/I8HVaw4hTtnxy8g=="], + + "unist-util-position": ["unist-util-position@5.0.0", "", { "dependencies": { "@types/unist": "^3.0.0" } }, "sha512-fucsC7HjXvkB5R3kTCO7kUjRdrS0BJt3M/FPxmHMBOm8JQi2BsHAHFsy27E0EolP8rp0NzXsJ+jNPyDWvOJZPA=="], + + "unist-util-stringify-position": ["unist-util-stringify-position@4.0.0", "", { "dependencies": { "@types/unist": "^3.0.0" } }, "sha512-0ASV06AAoKCDkS2+xw5RXJywruurpbC4JZSm7nr7MOt1ojAzvyyaO+UxZf18j8FCF6kmzCZKcAgN/yu2gm2XgQ=="], + + "unist-util-visit": ["unist-util-visit@5.0.0", "", { "dependencies": { "@types/unist": "^3.0.0", "unist-util-is": "^6.0.0", "unist-util-visit-parents": "^6.0.0" } }, "sha512-MR04uvD+07cwl/yhVuVWAtw+3GOR/knlL55Nd/wAdblk27GCVt3lqpTivy/tkJcZoNPzTwS1Y+KMojlLDhoTzg=="], + + "unist-util-visit-parents": ["unist-util-visit-parents@6.0.2", "", { "dependencies": { "@types/unist": "^3.0.0", "unist-util-is": "^6.0.0" } }, "sha512-goh1s1TBrqSqukSc8wrjwWhL0hiJxgA8m4kFxGlQ+8FYQ3C/m11FcTs4YYem7V664AhHVvgoQLk890Ssdsr2IQ=="], + + "vfile": ["vfile@6.0.3", "", { "dependencies": { "@types/unist": "^3.0.0", "vfile-message": "^4.0.0" } }, "sha512-KzIbH/9tXat2u30jf+smMwFCsno4wHVdNmzFyL+T/L3UGqqk6JKfVqOFOZEpZSHADH1k40ab6NUIXZq422ov3Q=="], + + "vfile-message": ["vfile-message@4.0.3", "", { "dependencies": { "@types/unist": "^3.0.0", "unist-util-stringify-position": "^4.0.0" } }, "sha512-QTHzsGd1EhbZs4AsQ20JX1rC3cOlt/IWJruk893DfLRr57lcnOeMaWG4K0JrRta4mIJZKth2Au3mM3u03/JWKw=="], + + "vite": ["vite@5.4.21", "", { "dependencies": { "esbuild": "^0.21.3", "postcss": "^8.4.43", "rollup": "^4.20.0" }, "optionalDependencies": { "fsevents": "~2.3.3" }, "peerDependencies": { "@types/node": "^18.0.0 || >=20.0.0", "less": "*", "lightningcss": "^1.21.0", "sass": "*", "sass-embedded": "*", "stylus": "*", "sugarss": "*", "terser": "^5.4.0" }, "optionalPeers": ["@types/node", "less", "lightningcss", "sass", "sass-embedded", "stylus", "sugarss", "terser"], "bin": { "vite": "bin/vite.js" } }, "sha512-o5a9xKjbtuhY6Bi5S3+HvbRERmouabWbyUcpXXUA1u+GNUKoROi9byOJ8M0nHbHYHkYICiMlqxkg1KkYmm25Sw=="], + + "vitepress": ["vitepress@1.6.4", "", { "dependencies": { "@docsearch/css": "3.8.2", "@docsearch/js": "3.8.2", "@iconify-json/simple-icons": "^1.2.21", "@shikijs/core": "^2.1.0", "@shikijs/transformers": "^2.1.0", "@shikijs/types": "^2.1.0", "@types/markdown-it": "^14.1.2", "@vitejs/plugin-vue": "^5.2.1", "@vue/devtools-api": "^7.7.0", "@vue/shared": "^3.5.13", "@vueuse/core": "^12.4.0", "@vueuse/integrations": "^12.4.0", "focus-trap": "^7.6.4", "mark.js": "8.11.1", "minisearch": "^7.1.1", "shiki": "^2.1.0", "vite": "^5.4.14", "vue": "^3.5.13" }, "peerDependencies": { "markdown-it-mathjax3": "^4", "postcss": "^8" }, "optionalPeers": ["markdown-it-mathjax3", "postcss"], "bin": { "vitepress": "bin/vitepress.js" } }, "sha512-+2ym1/+0VVrbhNyRoFFesVvBvHAVMZMK0rw60E3X/5349M1GuVdKeazuksqopEdvkKwKGs21Q729jX81/bkBJg=="], + + "vue": ["vue@3.5.25", "", { "dependencies": { "@vue/compiler-dom": "3.5.25", "@vue/compiler-sfc": "3.5.25", "@vue/runtime-dom": "3.5.25", "@vue/server-renderer": "3.5.25", "@vue/shared": "3.5.25" }, "peerDependencies": { "typescript": "*" }, "optionalPeers": ["typescript"] }, "sha512-YLVdgv2K13WJ6n+kD5owehKtEXwdwXuj2TTyJMsO7pSeKw2bfRNZGjhB7YzrpbMYj5b5QsUebHpOqR3R3ziy/g=="], + + "zwitch": ["zwitch@2.0.4", "", {}, "sha512-bXE4cR/kVZhKZX/RjPEflHaKVhUVl85noU3v6b8apfQEc1x4A+zBxjZ4lN8LqGd6WZ3dl98pY4o717VFmoPp+A=="], + } +} diff --git a/docs/guide/ast-chunking.md b/docs/guide/ast-chunking.md new file mode 100644 index 0000000..a6900bd --- /dev/null +++ b/docs/guide/ast-chunking.md @@ -0,0 +1,273 @@ +# AST-Based Chunking + +CodeRAG uses Abstract Syntax Tree (AST) parsing to split code into semantic chunks rather than arbitrary character or line-based splits. This produces more meaningful search units. + +## How Tree-Sitter Parsers Work + +CodeRAG uses Synth parsers (built on tree-sitter) to parse code into AST nodes. Tree-sitter is a parser generator that creates fast, incremental parsers for programming languages. + +**Key concepts:** + +- **Nodes**: AST nodes represent code constructs (functions, classes, expressions, etc.) +- **Spans**: Each node has a span with start/end positions (line, column, offset) +- **Types**: Node types identify the construct (e.g., `FunctionDeclaration`, `ClassDeclaration`) +- **Tree structure**: Nodes form a hierarchical tree with parent/child relationships + +**Example AST:** + +```typescript +// Code: +function hello(name: string) { + return `Hello, ${name}` +} + +// Simplified AST: +{ + type: "FunctionDeclaration", + span: { start: { line: 0, column: 0 }, end: { line: 2, column: 1 } }, + children: [ + { type: "Identifier", value: "hello" }, + { type: "Parameters", children: [...] }, + { type: "BlockStatement", children: [...] } + ] +} +``` + +**Synth parser interface:** + +```typescript +interface SynthParser { + parseAsync: (source: string, options?: Record) => Promise +} + +interface Tree { + meta: { language: string; source: string } + root: NodeId + nodes: BaseNode[] +} +``` + +All Synth parsers use async parsing because they're WASM-based (v0.3.x). + +## Semantic Boundaries + +Semantic boundaries define where code should be split. CodeRAG chunks code at boundaries defined in the language configuration. + +**Common boundaries by language:** + +**TypeScript/JavaScript:** +- `FunctionDeclaration` +- `ClassDeclaration` +- `InterfaceDeclaration` +- `TypeAliasDeclaration` +- `ExportNamedDeclaration` +- `ExportDefaultDeclaration` + +**Python:** +- `FunctionDef` +- `AsyncFunctionDef` +- `ClassDef` +- `Module` + +**Go:** +- `FuncDecl` +- `MethodDecl` +- `TypeSpec` +- `GenDecl` + +**Rust:** +- `function_item` +- `impl_item` +- `struct_item` +- `enum_item` +- `trait_item` + +**Why semantic boundaries?** + +Semantic chunking ensures each chunk is a complete, meaningful unit: + +```typescript +// Good: Function-level chunking +// Chunk 1: +export function calculateBM25(tf: number, idf: number, docLen: number, avgDocLen: number): number { + const k1 = 1.2 + const b = 0.75 + const numerator = tf * (k1 + 1) + const denominator = tf + k1 * (1 - b + b * docLen / avgDocLen) + return idf * (numerator / denominator) +} + +// Bad: Character-based chunking at 100 chars +// Chunk 1: "export function calculateBM25(tf: number, idf: number, docLen: number, avgDocLen: number): number {" +// Chunk 2: "const k1 = 1.2\n const b = 0.75\n const numerator = tf * (k1 + 1)\n const denominator = tf + k" +// Result: Incomplete, nonsensical chunks +``` + +## Language-Specific Configurations + +Each language has a configuration defining its parser, boundaries, and context types. + +**Configuration structure:** + +```typescript +interface LanguageConfig { + parser: string // NPM package name + extensions: readonly string[] // File extensions + boundaries: readonly string[] // AST node types for chunking + contextTypes?: readonly string[] // Context to preserve (imports, types) + embedded?: EmbeddedLanguageConfig[] // Embedded languages + parserOptions?: Record +} +``` + +**Example: TypeScript configuration:** + +```typescript +typescript: { + parser: '@sylphx/synth-js', + extensions: ['.ts', '.mts', '.cts'], + boundaries: [ + 'FunctionDeclaration', + 'ClassDeclaration', + 'InterfaceDeclaration', + 'TypeAliasDeclaration', + 'EnumDeclaration', + 'MethodDefinition', + 'ExportNamedDeclaration', + 'ExportDefaultDeclaration', + ], + contextTypes: ['ImportDeclaration', 'TypeAliasDeclaration', 'InterfaceDeclaration'], + parserOptions: { sourceType: 'module' }, +} +``` + +**Context preservation:** + +Context types (imports, type definitions) can be prepended to each chunk for better understanding: + +```typescript +// With preserveContext: true + +// Chunk 1 (with context): +import { User } from './types' + +export function getUser(id: string): User { + return database.findById(id) +} + +// Without context, the chunk would start at "export function..." +``` + +## Chunk Metadata + +Each chunk includes metadata for precise navigation and filtering. + +**ChunkResult interface:** + +```typescript +interface ChunkResult { + readonly content: string // Chunk source code + readonly type: string // AST node type + readonly startLine: number // 1-indexed + readonly endLine: number // 1-indexed + readonly metadata: Record +} +``` + +**Example chunk:** + +```typescript +{ + content: "export function parseQuery(query: string): string[] {\n return query.toLowerCase().split(/\\s+/)\n}", + type: "FunctionDeclaration", + startLine: 5, + endLine: 7, + metadata: { + name: "parseQuery", + exported: true + } +} +``` + +**Metadata uses:** + +- **Search results**: Display which function/class matched +- **Navigation**: Jump to exact line in editor +- **Filtering**: Search only specific node types (e.g., only functions) +- **Ranking**: Boost certain types (e.g., exported functions) + +## Chunking Process + +Step-by-step chunking algorithm: + +1. **Detect language**: Determine language from file extension +2. **Load parser**: Get Synth parser for the language +3. **Parse AST**: Parse source code into syntax tree +4. **Extract chunks**: Traverse tree, extract nodes at semantic boundaries +5. **Merge small chunks**: Combine small chunks below minChunkSize +6. **Split large chunks**: Recursively split chunks exceeding maxChunkSize +7. **Add metadata**: Attach type, line numbers, and other metadata + +**API usage:** + +```typescript +import { chunkCodeByAST } from '@sylphx/coderag' + +const chunks = await chunkCodeByAST( + sourceCode, + 'example.ts', + { + maxChunkSize: 1000, // Max chars per chunk + minChunkSize: 100, // Min chars per chunk + preserveContext: true, // Include imports/types + nodeTypes: undefined, // Chunk all boundaries + parseEmbedded: true // Parse code blocks in markdown + } +) + +for (const chunk of chunks) { + console.log(`${chunk.type} (lines ${chunk.startLine}-${chunk.endLine})`) + console.log(chunk.content) +} +``` + +**Fallback behavior:** + +If AST parsing fails (unknown language, syntax error), CodeRAG falls back to character-based chunking: + +```typescript +// Fallback chunk +{ + content: "... raw text ...", + type: "text", + startLine: 0, + endLine: 0, + metadata: { fallback: true, reason: "no-semantic-boundaries" } +} +``` + +## Supported Languages + +CodeRAG supports 15+ languages through Synth parsers: + +**Tier 1 (Full AST support):** +- JavaScript/TypeScript/JSX/TSX +- Python +- Go +- Java +- C/C++ +- Rust + +**Tier 2 (Markup/Config):** +- Markdown +- HTML +- XML +- JSON +- YAML +- TOML +- INI + +**Tier 3 (Specialized):** +- Protobuf + +See [languages.md](/Users/kyle/coderag/docs/guide/languages.md) for full details. diff --git a/docs/guide/file-watching.md b/docs/guide/file-watching.md new file mode 100644 index 0000000..b530e16 --- /dev/null +++ b/docs/guide/file-watching.md @@ -0,0 +1,450 @@ +# File Watching + +CodeRAG provides real-time file watching using @parcel/watcher for automatic index updates when files change. + +## @parcel/watcher Usage + +@parcel/watcher is a native file watcher that uses platform-specific APIs for efficient change detection. + +**Why @parcel/watcher?** + +- Native performance (FSEvents on macOS, inotify on Linux, ReadDirectoryChangesW on Windows) +- Low CPU overhead (kernel-level notifications, not polling) +- Recursive directory watching +- Built-in ignore patterns +- Reliable event delivery + +**Alternatives comparison:** + +| Library | Performance | Reliability | Platform Support | +|---------|-------------|-------------|------------------| +| @parcel/watcher | Excellent (native) | High | macOS, Linux, Windows | +| chokidar | Good (fallback to polling) | Medium | All platforms | +| fs.watch | Poor (OS-dependent) | Low | Node.js built-in | + +**Installation:** + +```bash +npm install @parcel/watcher +``` + +@parcel/watcher is a native module with prebuilt binaries for major platforms. + +## Debouncing (500ms) + +File watchers emit events immediately, but multiple events often occur for a single logical change (e.g., save triggers multiple write events). + +**Debouncing behavior:** + +CodeRAG waits 500ms after the last event before processing changes. This consolidates rapid-fire events into a single update. + +**Example without debouncing:** + +``` +User saves file + β”œβ”€ 0ms: change event + β”œβ”€ 5ms: change event + β”œβ”€ 10ms: change event + └─ 15ms: change event +Result: 4 index updates (wasteful) +``` + +**Example with debouncing (500ms):** + +``` +User saves file + β”œβ”€ 0ms: change event β†’ start timer + β”œβ”€ 5ms: change event β†’ reset timer + β”œβ”€ 10ms: change event β†’ reset timer + └─ 15ms: change event β†’ reset timer + ... 500ms pass ... + └─ 515ms: process update once +Result: 1 index update (efficient) +``` + +**Implementation:** + +```typescript +private pendingUpdates = new Map() + +private handleFileChange(type: 'add' | 'change' | 'unlink', absolutePath: string): void { + const relativePath = path.relative(this.codebaseRoot, absolutePath) + + // Clear existing timeout + const existing = this.pendingUpdates.get(relativePath) + if (existing) { + clearTimeout(existing) + } + + // Set new timeout (500ms) + const timeout = setTimeout(async () => { + this.pendingUpdates.delete(relativePath) + await this.processFileChange(type, relativePath, absolutePath) + }, 500) + + this.pendingUpdates.set(relativePath, timeout) +} +``` + +**Debounce duration:** + +500ms is chosen to balance responsiveness and efficiency: + +- Too short (100ms): Multiple updates for single edit +- Too long (2000ms): Feels unresponsive +- 500ms: Good balance for typical edit workflows + +## Incremental Updates + +When files change, CodeRAG updates only affected chunks rather than rebuilding the entire index. + +**Update algorithm:** + +1. **Detect change**: File watcher emits event +2. **Debounce**: Wait 500ms for additional events +3. **Hash comparison**: Check if content actually changed +4. **Delete old chunks**: Remove chunks for changed file +5. **Re-chunk**: Parse file into new chunks +6. **Update vectors**: Compute TF-IDF vectors for new chunks +7. **Rebuild IDF**: Recalculate global IDF scores (affected by all changes) +8. **Recalculate TF-IDF**: Update TF-IDF scores using new IDF +9. **Update metadata**: Recalculate chunk magnitudes and average doc length + +**Change detection flow:** + +```typescript +async processFileChange(type: 'add' | 'change' | 'unlink', path: string) { + if (type === 'unlink') { + // File deleted + await storage.deleteFile(path) + await rebuildIndex() + return + } + + // File added or changed + const content = await fs.readFile(absolutePath, 'utf-8') + const newHash = simpleHash(content) + + // OPTIMIZATION: Compare hash to detect real changes + const existingFile = await storage.getFile(path) + if (existingFile && existingFile.hash === newHash) { + console.log('File unchanged (same hash), skipping') + return // No actual change + } + + // Content changed, update index + const file: CodebaseFile = { + path, + content, + hash: newHash, + size: stats.size, + mtime: stats.mtime, + language: detectLanguage(path) + } + + await storage.storeFile(file) + await rebuildIndex() +} +``` + +**Hash-based optimization:** + +File watcher events trigger on `mtime` changes, but content may be unchanged (e.g., file touched without edits). Hash comparison prevents unnecessary reindexing. + +```typescript +// Simple hash function (fast, not cryptographic) +function simpleHash(content: string): string { + let hash = 0 + for (let i = 0; i < content.length; i++) { + hash = ((hash << 5) - hash + content.charCodeAt(i)) | 0 + } + return hash.toString(36) +} +``` + +## Event Handling + +@parcel/watcher emits events for file system changes. CodeRAG maps these to index operations. + +**Event types:** + +| Watcher Event | CodeRAG Type | Action | +|---------------|--------------|--------| +| `create` | `add` | Index new file | +| `update` | `change` | Re-index changed file | +| `delete` | `unlink` | Remove file from index | + +**Subscription:** + +```typescript +import * as watcher from '@parcel/watcher' + +async startWatch(): Promise { + this.watcher = await watcher.subscribe( + this.codebaseRoot, + (err, events) => { + if (err) { + console.error('[WARN] File watcher error:', err.message) + return + } + + for (const event of events) { + const absolutePath = event.path + const relativePath = path.relative(this.codebaseRoot, absolutePath) + + // Skip ignored files + if (this.shouldIgnore(relativePath)) { + continue + } + + // Map event type + const eventType = + event.type === 'create' ? 'add' : + event.type === 'delete' ? 'unlink' : + 'change' + + this.handleFileChange(eventType, absolutePath) + } + }, + { + // Auto-detect best backend (FSEvents, inotify, etc.) + backend: undefined, + + // Ignore common directories + ignore: [ + '**/node_modules/**', + '**/.git/**', + '**/dist/**', + '**/build/**', + '**/.next/**', + '**/.turbo/**', + '**/.cache/**', + '**/coverage/**', + '**/*.log', + ] + } + ) + + this.isWatching = true + console.error('[SUCCESS] File watcher started (native FSEvents)') +} +``` + +**Event batching:** + +@parcel/watcher batches events internally and delivers them in groups, reducing overhead. + +```typescript +// Single callback receives all events in a batch +(err, events) => { + // events = [ + // { type: 'create', path: '/path/to/file1.ts' }, + // { type: 'update', path: '/path/to/file2.ts' }, + // { type: 'delete', path: '/path/to/file3.ts' } + // ] +} +``` + +## Watch Lifecycle + +**Starting watch mode:** + +```typescript +import { CodebaseIndexer } from '@sylphx/coderag' + +const indexer = new CodebaseIndexer({ + codebaseRoot: '/path/to/project', + watch: true, // Auto-start watching after indexing + onFileChange: (event) => { + console.log(`File ${event.type}: ${event.path}`) + } +}) + +await indexer.index() +// Watcher automatically started after indexing completes + +// Or start manually: +await indexer.startWatch() +``` + +**Stopping watch mode:** + +```typescript +await indexer.stopWatch() +// Cleans up: +// - Unsubscribes from watcher +// - Clears pending update timers +// - Releases file handles +``` + +**Watch state:** + +```typescript +const isWatching = indexer.isWatchEnabled() +// Returns: true if watching, false otherwise +``` + +## Ignore Patterns + +CodeRAG respects .gitignore and custom ignore patterns. + +**Gitignore support:** + +```typescript +import { loadGitignore } from '@sylphx/coderag/utils' + +const ignoreFilter = loadGitignore('/path/to/project') + +// Check if file should be ignored +if (ignoreFilter.ignores('node_modules/package.json')) { + console.log('File ignored') +} +``` + +Uses `ignore` library (same as git). + +**Built-in ignore patterns:** + +@parcel/watcher automatically ignores common directories: + +```typescript +ignore: [ + '**/node_modules/**', // Dependencies + '**/.git/**', // Git metadata + '**/dist/**', // Build output + '**/build/**', // Build output + '**/.next/**', // Next.js cache + '**/.turbo/**', // Turborepo cache + '**/.cache/**', // General cache + '**/coverage/**', // Test coverage + '**/*.log', // Log files +] +``` + +These are in addition to .gitignore patterns. + +**Custom ignore filter:** + +```typescript +private shouldIgnore(relativePath: string): boolean { + // Skip empty paths + if (!relativePath) return true + + // Check gitignore + if (this.ignoreFilter?.ignores(relativePath)) { + return true + } + + // Custom rules + if (relativePath.startsWith('.')) { + return true // Ignore hidden files + } + + return false +} +``` + +## Performance Characteristics + +**Event latency:** + +| Platform | Native API | Latency | CPU Usage | +|----------|-----------|---------|-----------| +| macOS | FSEvents | 50-100ms | <1% | +| Linux | inotify | 10-50ms | <1% | +| Windows | ReadDirectoryChanges | 100-200ms | <2% | + +**Overhead:** + +- No polling (0% CPU when idle) +- Kernel-level notifications (efficient) +- Minimal memory footprint (~1-2MB) + +**Scalability:** + +@parcel/watcher handles large directory trees efficiently: + +| Files | Memory | Event Rate | +|-------|--------|------------| +| 1k files | 1MB | 1000+ events/sec | +| 10k files | 2MB | 1000+ events/sec | +| 100k files | 5MB | 500+ events/sec | + +## Example Usage + +**Basic watching:** + +```typescript +import { CodebaseIndexer } from '@sylphx/coderag' + +const indexer = new CodebaseIndexer({ + codebaseRoot: './src', + watch: true, + onFileChange: (event) => { + console.log(`[${event.type.toUpperCase()}] ${event.path} at ${new Date(event.timestamp).toISOString()}`) + } +}) + +// Index and start watching +await indexer.index() + +// Keep process alive +process.on('SIGINT', async () => { + await indexer.stopWatch() + process.exit(0) +}) +``` + +**Manual watch control:** + +```typescript +const indexer = new CodebaseIndexer({ + codebaseRoot: './src', + watch: false // Don't auto-start +}) + +await indexer.index() + +// Start watching later +console.log('Starting file watcher...') +await indexer.startWatch() + +// Perform searches... +const results = await indexer.search('async function') + +// Stop when done +await indexer.stopWatch() +``` + +**File change callbacks:** + +```typescript +const indexer = new CodebaseIndexer({ + watch: true, + onFileChange: async (event) => { + if (event.type === 'add') { + console.log(`New file: ${event.path}`) + } else if (event.type === 'change') { + console.log(`Modified: ${event.path}`) + // Notify external service + await notifyWebhook({ type: 'file_changed', path: event.path }) + } else if (event.type === 'unlink') { + console.log(`Deleted: ${event.path}`) + } + } +}) +``` + +**Conditional watching:** + +```typescript +// Only watch in development +const isDevelopment = process.env.NODE_ENV === 'development' + +const indexer = new CodebaseIndexer({ + watch: isDevelopment, + onFileChange: isDevelopment ? (event) => { + console.log(`[DEV] File changed: ${event.path}`) + } : undefined +}) +``` diff --git a/docs/guide/getting-started.md b/docs/guide/getting-started.md index ffe77e2..928c4b8 100644 --- a/docs/guide/getting-started.md +++ b/docs/guide/getting-started.md @@ -1,73 +1,107 @@ -# Getting Started +# What is CodeRAG? -## What is CodeRAG? - -CodeRAG is a lightning-fast hybrid code search library that combines TF-IDF keyword search with vector embeddings for semantic understanding. Built for RAG (Retrieval-Augmented Generation), it's perfect for AI assistants, documentation search, and IDE integration. +CodeRAG is a lightning-fast semantic code search library designed for RAG (Retrieval-Augmented Generation) applications. It combines traditional keyword search (TF-IDF/BM25) with optional vector embeddings to provide accurate, context-aware code search results. ## Key Features -### πŸ” Hybrid Search Engine +### AST-Based Chunking + +Unlike traditional search that returns entire files, CodeRAG uses Abstract Syntax Tree (AST) parsing to split code at semantic boundaries: + +- **Functions**: Find specific function implementations +- **Classes**: Locate class definitions and methods +- **Imports**: Track module dependencies +- **Comments**: Search documentation blocks -Combines two complementary search strategies: +This means search results are more precise and consume fewer tokens when used with LLMs. -- **TF-IDF (Keyword)**: Fast, precise matching for exact terms -- **Vector (Semantic)**: Understanding meaning and context -- **Hybrid**: Weighted combination for best results +### Hybrid Search -### ⚑ High Performance +CodeRAG supports three search modes: -- **2.7x faster** initial indexing -- **166x faster** incremental updates -- **100x faster** cached queries +1. **Keyword Search (TF-IDF/BM25)**: Fast, precise matching using StarCoder2 tokenization +2. **Semantic Search (Vector)**: Meaning-based search using embeddings (requires OpenAI API) +3. **Hybrid Search**: Weighted combination of both for best results -### 🎯 Code-Aware Tokenization +### Performance -Uses StarCoder2 tokenization to properly handle: -- camelCase identifiers -- snake_case naming -- Code-specific patterns +| Metric | Value | +|--------|-------| +| Indexing Speed | 1000-2000 files/sec | +| Startup Time | <100ms (cached) | +| Search Latency | <50ms | +| Memory per 1000 files | ~1-2 MB | -### πŸ”Œ Extensible Architecture +### Language Support -- Registry pattern for custom embedding providers -- Support for OpenAI, OpenRouter, Together AI, Fireworks AI, Ollama -- Mock provider for testing +CodeRAG supports 15+ programming languages out of the box: -### πŸ“¦ Batteries Included +- **JavaScript/TypeScript**: JS, JSX, TS, TSX, MJS, CJS +- **Systems**: Python, Go, Rust, Java, C, C++, Ruby, PHP +- **Markup**: Markdown, HTML, XML +- **Data**: JSON, YAML, TOML, Protobuf -- MCP server for AI assistant integration -- Persistent storage with SQLite -- Automatic incremental updates -- Query caching +## Use Cases -## Installation +### AI Assistants + +CodeRAG powers AI coding assistants by providing relevant code context: + +```typescript +// User asks: "How does authentication work?" +const results = await indexer.search('authentication logic') +// Returns: auth.ts:15-45 (login function), middleware/auth.ts:10-30 (JWT validation) +``` -See the [Installation Guide](./installation.md) for detailed setup instructions. +### Code Navigation -## Quick Start +Build IDE-like "Go to Definition" features: -See the [Quick Start Guide](./quick-start.md) to get up and running in 5 minutes. +```typescript +const results = await indexer.search('function getUserById', { + limit: 1, + fileExtensions: ['.ts'], +}) +``` + +### Documentation Search + +Find relevant code examples for documentation: + +```typescript +const results = await indexer.search('database connection pool', { + includeContent: true, + contextLines: 5, +}) +``` -## Architecture Overview +## Architecture ``` -β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -β”‚ Hybrid Search Engine β”‚ -β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ -β”‚ TF-IDF β”‚ Vector Search β”‚ -β”‚ (Keyword) β”‚ (Semantic) β”‚ -β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ -β”‚ Code Tokenization β”‚ -β”‚ (StarCoder2) β”‚ -β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ -β”‚ Persistent Storage (SQLite) β”‚ -β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ CodebaseIndexer β”‚ +β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ TF-IDF β”‚ β”‚ Vector β”‚ (optional) β”‚ +β”‚ β”‚ (BM25) β”‚ β”‚ (OpenAI) β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β”‚ β”‚ β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β”‚ β–Ό β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ AST Chunking β”‚ β”‚ +β”‚ β”‚ (tree-sitter + synth) β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ PersistentStorage β”‚ β”‚ +β”‚ β”‚ (SQLite) β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ ``` ## Next Steps -- [Installation](./installation.md) - Install and configure -- [Quick Start](./quick-start.md) - Build your first search index -- [TF-IDF Search](./tfidf.md) - Learn keyword search -- [Vector Search](./vector-search.md) - Learn semantic search -- [Hybrid Search](./hybrid-search.md) - Combine both strategies +- [Installation](/guide/installation) - Install CodeRAG in your project +- [Quick Start](/guide/quick-start) - Build your first search index +- [MCP Server](/mcp/overview) - Use with AI assistants diff --git a/docs/guide/how-search-works.md b/docs/guide/how-search-works.md new file mode 100644 index 0000000..1bd4c98 --- /dev/null +++ b/docs/guide/how-search-works.md @@ -0,0 +1,189 @@ +# How Search Works + +CodeRAG uses chunk-level indexing with BM25 scoring to deliver fast, accurate code search. Unlike traditional file-level search, CodeRAG operates at the granularity of individual code blocks (functions, classes, etc.). + +## Chunk-Level Indexing + +CodeRAG indexes code at the chunk level rather than the file level. Each chunk represents a semantic unit extracted through AST parsing. + +**Why chunk-level?** + +- More precise search results pointing to specific functions or classes +- Better relevance scoring (matches terms within the same function, not scattered across a large file) +- Enables line-level navigation with startLine and endLine metadata + +**Example:** + +For a TypeScript file with 3 functions, CodeRAG creates 3 separate searchable chunks: + +```typescript +// File: utils.ts + +// Chunk 1: FunctionDeclaration (lines 1-5) +export function parseQuery(query: string): string[] { + return query.toLowerCase().split(/\s+/) +} + +// Chunk 2: FunctionDeclaration (lines 7-11) +export function calculateScore(tf: number, idf: number): number { + return tf * idf +} + +// Chunk 3: FunctionDeclaration (lines 13-17) +export function normalizeVector(vec: number[]): number[] { + const magnitude = Math.sqrt(vec.reduce((sum, v) => sum + v * v, 0)) + return vec.map(v => v / magnitude) +} +``` + +Each chunk is indexed independently with its own TF-IDF vector. + +## StarCoder2 Tokenization + +CodeRAG uses the StarCoder2 tokenizer for code-aware tokenization. This tokenizer understands code syntax and produces better tokens than generic text tokenizers. + +**Advantages:** + +- Preserves camelCase and snake_case as single tokens (`getUserById` stays intact, not split into `get`, `User`, `By`, `Id`) +- Recognizes common code patterns (operators, keywords, identifiers) +- Language-agnostic (works across 15+ programming languages) + +**Implementation:** + +```typescript +import { tokenize } from '@sylphx/coderag' + +const tokens = await tokenize('function getUserById(id: string)') +// Returns: ['function', 'getUserById', '(', 'id', ':', 'string', ')'] +``` + +Tokenization happens asynchronously due to WASM-based StarCoder2 model. + +## BM25 Scoring Formula + +BM25 (Best Matching 25) improves upon basic TF-IDF with two key enhancements: + +1. **Term frequency saturation (k1 parameter)**: Diminishing returns for repeated terms +2. **Document length normalization (b parameter)**: Adjusts for chunk length + +**Formula:** + +``` +score(C, Q) = Ξ£ IDF(qi) * (f(qi, C) * (k1 + 1)) / (f(qi, C) + k1 * (1 - b + b * |C| / avgdl)) +``` + +Where: +- `C` = chunk (document) +- `Q` = query +- `f(qi, C)` = raw frequency of term qi in chunk C +- `|C|` = chunk length (token count) +- `avgdl` = average chunk length across all chunks +- `k1 = 1.2` (term frequency saturation) +- `b = 0.75` (length normalization) + +**Parameters:** + +```typescript +// From packages/core/src/tfidf.ts +const BM25_K1 = 1.2 // Typical range: 1.2-2.0 +const BM25_B = 0.75 // 0 = no normalization, 1 = full normalization +``` + +These are industry-standard values from Elasticsearch and Lucene. + +**How it works:** + +For a query `"async function error"`, BM25 scores each chunk by: + +1. Tokenizing the query: `["async", "function", "error"]` +2. For each chunk, calculating term scores using the formula above +3. Summing term scores to get final chunk score +4. Ranking chunks by score descending + +## Query Caching + +CodeRAG caches search results using an LRU (Least Recently Used) cache to avoid re-executing identical searches. + +**Cache parameters:** + +```typescript +// From packages/core/src/indexer.ts +this.searchCache = new LRUCache(100, 5) +// 100 entries max, 5 minute TTL +``` + +**Cache behavior:** + +- Maximum 100 cached queries +- 5-minute time-to-live (TTL) per entry +- LRU eviction: oldest entries removed when cache is full +- Cache invalidation on index updates (file add/change/delete) + +**Implementation:** + +```typescript +// Cache key includes query + options +const cacheKey = createCacheKey(query, { + limit: 10, + fileExtensions: ['.ts'], + pathFilter: 'src/', + excludePaths: ['node_modules/'] +}) + +const cachedResults = this.searchCache.get(cacheKey) +if (cachedResults) { + return cachedResults // Cache hit +} + +// Execute search... +const results = await searchChunks(query, options) +this.searchCache.set(cacheKey, results) +``` + +**Cache statistics:** + +Query cache performance metrics: + +```typescript +const stats = searchCache.stats() +console.log(`Hit rate: ${stats.hitRate}`) // 0-1 (1 = 100% hits) +console.log(`Size: ${stats.size}/${stats.maxSize}`) +``` + +## Search Flow + +End-to-end search process: + +1. **Query tokenization**: Convert query string to tokens using StarCoder2 +2. **Cache check**: Look up results in LRU cache +3. **SQL candidate retrieval**: Query database for chunks containing any query term +4. **BM25 scoring**: Score each candidate chunk using BM25 formula +5. **Filtering**: Apply file extension, path, and exclusion filters +6. **Ranking**: Sort by BM25 score descending +7. **Limiting**: Return top N results +8. **Caching**: Store results in cache for future queries + +**Performance characteristics:** + +- Tokenization: ~1-5ms per query (cached after first use) +- SQL retrieval: ~10-50ms depending on index size +- BM25 scoring: ~1ms per 100 candidates +- Total search time: typically 20-100ms for 10,000 chunks + +**SQL-based search:** + +CodeRAG uses SQL for memory-efficient search: + +```typescript +// Query chunks by terms +const candidates = await storage.searchByTerms(queryTokens, { + limit: limit * 3 // Get 3x candidates for scoring +}) + +// Candidates include: +// - chunkId, filePath, content +// - matched terms with tfidf and rawFreq +// - pre-computed magnitude and tokenCount +``` + +Pre-computed values (magnitude, tokenCount) stored in the database avoid recalculation during search. diff --git a/docs/guide/hybrid-search.md b/docs/guide/hybrid-search.md new file mode 100644 index 0000000..d478dba --- /dev/null +++ b/docs/guide/hybrid-search.md @@ -0,0 +1,355 @@ +# Hybrid Search + +Hybrid search combines keyword-based search (BM25) with semantic search (vector embeddings) to leverage the strengths of both approaches. + +## Weighted Combination Formula + +Hybrid search merges results from BM25 and vector search using a weighted average. + +**Formula:** + +``` +hybrid_score = (vectorWeight * normalized_vector_score) + ((1 - vectorWeight) * normalized_bm25_score) +``` + +Where: +- `vectorWeight`: Weight for vector search (0-1, default: 0.7) +- `normalized_vector_score`: Vector similarity score normalized to 0-1 +- `normalized_bm25_score`: BM25 score normalized to 0-1 + +**Score normalization:** + +Scores are normalized by dividing by the maximum score in each result set: + +```typescript +const maxVectorScore = Math.max(...vectorResults.map(r => r.similarity), 0.01) +const maxBM25Score = Math.max(...bm25Results.map(r => r.score), 0.01) + +const normalizedVectorScore = vectorScore / maxVectorScore +const normalizedBM25Score = bm25Score / maxBM25Score +``` + +**Implementation:** + +```typescript +function mergeSearchResults( + vectorResults: VectorSearchResult[], + tfidfResults: SearchResult[], + vectorWeight: number +): HybridSearchResult[] { + const resultMap = new Map() + + // Normalize scores + const maxVectorScore = Math.max(...vectorResults.map(r => r.similarity), 0.01) + const maxBM25Score = Math.max(...tfidfResults.map(r => r.score), 0.01) + + // Add vector results + for (const result of vectorResults) { + const normalizedScore = result.similarity / maxVectorScore + const key = getChunkKey(result.path, result.startLine, result.endLine) + + resultMap.set(key, { + path: result.path, + score: normalizedScore * vectorWeight, + method: 'vector', + similarity: result.similarity, + ... + }) + } + + // Add/merge BM25 results + for (const result of tfidfResults) { + const normalizedScore = result.score / maxBM25Score + const key = getChunkKey(result.path, result.startLine, result.endLine) + const existing = resultMap.get(key) + + if (existing) { + // Combine scores + resultMap.set(key, { + ...existing, + score: existing.score + normalizedScore * (1 - vectorWeight), + method: 'hybrid' + }) + } else { + resultMap.set(key, { + path: result.path, + score: normalizedScore * (1 - vectorWeight), + method: 'tfidf', + ... + }) + } + } + + // Sort by combined score + return Array.from(resultMap.values()).sort((a, b) => b.score - a.score) +} +``` + +## When to Use Each Mode + +CodeRAG supports three search modes: vector-only, BM25-only, and hybrid. + +### Vector Search (vectorWeight = 1.0) + +**Best for:** +- Conceptual queries ("error handling patterns") +- Natural language questions ("how to validate user input") +- Finding similar code by meaning, not exact keywords +- Cross-language searches (similar logic in different languages) + +**Example queries:** +```typescript +"authentication middleware" // Finds auth-related code even without exact terms +"database connection pooling" // Understands concepts +"handle async errors" // Natural language +``` + +**Usage:** + +```typescript +import { semanticSearch } from '@sylphx/coderag/hybrid-search' + +const results = await semanticSearch('error handling patterns', indexer, { + limit: 10 +}) +``` + +### BM25 Search (vectorWeight = 0.0) + +**Best for:** +- Exact keyword matching (function names, class names, identifiers) +- Fast search (2-3x faster than vector search) +- No embedding provider required +- Large codebases where vector search is too slow + +**Example queries:** +```typescript +"fetchUser" // Exact function name +"UserService" // Exact class name +"calculateBM25" // Specific identifier +``` + +**Usage:** + +```typescript +import { keywordSearch } from '@sylphx/coderag/hybrid-search' + +const results = await keywordSearch('fetchUser', indexer, { + limit: 10 +}) +``` + +### Hybrid Search (vectorWeight = 0.7, default) + +**Best for:** +- General-purpose search (balances precision and recall) +- Queries with both keywords and concepts +- Production use cases +- Unknown query types + +**Example queries:** +```typescript +"async fetchUser error handling" // Keywords + concepts +"UserService authentication logic" // Class name + concept +"validate email format regex" // Specific + general +``` + +**Usage:** + +```typescript +import { hybridSearch } from '@sylphx/coderag/hybrid-search' + +const results = await hybridSearch('async fetchUser error handling', indexer, { + limit: 10, + vectorWeight: 0.7 // Default +}) +``` + +## Tuning vectorWeight Parameter + +The `vectorWeight` parameter controls the balance between vector and BM25 search. + +**Weight spectrum:** + +``` +0.0 Pure BM25 Exact keywords only +0.3 BM25-heavy Favor keywords, some semantic understanding +0.5 Balanced Equal weight to both approaches +0.7 Vector-heavy Favor semantics, some keyword matching (DEFAULT) +1.0 Pure Vector Meaning only, ignore exact keywords +``` + +**Recommended settings by use case:** + +| Use Case | vectorWeight | Rationale | +|----------|--------------|-----------| +| API search (exact names) | 0.2-0.3 | Favor exact matches | +| Code exploration | 0.7-0.8 | Find related code | +| Documentation search | 0.8-0.9 | Natural language queries | +| Fast lookup | 0.0 | Skip vector search | +| Semantic understanding | 1.0 | Ignore keywords | +| General search | 0.7 | **Default, works for most cases** | + +**Tuning example:** + +```typescript +// Find exact function names (favor keywords) +const results = await hybridSearch('getUserById', indexer, { + vectorWeight: 0.3 +}) + +// Find authentication-related code (favor semantics) +const results = await hybridSearch('authentication logic', indexer, { + vectorWeight: 0.8 +}) +``` + +**Experimental tuning:** + +Test different weights to find optimal balance for your queries: + +```typescript +const query = 'error handling' +const weights = [0.0, 0.3, 0.5, 0.7, 1.0] + +for (const w of weights) { + const results = await hybridSearch(query, indexer, { vectorWeight: w, limit: 5 }) + console.log(`Weight ${w}:`) + results.forEach(r => console.log(` ${r.path} (score: ${r.score.toFixed(3)})`)) +} +``` + +## Search Result Structure + +Hybrid search returns unified results with metadata from both approaches. + +**HybridSearchResult interface:** + +```typescript +interface HybridSearchResult { + readonly path: string // File path + readonly score: number // Combined score (0-1+) + readonly method: 'vector' | 'tfidf' | 'hybrid' + readonly matchedTerms?: string[] // From BM25 (keyword matches) + readonly similarity?: number // From vector (cosine similarity) + readonly content?: string // Chunk content or snippet + readonly chunkType?: string // AST node type + readonly startLine?: number // Chunk start line + readonly endLine?: number // Chunk end line + readonly language?: string // Programming language +} +``` + +**Method field:** + +- `'vector'`: Result only from vector search +- `'tfidf'`: Result only from BM25 search +- `'hybrid'`: Result from both (merged) + +**Example result:** + +```typescript +{ + path: 'src/auth/middleware.ts', + score: 0.856, + method: 'hybrid', + matchedTerms: ['authenticate', 'middleware'], // From BM25 + similarity: 0.92, // From vector + content: 'export async function authenticate(req, res, next) {...}', + chunkType: 'FunctionDeclaration', + startLine: 15, + endLine: 25, + language: 'typescript' +} +``` + +## Performance Comparison + +**Search time comparison (10k chunks):** + +| Mode | Time | Quality | +|------|------|---------| +| BM25 only | 10-20ms | Good for exact matches | +| Vector only | 30-50ms | Best semantic understanding | +| Hybrid (0.7) | 40-60ms | Best overall quality | + +**Trade-offs:** + +- BM25: Fastest, but misses semantic matches +- Vector: Best quality, but slower and requires embeddings +- Hybrid: Balanced performance and quality (recommended) + +**Optimization tips:** + +1. **Limit candidate size**: Use smaller limits for faster searches + ```typescript + const results = await hybridSearch(query, indexer, { + limit: 5 // Fewer results = faster + }) + ``` + +2. **Cache frequently used queries**: Hybrid search results are cached automatically + ```typescript + // First call: 50ms (executes both searches) + await hybridSearch('authentication', indexer) + + // Second call: <1ms (cache hit) + await hybridSearch('authentication', indexer) + ``` + +3. **Skip vector search for simple queries**: Use BM25 for exact identifiers + ```typescript + if (isSimpleIdentifier(query)) { + return keywordSearch(query, indexer) // Faster + } else { + return hybridSearch(query, indexer) // Better quality + } + ``` + +## Example Queries + +**Hybrid search scenarios:** + +```typescript +import { hybridSearch } from '@sylphx/coderag/hybrid-search' + +// Conceptual query with keywords +await hybridSearch('async error handling middleware', indexer, { + vectorWeight: 0.7, // Default + limit: 10 +}) + +// Class name lookup (favor exact match) +await hybridSearch('UserService', indexer, { + vectorWeight: 0.3, + limit: 5 +}) + +// Natural language question (favor semantics) +await hybridSearch('how to validate email addresses', indexer, { + vectorWeight: 0.9, + limit: 10 +}) + +// File-filtered search +await hybridSearch('authentication', indexer, { + vectorWeight: 0.7, + fileExtensions: ['.ts', '.tsx'], + pathFilter: 'src/auth' +}) +``` + +**Comparing methods:** + +```typescript +const query = 'database connection' + +// BM25 only: finds exact matches of "database" and "connection" +const bm25Results = await keywordSearch(query, indexer) + +// Vector only: finds all database-related code (pool, client, connection, etc.) +const vectorResults = await semanticSearch(query, indexer) + +// Hybrid: combines both for best results +const hybridResults = await hybridSearch(query, indexer) +``` diff --git a/docs/guide/installation.md b/docs/guide/installation.md index 272f972..deacfdf 100644 --- a/docs/guide/installation.md +++ b/docs/guide/installation.md @@ -1,18 +1,9 @@ # Installation -## Prerequisites - -- Node.js 18+ or Bun 1.0+ -- TypeScript 5.0+ (for TypeScript projects) - -## Install Core Package +## Package Installation ::: code-group -```bash [bun] -bun add @sylphx/coderag -``` - ```bash [npm] npm install @sylphx/coderag ``` @@ -21,47 +12,69 @@ npm install @sylphx/coderag pnpm add @sylphx/coderag ``` +```bash [bun] +bun add @sylphx/coderag +``` + ```bash [yarn] yarn add @sylphx/coderag ``` ::: -## Install MCP Server (Optional) +## Requirements -For AI assistant RAG integration: +- **Node.js**: 18.0.0 or higher +- **Runtime**: Node.js, Bun, or Deno -::: code-group +## Optional Dependencies -```bash [bun] -bun add @sylphx/coderag-mcp +CodeRAG uses optional dependencies for language-specific AST parsing. These are automatically installed when needed but can be pre-installed for faster startup: + +### Language Parsers + +```bash +# All languages (recommended) +npm install @sylphx/synth-js @sylphx/synth-python @sylphx/synth-go \ + @sylphx/synth-rust @sylphx/synth-java @sylphx/synth-c + +# Specific languages only +npm install @sylphx/synth-js # JavaScript/TypeScript +npm install @sylphx/synth-python # Python +npm install @sylphx/synth-go # Go +npm install @sylphx/synth-rust # Rust ``` -```bash [npm] -npm install @sylphx/coderag-mcp +### Vector Search (Optional) + +For semantic search with embeddings: + +```bash +# LanceDB for vector storage +npm install @lancedb/lancedb ``` -::: +## Environment Variables -## Environment Setup +### For Semantic Search -Create a `.env` file in your project root: +To enable vector-based semantic search, set your OpenAI API key: ```bash -# OpenAI API Key (required for vector search) +# Required for semantic search OPENAI_API_KEY=sk-... -# Optional: Custom base URL for OpenAI-compatible endpoints -OPENAI_BASE_URL=https://api.openrouter.ai/api/v1 +# Optional: Custom endpoint (for OpenAI-compatible APIs) +OPENAI_BASE_URL=https://api.openai.com/v1 -# Optional: Custom embedding model +# Optional: Custom model EMBEDDING_MODEL=text-embedding-3-small -# Optional: Custom embedding dimensions +# Optional: Custom dimensions EMBEDDING_DIMENSIONS=1536 ``` -### Supported Providers +### Supported Embedding Providers #### OpenAI (Official) @@ -95,32 +108,76 @@ EMBEDDING_MODEL=nomic-embed-text EMBEDDING_DIMENSIONS=768 ``` -## Verify Installation +## MCP Server Installation + +For AI assistant integration (Claude, Cursor, etc.): + +```bash +# Run directly with npx (no installation needed) +npx @sylphx/coderag-mcp --root=/path/to/project + +# Or install globally +npm install -g @sylphx/coderag-mcp +coderag-mcp --root=/path/to/project +``` + +## Verifying Installation -Create a test file: +Create a test file to verify the installation: ```typescript -// test-search.ts -import { CodebaseIndexer } from '@sylphx/coderag'; +// test.ts +import { CodebaseIndexer, PersistentStorage } from '@sylphx/coderag' +const storage = new PersistentStorage({ codebaseRoot: '.' }) const indexer = new CodebaseIndexer({ - codebaseRoot: process.cwd(), - indexPath: '.coderag' -}); + codebaseRoot: '.', + storage, +}) -console.log('βœ… CodeRAG installed successfully!'); +console.log('CodeRAG installed successfully!') ``` -Run it: +Run with: ```bash -bun run test-search.ts +npx tsx test.ts # or -npx tsx test-search.ts +bun test.ts +``` + +## Troubleshooting + +### Native Module Errors (Windows) + +If you encounter errors with native modules on Windows, ensure you're using the latest version which uses WASM-based parsers: + +```bash +npm install @sylphx/coderag@latest +``` + +### Tokenizer Download + +On first run, CodeRAG downloads the StarCoder2 tokenizer (~4.7MB). This is cached locally after the first download. + +``` +[INFO] Loading StarCoder2 tokenizer (4.7MB, one-time download)... +[SUCCESS] Tokenizer loaded in 406ms +``` + +### Memory Issues + +For large codebases, enable low memory mode: + +```typescript +const indexer = new CodebaseIndexer({ + codebaseRoot: './large-project', + storage: new PersistentStorage({ codebaseRoot: './large-project' }), + lowMemoryMode: true, // Uses SQL-based search +}) ``` ## Next Steps -- [Quick Start](./quick-start.md) - Build your first index -- [Embedding Providers](./providers.md) - Configure providers -- [MCP Server](../mcp/installation.md) - Set up AI integration +- [Quick Start](/guide/quick-start) - Build your first search index +- [Performance Tuning](/guide/performance) - Tune for your use case diff --git a/docs/guide/languages.md b/docs/guide/languages.md new file mode 100644 index 0000000..f7adc13 --- /dev/null +++ b/docs/guide/languages.md @@ -0,0 +1,501 @@ +# Language Support + +CodeRAG supports 15+ programming and markup languages through Synth parsers. Each language has dedicated AST parsing for accurate chunking. + +## Full List of 15+ Languages + +### Programming Languages (Tier 1) + +**JavaScript** +- Extensions: `.js`, `.mjs`, `.cjs` +- Parser: `@sylphx/synth-js` +- Boundaries: Functions, classes, exports +- Context: Import declarations + +**TypeScript** +- Extensions: `.ts`, `.mts`, `.cts` +- Parser: `@sylphx/synth-js` +- Boundaries: Functions, classes, interfaces, types, enums +- Context: Imports, type aliases, interfaces + +**JSX** +- Extensions: `.jsx` +- Parser: `@sylphx/synth-js` +- Boundaries: Functions, classes, JSX elements (components) +- Context: Import declarations + +**TSX** +- Extensions: `.tsx` +- Parser: `@sylphx/synth-js` +- Boundaries: Functions, classes, interfaces, types, JSX elements +- Context: Imports, type aliases, interfaces + +**Python** +- Extensions: `.py`, `.pyw`, `.pyi` +- Parser: `@sylphx/synth-python` +- Boundaries: Functions, async functions, classes, modules +- Context: Import statements + +**Go** +- Extensions: `.go` +- Parser: `@sylphx/synth-go` +- Boundaries: Functions, methods, types, general declarations +- Context: Import specifications + +**Java** +- Extensions: `.java` +- Parser: `@sylphx/synth-java` +- Boundaries: Methods, constructors, classes, interfaces, enums, annotations +- Context: Imports, package declarations + +**C** +- Extensions: `.c`, `.h` +- Parser: `@sylphx/synth-c` +- Boundaries: Functions, declarations, structs, enums, typedefs +- Context: Preprocessor includes and definitions + +**Rust** +- Extensions: `.rs` +- Parser: `@sylphx/synth-rust` +- Boundaries: Functions, implementations, structs, enums, traits, modules, macros +- Context: Use declarations + +### Markup & Config Languages (Tier 2) + +**Markdown** +- Extensions: `.md`, `.markdown`, `.mdx` +- Parser: `@sylphx/synth-md` +- Boundaries: Headings, paragraphs, code blocks, blockquotes, list items +- Embedded: Code blocks (recursive parsing enabled) + +**HTML** +- Extensions: `.html`, `.htm` +- Parser: `@sylphx/synth-html` +- Boundaries: Elements, comments, doctypes +- Embedded: Script tags (JavaScript), style tags (CSS) + +**XML** +- Extensions: `.xml`, `.xsl`, `.xslt`, `.xsd`, `.svg` +- Parser: `@sylphx/synth-xml` +- Boundaries: Elements, comments, processing instructions + +**JSON** +- Extensions: `.json`, `.jsonc`, `.json5` +- Parser: `@sylphx/synth-json` +- Boundaries: Objects, arrays + +**YAML** +- Extensions: `.yaml`, `.yml` +- Parser: `@sylphx/synth-yaml` +- Boundaries: Documents, mappings, sequences + +**TOML** +- Extensions: `.toml` +- Parser: `@sylphx/synth-toml` +- Boundaries: Tables, array of tables, key-value pairs + +**INI** +- Extensions: `.ini`, `.cfg`, `.conf`, `.gitconfig`, `.editorconfig` +- Parser: `@sylphx/synth-ini` +- Boundaries: Sections, properties + +### Specialized Languages (Tier 3) + +**Protocol Buffers** +- Extensions: `.proto` +- Parser: `@sylphx/synth-protobuf` +- Boundaries: Messages, services, enums, RPC definitions + +## Extension Mappings + +CodeRAG automatically detects language from file extension. + +**Extension to language mapping:** + +```typescript +const EXTENSION_TO_LANGUAGE = { + // JavaScript family + '.js': 'javascript', + '.mjs': 'javascript', + '.cjs': 'javascript', + '.jsx': 'jsx', + '.ts': 'typescript', + '.mts': 'typescript', + '.cts': 'typescript', + '.tsx': 'tsx', + + // Python + '.py': 'python', + '.pyw': 'python', + '.pyi': 'python', + + // Go + '.go': 'go', + + // Java + '.java': 'java', + + // C + '.c': 'c', + '.h': 'c', + + // Rust + '.rs': 'rust', + + // Markup + '.md': 'markdown', + '.markdown': 'markdown', + '.mdx': 'markdown', + '.html': 'html', + '.htm': 'html', + '.xml': 'xml', + '.xsl': 'xml', + '.xslt': 'xml', + '.xsd': 'xml', + '.svg': 'xml', + + // Config + '.json': 'json', + '.jsonc': 'json', + '.json5': 'json', + '.yaml': 'yaml', + '.yml': 'yaml', + '.toml': 'toml', + '.ini': 'ini', + '.cfg': 'ini', + '.conf': 'ini', + '.gitconfig': 'ini', + '.editorconfig': 'ini', + + // Specialized + '.proto': 'protobuf', +} +``` + +**Language detection:** + +```typescript +import { getLanguageFromPath } from '@sylphx/coderag/language-config' + +const lang = getLanguageFromPath('src/utils.ts') +// Returns: 'typescript' + +const lang2 = getLanguageFromPath('README.md') +// Returns: 'markdown' + +const lang3 = getLanguageFromPath('unknown.xyz') +// Returns: undefined (not supported) +``` + +## Synth Parser Packages + +Each language has a dedicated Synth parser package. + +**Package naming:** + +All parsers follow the naming convention: `@sylphx/synth-{language}` + +**Installation:** + +```bash +# JavaScript/TypeScript/JSX/TSX (single package) +npm install @sylphx/synth-js + +# Python +npm install @sylphx/synth-python + +# Go +npm install @sylphx/synth-go + +# Java +npm install @sylphx/synth-java + +# C +npm install @sylphx/synth-c + +# Rust +npm install @sylphx/synth-rust + +# Markdown +npm install @sylphx/synth-md + +# HTML +npm install @sylphx/synth-html + +# XML +npm install @sylphx/synth-xml + +# JSON +npm install @sylphx/synth-json + +# YAML +npm install @sylphx/synth-yaml + +# TOML +npm install @sylphx/synth-toml + +# INI +npm install @sylphx/synth-ini + +# Protobuf +npm install @sylphx/synth-protobuf +``` + +**Parser interface:** + +All Synth parsers expose the same interface: + +```typescript +interface SynthParser { + parseAsync: (source: string, options?: Record) => Promise +} + +interface Tree { + meta: { + language: string + source: string + created: number + modified: number + } + root: NodeId + nodes: BaseNode[] +} +``` + +**Parser loading:** + +CodeRAG automatically loads the correct parser based on file extension: + +```typescript +// Auto-discovery +const parser = await loadSynthParser('typescript') +// Loads: @sylphx/synth-js + +const tree = await parser.parseAsync(code) +``` + +## Adding Custom Languages + +CodeRAG's language registry is extensible. You can add custom languages without modifying core code. + +**Language configuration:** + +```typescript +import { LANGUAGE_REGISTRY } from '@sylphx/coderag/language-config' + +// Add a custom language +LANGUAGE_REGISTRY['kotlin'] = { + parser: '@sylphx/synth-kotlin', + extensions: ['.kt', '.kts'], + boundaries: [ + 'FunctionDeclaration', + 'ClassDeclaration', + 'ObjectDeclaration', + 'InterfaceDeclaration' + ], + contextTypes: ['ImportDirective', 'PackageDirective'] +} + +// Now CodeRAG can parse Kotlin files +const chunks = await chunkCodeByAST(kotlinCode, 'Main.kt') +``` + +**Custom parser requirements:** + +1. **Synth-compatible**: Parser must implement `parseAsync(source, options) -> Tree` +2. **Node types**: Define boundary node types for your language's AST +3. **Context types**: (Optional) Define nodes to preserve as context + +**Example custom parser:** + +```typescript +// Custom Swift parser +LANGUAGE_REGISTRY['swift'] = { + parser: '@sylphx/synth-swift', + extensions: ['.swift'], + boundaries: [ + 'FunctionDeclaration', + 'ClassDeclaration', + 'StructDeclaration', + 'EnumDeclaration', + 'ProtocolDeclaration', + 'ExtensionDeclaration' + ], + contextTypes: ['ImportDeclaration'], + parserOptions: { + sourceType: 'module' + } +} +``` + +**Testing custom language:** + +```typescript +import { isLanguageSupported, getSupportedLanguages } from '@sylphx/coderag/language-config' + +// Check if language is supported +console.log(isLanguageSupported('kotlin')) // true (after registration) + +// List all supported languages +console.log(getSupportedLanguages()) +// ['javascript', 'typescript', ..., 'kotlin'] +``` + +## Language-Specific Features + +### Embedded Language Support + +Some languages support embedded content (e.g., JavaScript in HTML, code blocks in Markdown). + +**Markdown code blocks:** + +```typescript +// Markdown with embedded TypeScript +const markdown = ` +# Example + +\`\`\`typescript +function hello(name: string) { + return \`Hello, \${name}\` +} +\`\`\` +` + +const chunks = await chunkCodeByAST(markdown, 'example.md', { + parseEmbedded: true // Enable recursive parsing +}) + +// Results in multiple chunks: +// 1. Heading: "# Example" +// 2. FunctionDeclaration (TypeScript): "function hello..." +``` + +**Configuration:** + +```typescript +markdown: { + parser: '@sylphx/synth-md', + extensions: ['.md', '.markdown', '.mdx'], + boundaries: ['heading', 'paragraph', 'code', 'blockquote'], + embedded: [ + { + nodeType: 'code', // AST node type for code blocks + langAttr: 'lang', // Attribute containing language + recursive: true // Enable recursive parsing + } + ] +} +``` + +### Context Preservation + +Context types are prepended to chunks for better understanding. + +**TypeScript example:** + +```typescript +// File: src/user.ts +import { User } from './types' +import { database } from './db' + +export function getUser(id: string): User { + return database.findById(id) +} + +export function updateUser(id: string, data: Partial): User { + return database.update(id, data) +} +``` + +**With preserveContext: true:** + +```typescript +const chunks = await chunkCodeByAST(code, 'user.ts', { + preserveContext: true +}) + +// Chunk 1: +// import { User } from './types' +// import { database } from './db' +// +// export function getUser(id: string): User { +// return database.findById(id) +// } + +// Chunk 2: +// import { User } from './types' +// import { database } from './db' +// +// export function updateUser(id: string, data: Partial): User { +// return database.update(id, data) +// } +``` + +Each chunk includes import statements for context. + +**Without preserveContext:** + +```typescript +// Chunk 1: +// export function getUser(id: string): User { +// return database.findById(id) +// } + +// Chunk 2: +// export function updateUser(id: string, data: Partial): User { +// return database.update(id, data) +// } +``` + +Chunks are more concise but lose context. + +## Supported Extensions Summary + +**Quick reference:** + +| Language | Extensions | Parser | +|----------|-----------|--------| +| JavaScript | .js, .mjs, .cjs | @sylphx/synth-js | +| TypeScript | .ts, .mts, .cts | @sylphx/synth-js | +| JSX | .jsx | @sylphx/synth-js | +| TSX | .tsx | @sylphx/synth-js | +| Python | .py, .pyw, .pyi | @sylphx/synth-python | +| Go | .go | @sylphx/synth-go | +| Java | .java | @sylphx/synth-java | +| C | .c, .h | @sylphx/synth-c | +| Rust | .rs | @sylphx/synth-rust | +| Markdown | .md, .markdown, .mdx | @sylphx/synth-md | +| HTML | .html, .htm | @sylphx/synth-html | +| XML | .xml, .xsl, .xslt, .xsd, .svg | @sylphx/synth-xml | +| JSON | .json, .jsonc, .json5 | @sylphx/synth-json | +| YAML | .yaml, .yml | @sylphx/synth-yaml | +| TOML | .toml | @sylphx/synth-toml | +| INI | .ini, .cfg, .conf, .gitconfig, .editorconfig | @sylphx/synth-ini | +| Protobuf | .proto | @sylphx/synth-protobuf | + +**API:** + +```typescript +import { + getSupportedLanguages, + getSupportedExtensions, + getLanguageFromPath, + getLanguageConfig +} from '@sylphx/coderag/language-config' + +// Get all supported languages +const languages = getSupportedLanguages() +// ['javascript', 'typescript', 'python', 'go', ...] + +// Get all supported extensions +const extensions = getSupportedExtensions() +// ['.js', '.ts', '.py', '.go', ...] + +// Detect language from file path +const lang = getLanguageFromPath('src/App.tsx') +// 'tsx' + +// Get language configuration +const config = getLanguageConfig('typescript') +// { parser: '@sylphx/synth-js', extensions: ['.ts', ...], ... } +``` diff --git a/docs/guide/performance.md b/docs/guide/performance.md new file mode 100644 index 0000000..123e4ea --- /dev/null +++ b/docs/guide/performance.md @@ -0,0 +1,511 @@ +# Performance Tuning + +CodeRAG is optimized for speed and memory efficiency. This guide covers optimization strategies for large codebases. + +## Memory Optimization + +CodeRAG provides multiple strategies to reduce memory usage. + +### Low Memory Mode + +Low memory mode uses SQL-based search instead of loading the entire index into RAM. + +**Enabling:** + +```typescript +import { CodebaseIndexer } from '@sylphx/coderag' +import { PersistentStorage } from '@sylphx/coderag/storage-persistent' + +const storage = new PersistentStorage() + +const indexer = new CodebaseIndexer({ + storage, + lowMemoryMode: true // Default: true with PersistentStorage +}) +``` + +**Memory comparison:** + +| Codebase Size | In-Memory | Low Memory | Reduction | +|---------------|-----------|------------|-----------| +| 1k chunks | 10 MB | 5 MB | 50% | +| 10k chunks | 100 MB | 10 MB | 90% | +| 100k chunks | 1 GB | 15 MB | 98.5% | + +**Trade-offs:** + +- Memory: 90% reduction for large codebases +- Search speed: ~50% slower (15-30ms vs 10-20ms) +- Indexing speed: Unchanged (same time to build index) + +**When to use:** + +- Large codebases (10k+ files) +- Memory-constrained environments +- Long-running processes (servers, daemons) + +**When to avoid:** + +- Small codebases (<1k files) where memory isn't a concern +- Latency-critical applications requiring <10ms search +- Development environments with plenty of RAM + +### Chunk-Based Indexing + +CodeRAG indexes at the chunk level rather than file level, reducing memory for large files. + +**Example:** + +```typescript +// Large file: 10,000 lines, 500 KB + +// File-level indexing: +// - 1 document vector (500 KB content in memory) +// - Large TF-IDF vector (many unique terms) + +// Chunk-level indexing: +// - 50 chunks (average 200 lines each) +// - 50 smaller TF-IDF vectors (fewer terms per chunk) +// - More precise search (function-level granularity) +``` + +**Memory savings:** + +For a 1 MB file split into 10 chunks: + +``` +File-level: 1 MB content + 100 KB TF-IDF vector = 1.1 MB +Chunk-level: 1 MB content + 10 * 10 KB TF-IDF vectors = 1.1 MB (similar) + +But with low memory mode: +File-level: 100 KB TF-IDF vector in RAM +Chunk-level: Only query candidates loaded (typically 10-50 chunks) +``` + +### Streaming Indexing + +CodeRAG processes files in batches to avoid loading everything at once. + +**Batch processing:** + +```typescript +const indexer = new CodebaseIndexer({ + indexingBatchSize: 50 // Process 50 files at a time (default) +}) +``` + +**Memory profile during indexing:** + +``` +Without batching (10k files): + Peak memory: ~2 GB (all files loaded) + +With batching (10k files, batch=50): + Peak memory: ~100 MB (50 files at a time) +``` + +**Batch size tuning:** + +| Batch Size | Memory Usage | Indexing Speed | Recommendation | +|------------|--------------|----------------|----------------| +| 10 | Very low | Slow (many DB writes) | Memory-critical | +| 50 | Low | Fast | **Default** | +| 100 | Medium | Faster | High-memory systems | +| 500 | High | Fastest | RAM-abundant | + +**Example:** + +```typescript +// Low memory environment (512 MB RAM) +const indexer = new CodebaseIndexer({ + indexingBatchSize: 10 +}) + +// High memory environment (16 GB RAM) +const indexer = new CodebaseIndexer({ + indexingBatchSize: 200 +}) +``` + +## Batch Sizes + +CodeRAG uses batching at multiple levels for efficiency. + +### File Batch Size + +Controls how many files are processed together during indexing. + +**Configuration:** + +```typescript +const indexer = new CodebaseIndexer({ + indexingBatchSize: 50 // Files per batch +}) +``` + +**Impact:** + +- **Memory**: Smaller batches = lower peak memory +- **Speed**: Larger batches = fewer DB transactions (faster) +- **I/O**: Larger batches = better disk cache utilization + +**Tuning guide:** + +```typescript +// For 1 GB RAM or less +indexingBatchSize: 10 + +// For 4 GB RAM (typical laptop) +indexingBatchSize: 50 // Default + +// For 16 GB RAM or more +indexingBatchSize: 100-200 + +// For 64 GB RAM (server) +indexingBatchSize: 500 +``` + +### Vector Batch Size + +Controls how many embeddings are generated at once. + +**Configuration:** + +```typescript +const indexer = new CodebaseIndexer({ + embeddingProvider: provider, + vectorBatchSize: 10 // Embeddings per API call (default) +}) +``` + +**Impact:** + +- **API calls**: Larger batches = fewer API requests (lower cost) +- **Latency**: Larger batches = longer per-request latency +- **Rate limits**: Too large batches may hit rate limits + +**Tuning guide:** + +```typescript +// Conservative (avoid rate limits) +vectorBatchSize: 5 + +// Default (good balance) +vectorBatchSize: 10 + +// Aggressive (faster but may hit limits) +vectorBatchSize: 50 + +// Maximum (check provider limits) +vectorBatchSize: 100 +``` + +**OpenAI rate limits:** + +``` +text-embedding-3-small: + - Tier 1: 3,000 RPM, 1M TPM + - Tier 2: 5,000 RPM, 5M TPM + - Tier 3: 5,000 RPM, 5M TPM + +Recommended batch size: + - Tier 1: 10 (safe) + - Tier 2+: 20-50 (faster) +``` + +### SQL Batch Size + +Internal batch sizes for database operations. + +**Document vectors:** + +```typescript +// Internal constant in storage-persistent.ts +const BATCH_SIZE = 199 // SQLite variable limit: ~999 / 5 fields = 199 rows +``` + +SQLite has a limit of ~999 bind variables. With 5 fields per row (chunkId, term, tf, tfidf, rawFreq), the maximum batch is 199 rows. + +**IDF scores:** + +```typescript +// Internal constant +const BATCH_SIZE = 300 // 3 fields per row: term, idf, documentFrequency +``` + +**These are internal optimizations and cannot be configured.** + +## Caching Strategies + +CodeRAG uses multiple cache layers for performance. + +### Query Token Cache + +Caches tokenized queries to avoid re-tokenization. + +**Configuration:** + +```typescript +// Internal cache in tfidf.ts +const queryTokenCache = new Map() +const QUERY_CACHE_MAX_SIZE = 100 +``` + +**Behavior:** + +- Stores up to 100 unique queries +- LRU eviction (oldest removed when full) +- No TTL (persists until evicted or cleared) + +**Performance impact:** + +``` +Without cache: + Query "async function" β†’ tokenize (~5ms) β†’ search (15ms) = 20ms total + +With cache (hit): + Query "async function" β†’ cache hit (~0.01ms) β†’ search (15ms) = 15ms total + +Speedup: 25% for repeated queries +``` + +### Search Result Cache + +Caches complete search results including scores and snippets. + +**Configuration:** + +```typescript +// In CodebaseIndexer +this.searchCache = new LRUCache(100, 5) +// 100 entries, 5 minute TTL +``` + +**Parameters:** + +- **Max size**: 100 queries +- **TTL**: 5 minutes +- **Eviction**: LRU (least recently used) + +**Cache key:** + +Cache key includes all search parameters: + +```typescript +const cacheKey = createCacheKey(query, { + limit: 10, + fileExtensions: ['.ts'], + pathFilter: 'src/', + excludePaths: ['node_modules/'] +}) +``` + +Different options = different cache entry. + +**Invalidation:** + +Cache is invalidated on index updates: + +```typescript +// File changed +await storage.storeFile(file) +searchCache.invalidate() // Clear all cached results +``` + +**Performance impact:** + +``` +Cache miss (first query): + "async function" β†’ search (20ms) + +Cache hit (repeated query): + "async function" β†’ cache (<1ms) + +Speedup: 20x for repeated queries +``` + +### Vector Storage Cache + +LanceDB (vector storage) has internal caching. + +**Behavior:** + +- Recently accessed vectors cached in memory +- Automatic cache management (no configuration needed) +- Typical cache size: 10-50 MB + +**No user configuration required.** + +## Optimization Checklist + +**For large codebases (10k+ files):** + +- [x] Enable low memory mode +- [x] Use persistent storage (PersistentStorage) +- [x] Tune indexing batch size (10-50 files) +- [x] Reduce vector batch size if hitting rate limits (5-10) +- [x] Monitor memory usage and adjust accordingly + +**For memory-constrained environments:** + +- [x] `lowMemoryMode: true` +- [x] `indexingBatchSize: 10` +- [x] Skip vector search (no embeddingProvider) +- [x] Use BM25-only search (faster, no embeddings) + +**For high-performance requirements:** + +- [x] Increase indexing batch size (100-200) +- [x] Increase vector batch size (20-50) +- [x] Use in-memory mode if RAM permits (lowMemoryMode: false) +- [x] Enable hybrid search with tuned vectorWeight + +## Performance Benchmarks + +**Indexing speed:** + +| Files | Chunks | Time | Throughput | +|-------|--------|------|------------| +| 100 | 500 | 2s | 50 files/s | +| 1,000 | 5,000 | 15s | 67 files/s | +| 10,000 | 50,000 | 180s | 56 files/s | +| 100,000 | 500,000 | 2,400s | 42 files/s | + +Performance degrades slightly for very large codebases due to IDF recalculation. + +**Search speed:** + +| Mode | Chunks | Time | Notes | +|------|--------|------|-------| +| BM25 (in-memory) | 10k | 10ms | Fastest | +| BM25 (low-memory) | 10k | 20ms | SQL overhead | +| Vector | 10k | 40ms | Embedding + search | +| Hybrid (0.7) | 10k | 50ms | Both searches | + +**Memory usage:** + +| Mode | Chunks | RAM | +|------|--------|-----| +| In-memory | 1k | 10 MB | +| In-memory | 10k | 100 MB | +| In-memory | 100k | 1 GB | +| Low-memory | 1k | 5 MB | +| Low-memory | 10k | 10 MB | +| Low-memory | 100k | 15 MB | + +## Example Configurations + +**Default (balanced):** + +```typescript +const indexer = new CodebaseIndexer({ + storage: new PersistentStorage(), + lowMemoryMode: true, // Use SQL-based search + indexingBatchSize: 50, // 50 files per batch + vectorBatchSize: 10, // 10 embeddings per API call + embeddingProvider: provider +}) +``` + +**Memory-optimized:** + +```typescript +const indexer = new CodebaseIndexer({ + storage: new PersistentStorage(), + lowMemoryMode: true, + indexingBatchSize: 10, // Smaller batches + vectorBatchSize: 5, // Fewer API calls + maxFileSize: 524288 // 512 KB limit (skip large files) +}) +``` + +**Speed-optimized:** + +```typescript +const indexer = new CodebaseIndexer({ + storage: new PersistentStorage(), + lowMemoryMode: false, // In-memory search + indexingBatchSize: 200, // Large batches + vectorBatchSize: 50, // Large embedding batches + embeddingProvider: provider +}) +``` + +**BM25-only (no embeddings):** + +```typescript +const indexer = new CodebaseIndexer({ + storage: new PersistentStorage(), + lowMemoryMode: true, + indexingBatchSize: 50 + // No embeddingProvider = BM25-only search +}) + +// Fast keyword search +const results = await indexer.search('async function') +``` + +**Hybrid with custom weights:** + +```typescript +import { hybridSearch } from '@sylphx/coderag/hybrid-search' + +const indexer = new CodebaseIndexer({ + storage: new PersistentStorage(), + embeddingProvider: provider, + vectorBatchSize: 20 +}) + +// Tune vectorWeight for your use case +const results = await hybridSearch(query, indexer, { + vectorWeight: 0.8, // Favor semantic search + limit: 20 +}) +``` + +## Profiling + +**Measure indexing time:** + +```typescript +const start = Date.now() + +await indexer.index({ + onProgress: (current, total, file) => { + const elapsed = Date.now() - start + const rate = current / (elapsed / 1000) + console.log(`[${current}/${total}] ${file} (${rate.toFixed(1)} files/s)`) + } +}) + +const elapsed = Date.now() - start +console.log(`Indexing complete: ${elapsed}ms`) +``` + +**Measure search time:** + +```typescript +const start = Date.now() +const results = await indexer.search('async function') +const elapsed = Date.now() - start + +console.log(`Search time: ${elapsed}ms (${results.length} results)`) +``` + +**Monitor cache hit rate:** + +```typescript +const stats = indexer.searchCache.stats() +console.log(`Cache hit rate: ${(stats.hitRate * 100).toFixed(1)}%`) +console.log(`Cache size: ${stats.size}/${stats.maxSize}`) +``` + +**Memory profiling:** + +```typescript +// Node.js memory usage +const used = process.memoryUsage() +console.log(`Heap used: ${(used.heapUsed / 1024 / 1024).toFixed(2)} MB`) +console.log(`Heap total: ${(used.heapTotal / 1024 / 1024).toFixed(2)} MB`) +console.log(`RSS: ${(used.rss / 1024 / 1024).toFixed(2)} MB`) +``` diff --git a/docs/guide/quick-start.md b/docs/guide/quick-start.md index ed1ce86..cdbb9cf 100644 --- a/docs/guide/quick-start.md +++ b/docs/guide/quick-start.md @@ -1,218 +1,226 @@ # Quick Start -Get started with CodeRAG in 5 minutes. +Get up and running with CodeRAG in 5 minutes. -## 1. Install +## Basic Usage -```bash -bun add @sylphx/coderag -``` +### 1. Create an Indexer -## 2. Set Up Environment +```typescript +import { CodebaseIndexer, PersistentStorage } from '@sylphx/coderag' -Create `.env`: +// Create persistent storage (SQLite database) +const storage = new PersistentStorage({ + codebaseRoot: './my-project', +}) -```bash -OPENAI_API_KEY=sk-... +// Create the indexer +const indexer = new CodebaseIndexer({ + codebaseRoot: './my-project', + storage, +}) ``` -## 3. Create Indexer +### 2. Index Your Codebase ```typescript -// index.ts -import { CodebaseIndexer } from '@sylphx/coderag'; - -const indexer = new CodebaseIndexer({ - codebaseRoot: '/path/to/your/project', - indexPath: '.coderag' -}); - -// Index your codebase -await indexer.index(); - -console.log('βœ… Indexing complete!'); +// Index all files +await indexer.index() + +// Or index with progress reporting +await indexer.index({ + onProgress: (current, total, file) => { + console.log(`[${current}/${total}] ${file}`) + }, +}) ``` -Run it: +### 3. Search -```bash -bun run index.ts +```typescript +// Search for code +const results = await indexer.search('authentication middleware', { + limit: 10, + includeContent: true, +}) + +// Results include file path, score, and code snippet +for (const result of results) { + console.log(`${result.path}:${result.startLine}-${result.endLine}`) + console.log(`Score: ${result.score}`) + console.log(`Type: ${result.chunkType}`) + console.log(result.snippet) + console.log('---') +} ``` -## 4. Search Your Codebase +## Complete Example ```typescript -// search.ts -import { CodebaseIndexer } from '@sylphx/coderag'; - -const indexer = new CodebaseIndexer({ - codebaseRoot: '/path/to/your/project', - indexPath: '.coderag' -}); - -// Hybrid search (keyword + semantic) -const results = await indexer.search('user authentication', { - limit: 10, - vectorWeight: 0.7, // 70% semantic, 30% keyword - includeContent: true -}); - -console.log(`Found ${results.length} results:`); -results.forEach((result, i) => { - console.log(`\n${i + 1}. ${result.path} (score: ${result.score.toFixed(3)})`); - console.log(` ${result.content?.slice(0, 100)}...`); -}); -``` +import { CodebaseIndexer, PersistentStorage } from '@sylphx/coderag' + +async function main() { + // Setup + const storage = new PersistentStorage({ codebaseRoot: '.' }) + const indexer = new CodebaseIndexer({ + codebaseRoot: '.', + storage, + maxFileSize: 1024 * 1024, // 1MB max file size + }) + + // Index with file watching + console.log('Indexing codebase...') + await indexer.index({ watch: true }) + console.log(`Indexed ${await indexer.getIndexedCount()} files`) + + // Search + const query = 'database connection' + console.log(`\nSearching for: "${query}"`) + + const results = await indexer.search(query, { + limit: 5, + includeContent: true, + fileExtensions: ['.ts', '.js'], + }) + + // Display results + for (const result of results) { + console.log(`\nπŸ“„ ${result.path}:${result.startLine || 0}`) + console.log(` Score: ${result.score.toFixed(3)}`) + console.log(` Type: ${result.chunkType || 'unknown'}`) + if (result.snippet) { + console.log(` Preview: ${result.snippet.slice(0, 100)}...`) + } + } -Run it: + // Keep watching for changes + console.log('\nπŸ‘οΈ Watching for file changes...') +} -```bash -bun run search.ts +main().catch(console.error) ``` -## Search Strategies +## With Semantic Search -### Hybrid Search (Recommended) - -Best of both worlds - combines keyword precision with semantic understanding: +Enable vector-based semantic search for meaning-based results: ```typescript -const results = await indexer.search('authentication logic', { - vectorWeight: 0.7 // 70% semantic, 30% keyword -}); -``` - -### Keyword Search (Fast) +import { + CodebaseIndexer, + PersistentStorage, + createEmbeddingProvider, + hybridSearch, +} from '@sylphx/coderag' + +// Create embedding provider (requires OPENAI_API_KEY) +const embeddingProvider = await createEmbeddingProvider({ + provider: 'openai', + model: 'text-embedding-3-small', + dimensions: 1536, +}) + +const storage = new PersistentStorage({ codebaseRoot: '.' }) +const indexer = new CodebaseIndexer({ + codebaseRoot: '.', + storage, + embeddingProvider, +}) -Traditional TF-IDF for exact term matching: +await indexer.index() -```typescript -const results = await indexer.keywordSearch('getUserData', { - limit: 10 -}); +// Hybrid search: 70% semantic, 30% keyword +const results = await hybridSearch('user authentication flow', indexer, { + vectorWeight: 0.7, + limit: 10, +}) ``` -### Semantic Search (Smart) - -Vector search for understanding meaning: +## Search Options ```typescript -const results = await indexer.semanticSearch('database connection pool', { - limit: 10 -}); -``` +interface SearchOptions { + // Number of results to return + limit?: number // default: 10 -## Incremental Updates + // Include code snippets in results + includeContent?: boolean // default: true -Only reindex changed files: + // Filter by file extension + fileExtensions?: string[] // e.g., ['.ts', '.tsx'] -```typescript -// Initial index -await indexer.index(); + // Filter by path pattern + pathFilter?: string // e.g., 'src/components' -// ... make changes to your codebase ... + // Exclude paths + excludePaths?: string[] // e.g., ['node_modules', 'dist'] -// Incremental update (166x faster!) -await indexer.index(); // Automatically detects changes -``` + // Context lines around matches + contextLines?: number // default: 3 -## Search Options + // Max characters per snippet + maxSnippetChars?: number // default: 2000 -```typescript -interface SearchOptions { - limit?: number; // Max results (default: 10) - minScore?: number; // Minimum relevance score (0-1) - includeContent?: boolean; // Include file content in results - vectorWeight?: number; // Semantic vs keyword balance (0-1) + // Max snippet blocks per file + maxSnippetBlocks?: number // default: 4 } ``` -## Example: Full-Featured Search +## File Watching -```typescript -import { CodebaseIndexer } from '@sylphx/coderag'; - -const indexer = new CodebaseIndexer({ - codebaseRoot: process.cwd(), - indexPath: '.coderag' -}); - -// Build index -console.log('πŸ“¦ Indexing codebase...'); -await indexer.index(); - -// Search with all options -const results = await indexer.search('error handling middleware', { - limit: 5, - minScore: 0.5, - includeContent: true, - vectorWeight: 0.8 // Favor semantic understanding -}); +Enable automatic re-indexing when files change: -// Display results -console.log(`\nπŸ” Found ${results.length} results:\n`); +```typescript +// Start watching +await indexer.index({ watch: true }) -results.forEach((result, i) => { - console.log(`${i + 1}. ${result.path}`); - console.log(` Score: ${result.score.toFixed(3)}`); - console.log(` Language: ${result.language}`); +// Or start/stop manually +await indexer.startWatch() +await indexer.stopWatch() - if (result.content) { - const preview = result.content.slice(0, 150).replace(/\n/g, ' '); - console.log(` Preview: ${preview}...`); - } - - console.log(''); -}); +// Check status +console.log(indexer.isWatchEnabled()) // true/false ``` -## Performance Tips +## Keyword vs Semantic Search -### 1. Use Query Caching +### Keyword Search (Default) -Caching is automatic and provides 100x speedup for repeated queries: +Best for exact matches and code symbols: ```typescript -// First query: ~130ms -const results1 = await indexer.search('authentication'); - -// Cached query: ~1.3ms (100x faster!) -const results2 = await indexer.search('authentication'); +// Good for: function names, variable names, exact matches +const results = await indexer.search('getUserById') +const results = await indexer.search('handleSubmit') ``` -### 2. Incremental Updates +### Semantic Search -Only reindex changed files instead of full rebuild: +Best for conceptual queries: ```typescript -// Initial index: ~13s for 250 files -await indexer.index(); - -// Change 5 files... +import { hybridSearch } from '@sylphx/coderag' -// Incremental update: ~2.6s (166x faster!) -await indexer.index(); +// Good for: concepts, descriptions, "how does X work" +const results = await hybridSearch('authentication flow', indexer, { + vectorWeight: 0.9, // Almost pure semantic +}) ``` -### 3. Tune Vector Weight +### Hybrid Search (Recommended) -Adjust based on your search needs: +Best for general queries: ```typescript -// Favor keyword precision (code symbols) -const results1 = await indexer.search('getUserData', { vectorWeight: 0.3 }); - -// Favor semantic understanding (concepts) -const results2 = await indexer.search('user authentication', { vectorWeight: 0.8 }); +import { hybridSearch } from '@sylphx/coderag' -// Balanced (default) -const results3 = await indexer.search('error handling', { vectorWeight: 0.5 }); +// Balanced keyword + semantic +const results = await hybridSearch('user login validation', indexer, { + vectorWeight: 0.7, // 70% semantic, 30% keyword +}) ``` ## Next Steps -- [TF-IDF Search](./tfidf.md) - Deep dive into keyword search -- [Vector Search](./vector-search.md) - Understand semantic search -- [Hybrid Search](./hybrid-search.md) - Master combined strategies -- [Embedding Providers](./providers.md) - Configure providers -- [Performance Tuning](./performance.md) - Optimize for your use case +- [How Search Works](/guide/how-search-works) - Understand the search algorithm +- [AST Chunking](/guide/ast-chunking) - Learn about semantic chunking +- [MCP Server](/mcp/overview) - Use with AI assistants diff --git a/docs/guide/storage.md b/docs/guide/storage.md new file mode 100644 index 0000000..91891dc --- /dev/null +++ b/docs/guide/storage.md @@ -0,0 +1,477 @@ +# Persistent Storage + +CodeRAG uses SQLite with LibSQL for persistent, memory-efficient storage. This enables incremental updates and low-memory operation. + +## SQLite with LibSQL + +LibSQL is a fork of SQLite optimized for embedded applications with WASM support. + +**Why LibSQL?** + +- Embedded database (no separate server) +- ACID transactions (reliable updates) +- Fast reads/writes +- Low memory footprint +- WASM-compatible (works in browsers and edge environments) +- Drizzle ORM integration + +**Database location:** + +Storage is in `~/.coderag/projects//coderag.db`: + +```typescript +import { getCoderagDataDir } from '@sylphx/coderag/db/client' + +const dataDir = getCoderagDataDir('/path/to/codebase') +// Returns: /Users/username/.coderag/projects/abc123/ + +const dbPath = path.join(dataDir, 'coderag.db') +// Returns: /Users/username/.coderag/projects/abc123/coderag.db +``` + +The hash is based on the codebase absolute path, ensuring each project has its own isolated database. + +**Creating storage:** + +```typescript +import { PersistentStorage } from '@sylphx/coderag/storage-persistent' + +// Default: creates database in ~/.coderag/projects// +const storage = new PersistentStorage() + +// Custom path +const storage = new PersistentStorage({ + url: 'file:///custom/path/coderag.db' +}) + +// In-memory (for testing) +const storage = new PersistentStorage({ + url: ':memory:' +}) +``` + +## Database Schema + +CodeRAG uses a chunk-based schema optimized for granular search. + +**Schema overview:** + +``` +files + └─ chunks (1:N) + └─ document_vectors (1:N) + +idf_scores (global) +index_metadata (global) +``` + +**Files table:** + +Stores file metadata and full content. + +```sql +CREATE TABLE files ( + id INTEGER PRIMARY KEY, + path TEXT NOT NULL UNIQUE, + content TEXT NOT NULL, + hash TEXT NOT NULL, + size INTEGER NOT NULL, + mtime INTEGER NOT NULL, + language TEXT, + indexed_at INTEGER NOT NULL +) +``` + +**Chunks table:** + +Stores code chunks extracted via AST parsing. + +```sql +CREATE TABLE chunks ( + id INTEGER PRIMARY KEY, + file_id INTEGER NOT NULL REFERENCES files(id) ON DELETE CASCADE, + content TEXT NOT NULL, + type TEXT NOT NULL, + start_line INTEGER NOT NULL, + end_line INTEGER NOT NULL, + metadata TEXT, + token_count INTEGER, + magnitude REAL, + FOREIGN KEY(file_id) REFERENCES files(id) +) +``` + +Key fields: +- `type`: AST node type (e.g., `FunctionDeclaration`) +- `start_line`, `end_line`: Line numbers for navigation +- `token_count`: Used for BM25 length normalization +- `magnitude`: Pre-computed for cosine similarity + +**Document vectors table:** + +Stores TF-IDF vectors for each chunk. + +```sql +CREATE TABLE document_vectors ( + id INTEGER PRIMARY KEY, + chunk_id INTEGER NOT NULL REFERENCES chunks(id) ON DELETE CASCADE, + term TEXT NOT NULL, + tf REAL NOT NULL, + tfidf REAL NOT NULL, + raw_freq INTEGER NOT NULL, + FOREIGN KEY(chunk_id) REFERENCES chunks(id) +) + +CREATE INDEX idx_vectors_chunk ON document_vectors(chunk_id) +CREATE INDEX idx_vectors_term ON document_vectors(term) +``` + +One row per (chunk, term) pair. Enables efficient term-based search. + +**IDF scores table:** + +Stores global IDF scores for all terms. + +```sql +CREATE TABLE idf_scores ( + term TEXT PRIMARY KEY, + idf REAL NOT NULL, + document_frequency INTEGER NOT NULL +) +``` + +Shared across all chunks for IDF calculation. + +**Index metadata table:** + +Stores key-value metadata. + +```sql +CREATE TABLE index_metadata ( + key TEXT PRIMARY KEY, + value TEXT NOT NULL, + updated_at INTEGER NOT NULL +) +``` + +Used for: +- Average document length (`avgDocLength`) +- Index version +- Last update timestamp + +## Low Memory Mode + +Low memory mode uses SQL-based search instead of loading the entire index into memory. + +**Enabling low memory mode:** + +```typescript +import { CodebaseIndexer } from '@sylphx/coderag' +import { PersistentStorage } from '@sylphx/coderag/storage-persistent' + +const storage = new PersistentStorage() + +const indexer = new CodebaseIndexer({ + storage, + lowMemoryMode: true // Default: true when using PersistentStorage +}) +``` + +**Memory comparison:** + +| Mode | Memory Usage | Search Speed | +|------|--------------|--------------| +| In-memory | ~100MB per 10k chunks | 10-20ms | +| Low memory | ~10MB baseline | 15-30ms | + +Low memory mode sacrifices 50% search speed for 90% memory reduction. + +**How it works:** + +**In-memory mode:** +1. Load all chunks and TF-IDF vectors into RAM +2. Build in-memory search index +3. Search uses in-memory data structures + +**Low memory mode:** +1. Query database for chunks matching query terms +2. Compute BM25 scores on-the-fly +3. Return top results + +**SQL-based search:** + +```typescript +// Get chunks containing query terms +const candidates = await storage.searchByTerms(queryTokens, { limit: 100 }) + +// Candidates include: +// - chunkId, filePath, content +// - matchedTerms (with tfidf and rawFreq) +// - pre-computed magnitude and tokenCount + +// Score candidates using BM25 (in memory, only for candidates) +for (const candidate of candidates) { + let score = 0 + for (const term of matchedTerms) { + const termData = candidate.matchedTerms.get(term) + const tf = termData.rawFreq + const idf = idfScores.get(term) + score += idf * (tf * (k1 + 1)) / (tf + k1 * (1 - b + b * docLen / avgDocLen)) + } +} +``` + +Only top candidates are loaded into memory, not the entire index. + +## Migrations + +CodeRAG uses Drizzle ORM for schema migrations. + +**Migration system:** + +Migrations ensure database schema stays up-to-date across versions. + +```typescript +import { runMigrations } from '@sylphx/coderag/db/migrations' + +// Auto-runs on storage initialization +const storage = new PersistentStorage() +// Migrations applied automatically +``` + +**Migration tracking:** + +Migrations are tracked in the `__drizzle_migrations` table: + +```sql +CREATE TABLE __drizzle_migrations ( + id INTEGER PRIMARY KEY, + hash TEXT NOT NULL, + created_at INTEGER +) +``` + +Each migration runs once, identified by its hash. + +**Current migrations:** + +```typescript +// packages/core/src/db/migrations.ts + +export async function runMigrations(client: Client) { + // Create files table + await client.execute(` + CREATE TABLE IF NOT EXISTS files (...) + `) + + // Create chunks table + await client.execute(` + CREATE TABLE IF NOT EXISTS chunks (...) + `) + + // Create document_vectors table with indexes + await client.execute(` + CREATE TABLE IF NOT EXISTS document_vectors (...) + `) + await client.execute(` + CREATE INDEX IF NOT EXISTS idx_vectors_chunk ON document_vectors(chunk_id) + `) + await client.execute(` + CREATE INDEX IF NOT EXISTS idx_vectors_term ON document_vectors(term) + `) + + // Create idf_scores table + await client.execute(` + CREATE TABLE IF NOT EXISTS idf_scores (...) + `) + + // Create index_metadata table + await client.execute(` + CREATE TABLE IF NOT EXISTS index_metadata (...) + `) +} +``` + +Migrations are idempotent (safe to run multiple times). + +## Batch Operations + +CodeRAG uses batch operations for efficient bulk updates. + +**Batch file storage:** + +```typescript +const files: CodebaseFile[] = [...] // 1000 files + +// Bad: One transaction per file (slow) +for (const file of files) { + await storage.storeFile(file) // 1000 transactions +} + +// Good: Batch transaction (fast) +await storage.storeFiles(files) // 1 batch transaction +``` + +**Implementation:** + +```typescript +async storeFiles(files: CodebaseFile[]): Promise { + await this.client.batch( + files.map(file => ({ + sql: `INSERT INTO files (...) VALUES (?, ?, ?, ...) + ON CONFLICT(path) DO UPDATE SET ...`, + args: [file.path, file.content, file.hash, ...] + })), + 'write' + ) +} +``` + +Batch size limit: ~500-1000 files per batch (SQLite variable limit). + +**Batch chunk vectors:** + +```typescript +const chunkVectors: Array<{ + chunkId: number + terms: Map + tokenCount: number +}> = [...] + +// Store all vectors in batches +await storage.storeManyChunkVectors(chunkVectors) +``` + +Vectors are batched in groups of 199 (SQLite has ~999 bind variable limit, 5 fields per row = 199 rows max). + +**Batch IDF updates:** + +```typescript +// Rebuild IDF for all terms using SQL +await storage.rebuildIdfScoresFromVectors() + +// SQL aggregation (no need to load data into memory) +const dfResults = await db.select({ + term: documentVectors.term, + df: sql`COUNT(DISTINCT ${documentVectors.chunkId})` +}) +.from(documentVectors) +.groupBy(documentVectors.term) + +// Batch insert IDF scores +const scores = dfResults.map(row => ({ + term: row.term, + idf: Math.log((totalChunks + 1) / (row.df + 1)) + 1, + documentFrequency: row.df +})) + +// Insert in batches of 300 +for (let i = 0; i < scores.length; i += 300) { + await db.insert(idfScores).values(scores.slice(i, i + 300)) +} +``` + +## Incremental Updates + +CodeRAG supports incremental updates for efficient file watching. + +**Update flow:** + +1. **Detect changes**: Compare filesystem with database +2. **Delete old data**: Remove chunks and vectors for changed/deleted files +3. **Insert new data**: Add chunks and vectors for new/changed files +4. **Rebuild IDF**: Recalculate IDF scores globally +5. **Update TF-IDF**: Recalculate TF-IDF scores using new IDF +6. **Update metadata**: Recalculate chunk magnitudes and average doc length + +**Example:** + +```typescript +// User edits src/utils.ts +// CodeRAG detects change via file watcher + +// 1. Get terms for old chunks (for IDF recalculation) +const affectedTerms = await storage.getTermsForFiles(['src/utils.ts']) + +// 2. Delete old chunks +await storage.deleteFiles(['src/utils.ts']) + +// 3. Re-chunk file +const chunks = await chunkCodeByAST(newContent, 'src/utils.ts') + +// 4. Store file and chunks +await storage.storeFile({ path: 'src/utils.ts', content: newContent, ... }) +const chunkIds = await storage.storeChunks('src/utils.ts', chunks) + +// 5. Build TF-IDF vectors for new chunks +const chunkVectors = buildVectors(chunks) +await storage.storeManyChunkVectors(chunkVectors) + +// 6. Rebuild global IDF scores +await storage.rebuildIdfScoresFromVectors() + +// 7. Recalculate TF-IDF scores for all chunks +await storage.recalculateTfidfScores() + +// 8. Update pre-computed magnitudes +await storage.updateChunkMagnitudes() +``` + +Incremental updates are significantly faster than full reindex: + +| Operation | Full Rebuild | Incremental | +|-----------|--------------|-------------| +| 1 file changed | 30 seconds | 0.5 seconds | +| 10 files changed | 30 seconds | 3 seconds | +| 100 files changed | 30 seconds | 15 seconds | + +## Storage API + +**Key methods:** + +```typescript +class PersistentStorage { + // File operations + async storeFile(file: CodebaseFile): Promise + async storeFiles(files: CodebaseFile[]): Promise + async getFile(path: string): Promise + async getAllFiles(): Promise + async deleteFile(path: string): Promise + async deleteFiles(paths: string[]): Promise + async count(): Promise + + // Chunk operations + async storeChunks(filePath: string, chunks: ChunkData[]): Promise + async storeManyChunks(fileChunks: Array<{ filePath, chunks }>): Promise> + async getChunksForFile(filePath: string): Promise + async getAllChunks(): Promise + async getChunkCount(): Promise + + // Vector operations + async storeChunkVectors(chunkId, terms, tokenCount): Promise + async storeManyChunkVectors(chunkVectors): Promise + async getChunkVectors(chunkId): Promise> + async getAllChunkVectors(): Promise>> + + // IDF operations + async storeIdfScores(idf, docFreq): Promise + async getIdfScores(): Promise> + async getIdfScoresForTerms(terms): Promise> + async rebuildIdfScoresFromVectors(): Promise + + // Search operations + async searchByTerms(queryTerms, options): Promise + async getAverageDocLength(): Promise + async updateAverageDocLength(): Promise + + // Metadata operations + async setMetadata(key, value): Promise + async getMetadata(key): Promise + + // Index maintenance + async recalculateTfidfScores(): Promise + async updateChunkMagnitudes(): Promise + async clear(): Promise + close(): void +} +``` diff --git a/docs/guide/tfidf.md b/docs/guide/tfidf.md new file mode 100644 index 0000000..e72a337 --- /dev/null +++ b/docs/guide/tfidf.md @@ -0,0 +1,335 @@ +# TF-IDF and BM25 + +CodeRAG uses BM25, an improved version of TF-IDF, for keyword-based search. This page explains the mathematical foundations and implementation details. + +## Term Frequency (TF) + +Term frequency measures how often a term appears in a document (chunk). + +**Formula:** + +``` +TF(t, d) = count(t in d) / total_terms(d) +``` + +Where: +- `t` = term (token) +- `d` = document (chunk) +- `count(t in d)` = number of times term t appears in document d +- `total_terms(d)` = total number of tokens in document d + +**Example:** + +```typescript +// Chunk: "async function fetchUser(userId) { return await api.get(userId) }" +// Tokens: ["async", "function", "fetchUser", "(", "userId", ")", "{", "return", "await", "api", ".", "get", "(", "userId", ")", "}"] +// Total: 16 tokens + +TF("userId", chunk) = 2 / 16 = 0.125 +TF("async", chunk) = 1 / 16 = 0.0625 +TF("return", chunk) = 1 / 16 = 0.0625 +``` + +**Implementation:** + +```typescript +function calculateTF(termFrequency: Map): Map { + const totalTerms = Array.from(termFrequency.values()).reduce((sum, freq) => sum + freq, 0) + const tf = new Map() + + for (const [term, freq] of termFrequency.entries()) { + tf.set(term, freq / totalTerms) + } + + return tf +} +``` + +## Inverse Document Frequency (IDF) + +IDF measures how rare a term is across all documents. Rare terms are more informative than common terms. + +**Formula (smoothed):** + +``` +IDF(t) = log((N + 1) / (df(t) + 1)) + 1 +``` + +Where: +- `N` = total number of chunks +- `df(t)` = document frequency (number of chunks containing term t) +- `+1` terms provide smoothing (prevent zero/negative values) + +**Why smoothing?** + +Standard IDF `log(N / df)` becomes 0 when a term appears in all documents. Smoothed IDF ensures every term has a positive score. + +**Example:** + +```typescript +// Index: 1000 chunks total + +IDF("fetchUser") = log((1000 + 1) / (5 + 1)) + 1 = 4.22 // Rare +IDF("async") = log((1000 + 1) / (200 + 1)) + 1 = 2.61 // Common +IDF("function") = log((1000 + 1) / (800 + 1)) + 1 = 1.25 // Very common +``` + +Rare terms like `fetchUser` get higher IDF scores. + +**Implementation:** + +```typescript +function calculateIDF( + documents: Map[], + totalDocuments: number +): Map { + const documentFrequency = new Map() + + // Count chunks containing each term + for (const doc of documents) { + const uniqueTerms = new Set(doc.keys()) + for (const term of uniqueTerms) { + documentFrequency.set(term, (documentFrequency.get(term) || 0) + 1) + } + } + + // Calculate IDF for each term + const idf = new Map() + for (const [term, docFreq] of documentFrequency.entries()) { + idf.set(term, Math.log((totalDocuments + 1) / (docFreq + 1)) + 1) + } + + return idf +} +``` + +## TF-IDF Calculation + +TF-IDF combines TF and IDF to score term importance in a document. + +**Formula:** + +``` +TF-IDF(t, d) = TF(t, d) * IDF(t) +``` + +**Example:** + +```typescript +// Term: "fetchUser" +TF("fetchUser", chunk) = 2 / 16 = 0.125 +IDF("fetchUser") = 4.22 + +TF-IDF("fetchUser", chunk) = 0.125 * 4.22 = 0.5275 +``` + +**Implementation:** + +```typescript +function calculateTFIDF(tf: Map, idf: Map): Map { + const tfidf = new Map() + + for (const [term, tfScore] of tf.entries()) { + const idfScore = idf.get(term) || 0 + tfidf.set(term, tfScore * idfScore) + } + + return tfidf +} +``` + +Each chunk is represented as a TF-IDF vector (map of term to score). + +## BM25 Formula + +BM25 (Best Matching 25) improves TF-IDF with saturation and normalization. + +**Full formula:** + +``` +BM25(d, q) = Ξ£ IDF(qi) * (f(qi, d) * (k1 + 1)) / (f(qi, d) + k1 * (1 - b + b * |d| / avgdl)) + for qi in q +``` + +Where: +- `d` = document (chunk) +- `q` = query +- `qi` = query term i +- `f(qi, d)` = raw frequency of qi in d +- `|d|` = document length (token count) +- `avgdl` = average document length across all chunks +- `k1` = term frequency saturation parameter (default: 1.2) +- `b` = length normalization parameter (default: 0.75) +- `IDF(qi)` = inverse document frequency of qi + +**Key improvements over TF-IDF:** + +1. **Term frequency saturation (k1)**: Diminishing returns for repeated terms + - A term appearing 10 times is not 10x more important than appearing once + - k1 = 1.2 means TF plateaus around 2.2x boost + +2. **Length normalization (b)**: Penalizes long chunks + - Prevents long chunks from dominating just by containing more terms + - b = 0.75 means 75% length normalization, 25% unchanged + +**BM25 Parameters:** + +```typescript +const BM25_K1 = 1.2 // Saturation: 1.2-2.0 typical +const BM25_B = 0.75 // Normalization: 0 = none, 1 = full +``` + +These values are industry standards from Elasticsearch and Lucene. + +**Implementation:** + +```typescript +export async function searchDocumentsFromStorage( + query: string, + candidates: StorageSearchResult[], + idf: Map, + options: { avgDocLength?: number } = {} +): Promise { + const queryTokens = await getQueryTokens(query) + + let avgDocLength = options.avgDocLength + if (!avgDocLength) { + const totalTokens = candidates.reduce((sum, c) => sum + (c.tokenCount || 0), 0) + avgDocLength = candidates.length > 0 ? totalTokens / candidates.length : 1 + } + avgDocLength = Math.max(avgDocLength, 1) + + const results = [] + + for (const candidate of candidates) { + const matchedTerms = [] + for (const term of queryTokens) { + if (candidate.matchedTerms.has(term)) { + matchedTerms.push(term) + } + } + + if (matchedTerms.length === 0) continue + + const docLen = candidate.tokenCount || 1 + let score = 0 + + for (const term of matchedTerms) { + const termFreq = candidate.matchedTerms.get(term).rawFreq + const termIdf = idf.get(term) || 0 + + // BM25 term score + const numerator = termFreq * (BM25_K1 + 1) + const denominator = termFreq + BM25_K1 * (1 - BM25_B + (BM25_B * docLen) / avgDocLength) + score += termIdf * (numerator / denominator) + } + + results.push({ uri: `file://${candidate.path}`, score, matchedTerms }) + } + + return results.sort((a, b) => b.score - a.score) +} +``` + +## Code-Aware Tokenization + +CodeRAG uses StarCoder2, a code-aware tokenizer that understands programming syntax. + +**Why StarCoder2?** + +Generic tokenizers split code incorrectly: + +```typescript +// Generic tokenizer (word-based): +"getUserById" β†’ ["get", "User", "By", "Id"] // Broken +"snake_case" β†’ ["snake", "case"] // Lost underscore + +// StarCoder2: +"getUserById" β†’ ["getUserById"] // Preserved +"snake_case" β†’ ["snake_case"] // Preserved +``` + +**Tokenization interface:** + +```typescript +import { tokenize } from '@sylphx/coderag' + +const tokens = await tokenize('async function getUserById(id: string)') +// Returns: ['async', 'function', 'getUserById', '(', 'id', ':', 'string', ')'] +``` + +**Tokenization caching:** + +Query tokens are cached to avoid re-tokenization: + +```typescript +// Query token cache (LRU) +const queryTokenCache = new Map() +const QUERY_CACHE_MAX_SIZE = 100 + +async function getCachedQueryTokens(query: string): Promise { + const cached = queryTokenCache.get(query) + if (cached) return cached + + const tokens = [...new Set(await tokenize(query))] + + // LRU eviction + if (queryTokenCache.size >= QUERY_CACHE_MAX_SIZE) { + const firstKey = queryTokenCache.keys().next().value + if (firstKey) queryTokenCache.delete(firstKey) + } + + queryTokenCache.set(query, tokens) + return tokens +} +``` + +Cache stores up to 100 unique queries, evicting oldest when full. + +## Vector Magnitude + +For cosine similarity search (used as TF-IDF fallback), vectors need normalized magnitude. + +**Magnitude formula:** + +``` +magnitude(v) = sqrt(Ξ£ vi^2) +``` + +**Implementation:** + +```typescript +function calculateMagnitude(vector: Map): number { + let sum = 0 + for (const value of vector.values()) { + sum += value * value + } + return Math.sqrt(sum) +} +``` + +**Cosine similarity:** + +```typescript +export function calculateCosineSimilarity( + queryVector: Map, + docVector: DocumentVector +): number { + let dotProduct = 0 + + for (const [term, queryScore] of queryVector.entries()) { + const docScore = docVector.terms.get(term) || 0 + dotProduct += queryScore * docScore + } + + const queryMagnitude = calculateMagnitude(queryVector) + + if (queryMagnitude === 0 || docVector.magnitude === 0) { + return 0 + } + + return dotProduct / (queryMagnitude * docVector.magnitude) +} +``` + +Cosine similarity ranges from 0 (orthogonal) to 1 (identical). diff --git a/docs/guide/vector-search.md b/docs/guide/vector-search.md new file mode 100644 index 0000000..3f33783 --- /dev/null +++ b/docs/guide/vector-search.md @@ -0,0 +1,376 @@ +# Vector Search + +CodeRAG supports semantic search through vector embeddings. Unlike keyword search (BM25), vector search understands meaning and context. + +## How Embeddings Work + +Embeddings convert text into high-dimensional vectors (arrays of numbers) that capture semantic meaning. + +**Key concept:** + +Similar code has similar vectors. Vectors are compared using cosine similarity or distance metrics. + +**Example:** + +```typescript +// These have similar embeddings (close in vector space): +"async function fetchUser(id)" +"function getUser(userId)" +"async getUserById(id)" + +// These are dissimilar (far apart): +"async function fetchUser(id)" +"render UI component" +``` + +**Vector dimensions:** + +- OpenAI `text-embedding-3-small`: 1536 dimensions +- OpenAI `text-embedding-3-large`: 3072 dimensions +- Custom models: configurable + +Higher dimensions capture more nuance but cost more storage and compute. + +**Embedding representation:** + +```typescript +const embedding = [0.023, -0.015, 0.042, ..., 0.011] // 1536 numbers for text-embedding-3-small +``` + +## OpenAI Provider Setup + +CodeRAG uses the Vercel AI SDK with OpenAI for embeddings. + +**Installation:** + +```bash +npm install @sylphx/coderag @ai-sdk/openai ai +``` + +**Environment variables:** + +```bash +# .env +OPENAI_API_KEY=sk-... # Required +EMBEDDING_MODEL=text-embedding-3-small # Optional (default) +EMBEDDING_DIMENSIONS=1536 # Optional (auto-detected for known models) +OPENAI_BASE_URL=https://api.openai.com/v1 # Optional (for OpenAI-compatible endpoints) +``` + +**Programmatic configuration:** + +```typescript +import { CodebaseIndexer } from '@sylphx/coderag' +import { createEmbeddingProvider, createDefaultConfig } from '@sylphx/coderag/embeddings' + +// Auto-detect from environment +const provider = await getDefaultEmbeddingProvider() + +// Custom configuration +const customProvider = createEmbeddingProvider({ + provider: 'openai', + model: 'text-embedding-3-large', + dimensions: 3072, + apiKey: process.env.OPENAI_API_KEY, + batchSize: 10 +}) + +const indexer = new CodebaseIndexer({ + embeddingProvider: customProvider, + vectorBatchSize: 10 +}) +``` + +**Supported providers:** + +- `openai`: Official OpenAI API +- `openai-compatible`: OpenAI-compatible endpoints (OpenRouter, Together AI, etc.) +- `mock`: Deterministic mock embeddings for testing + +**OpenAI-compatible example:** + +```typescript +const provider = createEmbeddingProvider({ + provider: 'openai-compatible', + model: 'custom-embedding-model', + dimensions: 768, + apiKey: process.env.API_KEY, + baseURL: 'https://api.together.xyz/v1' +}) +``` + +## Vector Storage (LanceDB) + +CodeRAG uses LanceDB for efficient vector storage and retrieval. + +**Why LanceDB?** + +- Embedded database (no separate server) +- Fast vector search with ANN (approximate nearest neighbor) +- Disk-based storage (low memory footprint) +- Native support for filtering and metadata + +**Storage location:** + +Vectors are stored in `~/.coderag/projects//vectors.lance`: + +```typescript +import { getCoderagDataDir } from '@sylphx/coderag/db/client' + +const dataDir = getCoderagDataDir('/path/to/codebase') +// Returns: /Users/username/.coderag/projects/abc123/ + +const vectorDbPath = path.join(dataDir, 'vectors.lance') +// Returns: /Users/username/.coderag/projects/abc123/vectors.lance +``` + +**Vector document structure:** + +```typescript +interface VectorDocument { + readonly id: string // Unique identifier + readonly embedding: number[] // Vector (1536 or 3072 dims) + readonly metadata: { + readonly type: 'code' | 'knowledge' + readonly language?: string // e.g., 'typescript' + readonly content?: string // Code snippet (preview) + readonly chunkType?: string // e.g., 'FunctionDeclaration' + readonly path?: string // File path + readonly startLine?: number + readonly endLine?: number + } +} +``` + +**Chunk-level embeddings:** + +CodeRAG generates one embedding per chunk (not per file): + +```typescript +// Vector ID format: chunk://path:startLine-endLine +const doc: VectorDocument = { + id: 'chunk://src/utils.ts:5-10', + embedding: [0.023, -0.015, ...], + metadata: { + type: 'code', + chunkType: 'FunctionDeclaration', + language: 'typescript', + content: 'export function parseQuery(query: string): string[] {...}', + path: 'src/utils.ts', + startLine: 5, + endLine: 10 + } +} +``` + +**Batch insertion:** + +Embeddings are added in batches for performance: + +```typescript +const batchSize = 10 +const chunks = [...] // Array of chunks + +for (let i = 0; i < chunks.length; i += batchSize) { + const batch = chunks.slice(i, i + batchSize) + + // Generate embeddings for batch + const embeddings = await embeddingProvider.generateEmbeddings( + batch.map(c => c.content) + ) + + // Add to vector storage + for (let j = 0; j < batch.length; j++) { + await vectorStorage.addDocument({ + id: `chunk://${batch[j].path}:${batch[j].startLine}-${batch[j].endLine}`, + embedding: embeddings[j], + metadata: { ... } + }) + } +} +``` + +Default batch size is 10 (configurable via `vectorBatchSize` option). + +## Cosine Similarity Scoring + +Vector similarity is measured using cosine similarity, which compares the angle between two vectors. + +**Formula:** + +``` +cosine_similarity(A, B) = (A Β· B) / (||A|| * ||B||) +``` + +Where: +- `A Β· B` = dot product of A and B +- `||A||` = magnitude (length) of vector A +- `||B||` = magnitude (length) of vector B + +**Range:** + +- 1.0 = identical (same direction) +- 0.0 = orthogonal (no similarity) +- -1.0 = opposite (rarely happens with embeddings) + +**Implementation:** + +```typescript +export const cosineSimilarity = (vecA: number[], vecB: number[]): number => { + if (vecA.length !== vecB.length) { + throw new Error(`Vector dimension mismatch`) + } + + const { dotProduct, normA, normB } = vecA.reduce( + (acc, aVal, i) => { + const bVal = vecB[i] + return { + dotProduct: acc.dotProduct + aVal * bVal, + normA: acc.normA + aVal * aVal, + normB: acc.normB + bVal * bVal, + } + }, + { dotProduct: 0, normA: 0, normB: 0 } + ) + + if (normA === 0 || normB === 0) { + return 0 + } + + return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB)) +} +``` + +**LanceDB distance:** + +LanceDB uses L2 (Euclidean) distance by default. CodeRAG converts to similarity: + +```typescript +// LanceDB returns L2 distance +const distance = result._distance // e.g., 0.5 + +// Convert to similarity score +const similarity = 1 / (1 + distance) // 0.67 +``` + +**Search example:** + +```typescript +import { VectorStorage } from '@sylphx/coderag/vector-storage' + +const vectorStorage = new VectorStorage({ + dimensions: 1536, + dbPath: './vectors.lance' +}) + +// Generate query embedding +const queryEmbedding = await embeddingProvider.generateEmbedding( + 'async function to fetch user data' +) + +// Search for similar vectors +const results = await vectorStorage.search(queryEmbedding, { + k: 10, // Top 10 results + minScore: 0.7 // Minimum similarity threshold +}) + +for (const result of results) { + console.log(`${result.doc.metadata.path} (similarity: ${result.similarity})`) + console.log(result.doc.metadata.content) +} +``` + +## Embedding Generation + +**Single embedding:** + +```typescript +const embedding = await embeddingProvider.generateEmbedding( + 'async function fetchUser(id: string)' +) +// Returns: number[] (1536 dimensions) +``` + +**Batch embeddings:** + +```typescript +const texts = [ + 'function fetchUser(id)', + 'class UserService', + 'interface User' +] + +const embeddings = await embeddingProvider.generateEmbeddings(texts) +// Returns: number[][] (array of 1536-dim vectors) +``` + +Batch generation is more efficient (single API call). + +## Mock Provider + +For development and testing, CodeRAG includes a mock embedding provider. + +**When to use:** + +- Testing without OpenAI API key +- CI/CD pipelines +- Development environments + +**Behavior:** + +- Generates deterministic embeddings using hash functions +- Same input always produces same vector +- No API calls (instant, free) +- Not semantically meaningful (use for structure testing only) + +**Usage:** + +```typescript +import { createMockProvider } from '@sylphx/coderag/embeddings' + +const mockProvider = createMockProvider(1536) + +const embedding = await mockProvider.generateEmbedding('test') +// Returns: deterministic 1536-dim vector +``` + +**Auto-detection:** + +If `OPENAI_API_KEY` is not set, CodeRAG automatically uses mock provider: + +```typescript +const provider = await getDefaultEmbeddingProvider() +// Uses 'mock' if no API key, 'openai' if key present +``` + +## Performance Considerations + +**Embedding generation cost:** + +- OpenAI `text-embedding-3-small`: ~$0.02 per 1M tokens +- 1000 chunks * ~100 tokens each = 100k tokens = $0.002 +- Generation time: ~1-2 seconds per batch of 10 + +**Optimization strategies:** + +1. **Batch processing**: Generate embeddings in batches of 10-50 +2. **Caching**: Reuse embeddings for unchanged chunks +3. **Incremental updates**: Only generate embeddings for new/changed chunks +4. **Model selection**: Use `text-embedding-3-small` for lower cost/latency + +**Storage requirements:** + +``` +Chunks: 1000 +Dimensions: 1536 +Bytes per float: 4 +Size: 1000 * 1536 * 4 = 6.14 MB +``` + +LanceDB compresses vectors, actual disk usage is lower. + +**Search performance:** + +- Vector search (k=10): ~20-50ms for 10k chunks +- Slower than BM25 (~10-20ms) but provides semantic understanding +- Use hybrid search to combine speed of BM25 with accuracy of vectors diff --git a/docs/index.md b/docs/index.md index 326e17c..aa10240 100644 --- a/docs/index.md +++ b/docs/index.md @@ -2,111 +2,185 @@ layout: home hero: - name: "CodeRAG" - text: "Intelligent Code Search" - tagline: Lightning-fast hybrid search (TF-IDF + Vector) - RAG-ready for AI assistants + name: CodeRAG + text: Semantic Code Search + tagline: Lightning-fast hybrid search with AST chunking - RAG-ready for AI assistants + image: + src: /logo.svg + alt: CodeRAG actions: - theme: brand text: Get Started link: /guide/getting-started + - theme: alt + text: MCP Server + link: /mcp/overview - theme: alt text: View on GitHub - link: https://github.com/sylphlab/coderag + link: https://github.com/SylphxAI/coderag features: - - icon: ⚑ + - icon: + svg: '' title: Blazing Fast - details: 2.7x faster initial indexing, 166x faster incremental updates, and 100x faster cached queries compared to traditional approaches. - - - icon: 🧠 + details: Index 1000-2000 files/sec with instant startup (<100ms). Incremental updates only reindex changed files. + link: /guide/performance + linkText: Learn more + + - icon: + svg: '' + title: AST Chunking + details: Split code at semantic boundaries (functions, classes) using tree-sitter. 15+ languages supported. + link: /guide/ast-chunking + linkText: Learn more + + - icon: + svg: '' title: Hybrid Search - details: Combines TF-IDF keyword search with semantic vector search for best-of-both-worlds accuracy. - - - icon: πŸ”Œ - title: Extensible Providers - details: Built-in support for OpenAI, OpenRouter, and custom embedding providers via registry pattern. + details: Combines TF-IDF keyword search with optional vector embeddings for best-of-both-worlds accuracy. + link: /guide/hybrid-search + linkText: Learn more - - icon: πŸ“¦ + - icon: + svg: '' title: Zero Config - details: Works out of the box with sensible defaults. Advanced configuration available when needed. + details: Works out of the box with sensible defaults. Just point to your codebase and start searching. + link: /guide/quick-start + linkText: Learn more - - icon: 🎯 + - icon: + svg: '' title: Code-Aware - details: StarCoder2 tokenization handles camelCase, snake_case, and code-specific patterns. + details: StarCoder2 tokenization handles camelCase, snake_case, and code-specific patterns correctly. + link: /guide/tfidf + linkText: Learn more - - icon: πŸ€– + - icon: + svg: '' title: MCP Integration - details: Built-in Model Context Protocol server for seamless AI assistant integration. + details: Built-in Model Context Protocol server for Claude, Cursor, VS Code, and other AI assistants. + link: /mcp/overview + linkText: Learn more --- + + ## Quick Example ```typescript -import { CodebaseIndexer } from '@sylphx/coderag'; +import { CodebaseIndexer, PersistentStorage } from '@sylphx/coderag' + +// Create persistent storage (SQLite) +const storage = new PersistentStorage({ codebaseRoot: './my-project' }) // Initialize indexer const indexer = new CodebaseIndexer({ - codebaseRoot: '/path/to/project', - indexPath: '.codebase-search' -}); + codebaseRoot: './my-project', + storage, +}) -// Index codebase -await indexer.index(); +// Index codebase with file watching +await indexer.index({ watch: true }) -// Hybrid search (TF-IDF + Vector) +// Search for code const results = await indexer.search('authentication logic', { limit: 10, - vectorWeight: 0.7 // 70% semantic, 30% keyword -}); - -// Pure keyword search -const keywordResults = await indexer.keywordSearch('getUserData'); + includeContent: true, +}) -// Pure semantic search -const semanticResults = await indexer.semanticSearch('database connection pool'); +console.log(results) +// [{ path: 'src/auth.ts', score: 0.85, snippet: '...', chunkType: 'FunctionDeclaration' }] ``` ## Why CodeRAG? +
+ ### Performance First Built with performance in mind from day one: -- **Incremental Updates**: Only reindex changed files (166x faster) -- **Query Caching**: LRU cache for frequently searched queries (100x faster) -- **Efficient Storage**: SQLite + HNSW for optimal disk usage +- **Fast Indexing**: 1000-2000 files/second +- **Instant Startup**: <100ms with cached index +- **Low Memory**: SQL-based search mode available +- **Incremental Updates**: Only reindex changed files + +### Chunk-Level Search -### Pure Functional Design +Returns semantic chunks, not entire files: -- Immutable data structures -- Composable functions -- Easy to test and reason about +- **Functions**: Find specific function implementations +- **Classes**: Locate class definitions +- **Methods**: Search within class methods +- **Imports**: Track dependencies ### Production Ready -- βœ… 396 tests passing -- βœ… Full TypeScript support -- βœ… Comprehensive documentation -- βœ… MCP server included +- 400+ tests passing +- Full TypeScript support +- Comprehensive documentation +- MCP server included -## Performance Benchmarks +
-| Operation | CodeRAG | Traditional | Improvement | -|-----------|----------------|-------------|-------------| -| Initial Indexing | 13.4s | 36.2s | **2.7x faster** | -| Incremental Update | 2.6s | 431.2s | **166x faster** | -| Cached Query | 0.0013s | 0.13s | **100x faster** | +## Installation -*Benchmarks on 250-file codebase, OpenAI embeddings* +::: code-group -## Get Started +```bash [npm] +npm install @sylphx/coderag +``` -
+```bash [pnpm] +pnpm add @sylphx/coderag +``` -Ready to supercharge your code search with RAG? +```bash [bun] +bun add @sylphx/coderag +``` + +::: + +## MCP Server (for AI Assistants) + +Use CodeRAG with Claude, Cursor, or any MCP-compatible AI assistant: ```bash -bun add @sylphx/coderag +npx @sylphx/coderag-mcp --root=/path/to/project ``` +Or add to your `claude_desktop_config.json`: + +```json +{ + "mcpServers": { + "coderag": { + "command": "npx", + "args": ["-y", "@sylphx/coderag-mcp", "--root=/path/to/project"] + } + } +} +``` + +
+ +Ready to get started? Check out the [Quick Start Guide](/guide/quick-start) or learn about the [MCP Server](/mcp/overview). +
+ + diff --git a/docs/mcp/configuration.md b/docs/mcp/configuration.md new file mode 100644 index 0000000..4cd517a --- /dev/null +++ b/docs/mcp/configuration.md @@ -0,0 +1,509 @@ +# Configuration Guide + +This guide covers how to configure CodeRAG MCP for different AI assistants and use cases. + +## Claude Desktop Configuration + +Claude Desktop is Anthropic's official desktop application for Claude AI. + +### Configuration File Location + +**macOS:** +``` +~/Library/Application Support/Claude/claude_desktop_config.json +``` + +**Windows:** +``` +%APPDATA%\Claude\claude_desktop_config.json +``` + +**Linux:** +``` +~/.config/Claude/claude_desktop_config.json +``` + +### Basic Configuration + +Edit `claude_desktop_config.json`: + +```json +{ + "mcpServers": { + "coderag": { + "command": "npx", + "args": ["-y", "@sylphx/coderag-mcp", "--root=/path/to/project"] + } + } +} +``` + +Replace `/path/to/project` with your project's absolute path. + +### With Semantic Search + +```json +{ + "mcpServers": { + "coderag": { + "command": "npx", + "args": ["-y", "@sylphx/coderag-mcp", "--root=/path/to/project"], + "env": { + "OPENAI_API_KEY": "sk-..." + } + } + } +} +``` + +### Advanced Configuration + +```json +{ + "mcpServers": { + "coderag": { + "command": "npx", + "args": [ + "-y", + "@sylphx/coderag-mcp", + "--root=/path/to/project", + "--max-size=2097152" + ], + "env": { + "OPENAI_API_KEY": "sk-...", + "EMBEDDING_MODEL": "text-embedding-3-large" + } + } + } +} +``` + +### Applying Changes + +1. Save `claude_desktop_config.json` +2. Quit Claude Desktop completely +3. Restart Claude Desktop +4. Server starts automatically on first tool use + +## Cursor Configuration + +Cursor is an AI-powered code editor built on VS Code. + +### Configuration File Location + +**macOS:** +``` +~/.cursor/mcp.json +``` + +**Windows:** +``` +%USERPROFILE%\.cursor\mcp.json +``` + +**Linux:** +``` +~/.cursor/mcp.json +``` + +### Basic Configuration + +Create or edit `~/.cursor/mcp.json`: + +```json +{ + "mcpServers": { + "coderag": { + "command": "npx", + "args": ["-y", "@sylphx/coderag-mcp", "--root=/path/to/project"] + } + } +} +``` + +### Workspace-Relative Path + +Use current workspace folder: + +```json +{ + "mcpServers": { + "coderag": { + "command": "npx", + "args": ["-y", "@sylphx/coderag-mcp", "--root=${workspaceFolder}"] + } + } +} +``` + +Note: `${workspaceFolder}` support depends on Cursor's MCP implementation. If unsupported, use absolute paths. + +### Applying Changes + +1. Save `mcp.json` +2. Restart Cursor +3. Server starts when MCP client initializes + +## VS Code Configuration + +VS Code supports MCP through extensions like Continue. + +### Using Continue Extension + +Install the [Continue extension](https://marketplace.visualstudio.com/items?itemName=Continue.continue) from VS Code Marketplace. + +**Configuration File Location:** + +**macOS/Linux:** +``` +~/.continue/config.json +``` + +**Windows:** +``` +%USERPROFILE%\.continue\config.json +``` + +Edit `config.json` to add MCP servers: + +```json +{ + "mcpServers": { + "coderag": { + "command": "npx", + "args": ["-y", "@sylphx/coderag-mcp", "--root=${workspaceFolder}"] + } + } +} +``` + +### Workspace Configuration + +Create `.vscode/mcp.json` in your project: + +```json +{ + "mcp": { + "servers": { + "coderag": { + "command": "npx", + "args": ["-y", "@sylphx/coderag-mcp", "--root=${workspaceFolder}"] + } + } + } +} +``` + +This configuration is project-specific and checked into version control. + +### Applying Changes + +1. Save configuration file +2. Reload VS Code window (Cmd/Ctrl+R) +3. Continue extension loads MCP servers automatically + +## Windsurf Configuration + +Windsurf is an AI-powered development environment. + +### Configuration File Location + +**macOS:** +``` +~/.codeium/windsurf/mcp_config.json +``` + +**Windows:** +``` +%USERPROFILE%\.codeium\windsurf\mcp_config.json +``` + +**Linux:** +``` +~/.codeium/windsurf/mcp_config.json +``` + +### Basic Configuration + +Create or edit `mcp_config.json`: + +```json +{ + "mcpServers": { + "coderag": { + "command": "npx", + "args": ["-y", "@sylphx/coderag-mcp", "--root=/path/to/project"] + } + } +} +``` + +### Applying Changes + +1. Save `mcp_config.json` +2. Restart Windsurf +3. Server initializes on startup + +## Multiple Project Setup + +Configure multiple CodeRAG instances for different projects. + +### Separate Servers per Project + +```json +{ + "mcpServers": { + "coderag-frontend": { + "command": "npx", + "args": ["-y", "@sylphx/coderag-mcp", "--root=/path/to/frontend"] + }, + "coderag-backend": { + "command": "npx", + "args": ["-y", "@sylphx/coderag-mcp", "--root=/path/to/backend"] + }, + "coderag-mobile": { + "command": "npx", + "args": ["-y", "@sylphx/coderag-mcp", "--root=/path/to/mobile"] + } + } +} +``` + +**Benefits:** +- Separate indexes for faster search +- Different configurations per project +- AI can specify which project to search + +**Usage:** +``` +Human: "Search the frontend codebase for authentication components" +AI: Uses coderag-frontend server +``` + +### Monorepo Configuration + +For monorepos, index the entire repository: + +```json +{ + "mcpServers": { + "coderag-monorepo": { + "command": "npx", + "args": ["-y", "@sylphx/coderag-mcp", "--root=/path/to/monorepo"] + } + } +} +``` + +Use path filters in search queries: + +```json +{ + "query": "authentication", + "path_filter": "packages/frontend" +} +``` + +## Environment-Specific Configuration + +### Development Environment + +```json +{ + "mcpServers": { + "coderag": { + "command": "npx", + "args": ["-y", "@sylphx/coderag-mcp", "--root=/path/to/project"], + "env": { + "OPENAI_API_KEY": "sk-dev-key" + } + } + } +} +``` + +### Production Environment + +```json +{ + "mcpServers": { + "coderag": { + "command": "/usr/local/bin/coderag-mcp", + "args": [ + "--root=/var/www/project", + "--max-size=5242880", + "--no-auto-index" + ], + "env": { + "OPENAI_API_KEY": "sk-prod-key", + "EMBEDDING_MODEL": "text-embedding-3-large" + } + } + } +} +``` + +## Custom Embedding Providers + +### OpenRouter + +```json +{ + "mcpServers": { + "coderag": { + "command": "npx", + "args": ["-y", "@sylphx/coderag-mcp", "--root=/path/to/project"], + "env": { + "OPENAI_API_KEY": "sk-or-v1-...", + "OPENAI_BASE_URL": "https://openrouter.ai/api/v1" + } + } + } +} +``` + +### Together AI + +```json +{ + "mcpServers": { + "coderag": { + "command": "npx", + "args": ["-y", "@sylphx/coderag-mcp", "--root=/path/to/project"], + "env": { + "OPENAI_API_KEY": "your-together-api-key", + "OPENAI_BASE_URL": "https://api.together.xyz/v1", + "EMBEDDING_MODEL": "togethercomputer/m2-bert-80M-8k-retrieval", + "EMBEDDING_DIMENSIONS": "768" + } + } + } +} +``` + +### Azure OpenAI + +```json +{ + "mcpServers": { + "coderag": { + "command": "npx", + "args": ["-y", "@sylphx/coderag-mcp", "--root=/path/to/project"], + "env": { + "OPENAI_API_KEY": "your-azure-key", + "OPENAI_BASE_URL": "https://your-resource.openai.azure.com/openai/deployments/your-deployment", + "EMBEDDING_MODEL": "text-embedding-3-small" + } + } + } +} +``` + +## Performance Tuning + +### Large Codebases + +For projects with 10,000+ files: + +```json +{ + "mcpServers": { + "coderag": { + "command": "npx", + "args": [ + "-y", + "@sylphx/coderag-mcp", + "--root=/path/to/large-project", + "--max-size=524288" + ] + } + } +} +``` + +**Tips:** +- Reduce `--max-size` to skip large generated files +- Exclude build directories (handled automatically) +- First indexing takes longer, subsequent startups are fast (<100ms) + +### Resource-Constrained Environments + +For limited memory or CPU: + +```json +{ + "mcpServers": { + "coderag": { + "command": "npx", + "args": [ + "-y", + "@sylphx/coderag-mcp", + "--root=/path/to/project", + "--max-size=262144" + ] + } + } +} +``` + +**Settings:** +- Lower `--max-size` (256 KB) +- Use keyword search only (no OPENAI_API_KEY) +- Index is stored in SQLite for low memory usage + +## Configuration Validation + +### Check Configuration Syntax + +Ensure your JSON configuration is valid: + +```bash +# macOS/Linux +cat ~/Library/Application\ Support/Claude/claude_desktop_config.json | python -m json.tool + +# Windows PowerShell +Get-Content $env:APPDATA\Claude\claude_desktop_config.json | ConvertFrom-Json +``` + +### Test Server Manually + +Test CodeRAG MCP outside of your AI assistant: + +```bash +npx @sylphx/coderag-mcp --root=/path/to/project +``` + +You should see: +``` +[INFO] Starting MCP Codebase Search Server... +[INFO] Codebase root: /path/to/project +[INFO] Max file size: 1.00 MB +[INFO] Auto-index: enabled +[SUCCESS] Indexed 1234 files +[INFO] Watching for file changes... +``` + +Press Ctrl+C to stop. + +## Troubleshooting + +**Configuration not loading:** +- Verify JSON syntax (no trailing commas, proper quotes) +- Check file path is correct for your OS +- Restart AI assistant after changes + +**Server not starting:** +- Test command manually in terminal +- Check Node.js is installed (`node --version`) +- Verify `--root` path exists + +**Multiple servers conflict:** +- Give each server a unique name +- Ensure different `--root` paths +- Check logs for port conflicts + +## Next Steps + +- [Tools Reference](./tools.md) - Learn about codebase_search parameters +- [IDE Integration](./ide-integration.md) - Detailed setup for specific IDEs +- [Installation Guide](./installation.md) - CLI arguments and environment variables diff --git a/docs/mcp/ide-integration.md b/docs/mcp/ide-integration.md new file mode 100644 index 0000000..ed5b832 --- /dev/null +++ b/docs/mcp/ide-integration.md @@ -0,0 +1,682 @@ +# IDE Integration Guide + +This guide provides step-by-step setup instructions for using CodeRAG MCP with different AI-powered development tools. + +## Claude Desktop + +Claude Desktop is Anthropic's official desktop application for Claude AI. + +### Prerequisites + +- Claude Desktop installed ([download here](https://claude.ai/download)) +- Node.js installed (v16 or later) + +### Setup Steps + +1. **Locate Configuration File** + + **macOS:** + ```bash + open ~/Library/Application\ Support/Claude/ + ``` + + **Windows:** + ```powershell + explorer %APPDATA%\Claude + ``` + + **Linux:** + ```bash + cd ~/.config/Claude/ + ``` + +2. **Edit claude_desktop_config.json** + + Create or edit `claude_desktop_config.json`: + + ```json + { + "mcpServers": { + "coderag": { + "command": "npx", + "args": ["-y", "@sylphx/coderag-mcp", "--root=/path/to/your/project"] + } + } + } + ``` + + Replace `/path/to/your/project` with your project's absolute path. + +3. **Restart Claude Desktop** + + Quit Claude Desktop completely and restart it. + +4. **Verify Setup** + + In Claude Desktop, ask: "Search the codebase for authentication" + + Claude should use the `codebase_search` tool and return results. + +### Enable Semantic Search + +Add your OpenAI API key to enable natural language queries: + +```json +{ + "mcpServers": { + "coderag": { + "command": "npx", + "args": ["-y", "@sylphx/coderag-mcp", "--root=/path/to/project"], + "env": { + "OPENAI_API_KEY": "sk-..." + } + } + } +} +``` + +### Multiple Projects + +Configure multiple CodeRAG instances: + +```json +{ + "mcpServers": { + "frontend": { + "command": "npx", + "args": ["-y", "@sylphx/coderag-mcp", "--root=/path/to/frontend"] + }, + "backend": { + "command": "npx", + "args": ["-y", "@sylphx/coderag-mcp", "--root=/path/to/backend"] + } + } +} +``` + +Tell Claude which project to search: +``` +"Search the frontend codebase for authentication components" +``` + +### Troubleshooting + +**Server doesn't start:** +- Check logs: `~/Library/Logs/Claude/mcp*.log` (macOS) +- Verify Node.js is installed: `node --version` +- Test manually: `npx @sylphx/coderag-mcp --root=/path/to/project` + +**No results returned:** +- Wait for initial indexing (check logs) +- Verify project path exists +- Try a broader search query + +## Cursor + +Cursor is an AI-powered code editor built on VS Code. + +### Prerequisites + +- Cursor installed ([download here](https://cursor.sh)) +- Node.js installed (v16 or later) + +### Setup Steps + +1. **Locate Configuration File** + + **macOS:** + ```bash + mkdir -p ~/.cursor + touch ~/.cursor/mcp.json + ``` + + **Windows:** + ```powershell + New-Item -Path "$env:USERPROFILE\.cursor" -ItemType Directory -Force + New-Item -Path "$env:USERPROFILE\.cursor\mcp.json" -ItemType File + ``` + + **Linux:** + ```bash + mkdir -p ~/.cursor + touch ~/.cursor/mcp.json + ``` + +2. **Edit mcp.json** + + Add CodeRAG MCP configuration: + + ```json + { + "mcpServers": { + "coderag": { + "command": "npx", + "args": ["-y", "@sylphx/coderag-mcp", "--root=/path/to/project"] + } + } + } + ``` + +3. **Restart Cursor** + + Close and reopen Cursor. + +4. **Verify Setup** + + Ask Cursor's AI: "Search the codebase for authentication" + + The AI should invoke the `codebase_search` tool. + +### Per-Workspace Configuration + +Create `.cursor/mcp.json` in your project root: + +```json +{ + "mcpServers": { + "coderag": { + "command": "npx", + "args": ["-y", "@sylphx/coderag-mcp", "--root=${workspaceFolder}"] + } + } +} +``` + +**Benefits:** +- Configuration travels with project +- Team members get same setup +- No hardcoded paths + +**Note:** `${workspaceFolder}` support depends on Cursor's MCP implementation. If unsupported, use relative paths or absolute paths. + +### Troubleshooting + +**MCP servers not loading:** +- Verify JSON syntax (use a JSON validator) +- Check Cursor's developer console for errors +- Ensure Node.js is in PATH + +**Slow startup:** +- First run indexes the codebase (may take time) +- Subsequent startups are fast (<100ms) + +## VS Code with Continue + +VS Code supports MCP through the Continue extension. + +### Prerequisites + +- VS Code installed +- [Continue extension](https://marketplace.visualstudio.com/items?itemName=Continue.continue) installed +- Node.js installed (v16 or later) + +### Setup Steps + +1. **Install Continue Extension** + + In VS Code: + - Open Extensions (Cmd/Ctrl+Shift+X) + - Search for "Continue" + - Click Install + +2. **Locate Configuration File** + + **macOS/Linux:** + ```bash + mkdir -p ~/.continue + ``` + + **Windows:** + ```powershell + New-Item -Path "$env:USERPROFILE\.continue" -ItemType Directory -Force + ``` + +3. **Edit Continue Config** + + Open `~/.continue/config.json` (create if doesn't exist): + + ```json + { + "mcpServers": { + "coderag": { + "command": "npx", + "args": ["-y", "@sylphx/coderag-mcp", "--root=${workspaceFolder}"] + } + } + } + ``` + +4. **Reload VS Code** + + Press Cmd/Ctrl+R to reload the window. + +5. **Verify Setup** + + Open Continue chat panel and ask: "Search the codebase for authentication" + +### Workspace Configuration + +Create `.vscode/mcp.json` in your project: + +```json +{ + "mcp": { + "servers": { + "coderag": { + "command": "npx", + "args": ["-y", "@sylphx/coderag-mcp", "--root=${workspaceFolder}"] + } + } + } +} +``` + +This allows project-specific MCP configuration. + +### Troubleshooting + +**Continue doesn't see MCP servers:** +- Verify extension is installed and enabled +- Check `~/.continue/config.json` syntax +- Reload VS Code window + +**Server fails to start:** +- Test command manually in terminal +- Check Continue output panel for errors +- Ensure `--root` path exists + +## Windsurf + +Windsurf is an AI-powered development environment by Codeium. + +### Prerequisites + +- Windsurf installed ([download here](https://codeium.com/windsurf)) +- Node.js installed (v16 or later) + +### Setup Steps + +1. **Locate Configuration File** + + **macOS:** + ```bash + mkdir -p ~/.codeium/windsurf + touch ~/.codeium/windsurf/mcp_config.json + ``` + + **Windows:** + ```powershell + New-Item -Path "$env:USERPROFILE\.codeium\windsurf" -ItemType Directory -Force + New-Item -Path "$env:USERPROFILE\.codeium\windsurf\mcp_config.json" -ItemType File + ``` + + **Linux:** + ```bash + mkdir -p ~/.codeium/windsurf + touch ~/.codeium/windsurf/mcp_config.json + ``` + +2. **Edit mcp_config.json** + + Add CodeRAG configuration: + + ```json + { + "mcpServers": { + "coderag": { + "command": "npx", + "args": ["-y", "@sylphx/coderag-mcp", "--root=/path/to/project"] + } + } + } + ``` + +3. **Restart Windsurf** + + Close and reopen Windsurf. + +4. **Verify Setup** + + Ask Windsurf AI: "Search the codebase for authentication" + +### Workspace-Specific Configuration + +For project-specific setup, adjust the `--root` path: + +```json +{ + "mcpServers": { + "coderag-project1": { + "command": "npx", + "args": ["-y", "@sylphx/coderag-mcp", "--root=/path/to/project1"] + }, + "coderag-project2": { + "command": "npx", + "args": ["-y", "@sylphx/coderag-mcp", "--root=/path/to/project2"] + } + } +} +``` + +### Troubleshooting + +**Server not visible:** +- Check `mcp_config.json` syntax +- Verify directory structure +- Restart Windsurf + +**Indexing takes too long:** +- Reduce `--max-size`: `"--max-size=524288"` (512KB) +- Check for large generated files +- Monitor logs for progress + +## Claude Code + +Claude Code is Anthropic's CLI tool for Claude AI. + +### Prerequisites + +- Claude Code CLI installed +- Node.js installed (v16 or later) + +### Setup Steps + +1. **Add MCP Server** + + ```bash + claude mcp add coderag -- npx -y @sylphx/coderag-mcp --root=/path/to/project + ``` + +2. **Verify Configuration** + + ```bash + claude mcp list + ``` + + Should show `coderag` server. + +3. **Test Search** + + ```bash + claude chat "Search the codebase for authentication" + ``` + +### Update Server + +```bash +claude mcp remove coderag +claude mcp add coderag -- npx -y @sylphx/coderag-mcp --root=/path/to/project +``` + +### Troubleshooting + +**Server not found:** +- Run `claude mcp list` to verify +- Check `~/.config/claude/mcp.json` (macOS/Linux) or `%APPDATA%\Claude\mcp.json` (Windows) + +## Other MCP Clients + +CodeRAG MCP works with any MCP-compatible client. General setup pattern: + +### Generic MCP Configuration + +```json +{ + "mcpServers": { + "coderag": { + "command": "npx", + "args": ["-y", "@sylphx/coderag-mcp", "--root=/path/to/project"] + } + } +} +``` + +### Known Compatible Clients + +- **Claude Desktop** - Official Anthropic client +- **Cursor** - AI code editor +- **Continue** - VS Code extension +- **Windsurf** - Codeium IDE +- **Claude Code** - CLI tool +- **Zed** - Modern code editor (MCP support coming) +- **Custom MCP Clients** - Any client implementing MCP protocol + +### Integration Steps + +1. Locate client's MCP configuration file (usually `~/.client-name/mcp.json`) +2. Add CodeRAG server entry +3. Restart client +4. Test with codebase search query + +## Advanced Integration Patterns + +### Dynamic Project Selection + +Configure multiple projects and let AI choose: + +```json +{ + "mcpServers": { + "web-frontend": { + "command": "npx", + "args": ["-y", "@sylphx/coderag-mcp", "--root=/projects/web-frontend"] + }, + "web-backend": { + "command": "npx", + "args": ["-y", "@sylphx/coderag-mcp", "--root=/projects/web-backend"] + }, + "mobile-app": { + "command": "npx", + "args": ["-y", "@sylphx/coderag-mcp", "--root=/projects/mobile"] + } + } +} +``` + +Usage: +``` +"Search the web-backend codebase for API authentication" +``` + +### Environment-Specific Configuration + +Development and production environments: + +```json +{ + "mcpServers": { + "coderag-dev": { + "command": "npx", + "args": ["-y", "@sylphx/coderag-mcp", "--root=/dev/project"], + "env": { + "OPENAI_API_KEY": "sk-dev-..." + } + }, + "coderag-prod": { + "command": "/usr/local/bin/coderag-mcp", + "args": ["--root=/var/www/project", "--max-size=5242880"], + "env": { + "OPENAI_API_KEY": "sk-prod-..." + } + } + } +} +``` + +### Monorepo Setup + +Index entire monorepo with path filtering: + +```json +{ + "mcpServers": { + "monorepo": { + "command": "npx", + "args": ["-y", "@sylphx/coderag-mcp", "--root=/path/to/monorepo"] + } + } +} +``` + +Use path filters in queries: +```json +{ + "query": "authentication", + "path_filter": "packages/backend" +} +``` + +## Performance Optimization + +### Large Codebases + +For projects with 10,000+ files: + +```json +{ + "mcpServers": { + "coderag": { + "command": "npx", + "args": [ + "-y", + "@sylphx/coderag-mcp", + "--root=/path/to/large-project", + "--max-size=262144" + ] + } + } +} +``` + +**Settings:** +- `--max-size=262144` (256 KB) skips large files +- First indexing: 10-60 seconds +- Subsequent startups: <100ms +- Search: <50ms + +### Resource-Constrained Environments + +For limited memory/CPU: + +```json +{ + "mcpServers": { + "coderag": { + "command": "npx", + "args": [ + "-y", + "@sylphx/coderag-mcp", + "--root=/path/to/project", + "--max-size=131072" + ] + } + } +} +``` + +**Settings:** +- `--max-size=131072` (128 KB) +- No `OPENAI_API_KEY` (keyword search only) +- SQLite-based storage (low memory) + +## Debugging Integration Issues + +### Enable Detailed Logging + +Most MCP clients log server output. Check these locations: + +**Claude Desktop:** +- macOS: `~/Library/Logs/Claude/` +- Windows: `%APPDATA%\Claude\logs\` +- Linux: `~/.config/Claude/logs/` + +**Cursor:** +- Developer Tools β†’ Console +- Look for MCP-related messages + +**VS Code (Continue):** +- Output panel β†’ Continue +- Check for server startup errors + +### Test Server Manually + +Run CodeRAG MCP outside the client: + +```bash +npx @sylphx/coderag-mcp --root=/path/to/project +``` + +Expected output: +``` +[INFO] Starting MCP Codebase Search Server... +[INFO] Codebase root: /path/to/project +[SUCCESS] Indexed 1234 files +``` + +If this works but integration doesn't, the issue is with client configuration. + +### Common Issues + +**Server starts but no results:** +- Wait for indexing to complete +- Check logs for indexing progress +- Verify files exist in `--root` path + +**JSON syntax errors:** +- Use JSON validator +- Check for trailing commas +- Ensure proper quote escaping + +**Command not found:** +- Verify Node.js is in PATH +- Try absolute path: `/usr/local/bin/node` +- Use global install: `npm install -g @sylphx/coderag-mcp` + +## Best Practices + +### Configuration Management + +**Use Version Control:** +- Check in `.vscode/mcp.json` or `.cursor/mcp.json` +- Team members get automatic setup +- Use `${workspaceFolder}` for portability + +**Document Setup:** +- Add MCP setup to project README +- Include example queries +- Note required environment variables + +### Query Patterns + +**Proactive Search:** +``` +"Before implementing authentication, search for existing auth patterns" +``` + +**Exploratory Search:** +``` +"Show me how error handling is done in this codebase" +``` + +**Targeted Search:** +``` +"Find all API routes that handle user creation" +``` + +### Security Considerations + +**API Keys:** +- Never commit API keys to version control +- Use environment variables +- Rotate keys regularly + +**Sensitive Code:** +- Use `exclude_paths` for sensitive directories +- Consider separate indexes for public/private code +- Review search results before sharing + +## Next Steps + +- [Tools Reference](./tools.md) - Learn all codebase_search parameters +- [Configuration Guide](./configuration.md) - Advanced configuration options +- [Installation Guide](./installation.md) - CLI arguments and environment variables diff --git a/docs/mcp/installation.md b/docs/mcp/installation.md new file mode 100644 index 0000000..8f371ef --- /dev/null +++ b/docs/mcp/installation.md @@ -0,0 +1,285 @@ +# Installation Guide + +## Quick Start with npx + +The fastest way to run CodeRAG MCP is using `npx`: + +```bash +npx @sylphx/coderag-mcp --root=/path/to/your/project +``` + +This command: +- Downloads and runs the latest version +- Indexes the specified codebase +- Starts the MCP server +- Watches for file changes + +**When to use npx:** +- Quick testing or one-time use +- Always want the latest version +- Don't want to install globally + +## Global Installation + +Install CodeRAG MCP globally for faster startup: + +```bash +npm install -g @sylphx/coderag-mcp +``` + +Then run: + +```bash +coderag-mcp --root=/path/to/your/project +``` + +**When to use global install:** +- Faster startup (no download on each run) +- Stable version for production use +- Multiple projects on the same machine + +## CLI Arguments + +### `--root=` + +Specify the codebase root directory to index. + +```bash +npx @sylphx/coderag-mcp --root=/Users/you/projects/my-app +``` + +**Default:** Current working directory + +**Notes:** +- Path can be absolute or relative +- Directory must exist and be readable +- Creates `.coderag/` folder inside this directory + +### `--max-size=` + +Maximum file size to index (in bytes). + +```bash +npx @sylphx/coderag-mcp --root=/path/to/project --max-size=2097152 +``` + +**Default:** `1048576` (1 MB) + +**Common Values:** +- 512 KB: `--max-size=524288` +- 1 MB: `--max-size=1048576` (default) +- 2 MB: `--max-size=2097152` +- 5 MB: `--max-size=5242880` + +**When to adjust:** +- Increase for codebases with large auto-generated files +- Decrease for faster indexing on resource-constrained machines +- Files larger than this limit are skipped during indexing + +### --no-auto-index + +Disable automatic indexing on startup. + +```bash +npx @sylphx/coderag-mcp --root=/path/to/project --no-auto-index +``` + +**Default:** Auto-indexing enabled + +**When to use:** +- Manual control over indexing timing +- Testing server functionality without indexing +- Very large codebases where indexing takes time + +**Note:** You must manually trigger indexing by calling the `codebase_search` tool, which will index on first search. + +## Environment Variables + +CodeRAG MCP supports environment variables for configuration. + +### OPENAI_API_KEY + +Enable semantic search with OpenAI embeddings. + +```bash +export OPENAI_API_KEY=sk-... +npx @sylphx/coderag-mcp --root=/path/to/project +``` + +**Effect:** +- Switches from keyword search to semantic search mode +- Uses `text-embedding-3-small` by default +- Enables natural language queries + +**Without this variable:** +- Uses keyword-only search (TF-IDF) +- Still very effective for code search +- No external API calls + +### OPENAI_BASE_URL + +Use OpenAI-compatible embedding endpoints (OpenRouter, Together AI, etc.). + +```bash +export OPENAI_API_KEY=your-api-key +export OPENAI_BASE_URL=https://openrouter.ai/api/v1 +npx @sylphx/coderag-mcp --root=/path/to/project +``` + +**Use Cases:** +- OpenRouter for multi-provider access +- Together AI for faster/cheaper embeddings +- Local embedding servers (e.g., text-embeddings-inference) +- Azure OpenAI endpoints + +### EMBEDDING_MODEL + +Specify custom embedding model. + +```bash +export OPENAI_API_KEY=sk-... +export EMBEDDING_MODEL=text-embedding-3-large +npx @sylphx/coderag-mcp --root=/path/to/project +``` + +**Default:** `text-embedding-3-small` + +**Supported OpenAI Models:** +- `text-embedding-3-small` (1536 dims, default) +- `text-embedding-3-large` (3072 dims, higher quality) +- `text-embedding-ada-002` (1536 dims, legacy) + +**Custom Models:** +- Specify any model name for OpenAI-compatible endpoints +- Must also set `EMBEDDING_DIMENSIONS` for custom models + +### EMBEDDING_DIMENSIONS + +Override embedding dimensions for custom models. + +```bash +export OPENAI_API_KEY=your-key +export OPENAI_BASE_URL=https://api.together.xyz/v1 +export EMBEDDING_MODEL=togethercomputer/m2-bert-80M-8k-retrieval +export EMBEDDING_DIMENSIONS=768 +npx @sylphx/coderag-mcp --root=/path/to/project +``` + +**When to use:** +- Custom embedding models with non-standard dimensions +- Automatically detected for standard OpenAI models + +## MCP Configuration Examples + +### Basic Setup (Keyword Search) + +```json +{ + "mcpServers": { + "coderag": { + "command": "npx", + "args": ["-y", "@sylphx/coderag-mcp", "--root=/path/to/project"] + } + } +} +``` + +### With Semantic Search + +```json +{ + "mcpServers": { + "coderag": { + "command": "npx", + "args": ["-y", "@sylphx/coderag-mcp", "--root=/path/to/project"], + "env": { + "OPENAI_API_KEY": "sk-..." + } + } + } +} +``` + +### Multiple Projects + +```json +{ + "mcpServers": { + "coderag-frontend": { + "command": "npx", + "args": ["-y", "@sylphx/coderag-mcp", "--root=/path/to/frontend"] + }, + "coderag-backend": { + "command": "npx", + "args": ["-y", "@sylphx/coderag-mcp", "--root=/path/to/backend"] + } + } +} +``` + +### Custom Settings + +```json +{ + "mcpServers": { + "coderag": { + "command": "npx", + "args": [ + "-y", + "@sylphx/coderag-mcp", + "--root=/path/to/project", + "--max-size=2097152", + "--no-auto-index" + ], + "env": { + "OPENAI_API_KEY": "sk-...", + "EMBEDDING_MODEL": "text-embedding-3-large" + } + } + } +} +``` + +## Verifying Installation + +After adding CodeRAG MCP to your configuration: + +1. **Restart your AI assistant** (Claude Desktop, Cursor, etc.) + +2. **Check server logs:** + - Claude Desktop: Check `~/Library/Logs/Claude/` (macOS) or `%APPDATA%\Claude\logs\` (Windows) + - Cursor: Check console output in Developer Tools + - Look for "Starting MCP Codebase Search Server" message + +3. **Test search:** + - Ask your AI: "Search the codebase for authentication" + - AI should use the `codebase_search` tool + - Results should appear in markdown format + +## Troubleshooting + +**Server doesn't start:** +- Check that Node.js is installed (`node --version`) +- Verify the `--root` path exists and is readable +- Check MCP config file syntax (valid JSON) + +**Search returns no results:** +- Wait for initial indexing to complete (check logs) +- Verify files exist in the specified `--root` directory +- Check that file extensions are supported + +**Semantic search not working:** +- Verify `OPENAI_API_KEY` is set correctly +- Check OpenAI API quota and permissions +- Look for "Semantic search enabled" in server logs + +**Indexing is slow:** +- Reduce `--max-size` to skip large files +- Check for large auto-generated files (e.g., `dist/`, `build/`) +- Consider adding `.coderagignore` file (future feature) + +## Next Steps + +- [Configuration Guide](./configuration.md) - Configure for your AI assistant +- [Tools Reference](./tools.md) - Learn about the codebase_search tool +- [IDE Integration](./ide-integration.md) - Setup for specific IDEs diff --git a/docs/mcp/overview.md b/docs/mcp/overview.md new file mode 100644 index 0000000..6d3110f --- /dev/null +++ b/docs/mcp/overview.md @@ -0,0 +1,141 @@ +# MCP Server Overview + +## What is the Model Context Protocol? + +The Model Context Protocol (MCP) is an open standard for connecting AI assistants to external tools and data sources. It enables AI applications like Claude Desktop, Cursor, and VS Code to access custom functionality through a standardized interface. + +MCP uses a client-server architecture where: +- **MCP Clients** are AI applications (Claude Desktop, Cursor, etc.) +- **MCP Servers** provide tools and data to the client +- **Tools** are functions the AI can invoke to accomplish tasks + +## What is CodeRAG MCP? + +CodeRAG MCP (`@sylphx/coderag-mcp`) is an MCP server that provides intelligent codebase search capabilities to AI assistants. It enables AI to search and understand your codebase using hybrid TF-IDF and optional vector embeddings. + +**Key Benefits:** + +- **Zero Dependencies**: No Docker, no databases, no external services required +- **Fast**: <50ms search latency, instant startup with cached index +- **Offline**: Works entirely offline (except optional vector search) +- **Smart**: Hybrid TF-IDF + optional OpenAI embeddings for semantic understanding +- **Automatic**: Auto-indexes on startup, watches for file changes + +## How CodeRAG MCP Works + +CodeRAG MCP runs as a background process that your AI assistant communicates with via standard input/output. When you ask the AI to search your codebase, it calls the MCP server, which performs the search and returns relevant code snippets. + +**Architecture:** + +``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ AI Assistant β”‚ (Claude Desktop, Cursor, etc.) +β”‚ (MCP Client) β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ MCP Protocol (stdio) + β”‚ +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β–Όβ”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ CodeRAG MCP β”‚ Provides codebase_search tool +β”‚ MCP Server β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β–Όβ”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ .coderag/ β”‚ SQLite index cache +β”‚ index.db β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +**Workflow:** + +1. **Startup**: Server indexes your codebase on first run (1000-2000 files/sec) +2. **Caching**: Index stored in `.coderag/` folder for instant subsequent startups +3. **Watching**: Automatically detects file changes and updates index incrementally +4. **Search**: AI calls `codebase_search` tool with natural language or keyword queries +5. **Results**: Server returns ranked code snippets in LLM-optimized markdown format + +## Available Tool: codebase_search + +CodeRAG MCP provides a single tool: `codebase_search` + +**Search Modes:** + +1. **Keyword Search** (default): TF-IDF ranking with code-aware tokenization + - Use specific terms, function names, error messages + - Example: "getUserById authentication" + +2. **Semantic Search** (with OPENAI_API_KEY): AI embeddings + TF-IDF fusion + - Use natural language descriptions + - Example: "code that handles user login with JWT tokens" + +**Key Features:** + +- **Fast Ranking**: Hybrid TF-IDF with StarCoder2 tokenizer (4.7MB, trained on code) +- **Smart Filtering**: Filter by file extension, path pattern, or exclude paths +- **Context-Aware**: Returns code snippets with line numbers and syntax highlighting +- **AST Chunking**: Splits code at semantic boundaries (functions, classes, etc.) +- **LLM-Optimized Output**: Minimal token usage, maximum content density + +## Use Cases with AI Assistants + +**Before Implementation:** + +``` +Human: "Add JWT authentication to the API" +AI: Uses codebase_search("authentication JWT") to find existing auth patterns +AI: Implements new feature following existing patterns +``` + +**Code Understanding:** + +``` +Human: "How does error handling work in this project?" +AI: Uses codebase_search("error handling try catch") to find examples +AI: Explains the error handling patterns used +``` + +**Debugging:** + +``` +Human: "Why is the database connection failing?" +AI: Uses codebase_search("database connection retry") to find relevant code +AI: Identifies issue and suggests fix +``` + +**Refactoring:** + +``` +Human: "Extract common validation logic into a utility" +AI: Uses codebase_search("validation schema") to find all validation code +AI: Creates utility and updates references +``` + +## Performance Characteristics + +| Metric | Value | +|--------|-------| +| Initial indexing | 1000-2000 files/sec | +| Startup with cache | <100ms | +| Search latency | <50ms | +| Memory per 1000 files | ~1-2 MB | +| Tokenizer size | 4.7MB (StarCoder2) | + +## Supported Languages + +AST-based chunking with semantic boundary detection: + +| Category | Languages | +|----------|-----------| +| **JavaScript** | JavaScript, TypeScript, JSX, TSX | +| **Systems** | Python, Go, Java, C, Rust | +| **Markup** | Markdown, HTML, XML | +| **Data/Config** | JSON, YAML, TOML, INI | +| **Other** | Protobuf | + +**Embedded Code Support**: Automatically parses code blocks in Markdown files and `