From 6c26fa709e2a2b486e8ff81f09fadf58f146b0dd Mon Sep 17 00:00:00 2001 From: perf3ct Date: Wed, 27 Aug 2025 21:11:44 +0000 Subject: [PATCH 1/3] feat(quick_search): just fuzzy match note titles for larger notes, while still matching on exact strings --- .../expressions/note_content_fulltext.ts | 31 +++++++++++++++++++ .../src/services/search/utils/text_utils.ts | 2 ++ 2 files changed, 33 insertions(+) diff --git a/apps/server/src/services/search/expressions/note_content_fulltext.ts b/apps/server/src/services/search/expressions/note_content_fulltext.ts index f1e1bf95ff..ac2c882027 100644 --- a/apps/server/src/services/search/expressions/note_content_fulltext.ts +++ b/apps/server/src/services/search/expressions/note_content_fulltext.ts @@ -119,7 +119,38 @@ class NoteContentFulltextExp extends Expression { return; // Content too large or invalid } content = processedContent; + + // Check if this is a large note that needs optimized search strategy + const wordCount = content.split(/\s+/).length; + const isLargeNote = wordCount > FUZZY_SEARCH_CONFIG.LARGE_NOTE_THRESHOLD; + const isFuzzyOperator = this.operator === "~=" || this.operator === "~*"; + + // For large notes with fuzzy operators, switch to optimized strategy + if (isLargeNote && isFuzzyOperator) { + const note = becca.notes[noteId]; + const title = note.title || ""; + + log.info(`Note ${noteId} has ${wordCount} words - using optimized search (fuzzy on title, exact on content)`); + + // Perform fuzzy search on title + const titleMatches = this.fuzzyMatchToken(normalizeSearchText(this.tokens[0]), normalizeSearchText(title)); + + // Perform exact match on content for all tokens + const contentMatches = this.tokens.every(token => { + const normalizedToken = normalizeSearchText(token); + const normalizedContent = normalizeSearchText(content); + return normalizedContent.includes(normalizedToken); + }); + + // Add to results if either title matches with fuzzy or content matches exactly + if (titleMatches || contentMatches) { + resultNoteSet.add(becca.notes[noteId]); + } + + return content; + } + // Standard search logic for non-large notes or non-fuzzy operators if (this.tokens.length === 1) { const [token] = this.tokens; diff --git a/apps/server/src/services/search/utils/text_utils.ts b/apps/server/src/services/search/utils/text_utils.ts index 9274241cbc..9bba242213 100644 --- a/apps/server/src/services/search/utils/text_utils.ts +++ b/apps/server/src/services/search/utils/text_utils.ts @@ -14,6 +14,8 @@ export const FUZZY_SEARCH_CONFIG = { MAX_EDIT_DISTANCE: 2, // Maximum proximity distance for phrase matching (in words) MAX_PHRASE_PROXIMITY: 10, + // Large note threshold - above this, use optimized search strategy + LARGE_NOTE_THRESHOLD: 50000, // 50K words - switch to title-only fuzzy for performance // Absolute hard limits for extreme cases - only to prevent system crashes ABSOLUTE_MAX_CONTENT_SIZE: 100 * 1024 * 1024, // 100MB - extreme upper limit to prevent OOM ABSOLUTE_MAX_WORD_COUNT: 2000000, // 2M words - extreme upper limit for word processing From 93e8459d4bf2b0d982a05ca78bb8edeef6bbccb0 Mon Sep 17 00:00:00 2001 From: perf3ct Date: Wed, 27 Aug 2025 22:33:38 +0000 Subject: [PATCH 2/3] feat(quick_search): remove some old variables that are no longer used now --- .../expressions/note_content_fulltext.ts | 5 ---- .../src/services/search/utils/text_utils.ts | 29 ++++--------------- 2 files changed, 6 insertions(+), 28 deletions(-) diff --git a/apps/server/src/services/search/expressions/note_content_fulltext.ts b/apps/server/src/services/search/expressions/note_content_fulltext.ts index ac2c882027..38206d1c8d 100644 --- a/apps/server/src/services/search/expressions/note_content_fulltext.ts +++ b/apps/server/src/services/search/expressions/note_content_fulltext.ts @@ -281,11 +281,6 @@ class NoteContentFulltextExp extends Expression { return false; } - // Warn about large word counts but still attempt matching - if (words.length > FUZZY_SEARCH_CONFIG.PERFORMANCE_WARNING_WORDS) { - console.info(`Large word count for phrase matching: ${words.length} words - may take longer but will attempt full matching`); - } - // Find positions of each token const tokenPositions: number[][] = this.tokens.map(token => { const normalizedToken = normalizeSearchText(token); diff --git a/apps/server/src/services/search/utils/text_utils.ts b/apps/server/src/services/search/utils/text_utils.ts index 9bba242213..026ad79243 100644 --- a/apps/server/src/services/search/utils/text_utils.ts +++ b/apps/server/src/services/search/utils/text_utils.ts @@ -14,17 +14,11 @@ export const FUZZY_SEARCH_CONFIG = { MAX_EDIT_DISTANCE: 2, // Maximum proximity distance for phrase matching (in words) MAX_PHRASE_PROXIMITY: 10, - // Large note threshold - above this, use optimized search strategy + // Large note threshold - above this, use optimized search strategy (fuzzy on title only) LARGE_NOTE_THRESHOLD: 50000, // 50K words - switch to title-only fuzzy for performance // Absolute hard limits for extreme cases - only to prevent system crashes ABSOLUTE_MAX_CONTENT_SIZE: 100 * 1024 * 1024, // 100MB - extreme upper limit to prevent OOM ABSOLUTE_MAX_WORD_COUNT: 2000000, // 2M words - extreme upper limit for word processing - // Performance warning thresholds - inform user but still attempt search - PERFORMANCE_WARNING_SIZE: 5 * 1024 * 1024, // 5MB - warn about potential performance impact - PERFORMANCE_WARNING_WORDS: 100000, // 100K words - warn about word count impact - // Progressive processing thresholds for very large content - PROGRESSIVE_PROCESSING_SIZE: 10 * 1024 * 1024, // 10MB - use progressive processing - PROGRESSIVE_PROCESSING_WORDS: 500000, // 500K words - use progressive processing // Performance thresholds EARLY_TERMINATION_THRESHOLD: 3, } as const; @@ -206,7 +200,8 @@ export function validateFuzzySearchTokens(tokens: string[], operator: string): { /** * Validates and preprocesses content for search operations. - * Philosophy: Try to search everything! Only block truly extreme cases that could crash the system. + * Only blocks truly extreme cases that could crash the system. + * Large notes (>50K words) are handled with optimized search strategy instead. * * @param content The content to validate and preprocess * @param noteId The note ID (for logging purposes) @@ -224,12 +219,7 @@ export function validateAndPreprocessContent(content: string, noteId?: string): return content.substring(0, FUZZY_SEARCH_CONFIG.ABSOLUTE_MAX_CONTENT_SIZE); } - // Warn about very large content but still process it - if (content.length > FUZZY_SEARCH_CONFIG.PERFORMANCE_WARNING_SIZE) { - console.info(`Large content for note ${noteId || 'unknown'}: ${content.length} bytes - processing may take time but will attempt full search`); - } - - // For word count, be even more permissive - only block truly extreme cases + // For word count, only block truly extreme cases const wordCount = content.split(/\s+/).length; if (wordCount > FUZZY_SEARCH_CONFIG.ABSOLUTE_MAX_WORD_COUNT) { console.error(`Word count exceeds absolute system limit for note ${noteId || 'unknown'}: ${wordCount} words - this could cause system instability`); @@ -237,15 +227,8 @@ export function validateAndPreprocessContent(content: string, noteId?: string): return content.split(/\s+/).slice(0, FUZZY_SEARCH_CONFIG.ABSOLUTE_MAX_WORD_COUNT).join(' '); } - // Warn about high word counts but still process them - if (wordCount > FUZZY_SEARCH_CONFIG.PERFORMANCE_WARNING_WORDS) { - console.info(`High word count for note ${noteId || 'unknown'}: ${wordCount} words - phrase matching may take time but will attempt full search`); - } - - // Progressive processing warning for very large content - if (content.length > FUZZY_SEARCH_CONFIG.PROGRESSIVE_PROCESSING_SIZE || wordCount > FUZZY_SEARCH_CONFIG.PROGRESSIVE_PROCESSING_WORDS) { - console.info(`Very large content for note ${noteId || 'unknown'} - using progressive processing to maintain responsiveness`); - } + // Notes above LARGE_NOTE_THRESHOLD will use optimized search strategy + // (handled in note_content_fulltext.ts) return content; } From 912bc61730c773b5ec62f760c3aa548ccccd31e9 Mon Sep 17 00:00:00 2001 From: perf3ct Date: Thu, 28 Aug 2025 18:56:06 +0000 Subject: [PATCH 3/3] feat(search): also limit note content that can be searched, but keep searchability of titles --- .../expressions/note_content_fulltext.ts | 47 +++++++++++++++++-- .../src/services/search/utils/text_utils.ts | 6 ++- 2 files changed, 46 insertions(+), 7 deletions(-) diff --git a/apps/server/src/services/search/expressions/note_content_fulltext.ts b/apps/server/src/services/search/expressions/note_content_fulltext.ts index 38206d1c8d..0f0bbac859 100644 --- a/apps/server/src/services/search/expressions/note_content_fulltext.ts +++ b/apps/server/src/services/search/expressions/note_content_fulltext.ts @@ -120,17 +120,54 @@ class NoteContentFulltextExp extends Expression { } content = processedContent; - // Check if this is a large note that needs optimized search strategy - const wordCount = content.split(/\s+/).length; - const isLargeNote = wordCount > FUZZY_SEARCH_CONFIG.LARGE_NOTE_THRESHOLD; + // Check note size and determine search strategy + const contentSize = content.length; + const isExtremeNote = contentSize > FUZZY_SEARCH_CONFIG.EXTREME_NOTE_SIZE_THRESHOLD; + const isLargeNote = contentSize > FUZZY_SEARCH_CONFIG.LARGE_NOTE_SIZE_THRESHOLD; const isFuzzyOperator = this.operator === "~=" || this.operator === "~*"; - // For large notes with fuzzy operators, switch to optimized strategy + // For extremely large notes (>5MB), only search title regardless of operator + if (isExtremeNote) { + const note = becca.notes[noteId]; + const title = note.title || ""; + + log.info(`Note ${noteId} is ${(contentSize / (1024 * 1024)).toFixed(1)}MB - searching title only due to extreme size`); + + // For fuzzy operators, use fuzzy matching on title + // For other operators, use exact/wildcard matching on title + const normalizedTitle = normalizeSearchText(title); + let titleMatches = false; + + if (isFuzzyOperator) { + titleMatches = this.tokens.some(token => + this.fuzzyMatchToken(normalizeSearchText(token), normalizedTitle) + ); + } else { + // Apply the operator to title matching + titleMatches = this.tokens.every(token => { + const normalizedToken = normalizeSearchText(token); + if (this.operator === "*=*") return normalizedTitle.includes(normalizedToken); + if (this.operator === "=") return normalizedTitle === normalizedToken; + if (this.operator === "!=") return normalizedTitle !== normalizedToken; + if (this.operator === "*=") return normalizedTitle.endsWith(normalizedToken); + if (this.operator === "=*") return normalizedTitle.startsWith(normalizedToken); + return false; + }); + } + + if (titleMatches) { + resultNoteSet.add(becca.notes[noteId]); + } + + return content; + } + + // For large notes (250KB-5MB) with fuzzy operators, use optimized strategy if (isLargeNote && isFuzzyOperator) { const note = becca.notes[noteId]; const title = note.title || ""; - log.info(`Note ${noteId} has ${wordCount} words - using optimized search (fuzzy on title, exact on content)`); + log.info(`Note ${noteId} is ${(contentSize / 1024).toFixed(1)}KB - using optimized search (fuzzy on title, exact on content)`); // Perform fuzzy search on title const titleMatches = this.fuzzyMatchToken(normalizeSearchText(this.tokens[0]), normalizeSearchText(title)); diff --git a/apps/server/src/services/search/utils/text_utils.ts b/apps/server/src/services/search/utils/text_utils.ts index 026ad79243..c828d39833 100644 --- a/apps/server/src/services/search/utils/text_utils.ts +++ b/apps/server/src/services/search/utils/text_utils.ts @@ -15,7 +15,9 @@ export const FUZZY_SEARCH_CONFIG = { // Maximum proximity distance for phrase matching (in words) MAX_PHRASE_PROXIMITY: 10, // Large note threshold - above this, use optimized search strategy (fuzzy on title only) - LARGE_NOTE_THRESHOLD: 50000, // 50K words - switch to title-only fuzzy for performance + LARGE_NOTE_SIZE_THRESHOLD: 250000, // 250KB - switch to title-only fuzzy for performance + // Extreme note threshold - above this, skip content search entirely + EXTREME_NOTE_SIZE_THRESHOLD: 5 * 1024 * 1024, // 5MB - title search only // Absolute hard limits for extreme cases - only to prevent system crashes ABSOLUTE_MAX_CONTENT_SIZE: 100 * 1024 * 1024, // 100MB - extreme upper limit to prevent OOM ABSOLUTE_MAX_WORD_COUNT: 2000000, // 2M words - extreme upper limit for word processing @@ -227,7 +229,7 @@ export function validateAndPreprocessContent(content: string, noteId?: string): return content.split(/\s+/).slice(0, FUZZY_SEARCH_CONFIG.ABSOLUTE_MAX_WORD_COUNT).join(' '); } - // Notes above LARGE_NOTE_THRESHOLD will use optimized search strategy + // Notes above LARGE_NOTE_SIZE_THRESHOLD (250KB) will use optimized search strategy // (handled in note_content_fulltext.ts) return content;