diff --git a/.jules/bolt.md b/.jules/bolt.md new file mode 100644 index 0000000..3f785ee --- /dev/null +++ b/.jules/bolt.md @@ -0,0 +1,4 @@ + +## 2024-05-24 - [Optimize LaTeX Section Parsing] +**Learning:** For heavy text parsing in this codebase (like LaTeX section extraction), single-pass global regular expressions (e.g., using `matchAll` or `regex.exec`) combined with `substring` extraction are significantly faster and use much less memory compared to line-by-line splitting (`split('\n')`) and character-by-character string building. Using `indexOf('\n', lastNewlineIndex)` allows lazy computation of line numbers without splitting the whole document. +**Action:** Prefer single-pass regexes and substring methods over splitting whole documents into arrays for textual analysis or extraction where performance is critical. diff --git a/src/utils/parseSections.ts b/src/utils/parseSections.ts index 6b34d65..4da3b48 100644 --- a/src/utils/parseSections.ts +++ b/src/utils/parseSections.ts @@ -17,28 +17,22 @@ function extractBraceContent(content: string, startIndex: number): { content: st let depth = 1; let i = startIndex + 1; - let result = ''; while (i < content.length && depth > 0) { if (content[i] === '\\' && i + 1 < content.length) { - // Handle escaped character (e.g., \{, \}, \\) - result += content[i]; - i++; - result += content[i]; - i++; + // Skip escaped character (e.g., \{, \}, \\) + i += 2; } else if (content[i] === '{') { depth++; - result += content[i]; i++; } else if (content[i] === '}') { depth--; if (depth === 0) { - return { content: result, endIndex: i }; + // ⚡ Bolt: Use `substring` to avoid the overhead of building the string character-by-character + return { content: content.substring(startIndex + 1, i), endIndex: i }; } - result += content[i]; i++; } else { - result += content[i]; i++; } } @@ -56,51 +50,42 @@ function extractBraceContent(content: string, startIndex: number): { content: st */ export function parseSections(content: string): Section[] { const sections: Section[] = []; - const lines = content.split('\n'); - lines.forEach((line, lineNumber) => { - // Check for \section or \section* commands - let match = line.match(/\\section\*?\{/); - if (match) { - const braceIndex = match.index! + match[0].length - 1; // Index of the opening brace - const braceContent = extractBraceContent(line, braceIndex); - if (braceContent) { - sections.push({ - level: 1, - title: braceContent.content, - line: lineNumber + 1 - }); - } - } + // ⚡ Bolt: Single-pass global regex search instead of memory-heavy content.split('\n') + // This avoids allocating a large array of lines and running regexes on every single line. + const regex = /\\(section|subsection|subsubsection)\*?\{/g; + let match; + + let currentLine = 1; + let lastNewlineIndex = 0; + + while ((match = regex.exec(content)) !== null) { + // Determine level from the captured group + const levelStr = match[1]; + const level = levelStr === 'section' ? 1 : levelStr === 'subsection' ? 2 : 3; - // Check for \subsection or \subsection* commands - match = line.match(/\\subsection\*?\{/); - if (match) { - const braceIndex = match.index! + match[0].length - 1; // Index of the opening brace - const braceContent = extractBraceContent(line, braceIndex); - if (braceContent) { - sections.push({ - level: 2, - title: braceContent.content, - line: lineNumber + 1 - }); - } + const braceIndex = match.index + match[0].length - 1; // Index of the opening brace + + // ⚡ Bolt: Lazily count newlines up to the current match index + // This is much faster than splitting the entire document upfront. + let newlineIndex = content.indexOf('\n', lastNewlineIndex); + while (newlineIndex !== -1 && newlineIndex < match.index) { + currentLine++; + lastNewlineIndex = newlineIndex + 1; + newlineIndex = content.indexOf('\n', lastNewlineIndex); } - // Check for \subsubsection or \subsubsection* commands - match = line.match(/\\subsubsection\*?\{/); - if (match) { - const braceIndex = match.index! + match[0].length - 1; // Index of the opening brace - const braceContent = extractBraceContent(line, braceIndex); - if (braceContent) { - sections.push({ - level: 3, - title: braceContent.content, - line: lineNumber + 1 - }); - } + const braceContent = extractBraceContent(content, braceIndex); + if (braceContent) { + sections.push({ + level, + title: braceContent.content, + line: currentLine + }); + // Skip ahead to avoid finding sections inside other section titles + regex.lastIndex = braceContent.endIndex; } - }); + } return sections; } \ No newline at end of file