diff --git a/CHANGELOG.md b/CHANGELOG.md index d530dfaa..f1e3cce1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ ### Fixes - GPU: respect explicit `QMD_LLAMA_GPU=metal|vulkan|cuda` backend overrides instead of always using auto GPU selection. #529 +- Chunking: detect leading frontmatter blocks and keep them together as their own chunk instead of splitting metadata across semantic chunks. Markdown title extraction now prefers frontmatter `title` before falling back to headings or filenames. - Fix: preserve original filename case in `handelize()`. The previous `.toLowerCase()` call made indexed paths unreachable on case-sensitive filesystems (Linux). `qmd update` automatically migrates legacy diff --git a/README.md b/README.md index 6f318446..ede27fad 100644 --- a/README.md +++ b/README.md @@ -715,7 +715,7 @@ Template placeholders: - **Path**: Collection-relative path (e.g., `docs/guide.md`) - **Docid**: Short hash identifier (e.g., `#a1b2c3`) - use with `qmd get #a1b2c3` -- **Title**: Extracted from document (first heading or filename) +- **Title**: Extracted from document (frontmatter `title`, first heading, or filename) - **Context**: Path context if configured via `qmd context add` - **Score**: Color-coded (green >70%, yellow >40%, dim otherwise) - **Snippet**: Context around match with query terms highlighted @@ -860,6 +860,8 @@ The squared distance decay means a heading 200 tokens back (score ~30) still bea **Code Fence Protection:** Break points inside code blocks are ignored—code stays together. If a code block exceeds the chunk size, it's kept whole when possible. +**Frontmatter Protection:** Documents that start with frontmatter (`--- ... ---` or `+++ ... +++`) keep that metadata together as a dedicated first chunk instead of mixing it into later content chunks. + **AST-Aware Chunking (Code Files):** For supported code files, QMD also parses the source with [tree-sitter](https://tree-sitter.github.io/) and adds AST-derived break points that are merged with the regex scores above: diff --git a/bun.lock b/bun.lock index a96f0964..6b02ddd0 100644 --- a/bun.lock +++ b/bun.lock @@ -8,6 +8,7 @@ "@modelcontextprotocol/sdk": "1.29.0", "better-sqlite3": "12.8.0", "fast-glob": "3.3.3", + "gray-matter": "^4.0.3", "node-llama-cpp": "3.18.1", "picomatch": "4.0.4", "sqlite-vec": "0.1.9", @@ -241,6 +242,8 @@ "ansi-styles": ["ansi-styles@6.2.3", "", {}, "sha512-4Dj6M28JB+oAH8kFkTLUo+a2jwOFkuqb3yucU0CANcRRUbxS0cP0nZYCGjcc3BNXwRIsUVmDGgzawme7zvJHvg=="], + "argparse": ["argparse@1.0.10", "", { "dependencies": { "sprintf-js": "~1.0.2" } }, "sha512-o5Roy6tNG4SL/FOkCAN6RzjiakZS25RLYFrcMttJqbdd8BWrnA+fGz57iN5Pb06pvBGvl5gQ0B48dJlslXvoTg=="], + "assertion-error": ["assertion-error@2.0.1", "", {}, "sha512-Izi8RQcffqCeNVgFigKli1ssklIbpHnCYc6AknXGYoB6grJqyeby7jv12JUQgmTAnIDnbck1uxksT4dzN3PWBA=="], "async-retry": ["async-retry@1.3.3", "", { "dependencies": { "retry": "0.13.1" } }, "sha512-wfr/jstw9xNi/0teMHrRW7dsz3Lt5ARhYNZ2ewpadnhaIp5mbALhOAP+EAdsC7t4Z6wqsDVv9+W6gm1Dk9mEyw=="], @@ -343,6 +346,8 @@ "escape-html": ["escape-html@1.0.3", "", {}, "sha512-NiSupZ4OeuGwr68lGIeym/ksIZMJodUGOSCZ/FSnTxcrekbvqrgdUxlJOMpijaKZVjAJrWrGs/6Jy8OMuyj9ow=="], + "esprima": ["esprima@4.0.1", "", { "bin": { "esparse": "./bin/esparse.js", "esvalidate": "./bin/esvalidate.js" } }, "sha512-eGuFFw7Upda+g4p+QHvnW0RyTX/SVeJBDM/gCtMARO0cLuT2HcEKnTPvhjV6aGeqrCB/sbNop0Kszm0jsaWU4A=="], + "estree-walker": ["estree-walker@3.0.3", "", { "dependencies": { "@types/estree": "^1.0.0" } }, "sha512-7RUKfXgSMMkzt6ZuXmqapOurLGPPfgj6l9uRZ7lRGolvk0y2yocc35LdcxKC5PQZdn2DMqioAQ2NoWcrTKmm6g=="], "etag": ["etag@1.8.1", "", {}, "sha512-aIL5Fx7mawVa300al2BnEE4iNvo1qETxLrPI/o05L7z6go7fCw1J6EQmbK4FmJ2AS7kgVF/KEZWufBfdClMcPg=="], @@ -361,6 +366,8 @@ "express-rate-limit": ["express-rate-limit@8.3.2", "", { "dependencies": { "ip-address": "10.1.0" }, "peerDependencies": { "express": ">= 4.11" } }, "sha512-77VmFeJkO0/rvimEDuUC5H30oqUC4EyOhyGccfqoLebB0oiEYfM7nwPrsDsBL1gsTpwfzX8SFy2MT3TDyRq+bg=="], + "extend-shallow": ["extend-shallow@2.0.1", "", { "dependencies": { "is-extendable": "^0.1.0" } }, "sha512-zCnTtlxNoAiDc3gqY2aYAWFx7XWWiasuF2K8Me5WbN8otHKTUKBwjPtNpRs/rbUZm7KxWAaNj7P1a/p52GbVug=="], + "fast-deep-equal": ["fast-deep-equal@3.1.3", "", {}, "sha512-f3qQ9oQy9j2AhBe/H9VC91wLmKBCCU/gDOnKNAYG5hswO7BLKj09Hc5HYNz9cGI++xlpDCIgDaitVs03ATR84Q=="], "fast-glob": ["fast-glob@3.3.3", "", { "dependencies": { "@nodelib/fs.stat": "^2.0.2", "@nodelib/fs.walk": "^1.2.3", "glob-parent": "^5.1.2", "merge2": "^1.3.0", "micromatch": "^4.0.8" } }, "sha512-7MptL8U0cqcFdzIzwOTHoilX9x5BrNqye7Z/LuC7kCMRio1EMSyqRK3BEAUD7sXRq4iT4AzTVuZdhgQ2TCvYLg=="], @@ -411,6 +418,8 @@ "graceful-fs": ["graceful-fs@4.2.11", "", {}, "sha512-RbJ5/jmFcNNCcDV5o9eTnBLJ/HszWV0P73bc+Ff4nS/rJj+YaS6IGyiOL0VoBYX+l1Wrl3k63h/KrH+nhJ0XvQ=="], + "gray-matter": ["gray-matter@4.0.3", "", { "dependencies": { "js-yaml": "^3.13.1", "kind-of": "^6.0.2", "section-matter": "^1.0.0", "strip-bom-string": "^1.0.0" } }, "sha512-5v6yZd4JK3eMI3FqqCouswVqwugaA9r4dNZB1wwcmrD02QkV5H0y7XBQW8QwQqEaZY1pM9aqORSORhJRdNK44Q=="], + "has-symbols": ["has-symbols@1.1.0", "", {}, "sha512-1cDNdwJ2Jaohmb3sg4OmKaMBwuC48sYni5HUw2DvsC8LjGTLK9h+eb1X6RyuOHe4hT0ULCW68iomhjUoKUqlPQ=="], "hasown": ["hasown@2.0.2", "", { "dependencies": { "function-bind": "^1.1.2" } }, "sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ=="], @@ -435,6 +444,8 @@ "ipull": ["ipull@3.9.5", "", { "dependencies": { "@tinyhttp/content-disposition": "^2.2.0", "async-retry": "^1.3.3", "chalk": "^5.3.0", "ci-info": "^4.0.0", "cli-spinners": "^2.9.2", "commander": "^10.0.0", "eventemitter3": "^5.0.1", "filenamify": "^6.0.0", "fs-extra": "^11.1.1", "is-unicode-supported": "^2.0.0", "lifecycle-utils": "^2.0.1", "lodash.debounce": "^4.0.8", "lowdb": "^7.0.1", "pretty-bytes": "^6.1.0", "pretty-ms": "^8.0.0", "sleep-promise": "^9.1.0", "slice-ansi": "^7.1.0", "stdout-update": "^4.0.1", "strip-ansi": "^7.1.0" }, "optionalDependencies": { "@reflink/reflink": "^0.1.16" }, "bin": { "ipull": "dist/cli/cli.js" } }, "sha512-5w/yZB5lXmTfsvNawmvkCjYo4SJNuKQz/av8TC1UiOyfOHyaM+DReqbpU2XpWYfmY+NIUbRRH8PUAWsxaS+IfA=="], + "is-extendable": ["is-extendable@0.1.1", "", {}, "sha512-5BMULNob1vgFX6EjQw5izWDxrecWK9AM72rugNr0TFldMOi0fj6Jk+zeKIt0xGj4cEfQIJth4w3OKWOJ4f+AFw=="], + "is-extglob": ["is-extglob@2.1.1", "", {}, "sha512-SbKbANkN603Vi4jEZv49LeVJMn4yGwsbzZworEoyEiutsN3nJYdbO36zfhGJ6QEDpOZIFkDtnq5JRxmvl3jsoQ=="], "is-fullwidth-code-point": ["is-fullwidth-code-point@5.1.0", "", { "dependencies": { "get-east-asian-width": "^1.3.1" } }, "sha512-5XHYaSyiqADb4RnZ1Bdad6cPp8Toise4TzEjcOYDHZkTCbKgiUl7WTUCpNWHuxmDt91wnsZBc9xinNzopv3JMQ=="], @@ -455,12 +466,16 @@ "js-tokens": ["js-tokens@9.0.1", "", {}, "sha512-mxa9E9ITFOt0ban3j6L5MpjwegGz6lBQmM1IJkWeBZGcMxto50+eWdjC/52xDbS2vy0k7vIMK0Fe2wfL9OQSpQ=="], + "js-yaml": ["js-yaml@3.14.2", "", { "dependencies": { "argparse": "^1.0.7", "esprima": "^4.0.0" }, "bin": { "js-yaml": "bin/js-yaml.js" } }, "sha512-PMSmkqxr106Xa156c2M265Z+FTrPl+oxd/rgOQy2tijQeK5TxQ43psO1ZCwhVOSdnn+RzkzlRz/eY4BgJBYVpg=="], + "json-schema-traverse": ["json-schema-traverse@1.0.0", "", {}, "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug=="], "json-schema-typed": ["json-schema-typed@8.0.2", "", {}, "sha512-fQhoXdcvc3V28x7C7BMs4P5+kNlgUURe2jmUT1T//oBRMDrqy1QPelJimwZGo7Hg9VPV3EQV5Bnq4hbFy2vetA=="], "jsonfile": ["jsonfile@6.2.0", "", { "dependencies": { "universalify": "^2.0.0" }, "optionalDependencies": { "graceful-fs": "^4.1.6" } }, "sha512-FGuPw30AdOIUTRMC2OMRtQV+jkVj2cfPqSeWXv1NEAJ1qZ5zb1X6z1mFhbfOB/iy3ssJCD+3KuZ8r8C3uVFlAg=="], + "kind-of": ["kind-of@6.0.3", "", {}, "sha512-dcS1ul+9tmeD95T+x28/ehLgd9mENa3LsvDTtzm3vyBEO7RPptvAD+t44WVXaUjTBRcrpFeFlC8WCruUR456hw=="], + "lifecycle-utils": ["lifecycle-utils@3.1.1", "", {}, "sha512-gNd3OvhFNjHykJE3uGntz7UuPzWlK9phrIdXxU9Adis0+ExkwnZibfxCJWiWWZ+a6VbKiZrb+9D9hCQWd4vjTg=="], "lodash.debounce": ["lodash.debounce@4.0.8", "", {}, "sha512-FT1yDzDYEoYWhnSGnpE/4Kj1fLZkDFyqRb7fNt6FdYOSxlUWAtp42Eh6Wb0rGIv/m9Bgo7x4GhQbm5Ys4SG5ow=="], @@ -595,6 +610,8 @@ "safer-buffer": ["safer-buffer@2.1.2", "", {}, "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg=="], + "section-matter": ["section-matter@1.0.0", "", { "dependencies": { "extend-shallow": "^2.0.1", "kind-of": "^6.0.0" } }, "sha512-vfD3pmTzGpufjScBh50YHKzEu2lxBWhVEHsNGoEXmCmn2hKGfeNLYMzCJpe8cD7gqX7TJluOVpBkAequ6dgMmA=="], + "semver": ["semver@7.7.3", "", { "bin": { "semver": "bin/semver.js" } }, "sha512-SdsKMrI9TdgjdweUSR9MweHA4EJ8YxHn8DFaDisvhVlUOe4BF1tLD7GAj0lIqWVl+dPb/rExr0Btby5loQm20Q=="], "send": ["send@1.2.0", "", { "dependencies": { "debug": "^4.3.5", "encodeurl": "^2.0.0", "escape-html": "^1.0.3", "etag": "^1.8.1", "fresh": "^2.0.0", "http-errors": "^2.0.0", "mime-types": "^3.0.1", "ms": "^2.1.3", "on-finished": "^2.4.1", "range-parser": "^1.2.1", "statuses": "^2.0.1" } }, "sha512-uaW0WwXKpL9blXE2o0bRhoL2EGXIrZxQ2ZQ4mgcfoBxdFmQold+qWsD2jLrfZ0trjKL6vOw0j//eAwcALFjKSw=="], @@ -631,6 +648,8 @@ "source-map-js": ["source-map-js@1.2.1", "", {}, "sha512-UXWMKhLOwVKb728IUtQPXxfYU+usdybtUrK/8uGE8CQMvrhOpwvzDBwj0QhSL7MQc7vIsISBG8VQ8+IDQxpfQA=="], + "sprintf-js": ["sprintf-js@1.0.3", "", {}, "sha512-D9cPgkvLlV3t3IzL0D0YLvGA9Ahk4PcvVwUbN0dSGr1aP0Nrt4AEnTUbuGvquEC0mA64Gqt1fzirlRs5ibXx8g=="], + "sqlite-vec": ["sqlite-vec@0.1.9", "", { "optionalDependencies": { "sqlite-vec-darwin-arm64": "0.1.9", "sqlite-vec-darwin-x64": "0.1.9", "sqlite-vec-linux-arm64": "0.1.9", "sqlite-vec-linux-x64": "0.1.9", "sqlite-vec-windows-x64": "0.1.9" } }, "sha512-L7XJWRIBNvR9O5+vh1FQ+IGkh/3D2AzVksW5gdtk28m78Hy8skFD0pqReKH1Yp0/BUKRGcffgKvyO/EON5JXpA=="], "sqlite-vec-darwin-arm64": ["sqlite-vec-darwin-arm64@0.1.9", "", { "os": "darwin", "cpu": "arm64" }, "sha512-jSsZpE42OfBkGL/ItyJTVCUwl6o6Ka3U5rc4j+UBDIQzC1ulSSKMEhQLthsOnF/MdAf1MuAkYhkdKmmcjaIZQg=="], @@ -661,6 +680,8 @@ "strip-ansi": ["strip-ansi@7.2.0", "", { "dependencies": { "ansi-regex": "^6.2.2" } }, "sha512-yDPMNjp4WyfYBkHnjIRLfca1i6KMyGCtsVgoKe/z1+6vukgaENdgGBZt+ZmKPc4gavvEZ5OgHfHdrazhgNyG7w=="], + "strip-bom-string": ["strip-bom-string@1.0.0", "", {}, "sha512-uCC2VHvQRYu+lMh4My/sFNmF2klFymLX1wHJeXnbEJERpV/ZsVuonzerjfrGpIGF7LBVa1O7i9kjiWvJiFck8g=="], + "strip-json-comments": ["strip-json-comments@2.0.1", "", {}, "sha512-4gB8na07fecVVkOI6Rs4e7T6NOTki5EmL7TUduTs6bu3EdnSycntVJ4re8kgZA+wx9IueI2Y11bfbgwtzuE0KQ=="], "strip-literal": ["strip-literal@3.1.0", "", { "dependencies": { "js-tokens": "^9.0.1" } }, "sha512-8r3mkIM/2+PpjHoOtiAW8Rg3jJLHaV7xPwG+YRGrv6FP0wwk/toTpATxWYOW0BKdWwl82VT2tFYi5DlROa0Mxg=="], diff --git a/flake.nix b/flake.nix index 36450130..76dc287e 100644 --- a/flake.nix +++ b/flake.nix @@ -44,8 +44,8 @@ }); nodeModulesHashes = { - x86_64-linux = "sha256-D0ezO4vqq4iswcAMU2DCql9ZAQvh3me6N9aDB5roq4w="; - aarch64-darwin = "sha256-qU+9KdR/nTocelyANS09I/4yaQ+7s1LvJNqB27IOK/c="; + x86_64-linux = "sha256-utVpJ8sPILpNwYXjvUruXud8VNYnPEwOYJQ1mPEEZX8="; + aarch64-darwin = "sha256-qB6unHoVs+nTS38wDshsbbjsKppHo1CUmeHQK8Zfk4E="; # Populate these on first build for additional hosts if/when needed. aarch64-linux = pkgs.lib.fakeHash; diff --git a/package.json b/package.json index 0ec04c9c..de2d432c 100644 --- a/package.json +++ b/package.json @@ -48,6 +48,7 @@ "@modelcontextprotocol/sdk": "1.29.0", "better-sqlite3": "12.8.0", "fast-glob": "3.3.3", + "gray-matter": "^4.0.3", "node-llama-cpp": "3.18.1", "picomatch": "4.0.4", "sqlite-vec": "0.1.9", diff --git a/src/store.ts b/src/store.ts index 16a55b7d..eaffee5a 100644 --- a/src/store.ts +++ b/src/store.ts @@ -16,8 +16,10 @@ import type { Database } from "./db.js"; import picomatch from "picomatch"; import { createHash } from "crypto"; import { readFileSync, realpathSync, statSync, mkdirSync } from "node:fs"; +import { createRequire } from "node:module"; // Note: node:path resolve is not imported — we export our own cross-platform resolve() import fastGlob from "fast-glob"; +import YAML from "yaml"; import { LlamaCpp, getDefaultLlamaCpp, @@ -254,7 +256,7 @@ export function mergeBreakPoints(a: BreakPoint[], b: BreakPoint[]): BreakPoint[] * Core chunk algorithm that operates on precomputed break points and code fences. * This is the shared implementation used by both regex-only and AST-aware chunking. */ -export function chunkDocumentWithBreakPoints( +function chunkDocumentCore( content: string, breakPoints: BreakPoint[], codeFences: CodeFenceRegion[], @@ -306,6 +308,46 @@ export function chunkDocumentWithBreakPoints( return chunks; } +export function chunkDocumentWithBreakPoints( + content: string, + breakPoints: BreakPoint[], + codeFences: CodeFenceRegion[], + maxChars: number = CHUNK_SIZE_CHARS, + overlapChars: number = CHUNK_OVERLAP_CHARS, + windowChars: number = CHUNK_WINDOW_CHARS +): { text: string; pos: number }[] { + const frontmatter = extractFrontmatter(content); + if (!frontmatter) { + return chunkDocumentCore(content, breakPoints, codeFences, maxChars, overlapChars, windowChars); + } + + if (frontmatter.raw.length >= content.length) { + return [{ text: content, pos: 0 }]; + } + + const offset = frontmatter.raw.length; + const bodyBreakPoints = breakPoints + .filter(bp => bp.pos >= offset) + .map(bp => ({ ...bp, pos: bp.pos - offset })); + const bodyCodeFences = codeFences + .filter(fence => fence.end > offset) + .map(fence => ({ + start: Math.max(0, fence.start - offset), + end: Math.max(0, fence.end - offset), + })) + .filter(fence => fence.end > 0); + const bodyChunks = chunkDocumentCore( + frontmatter.body, + bodyBreakPoints, + bodyCodeFences, + maxChars, + overlapChars, + windowChars, + ).map(chunk => ({ text: chunk.text, pos: chunk.pos + offset })); + + return [{ text: frontmatter.raw, pos: 0 }, ...bodyChunks]; +} + // Hybrid query: strong BM25 signal detection thresholds // Skip expensive LLM expansion when top result is strong AND clearly separated from runner-up export const STRONG_SIGNAL_MIN_SCORE = 0.85; @@ -1686,7 +1728,7 @@ export function createStore(dbPath?: string): Store { export type DocumentResult = { filepath: string; // Full filesystem path displayPath: string; // Short display path (e.g., "docs/readme.md") - title: string; // Document title (from first heading or filename) + title: string; // Document title (from frontmatter, first heading, or filename) context: string | null; // Folder context description if configured hash: string; // Content hash for caching/change detection docid: string; // Short docid (first 6 chars of hash) for quick reference @@ -2032,6 +2074,84 @@ export async function hashContent(content: string): Promise { return hash.digest("hex"); } +const require = createRequire(import.meta.url); +const grayMatter = require("gray-matter") as typeof import("gray-matter"); +const PLUS_FRONTMATTER_OPTIONS = { delimiters: "+++" } as const; + +interface FrontmatterInfo { + raw: string; + body: string; + matter: string; + data: Record | null; + language: string | null; +} + +function isRecord(value: unknown): value is Record { + return value !== null && typeof value === "object" && !Array.isArray(value); +} + +function parseFrontmatterData(raw: string): Record | null { + for (const parser of [() => YAML.parse(raw) as unknown, () => JSON.parse(raw) as unknown]) { + try { + const parsed = parser(); + if (isRecord(parsed)) return parsed; + } catch { + // Try the next parser. + } + } + return null; +} + +function extractFrontmatterWithOptions(content: string, options?: { delimiters: string }): FrontmatterInfo | null { + if (!grayMatter.test(content, options)) return null; + + try { + const parsed = grayMatter(content, options); + const rawLength = content.length - parsed.content.length; + if (rawLength <= 0) return null; + + return { + raw: content.slice(0, rawLength), + body: parsed.content, + matter: parsed.matter, + data: isRecord(parsed.data) ? parsed.data : parseFrontmatterData(parsed.matter), + language: parsed.language || null, + }; + } catch { + return null; + } +} + +function extractFrontmatterByDelimiter(content: string): FrontmatterInfo | null { + const patterns = [ + /^(?:\uFEFF)?---[^\r\n]*\r?\n([\s\S]*?)\r?\n---[^\S\r\n]*(?:\r?\n|$)/, + /^(?:\uFEFF)?\+\+\+[^\r\n]*\r?\n([\s\S]*?)\r?\n\+\+\+[^\S\r\n]*(?:\r?\n|$)/, + ]; + + for (const pattern of patterns) { + const match = content.match(pattern); + if (!match) continue; + + const raw = match[0]; + const matter = match[1] ?? ""; + return { + raw, + body: content.slice(raw.length), + matter, + data: parseFrontmatterData(matter), + language: null, + }; + } + + return null; +} + +function extractFrontmatter(content: string): FrontmatterInfo | null { + return extractFrontmatterWithOptions(content) + ?? extractFrontmatterWithOptions(content, PLUS_FRONTMATTER_OPTIONS) + ?? extractFrontmatterByDelimiter(content); +} + const titleExtractors: Record string | null> = { '.md': (content) => { const match = content.match(/^##?\s+(.+)$/m); @@ -2055,10 +2175,17 @@ const titleExtractors: Record string | null> = { }; export function extractTitle(content: string, filename: string): string { + const frontmatter = extractFrontmatter(content); + const frontmatterTitle = frontmatter?.data?.title; + if (typeof frontmatterTitle === "string" && frontmatterTitle.trim().length > 0) { + return frontmatterTitle.trim(); + } + const ext = filename.slice(filename.lastIndexOf('.')).toLowerCase(); const extractor = titleExtractors[ext]; + const bodyContent = frontmatter?.body ?? content; if (extractor) { - const title = extractor(content); + const title = extractor(bodyContent); if (title) return title; } return filename.replace(/\.[^.]+$/, "").split("/").pop() || filename; diff --git a/test/store.test.ts b/test/store.test.ts index 93bfb7db..97d2ec72 100644 --- a/test/store.test.ts +++ b/test/store.test.ts @@ -49,6 +49,7 @@ import { STRONG_SIGNAL_MIN_SCORE, STRONG_SIGNAL_MIN_GAP, generateEmbeddings, + _resetProductionModeForTesting, type Store, type DocumentResult, type SearchResult, @@ -281,6 +282,10 @@ describe("Store Creation", () => { const originalIndexPath = process.env.INDEX_PATH; delete process.env.INDEX_PATH; + // Bun runs test files in a shared process, so top-level CLI imports in other + // test files can flip store.ts into production mode. + _resetProductionModeForTesting(); + expect(() => createStore()).toThrow("Database path not set"); // Restore @@ -388,6 +393,26 @@ describe("Document Helpers", () => { expect(extractTitle(content, "file.md")).toBe("My Title"); }); + test("extractTitle prefers YAML frontmatter title", () => { + const content = `--- +title: Frontmatter Title +--- + +# Heading Title +`; + expect(extractTitle(content, "file.md")).toBe("Frontmatter Title"); + }); + + test("extractTitle parses JSON frontmatter title", () => { + const content = `---json +{"title":"JSON Frontmatter Title"} +--- + +# Heading Title +`; + expect(extractTitle(content, "file.md")).toBe("JSON Frontmatter Title"); + }); + test("extractTitle extracts H2 heading if no H1", () => { const content = "## My Subtitle\n\nSome content here."; expect(extractTitle(content, "file.md")).toBe("My Subtitle"); @@ -407,6 +432,17 @@ describe("Document Helpers", () => { const content = "# 📝 Notes\n\n## Meeting Summary\n\nContent"; expect(extractTitle(content, "file.md")).toBe("Meeting Summary"); }); + + test("extractTitle ignores frontmatter when looking for markdown headings", () => { + const content = `--- +summary: | + # Not a title +--- + +# Actual Title +`; + expect(extractTitle(content, "file.md")).toBe("Actual Title"); + }); }); // ============================================================================= @@ -869,6 +905,41 @@ Final section content. expect(chunk.pos).toBeGreaterThanOrEqual(0); } }); + + test("chunkDocument keeps frontmatter in its own chunk", () => { + const content = `--- +title: Frontmatter Title +author: Example +--- + +# Heading + +${"Body paragraph. ".repeat(20)}`; + const bodyStart = content.indexOf("\n# Heading"); + const chunks = chunkDocument(content, 40, 0, 10); + + expect(bodyStart).toBeGreaterThan(0); + expect(chunks.length).toBeGreaterThan(1); + expect(chunks[0]).toEqual({ text: content.slice(0, bodyStart), pos: 0 }); + expect(chunks[1]!.pos).toBe(bodyStart); + expect(chunks[1]!.text.startsWith("\n# Heading")).toBe(true); + }); + + test("chunkDocument also separates TOML-style frontmatter", () => { + const content = `+++ +title = "Frontmatter Title" ++++ + +# Heading + +${"Body paragraph. ".repeat(20)}`; + const bodyStart = content.indexOf("\n# Heading"); + const chunks = chunkDocument(content, 40, 0, 10); + + expect(bodyStart).toBeGreaterThan(0); + expect(chunks[0]).toEqual({ text: content.slice(0, bodyStart), pos: 0 }); + expect(chunks[1]!.pos).toBe(bodyStart); + }); }); // =============================================================================