Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
### Fixes

- GPU: respect explicit `QMD_LLAMA_GPU=metal|vulkan|cuda` backend overrides instead of always using auto GPU selection. #529
- Chunking: detect leading frontmatter blocks and keep them together as their own chunk instead of splitting metadata across semantic chunks. Markdown title extraction now prefers frontmatter `title` before falling back to headings or filenames.
- Fix: preserve original filename case in `handelize()`. The previous
`.toLowerCase()` call made indexed paths unreachable on case-sensitive
filesystems (Linux). `qmd update` automatically migrates legacy
Expand Down
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -715,7 +715,7 @@ Template placeholders:

- **Path**: Collection-relative path (e.g., `docs/guide.md`)
- **Docid**: Short hash identifier (e.g., `#a1b2c3`) - use with `qmd get #a1b2c3`
- **Title**: Extracted from document (first heading or filename)
- **Title**: Extracted from document (frontmatter `title`, first heading, or filename)
- **Context**: Path context if configured via `qmd context add`
- **Score**: Color-coded (green >70%, yellow >40%, dim otherwise)
- **Snippet**: Context around match with query terms highlighted
Expand Down Expand Up @@ -860,6 +860,8 @@ The squared distance decay means a heading 200 tokens back (score ~30) still bea

**Code Fence Protection:** Break points inside code blocks are ignored—code stays together. If a code block exceeds the chunk size, it's kept whole when possible.

**Frontmatter Protection:** Documents that start with frontmatter (`--- ... ---` or `+++ ... +++`) keep that metadata together as a dedicated first chunk instead of mixing it into later content chunks.

**AST-Aware Chunking (Code Files):**

For supported code files, QMD also parses the source with [tree-sitter](https://tree-sitter.github.io/) and adds AST-derived break points that are merged with the regex scores above:
Expand Down
21 changes: 21 additions & 0 deletions bun.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions flake.nix
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@
});

nodeModulesHashes = {
x86_64-linux = "sha256-D0ezO4vqq4iswcAMU2DCql9ZAQvh3me6N9aDB5roq4w=";
aarch64-darwin = "sha256-qU+9KdR/nTocelyANS09I/4yaQ+7s1LvJNqB27IOK/c=";
x86_64-linux = "sha256-utVpJ8sPILpNwYXjvUruXud8VNYnPEwOYJQ1mPEEZX8=";
aarch64-darwin = "sha256-qB6unHoVs+nTS38wDshsbbjsKppHo1CUmeHQK8Zfk4E=";

# Populate these on first build for additional hosts if/when needed.
aarch64-linux = pkgs.lib.fakeHash;
Expand Down
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
"@modelcontextprotocol/sdk": "1.29.0",
"better-sqlite3": "12.8.0",
"fast-glob": "3.3.3",
"gray-matter": "^4.0.3",
"node-llama-cpp": "3.18.1",
"picomatch": "4.0.4",
"sqlite-vec": "0.1.9",
Expand Down
133 changes: 130 additions & 3 deletions src/store.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,10 @@ import type { Database } from "./db.js";
import picomatch from "picomatch";
import { createHash } from "crypto";
import { readFileSync, realpathSync, statSync, mkdirSync } from "node:fs";
import { createRequire } from "node:module";
// Note: node:path resolve is not imported — we export our own cross-platform resolve()
import fastGlob from "fast-glob";
import YAML from "yaml";
import {
LlamaCpp,
getDefaultLlamaCpp,
Expand Down Expand Up @@ -254,7 +256,7 @@ export function mergeBreakPoints(a: BreakPoint[], b: BreakPoint[]): BreakPoint[]
* Core chunk algorithm that operates on precomputed break points and code fences.
* This is the shared implementation used by both regex-only and AST-aware chunking.
*/
export function chunkDocumentWithBreakPoints(
function chunkDocumentCore(
content: string,
breakPoints: BreakPoint[],
codeFences: CodeFenceRegion[],
Expand Down Expand Up @@ -306,6 +308,46 @@ export function chunkDocumentWithBreakPoints(
return chunks;
}

export function chunkDocumentWithBreakPoints(
content: string,
breakPoints: BreakPoint[],
codeFences: CodeFenceRegion[],
maxChars: number = CHUNK_SIZE_CHARS,
overlapChars: number = CHUNK_OVERLAP_CHARS,
windowChars: number = CHUNK_WINDOW_CHARS
): { text: string; pos: number }[] {
const frontmatter = extractFrontmatter(content);
if (!frontmatter) {
return chunkDocumentCore(content, breakPoints, codeFences, maxChars, overlapChars, windowChars);
}

if (frontmatter.raw.length >= content.length) {
return [{ text: content, pos: 0 }];
}

const offset = frontmatter.raw.length;
const bodyBreakPoints = breakPoints
.filter(bp => bp.pos >= offset)
.map(bp => ({ ...bp, pos: bp.pos - offset }));
const bodyCodeFences = codeFences
.filter(fence => fence.end > offset)
.map(fence => ({
start: Math.max(0, fence.start - offset),
end: Math.max(0, fence.end - offset),
}))
.filter(fence => fence.end > 0);
const bodyChunks = chunkDocumentCore(
frontmatter.body,
bodyBreakPoints,
bodyCodeFences,
maxChars,
overlapChars,
windowChars,
).map(chunk => ({ text: chunk.text, pos: chunk.pos + offset }));

return [{ text: frontmatter.raw, pos: 0 }, ...bodyChunks];
}

// Hybrid query: strong BM25 signal detection thresholds
// Skip expensive LLM expansion when top result is strong AND clearly separated from runner-up
export const STRONG_SIGNAL_MIN_SCORE = 0.85;
Expand Down Expand Up @@ -1686,7 +1728,7 @@ export function createStore(dbPath?: string): Store {
export type DocumentResult = {
filepath: string; // Full filesystem path
displayPath: string; // Short display path (e.g., "docs/readme.md")
title: string; // Document title (from first heading or filename)
title: string; // Document title (from frontmatter, first heading, or filename)
context: string | null; // Folder context description if configured
hash: string; // Content hash for caching/change detection
docid: string; // Short docid (first 6 chars of hash) for quick reference
Expand Down Expand Up @@ -2032,6 +2074,84 @@ export async function hashContent(content: string): Promise<string> {
return hash.digest("hex");
}

const require = createRequire(import.meta.url);
const grayMatter = require("gray-matter") as typeof import("gray-matter");
const PLUS_FRONTMATTER_OPTIONS = { delimiters: "+++" } as const;

interface FrontmatterInfo {
raw: string;
body: string;
matter: string;
data: Record<string, unknown> | null;
language: string | null;
}

function isRecord(value: unknown): value is Record<string, unknown> {
return value !== null && typeof value === "object" && !Array.isArray(value);
}

function parseFrontmatterData(raw: string): Record<string, unknown> | null {
for (const parser of [() => YAML.parse(raw) as unknown, () => JSON.parse(raw) as unknown]) {
try {
const parsed = parser();
if (isRecord(parsed)) return parsed;
} catch {
// Try the next parser.
}
}
return null;
}

function extractFrontmatterWithOptions(content: string, options?: { delimiters: string }): FrontmatterInfo | null {
if (!grayMatter.test(content, options)) return null;

try {
const parsed = grayMatter(content, options);
const rawLength = content.length - parsed.content.length;
if (rawLength <= 0) return null;

return {
raw: content.slice(0, rawLength),
body: parsed.content,
matter: parsed.matter,
data: isRecord(parsed.data) ? parsed.data : parseFrontmatterData(parsed.matter),
language: parsed.language || null,
};
} catch {
return null;
}
}

function extractFrontmatterByDelimiter(content: string): FrontmatterInfo | null {
const patterns = [
/^(?:\uFEFF)?---[^\r\n]*\r?\n([\s\S]*?)\r?\n---[^\S\r\n]*(?:\r?\n|$)/,
/^(?:\uFEFF)?\+\+\+[^\r\n]*\r?\n([\s\S]*?)\r?\n\+\+\+[^\S\r\n]*(?:\r?\n|$)/,
];

for (const pattern of patterns) {
const match = content.match(pattern);
if (!match) continue;

const raw = match[0];
const matter = match[1] ?? "";
return {
raw,
body: content.slice(raw.length),
matter,
data: parseFrontmatterData(matter),
language: null,
};
}

return null;
}

function extractFrontmatter(content: string): FrontmatterInfo | null {
return extractFrontmatterWithOptions(content)
?? extractFrontmatterWithOptions(content, PLUS_FRONTMATTER_OPTIONS)
?? extractFrontmatterByDelimiter(content);
}

const titleExtractors: Record<string, (content: string) => string | null> = {
'.md': (content) => {
const match = content.match(/^##?\s+(.+)$/m);
Expand All @@ -2055,10 +2175,17 @@ const titleExtractors: Record<string, (content: string) => string | null> = {
};

export function extractTitle(content: string, filename: string): string {
const frontmatter = extractFrontmatter(content);
const frontmatterTitle = frontmatter?.data?.title;
if (typeof frontmatterTitle === "string" && frontmatterTitle.trim().length > 0) {
return frontmatterTitle.trim();
}

const ext = filename.slice(filename.lastIndexOf('.')).toLowerCase();
const extractor = titleExtractors[ext];
const bodyContent = frontmatter?.body ?? content;
if (extractor) {
const title = extractor(content);
const title = extractor(bodyContent);
if (title) return title;
}
return filename.replace(/\.[^.]+$/, "").split("/").pop() || filename;
Expand Down
Loading
Loading