Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
186 changes: 186 additions & 0 deletions packages/fern-docs/bundle/src/server/llm-txt-md.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
import { convertToLlmTxtMarkdown } from "./llm-txt-md";

describe("llm-txt-md", () => {
describe("convertToLlmTxtMarkdown", () => {
it("should handle regular markdown without MDX components", () => {
const markdown = "# Test\n\nThis is regular markdown.";
const result = convertToLlmTxtMarkdown(markdown, "Test Page", "md");

expect(result).toContain("# Test Page");
expect(result).toContain("This is regular markdown.");
});

it("should expand TSFetchCodeBlock components to code blocks", () => {
const markdown = `# Test

<TSFetchCodeBlock>
console.log("Hello, world!");
</TSFetchCodeBlock>`;

const result = convertToLlmTxtMarkdown(markdown, "Test Page", "mdx");

expect(result).toContain("# Test Page");
expect(result).toContain("```typescript");
expect(result).toContain('console.log("Hello, world!");');
expect(result).not.toContain("<TSFetchCodeBlock>");
});

it("should expand CodeGroup components to multiple code blocks", () => {
const markdown = `# Test

<CodeGroup>
<Code language="javascript">
console.log("JS code");
</Code>
<Code language="python">
print("Python code")
</Code>
</CodeGroup>`;

const result = convertToLlmTxtMarkdown(markdown, "Test Page", "mdx");

expect(result).toContain("# Test Page");
expect(result).toContain("```javascript");
expect(result).toContain('console.log("JS code");');
expect(result).toContain("```python");
expect(result).toContain('print("Python code")');
expect(result).not.toContain("<CodeGroup>");
expect(result).not.toContain("<Code");
});

it("should expand Template components with variable replacement", () => {
const markdown = `# Test

<Template data={{"API_KEY": "test-key-123", "BASE_URL": "https://api.example.com"}}>
Use your API key: {{API_KEY}}
Base URL: {{BASE_URL}}
</Template>`;

const result = convertToLlmTxtMarkdown(markdown, "Test Page", "mdx");

expect(result).toContain("# Test Page");
expect(result).toContain("Use your API key: test-key-123");
expect(result).toContain("Base URL: https://api.example.com");
expect(result).not.toContain("<Template");
expect(result).not.toContain("{{API_KEY}}");
expect(result).not.toContain("{{BASE_URL}}");
});

it("should apply global template variables", () => {
const markdown = `# Test

Your free credits threshold is {{FREE_MODEL_CREDITS_THRESHOLD}}.
Use API key: {{API_KEY_REF}}`;

const result = convertToLlmTxtMarkdown(markdown, "Test Page", "mdx");

expect(result).toContain("# Test Page");
expect(result).toContain("Your free credits threshold is 10");
expect(result).toContain("Use API key: your-api-key");
expect(result).not.toContain("{{FREE_MODEL_CREDITS_THRESHOLD}}");
expect(result).not.toContain("{{API_KEY_REF}}");
});

it("should handle mixed content with multiple component types", () => {
const markdown = `# Mixed Content Test

Regular markdown paragraph.

<TSFetchCodeBlock>
const apiKey = "{{API_KEY_REF}}";
</TSFetchCodeBlock>

<Template data={{"USER_NAME": "Alice"}}>
Hello {{USER_NAME}}!
</Template>

<CodeGroup>
<Code language="bash">
curl -H "Authorization: Bearer {{API_KEY_REF}}"
</Code>
</CodeGroup>

More regular content with {{FREE_MODEL_CREDITS_THRESHOLD}} credits.`;

const result = convertToLlmTxtMarkdown(markdown, "Mixed Test", "mdx");

expect(result).toContain("# Mixed Test");
expect(result).toContain("Regular markdown paragraph.");
expect(result).toContain("```typescript");
expect(result).toContain('const apiKey = "your-api-key";');
expect(result).toContain("Hello Alice!");
expect(result).toContain("```bash");
expect(result).toContain('curl -H "Authorization: Bearer your-api-key"');
expect(result).toContain("More regular content with 10 credits.");

expect(result).not.toContain("<TSFetchCodeBlock>");
expect(result).not.toContain("<Template");
expect(result).not.toContain("<CodeGroup>");
expect(result).not.toContain("{{");
});

it("should handle empty or malformed components gracefully", () => {
const markdown = `# Edge Cases

<TSFetchCodeBlock></TSFetchCodeBlock>

<CodeGroup></CodeGroup>

<Template></Template>

Regular content continues.`;

const result = convertToLlmTxtMarkdown(markdown, "Edge Cases", "mdx");

expect(result).toContain("# Edge Cases");
expect(result).toContain("Regular content continues.");
expect(result).not.toContain("<TSFetchCodeBlock>");
expect(result).not.toContain("<CodeGroup>");
expect(result).not.toContain("<Template>");
});

it("should preserve content when format is 'md' instead of 'mdx'", () => {
const markdown = `# Test

<TSFetchCodeBlock>
console.log("test");
</TSFetchCodeBlock>`;

const result = convertToLlmTxtMarkdown(markdown, "Test Page", "md");

expect(result).toContain("# Test Page");
expect(result).toContain("<TSFetchCodeBlock>");
});

it("should handle TSFetchCodeBlock with src attribute", () => {
const markdown = `# Test

<TSFetchCodeBlock src="https://example.com/code.ts">
</TSFetchCodeBlock>`;

const result = convertToLlmTxtMarkdown(markdown, "Test Page", "mdx");

expect(result).toContain("# Test Page");
expect(result).toContain("```typescript");
expect(result).toContain("// Code from: https://example.com/code.ts");
expect(result).not.toContain("<TSFetchCodeBlock>");
});

it("should extract title and description from frontmatter", () => {
const markdown = `---
title: "Custom Title"
description: "Custom description"
---

# Heading

Content here.`;

const result = convertToLlmTxtMarkdown(markdown, "Default Title", "md");

expect(result).toContain("# Custom Title");
expect(result).toContain("> Custom description");
expect(result).toContain("Content here.");
});
});
});
159 changes: 158 additions & 1 deletion packages/fern-docs/bundle/src/server/llm-txt-md.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import {
toTree,
visit,
} from "@fern-docs/mdx";
import { applyTemplates } from "../mdx/components/code/Template";

export function convertToLlmTxtMarkdown(
markdown: string,
Expand All @@ -22,12 +23,168 @@ export function convertToLlmTxtMarkdown(
return [
`# ${title}`,
description != null ? `> ${description}` : undefined,
stripMdxFeatures(content, format),
stripMdxFeatures(expandMdxComponents(content, format), format),

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there anywhere we should be adding/modifying tests for this?

]
.filter(isNonNullish)
.join("\n\n");
}

/**
* Expands custom MDX components to their semantic markdown equivalents
* before stripping MDX features. This ensures components like TSFetchCodeBlock
* and Template are converted to readable content for LLM consumption.
*/
function expandMdxComponents(markdown: string, format: "mdx" | "md"): string {
if (format !== "mdx") {
return markdown;
}

const { mdast } = toTree(markdown, {
format,
sanitize: true,
});

visit(mdast, (node, idx, parent) => {
if (parent == null || idx == null) {
return;
}

if (isMdxJsxElementHast(node)) {
if (node.name === "TSFetchCodeBlock") {
const codeContent = extractCodeFromTSFetchCodeBlock(node);
if (codeContent) {
const codeBlock = {
type: "code",
lang: "typescript",
value: codeContent,
};
parent.children[idx] = codeBlock;
}
return;
}

if (node.name === "CodeGroup") {
const codeBlocks = extractCodeFromCodeGroup(node);
if (codeBlocks.length > 0) {
parent.children.splice(idx, 1, ...codeBlocks);
return idx + codeBlocks.length - 1;
}
return;
}

if (node.name === "Template") {
const templateData = extractTemplateData(node);
if (templateData && node.children) {
const childrenMarkdown = mdastToMarkdown({ type: "root", children: node.children });
const expandedContent = applyTemplates(childrenMarkdown, templateData);

const { mdast: expandedMdast } = toTree(expandedContent, { format: "md", sanitize: true });
if (expandedMdast.children) {
parent.children.splice(idx, 1, ...expandedMdast.children);
return idx + expandedMdast.children.length - 1;
}
}
return;
}
}

return;
});

let expandedMarkdown = mdastToMarkdown(mdast);

const templateData = extractGlobalTemplateData(markdown);
if (templateData && Object.keys(templateData).length > 0) {
expandedMarkdown = applyTemplates(expandedMarkdown, templateData);
}

return expandedMarkdown;
}

/**
* Extract code content from TSFetchCodeBlock component
*/
function extractCodeFromTSFetchCodeBlock(node: any): string | null {
if (node.children && node.children.length > 0) {
const codeChild = node.children.find((child: any) => child.type === "text" || child.type === "code");
if (codeChild) {
return codeChild.value || codeChild.children?.[0]?.value || "";
}
}

const srcAttr = node.attributes?.find((attr: any) => attr.name === "src");
const contentAttr = node.attributes?.find((attr: any) => attr.name === "content");

if (contentAttr?.value) {
return contentAttr.value;
}

if (srcAttr?.value) {
return `// Code from: ${srcAttr.value}`;
}

return null;
}

/**
* Extract code blocks from CodeGroup component
*/
function extractCodeFromCodeGroup(node: any): any[] {
const codeBlocks: any[] = [];

if (node.children) {
node.children.forEach((child: any, index: number) => {
if (child.type === "code" || (child.type === "element" && child.tagName === "code")) {
codeBlocks.push({
type: "code",
lang: child.lang || "text",
value: child.value || child.children?.[0]?.value || "",
});
} else if (isMdxJsxElementHast(child) && child.name === "Code") {
const lang = child.attributes?.find((attr: any) => attr.name === "language")?.value || "text";
const content = child.children?.[0]?.value || "";
codeBlocks.push({
type: "code",
lang,
value: content,
});
}
});
}

return codeBlocks;
}

/**
* Extract template data from Template component attributes
*/
function extractTemplateData(node: any): Record<string, string> | null {
const dataAttr = node.attributes?.find((attr: any) => attr.name === "data");
if (dataAttr?.value && typeof dataAttr.value === "object") {
return dataAttr.value;
}
return null;
}

/**
* Extract global template variables from markdown content
* This handles common OpenRouter template variables
*/
function extractGlobalTemplateData(markdown: string): Record<string, string> {
const templateData: Record<string, string> = {
FREE_MODEL_CREDITS_THRESHOLD: "10", // Example value
API_KEY_REF: "your-api-key",
BASE_URL: "https://openrouter.ai/api/v1",
};

const { data: frontmatter } = getFrontmatter(markdown);
if (frontmatter.templateData) {
Object.assign(templateData, frontmatter.templateData);
}

return templateData;
}

/**
* This is a living list of mdx features that we don't want to include in the LLM TXT format:
* - esm imports
Expand Down