Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@ packages/**/dist

/data

.trunk
!.trunk/trunk.yaml
!.trunk/.gitignore

Expand Down
5 changes: 5 additions & 0 deletions .trunk/configs/ruff.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Generic, formatter-friendly config.
select = ["B", "D3", "E", "F"]

# Never enforce `E501` (line length violations). This should be handled by formatters.
ignore = ["E501"]
188 changes: 120 additions & 68 deletions ingesters/src/ingesters/CoreLibDocsIngester.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import * as fs from 'fs/promises';
import * as path from 'path';
import { exec as execCallback } from 'child_process';
import { promisify } from 'util';
import { type BookConfig } from '../utils/types';
import { MarkdownIngester } from './MarkdownIngester';
import { type BookChunk, DocumentSource } from '../types';
Expand All @@ -8,17 +10,15 @@ import { VectorStore } from '../db/postgresVectorStore';
import { type VectorStoreUpdateOptions } from '../utils/vectorStoreUtils';
import { logger } from '../utils/logger';
import { calculateHash } from '../utils/contentUtils';
import {
RecursiveMarkdownSplitter,
type SplitOptions,
} from '../utils/RecursiveMarkdownSplitter';
import { getPythonPath } from '../utils/paths';
import { parseMdxFile, type ParsedMdxDoc } from '../utils/MdxParser';
import { formatAsApiIndex } from '../utils/ApiIndexFormatter';

/**
* Ingester for the Cairo Core Library documentation
*
* This ingester processes the pre-summarized Cairo Core Library documentation
* from a local markdown file and creates chunks for the vector store.
* This ingester pulls structured corelib MDX docs from starknet-docs,
* formats them into a compact API index, and creates module-level chunks.
*/
export class CoreLibDocsIngester extends MarkdownIngester {
/**
Expand All @@ -27,23 +27,88 @@ export class CoreLibDocsIngester extends MarkdownIngester {
constructor() {
// Define the configuration for the Cairo Core Library
const config: BookConfig = {
repoOwner: 'enitrat',
repoName: 'cairo-docs',
fileExtensions: ['.md'],
repoOwner: 'starknet-io',
repoName: 'starknet-docs',
fileExtensions: ['.mdx'],
chunkSize: 4096,
chunkOverlap: 512,
baseUrl: 'https://docs.starknet.io/build/corelib/intro',
baseUrl: 'https://docs.starknet.io/build/corelib',
urlSuffix: '',
useUrlMapping: true,
sourceDir: 'build/corelib',
};

super(config, DocumentSource.CORELIB_DOCS);
}

/**
* Read the pre-summarized core library documentation file
* Clone the corelib documentation repository
*/
async readCorelibSummaryFile(): Promise<string> {
private async cloneRepo(): Promise<string> {
const extractDir = this.getExtractDir();
const repoUrl = `https://github.com/${this.config.repoOwner}/${this.config.repoName}.git`;
const exec = promisify(execCallback);

logger.info(`Cloning repository from ${repoUrl}`);

await fs.rm(extractDir, { recursive: true, force: true }).catch(() => {});
await exec(`git clone --depth 1 ${repoUrl} ${extractDir}`);

logger.info('Repository cloned successfully.');
return extractDir;
}

/**
* Read and parse all corelib MDX files into structured docs
*/
private async parseCorelibMdx(repoPath: string): Promise<ParsedMdxDoc[]> {
const sourceDir = this.config.sourceDir ?? 'build/corelib';
const corelibDir = path.join(repoPath, sourceDir);
const mdxFiles = await this.collectMdxFiles(corelibDir);
logger.info(`Found ${mdxFiles.length} corelib MDX files.`);

const docs: ParsedMdxDoc[] = [];
for (const filePath of mdxFiles) {
const content = await fs.readFile(filePath, 'utf8');
const relativePath = path
.relative(corelibDir, filePath)
.split(path.sep)
.join('/');
docs.push(parseMdxFile(content, relativePath));
}

return docs;
}

/**
* Collect MDX files from a directory (recursively).
*/
private async collectMdxFiles(directory: string): Promise<string[]> {
const entries = await fs.readdir(directory, { withFileTypes: true });
const files: string[] = [];

for (const entry of entries) {
const fullPath = path.join(directory, entry.name);
if (entry.isDirectory()) {
const nested = await this.collectMdxFiles(fullPath);
files.push(...nested);
} else if (
entry.isFile() &&
this.config.fileExtensions.includes(
path.extname(entry.name).toLowerCase(),
)
) {
files.push(fullPath);
}
}

return files;
}

/**
* Save formatted API index to the generated corelib summary file
*/
private async saveApiIndex(apiIndex: string): Promise<void> {
const summaryPath = getPythonPath(
'src',
'cairo_coder_tools',
Expand All @@ -52,88 +117,74 @@ export class CoreLibDocsIngester extends MarkdownIngester {
'corelib_summary.md',
);

logger.info(`Reading core library summary from ${summaryPath}`);
const text = await fs.readFile(summaryPath, 'utf-8');
return text;
await fs.mkdir(path.dirname(summaryPath), { recursive: true });
await fs.writeFile(summaryPath, apiIndex, 'utf8');
logger.info(`Saved API index to ${summaryPath}`);
}

/**
* Chunk the core library summary file using RecursiveMarkdownSplitter
*
* This function takes the markdown content and splits it using a recursive
* strategy that respects headers, code blocks, and maintains overlap between chunks.
*
* @param text - The markdown content to chunk
* @returns Promise<Document<BookChunk>[]> - Array of document chunks
* Chunk the API index by module blocks
*/
async chunkCorelibSummaryFile(text: string): Promise<Document<BookChunk>[]> {
logger.info(
'Using RecursiveMarkdownSplitter to chunk Core Library documentation',
);

// Configure the splitter with appropriate settings
const splitOptions: SplitOptions = {
maxChars: 2048,
minChars: 500,
overlap: 256,
headerLevels: [1, 2, 3], // Split on H1/H2/H3 (title uses deepest)
preserveCodeBlocks: true,
idPrefix: 'corelib',
trim: true,
};

// Create the splitter and split the content
const splitter = new RecursiveMarkdownSplitter(splitOptions);
const chunks = splitter.splitMarkdownToChunks(text);

logger.info(
`Created ${chunks.length} chunks using RecursiveMarkdownSplitter`,
);

// Convert chunks to Document<BookChunk> format
const localChunks: Document<BookChunk>[] = chunks.map((chunk) => {
const contentHash = calculateHash(chunk.content);
private chunkApiIndex(apiIndex: string): Document<BookChunk>[] {
const blocks = apiIndex
.split(/\n{2,}(?=\[module\]\s+)/)
.map((block) => block.trim())
.filter(Boolean);

return blocks.map((block) => {
const moduleMatch = block.match(/^\[module\]\s+(.+)$/m);
const modulePath = moduleMatch ? moduleMatch[1].trim() : 'corelib';
const urlMatch = block.match(/^\[url\]\s+(.+)$/m);
const sourceLink = urlMatch ? urlMatch[1].trim() : this.config.baseUrl;
const contentHash = calculateHash(block);

return new Document<BookChunk>({
pageContent: chunk.content,
pageContent: block,
metadata: {
name: chunk.meta.title,
title: chunk.meta.title,
chunkNumber: chunk.meta.chunkNumber, // Already 0-based
contentHash: contentHash,
uniqueId: chunk.meta.uniqueId,
sourceLink: chunk.meta.sourceLink || this.config.baseUrl,
name: modulePath,
title: modulePath,
chunkNumber: 0,
contentHash,
uniqueId: `${modulePath}-0`,
sourceLink,
source: this.source,
},
});
});

return localChunks;
}

/**
* Core Library specific processing based on the pre-summarized markdown file
* Core Library specific processing based on the structured MDX docs
* @param vectorStore
*/
public override async process(
vectorStore: VectorStore,
options?: VectorStoreUpdateOptions,
): Promise<void> {
try {
// 1. Read the pre-summarized documentation
const text = await this.readCorelibSummaryFile();
// 1. Clone the repository
const repoPath = await this.cloneRepo();

// 2. Parse corelib MDX files
const docs = await this.parseCorelibMdx(repoPath);

// 3. Format as compact API index
const apiIndex = formatAsApiIndex(docs);

// 4. Save the API index to disk
await this.saveApiIndex(apiIndex);

// 2. Create chunks from the documentation
const chunks = await this.chunkCorelibSummaryFile(text);
// 5. Create chunks from the API index
const chunks = this.chunkApiIndex(apiIndex);

logger.info(
`Created ${chunks.length} chunks from core library documentation`,
);

// 3. Update the vector store with the chunks
// 6. Update the vector store with the chunks
await this.updateVectorStore(vectorStore, chunks, options);

// 4. Clean up any temporary files (no temp files in this case)
// 7. Clean up cloned repo
await this.cleanupDownloadedFiles();
} catch (error) {
this.handleError(error);
Expand All @@ -151,10 +202,11 @@ export class CoreLibDocsIngester extends MarkdownIngester {
}

/**
* Override cleanupDownloadedFiles since we don't download anything
* Clean up cloned repository files
*/
protected override async cleanupDownloadedFiles(): Promise<void> {
// No cleanup needed as we're reading from a local file
logger.info('No cleanup needed - using local summary file');
const extractDir = this.getExtractDir();
await fs.rm(extractDir, { recursive: true, force: true });
logger.info(`Deleted downloaded corelib docs from ${extractDir}`);
}
}
Loading