KasarLabs · enitrat · Jan 4, 2026 · Jan 4, 2026 · Jan 4, 2026
diff --git a/.gitignore b/.gitignore
@@ -41,7 +41,6 @@ packages/**/dist
 
 /data
 
-.trunk
 !.trunk/trunk.yaml
 !.trunk/.gitignore
 

diff --git a/.trunk/configs/ruff.toml b/.trunk/configs/ruff.toml
@@ -0,0 +1,5 @@
+# Generic, formatter-friendly config.
+select = ["B", "D3", "E", "F"]
+
+# Never enforce `E501` (line length violations). This should be handled by formatters.
+ignore = ["E501"]
diff --git a/ingesters/src/ingesters/CoreLibDocsIngester.ts b/ingesters/src/ingesters/CoreLibDocsIngester.ts
@@ -1,5 +1,7 @@
 import * as fs from 'fs/promises';
 import * as path from 'path';
+import { exec as execCallback } from 'child_process';
+import { promisify } from 'util';
 import { type BookConfig } from '../utils/types';
 import { MarkdownIngester } from './MarkdownIngester';
 import { type BookChunk, DocumentSource } from '../types';
@@ -8,17 +10,15 @@ import { VectorStore } from '../db/postgresVectorStore';
 import { type VectorStoreUpdateOptions } from '../utils/vectorStoreUtils';
 import { logger } from '../utils/logger';
 import { calculateHash } from '../utils/contentUtils';
-import {
-  RecursiveMarkdownSplitter,
-  type SplitOptions,
-} from '../utils/RecursiveMarkdownSplitter';
 import { getPythonPath } from '../utils/paths';
+import { parseMdxFile, type ParsedMdxDoc } from '../utils/MdxParser';
+import { formatAsApiIndex } from '../utils/ApiIndexFormatter';
 
 /**
  * Ingester for the Cairo Core Library documentation
  *
- * This ingester processes the pre-summarized Cairo Core Library documentation
- * from a local markdown file and creates chunks for the vector store.
+ * This ingester pulls structured corelib MDX docs from starknet-docs,
+ * formats them into a compact API index, and creates module-level chunks.
  */
 export class CoreLibDocsIngester extends MarkdownIngester {
   /**
@@ -27,23 +27,88 @@ export class CoreLibDocsIngester extends MarkdownIngester {
   constructor() {
     // Define the configuration for the Cairo Core Library
     const config: BookConfig = {
-      repoOwner: 'enitrat',
-      repoName: 'cairo-docs',
-      fileExtensions: ['.md'],
+      repoOwner: 'starknet-io',
+      repoName: 'starknet-docs',
+      fileExtensions: ['.mdx'],
       chunkSize: 4096,
       chunkOverlap: 512,
-      baseUrl: 'https://docs.starknet.io/build/corelib/intro',
+      baseUrl: 'https://docs.starknet.io/build/corelib',
       urlSuffix: '',
       useUrlMapping: true,
+      sourceDir: 'build/corelib',
     };
 
     super(config, DocumentSource.CORELIB_DOCS);
   }
 
   /**
-   * Read the pre-summarized core library documentation file
+   * Clone the corelib documentation repository
    */
-  async readCorelibSummaryFile(): Promise<string> {
+  private async cloneRepo(): Promise<string> {
+    const extractDir = this.getExtractDir();
+    const repoUrl = `https://github.com/${this.config.repoOwner}/${this.config.repoName}.git`;
+    const exec = promisify(execCallback);
+
+    logger.info(`Cloning repository from ${repoUrl}`);
+
+    await fs.rm(extractDir, { recursive: true, force: true }).catch(() => {});
+    await exec(`git clone --depth 1 ${repoUrl} ${extractDir}`);
+
+    logger.info('Repository cloned successfully.');
+    return extractDir;
+  }
+
+  /**
+   * Read and parse all corelib MDX files into structured docs
+   */
+  private async parseCorelibMdx(repoPath: string): Promise<ParsedMdxDoc[]> {
+    const sourceDir = this.config.sourceDir ?? 'build/corelib';
+    const corelibDir = path.join(repoPath, sourceDir);
+    const mdxFiles = await this.collectMdxFiles(corelibDir);
+    logger.info(`Found ${mdxFiles.length} corelib MDX files.`);
+
+    const docs: ParsedMdxDoc[] = [];
+    for (const filePath of mdxFiles) {
+      const content = await fs.readFile(filePath, 'utf8');
+      const relativePath = path
+        .relative(corelibDir, filePath)
+        .split(path.sep)
+        .join('/');
+      docs.push(parseMdxFile(content, relativePath));
+    }
+
+    return docs;
+  }
+
+  /**
+   * Collect MDX files from a directory (recursively).
+   */
+  private async collectMdxFiles(directory: string): Promise<string[]> {
+    const entries = await fs.readdir(directory, { withFileTypes: true });
+    const files: string[] = [];
+
+    for (const entry of entries) {
+      const fullPath = path.join(directory, entry.name);
+      if (entry.isDirectory()) {
+        const nested = await this.collectMdxFiles(fullPath);
+        files.push(...nested);
+      } else if (
+        entry.isFile() &&
+        this.config.fileExtensions.includes(
+          path.extname(entry.name).toLowerCase(),
+        )
+      ) {
+        files.push(fullPath);
+      }
+    }
+
+    return files;
+  }
+
+  /**
+   * Save formatted API index to the generated corelib summary file
+   */
+  private async saveApiIndex(apiIndex: string): Promise<void> {
     const summaryPath = getPythonPath(
       'src',
       'cairo_coder_tools',
@@ -52,88 +117,74 @@ export class CoreLibDocsIngester extends MarkdownIngester {
       'corelib_summary.md',
     );
 
-    logger.info(`Reading core library summary from ${summaryPath}`);
-    const text = await fs.readFile(summaryPath, 'utf-8');
-    return text;
+    await fs.mkdir(path.dirname(summaryPath), { recursive: true });
+    await fs.writeFile(summaryPath, apiIndex, 'utf8');
+    logger.info(`Saved API index to ${summaryPath}`);
   }
 
   /**
-   * Chunk the core library summary file using RecursiveMarkdownSplitter
-   *
-   * This function takes the markdown content and splits it using a recursive
-   * strategy that respects headers, code blocks, and maintains overlap between chunks.
-   *
-   * @param text - The markdown content to chunk
-   * @returns Promise<Document<BookChunk>[]> - Array of document chunks
+   * Chunk the API index by module blocks
    */
-  async chunkCorelibSummaryFile(text: string): Promise<Document<BookChunk>[]> {
-    logger.info(
-      'Using RecursiveMarkdownSplitter to chunk Core Library documentation',
-    );
-
-    // Configure the splitter with appropriate settings
-    const splitOptions: SplitOptions = {
-      maxChars: 2048,
-      minChars: 500,
-      overlap: 256,
-      headerLevels: [1, 2, 3], // Split on H1/H2/H3 (title uses deepest)
-      preserveCodeBlocks: true,
-      idPrefix: 'corelib',
-      trim: true,
-    };
-
-    // Create the splitter and split the content
-    const splitter = new RecursiveMarkdownSplitter(splitOptions);
-    const chunks = splitter.splitMarkdownToChunks(text);
-
-    logger.info(
-      `Created ${chunks.length} chunks using RecursiveMarkdownSplitter`,
-    );
-
-    // Convert chunks to Document<BookChunk> format
-    const localChunks: Document<BookChunk>[] = chunks.map((chunk) => {
-      const contentHash = calculateHash(chunk.content);
+  private chunkApiIndex(apiIndex: string): Document<BookChunk>[] {
+    const blocks = apiIndex
+      .split(/\n{2,}(?=\[module\]\s+)/)
+      .map((block) => block.trim())
+      .filter(Boolean);
+
+    return blocks.map((block) => {
+      const moduleMatch = block.match(/^\[module\]\s+(.+)$/m);
+      const modulePath = moduleMatch ? moduleMatch[1].trim() : 'corelib';
+      const urlMatch = block.match(/^\[url\]\s+(.+)$/m);
+      const sourceLink = urlMatch ? urlMatch[1].trim() : this.config.baseUrl;
+      const contentHash = calculateHash(block);
 
       return new Document<BookChunk>({
-        pageContent: chunk.content,
+        pageContent: block,
         metadata: {
-          name: chunk.meta.title,
-          title: chunk.meta.title,
-          chunkNumber: chunk.meta.chunkNumber, // Already 0-based
-          contentHash: contentHash,
-          uniqueId: chunk.meta.uniqueId,
-          sourceLink: chunk.meta.sourceLink || this.config.baseUrl,
+          name: modulePath,
+          title: modulePath,
+          chunkNumber: 0,
+          contentHash,
+          uniqueId: `${modulePath}-0`,
+          sourceLink,
           source: this.source,
         },
       });
     });
-
-    return localChunks;
   }
 
   /**
-   * Core Library specific processing based on the pre-summarized markdown file
+   * Core Library specific processing based on the structured MDX docs
    * @param vectorStore
    */
   public override async process(
     vectorStore: VectorStore,
     options?: VectorStoreUpdateOptions,
   ): Promise<void> {
     try {
-      // 1. Read the pre-summarized documentation
-      const text = await this.readCorelibSummaryFile();
+      // 1. Clone the repository
+      const repoPath = await this.cloneRepo();
+
+      // 2. Parse corelib MDX files
+      const docs = await this.parseCorelibMdx(repoPath);
+
+      // 3. Format as compact API index
+      const apiIndex = formatAsApiIndex(docs);
+
+      // 4. Save the API index to disk
+      await this.saveApiIndex(apiIndex);
 
-      // 2. Create chunks from the documentation
-      const chunks = await this.chunkCorelibSummaryFile(text);
+      // 5. Create chunks from the API index
+      const chunks = this.chunkApiIndex(apiIndex);
 
       logger.info(
         `Created ${chunks.length} chunks from core library documentation`,
       );
 
-      // 3. Update the vector store with the chunks
+      // 6. Update the vector store with the chunks
       await this.updateVectorStore(vectorStore, chunks, options);
 
-      // 4. Clean up any temporary files (no temp files in this case)
+      // 7. Clean up cloned repo
       await this.cleanupDownloadedFiles();
     } catch (error) {
       this.handleError(error);
@@ -151,10 +202,11 @@ export class CoreLibDocsIngester extends MarkdownIngester {
   }
 
   /**
-   * Override cleanupDownloadedFiles since we don't download anything
+   * Clean up cloned repository files
    */
   protected override async cleanupDownloadedFiles(): Promise<void> {
-    // No cleanup needed as we're reading from a local file
-    logger.info('No cleanup needed - using local summary file');
+    const extractDir = this.getExtractDir();
+    await fs.rm(extractDir, { recursive: true, force: true });
+    logger.info(`Deleted downloaded corelib docs from ${extractDir}`);
   }
 }
-Original file line number
+Diff line change
@@ Expand Up / @@ -41,7 +41,6 @@ packages/**/dist @@
     /data
-    .trunk
     !.trunk/trunk.yaml
     !.trunk/.gitignore
@@ Expand Down @@