KasarLabs · enitrat · Jan 4, 2026 · Jan 4, 2026 · Jan 4, 2026
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -91,6 +91,12 @@ docker compose up postgres ingester  # Run ingestion
 
 Key fixtures: `client`, `mock_agent`, `mock_vector_db`, `mock_lm`, `sample_documents`
 
+Always run tests before committing.
+
+### Lint and formatting
+
+Lints are managed by the Trunk tool, that can be run with `trunk check --fix`. Always lint and format before committing.
+
 ## Adding a New Documentation Source
 
 1. **Ingester** (`ingesters/src/ingesters/`):

diff --git a/ingesters/src/ingesters/CairoBookIngester.ts b/ingesters/src/ingesters/CairoBookIngester.ts
@@ -1,24 +1,20 @@
-import { type BookConfig } from '../utils/types';
-import { MarkdownIngester } from './MarkdownIngester';
-import { type BookChunk, DocumentSource } from '../types';
-import { Document } from '@langchain/core/documents';
-import { VectorStore } from '../db/postgresVectorStore';
-import { type VectorStoreUpdateOptions } from '../utils/vectorStoreUtils';
-import { logger } from '../utils/logger';
 import * as fs from 'fs/promises';
 import * as path from 'path';
-import { calculateHash } from '../utils/contentUtils';
-import {
-  RecursiveMarkdownSplitter,
-  type SplitOptions,
-} from '../utils/RecursiveMarkdownSplitter';
-import { getPythonPath } from '../utils/paths';
+import { type BookChunk, DocumentSource } from '../types';
+import { type BookConfig, type BookPageDto } from '../utils/types';
+import { processDocFiles } from '../utils/fileUtils';
+import { logger } from '../utils/logger';
+import { exec as execCallback } from 'child_process';
+import { promisify } from 'util';
+import { MarkdownIngester } from './MarkdownIngester';
+
+const exec = promisify(execCallback);
 
 /**
  * Ingester for the Cairo Book documentation
  *
- * This ingester downloads the Cairo Book documentation from GitHub releases,
- * processes the markdown files, and creates chunks for the vector store.
+ * This ingester clones the Cairo Book documentation from the main branch,
+ * builds the mdbook, processes the markdown files, and creates chunks for the vector store.
  */
 export class CairoBookIngester extends MarkdownIngester {
   /**
@@ -41,116 +37,127 @@ export class CairoBookIngester extends MarkdownIngester {
   }
 
   /**
-   * Read the pre-summarized core library documentation file
+   * Get the directory path for extracting files
+   *
+   * @returns string - Path to the extract directory
    */
-  async readSummaryFile(): Promise<string> {
-    const summaryPath = getPythonPath(
-      'src',
-      'cairo_coder_tools',
-      'ingestion',
-      'generated',
-      'cairo_book_summary.md',
-    );
-
-    logger.info(`Reading core library summary from ${summaryPath}`);
-    const text = await fs.readFile(summaryPath, 'utf-8');
-    return text;
+  protected getExtractDir(): string {
+    const { getTempDir } = require('../utils/paths');
+    return getTempDir('cairo-book');
   }
 
   /**
-   * Chunk the core library summary file using RecursiveMarkdownSplitter
-   *
-   * This function takes the markdown content and splits it using a recursive
-   * strategy that respects headers, code blocks, and maintains overlap between chunks.
+   * Clone and process the Cairo Book documentation
    *
-   * @param text - The markdown content to chunk
-   * @returns Promise<Document<BookChunk>[]> - Array of document chunks
+   * @returns Promise<BookPageDto[]> - Array of book pages
    */
-  async chunkSummaryFile(text: string): Promise<Document<BookChunk>[]> {
-    // Configure the splitter with appropriate settings
-    const splitOptions: SplitOptions = {
-      maxChars: 2048,
-      minChars: 500,
-      overlap: 256,
-      headerLevels: [1, 2, 3], // Split on H1/H2/H3 (title uses deepest)
-      preserveCodeBlocks: true,
-      idPrefix: 'cairo-book',
-      trim: true,
-    };
+  protected override async downloadAndExtractDocs(): Promise<BookPageDto[]> {
+    logger.info('Cloning and processing Cairo Book docs');
+    const extractDir = this.getExtractDir();
+    // clear extract dir
+    await fs.rm(extractDir, { recursive: true, force: true });
+
+    // Clone the repository
+    await this.downloadAndExtractRepo(extractDir);
+
+    // Update book.toml configuration
+    await this.updateBookConfig(extractDir);
+
+    // Build the mdbook
+    await this.buildMdBook(extractDir);
 
-    // Create the splitter and split the content
-    const splitter = new RecursiveMarkdownSplitter(splitOptions);
-    const chunks = splitter.splitMarkdownToChunks(text);
-
-    logger.info(
-      `Created ${chunks.length} chunks using RecursiveMarkdownSplitter`,
-    );
-
-    // Convert chunks to Document<BookChunk> format
-    const localChunks: Document<BookChunk>[] = chunks.map((chunk) => {
-      const contentHash = calculateHash(chunk.content);
-
-      return new Document<BookChunk>({
-        pageContent: chunk.content,
-        metadata: {
-          name: chunk.meta.title,
-          title: chunk.meta.title,
-          chunkNumber: chunk.meta.chunkNumber, // Already 0-based
-          contentHash: contentHash,
-          uniqueId: chunk.meta.uniqueId,
-          sourceLink: chunk.meta.sourceLink || this.config.baseUrl,
-          source: this.source,
-        },
-      });
-    });
-
-    return localChunks;
+    // Process the markdown files
+    const srcDir = path.join(extractDir, 'book', 'markdown');
+    const pages = await processDocFiles(this.config, srcDir);
+
+    return pages;
   }
 
   /**
-   * Core Library specific processing based on the pre-summarized markdown file
-   * @param vectorStore
+   * Clone the repository from the main branch
+   *
+   * @param extractDir - The directory to clone into
    */
-  public override async process(
-    vectorStore: VectorStore,
-    options?: VectorStoreUpdateOptions,
-  ): Promise<void> {
-    try {
-      // 1. Read the pre-summarized documentation
-      const text = await this.readSummaryFile();
+  private async downloadAndExtractRepo(extractDir: string): Promise<void> {
+    const repoUrl = `https://github.com/${this.config.repoOwner}/${this.config.repoName}.git`;
 
-      // 2. Create chunks from the documentation
-      const chunks = await this.chunkSummaryFile(text);
+    logger.info(`Cloning repository from ${repoUrl}`);
 
-      logger.info(
-        `Created ${chunks.length} chunks from Cairo Book documentation`,
+    try {
+      await exec(
+        `git clone --depth 1 --branch main "${repoUrl}" "${extractDir}"`,
       );
-
-      // 3. Update the vector store with the chunks
-      await this.updateVectorStore(vectorStore, chunks, options);
-
-      // 4. Clean up any temporary files (no temp files in this case)
-      await this.cleanupDownloadedFiles();
+      logger.info('Repository cloned successfully.');
     } catch (error) {
-      this.handleError(error);
+      logger.error('Error cloning repository:', error);
+      throw new Error('Failed to clone repository');
     }
   }
 
   /**
-   * Get the directory path for extracting files
+   * Update the book.toml configuration
    *
-   * @returns string - Path to the extract directory
+   * @param extractDir - The directory containing the book.toml file
    */
-  protected getExtractDir(): string {
-    const { getTempDir } = require('../utils/paths');
-    return getTempDir('corelib-docs');
+  private async updateBookConfig(extractDir: string): Promise<void> {
+    const bookTomlPath = path.join(extractDir, 'book.toml');
+
+    try {
+      let bookToml = await fs.readFile(bookTomlPath, 'utf8');
+
+      // Remove the quiz-cairo preprocessor section if it exists
+      bookToml = bookToml.replace(
+        /\[preprocessor\.quiz-cairo\][\s\S]*?(?=\n\[|$)/g,
+        '',
+      );
+
+      // Remove the cairo preprocessor section if it exists
+      bookToml = bookToml.replace(
+        /\[preprocessor\.cairo\][\s\S]*?(?=\n\[|$)/g,
+        '',
+      );
+
+      // Remove the gettext preprocessor section if it exists
+      bookToml = bookToml.replace(
+        /\[preprocessor\.gettext\][\s\S]*?(?=\n\[|$)/g,
+        '',
+      );
+
+      // Add [output.markdown] if it doesn't exist
+      if (!bookToml.includes('[output.markdown]')) {
+        bookToml += '\n[output.markdown]\n';
+      }
+
+      await fs.writeFile(bookTomlPath, bookToml);
+      logger.info('Updated book.toml configuration');
+    } catch (error) {
+      logger.error('Error updating book.toml:', error);
+      throw new Error('Failed to update book.toml configuration');
+    }
   }
 
   /**
-   * Override cleanupDownloadedFiles since we don't download anything
+   * Build the mdbook
+   *
+   * @param extractDir - The directory containing the mdbook
    */
-  protected override async cleanupDownloadedFiles(): Promise<void> {
-    // No cleanup needed as we're reading from a local file
-    logger.info('No cleanup needed - using local summary file');
+  private async buildMdBook(extractDir: string): Promise<void> {
+    try {
+      logger.info('Building mdbook...');
+      try {
+        await exec('mdbook --version');
+      } catch (error) {
+        logger.error('mdbook is not installed on this system');
+        throw new Error(
+          'mdbook is not installed. Please install mdbook to continue: https://rust-lang.github.io/mdBook/guide/installation.html',
+        );
+      }
+
+      await exec('mdbook build', { cwd: extractDir });
+      logger.info('mdbook build completed successfully');
+    } catch (error) {
+      logger.error('Error building mdbook:', error);
+      throw error;
+    }
   }
 }