Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,12 @@ docker compose up postgres ingester # Run ingestion

Key fixtures: `client`, `mock_agent`, `mock_vector_db`, `mock_lm`, `sample_documents`

Always run tests before committing.

### Lint and formatting

Lints are managed by the Trunk tool, that can be run with `trunk check --fix`. Always lint and format before committing.

## Adding a New Documentation Source

1. **Ingester** (`ingesters/src/ingesters/`):
Expand Down
209 changes: 108 additions & 101 deletions ingesters/src/ingesters/CairoBookIngester.ts
Original file line number Diff line number Diff line change
@@ -1,24 +1,20 @@
import { type BookConfig } from '../utils/types';
import { MarkdownIngester } from './MarkdownIngester';
import { type BookChunk, DocumentSource } from '../types';
import { Document } from '@langchain/core/documents';
import { VectorStore } from '../db/postgresVectorStore';
import { type VectorStoreUpdateOptions } from '../utils/vectorStoreUtils';
import { logger } from '../utils/logger';
import * as fs from 'fs/promises';
import * as path from 'path';
import { calculateHash } from '../utils/contentUtils';
import {
RecursiveMarkdownSplitter,
type SplitOptions,
} from '../utils/RecursiveMarkdownSplitter';
import { getPythonPath } from '../utils/paths';
import { type BookChunk, DocumentSource } from '../types';
import { type BookConfig, type BookPageDto } from '../utils/types';
import { processDocFiles } from '../utils/fileUtils';
import { logger } from '../utils/logger';
import { exec as execCallback } from 'child_process';
import { promisify } from 'util';
import { MarkdownIngester } from './MarkdownIngester';

const exec = promisify(execCallback);

/**
* Ingester for the Cairo Book documentation
*
* This ingester downloads the Cairo Book documentation from GitHub releases,
* processes the markdown files, and creates chunks for the vector store.
* This ingester clones the Cairo Book documentation from the main branch,
* builds the mdbook, processes the markdown files, and creates chunks for the vector store.
*/
export class CairoBookIngester extends MarkdownIngester {
/**
Expand All @@ -41,116 +37,127 @@ export class CairoBookIngester extends MarkdownIngester {
}

/**
* Read the pre-summarized core library documentation file
* Get the directory path for extracting files
*
* @returns string - Path to the extract directory
*/
async readSummaryFile(): Promise<string> {
const summaryPath = getPythonPath(
'src',
'cairo_coder_tools',
'ingestion',
'generated',
'cairo_book_summary.md',
);

logger.info(`Reading core library summary from ${summaryPath}`);
const text = await fs.readFile(summaryPath, 'utf-8');
return text;
protected getExtractDir(): string {
const { getTempDir } = require('../utils/paths');
return getTempDir('cairo-book');
}

/**
* Chunk the core library summary file using RecursiveMarkdownSplitter
*
* This function takes the markdown content and splits it using a recursive
* strategy that respects headers, code blocks, and maintains overlap between chunks.
* Clone and process the Cairo Book documentation
*
* @param text - The markdown content to chunk
* @returns Promise<Document<BookChunk>[]> - Array of document chunks
* @returns Promise<BookPageDto[]> - Array of book pages
*/
async chunkSummaryFile(text: string): Promise<Document<BookChunk>[]> {
// Configure the splitter with appropriate settings
const splitOptions: SplitOptions = {
maxChars: 2048,
minChars: 500,
overlap: 256,
headerLevels: [1, 2, 3], // Split on H1/H2/H3 (title uses deepest)
preserveCodeBlocks: true,
idPrefix: 'cairo-book',
trim: true,
};
protected override async downloadAndExtractDocs(): Promise<BookPageDto[]> {
logger.info('Cloning and processing Cairo Book docs');
const extractDir = this.getExtractDir();
// clear extract dir
await fs.rm(extractDir, { recursive: true, force: true });

// Clone the repository
await this.downloadAndExtractRepo(extractDir);

// Update book.toml configuration
await this.updateBookConfig(extractDir);

// Build the mdbook
await this.buildMdBook(extractDir);

// Create the splitter and split the content
const splitter = new RecursiveMarkdownSplitter(splitOptions);
const chunks = splitter.splitMarkdownToChunks(text);

logger.info(
`Created ${chunks.length} chunks using RecursiveMarkdownSplitter`,
);

// Convert chunks to Document<BookChunk> format
const localChunks: Document<BookChunk>[] = chunks.map((chunk) => {
const contentHash = calculateHash(chunk.content);

return new Document<BookChunk>({
pageContent: chunk.content,
metadata: {
name: chunk.meta.title,
title: chunk.meta.title,
chunkNumber: chunk.meta.chunkNumber, // Already 0-based
contentHash: contentHash,
uniqueId: chunk.meta.uniqueId,
sourceLink: chunk.meta.sourceLink || this.config.baseUrl,
source: this.source,
},
});
});

return localChunks;
// Process the markdown files
const srcDir = path.join(extractDir, 'book', 'markdown');
const pages = await processDocFiles(this.config, srcDir);

return pages;
}

/**
* Core Library specific processing based on the pre-summarized markdown file
* @param vectorStore
* Clone the repository from the main branch
*
* @param extractDir - The directory to clone into
*/
public override async process(
vectorStore: VectorStore,
options?: VectorStoreUpdateOptions,
): Promise<void> {
try {
// 1. Read the pre-summarized documentation
const text = await this.readSummaryFile();
private async downloadAndExtractRepo(extractDir: string): Promise<void> {
const repoUrl = `https://github.com/${this.config.repoOwner}/${this.config.repoName}.git`;

// 2. Create chunks from the documentation
const chunks = await this.chunkSummaryFile(text);
logger.info(`Cloning repository from ${repoUrl}`);

logger.info(
`Created ${chunks.length} chunks from Cairo Book documentation`,
try {
await exec(
`git clone --depth 1 --branch main "${repoUrl}" "${extractDir}"`,
);

// 3. Update the vector store with the chunks
await this.updateVectorStore(vectorStore, chunks, options);

// 4. Clean up any temporary files (no temp files in this case)
await this.cleanupDownloadedFiles();
logger.info('Repository cloned successfully.');
} catch (error) {
this.handleError(error);
logger.error('Error cloning repository:', error);
throw new Error('Failed to clone repository');
}
}

/**
* Get the directory path for extracting files
* Update the book.toml configuration
*
* @returns string - Path to the extract directory
* @param extractDir - The directory containing the book.toml file
*/
protected getExtractDir(): string {
const { getTempDir } = require('../utils/paths');
return getTempDir('corelib-docs');
private async updateBookConfig(extractDir: string): Promise<void> {
const bookTomlPath = path.join(extractDir, 'book.toml');

try {
let bookToml = await fs.readFile(bookTomlPath, 'utf8');

// Remove the quiz-cairo preprocessor section if it exists
bookToml = bookToml.replace(
/\[preprocessor\.quiz-cairo\][\s\S]*?(?=\n\[|$)/g,
'',
);

// Remove the cairo preprocessor section if it exists
bookToml = bookToml.replace(
/\[preprocessor\.cairo\][\s\S]*?(?=\n\[|$)/g,
'',
);

// Remove the gettext preprocessor section if it exists
bookToml = bookToml.replace(
/\[preprocessor\.gettext\][\s\S]*?(?=\n\[|$)/g,
'',
);

// Add [output.markdown] if it doesn't exist
if (!bookToml.includes('[output.markdown]')) {
bookToml += '\n[output.markdown]\n';
}

await fs.writeFile(bookTomlPath, bookToml);
logger.info('Updated book.toml configuration');
} catch (error) {
logger.error('Error updating book.toml:', error);
throw new Error('Failed to update book.toml configuration');
}
}

/**
* Override cleanupDownloadedFiles since we don't download anything
* Build the mdbook
*
* @param extractDir - The directory containing the mdbook
*/
protected override async cleanupDownloadedFiles(): Promise<void> {
// No cleanup needed as we're reading from a local file
logger.info('No cleanup needed - using local summary file');
private async buildMdBook(extractDir: string): Promise<void> {
try {
logger.info('Building mdbook...');
try {
await exec('mdbook --version');
} catch (error) {
logger.error('mdbook is not installed on this system');
throw new Error(
'mdbook is not installed. Please install mdbook to continue: https://rust-lang.github.io/mdBook/guide/installation.html',
);
}

await exec('mdbook build', { cwd: extractDir });
logger.info('mdbook build completed successfully');
} catch (error) {
logger.error('Error building mdbook:', error);
throw error;
}
}
}
Loading