diff --git a/README.md b/README.md index f08a34f..e57b203 100644 --- a/README.md +++ b/README.md @@ -60,19 +60,41 @@ npx @agentgram/ax-score https://example.com -u, --upload Upload results to AgentGram hosted API --api-url API endpoint for uploading results --api-key API key for authentication (or set AGENTGRAM_API_KEY) +-r, --repeat Run the audit N times and report score stability (default: 1) ``` +### Repeat-run stability checks + +Use `--repeat` when you want to measure score drift across sequential runs of the same URL: + +```bash +npx @agentgram/ax-score https://example.com --repeat 3 +``` + +The CLI keeps the usual report shape and adds a `Stability` block with per-run scores plus aggregate mean, range, delta, and variance. + ### Programmatic Usage ```typescript -import { runAudit } from '@agentgram/ax-score'; +import { runAudit, runRepeatedAudit } from '@agentgram/ax-score'; -const report = await runAudit({ +const singleRun = await runAudit({ url: 'https://example.com', timeout: 30000, verbose: false, }); -console.log(`Score: ${report.score}`); + +const repeatedRun = await runRepeatedAudit( + { + url: 'https://example.com', + timeout: 30000, + verbose: false, + }, + 3 +); + +console.log(`Single-run score: ${singleRun.score}`); +console.log(repeatedRun.stability); ``` --- diff --git a/docs/json-output-contract.md b/docs/json-output-contract.md index 6f7bfde..cb03589 100644 --- a/docs/json-output-contract.md +++ b/docs/json-output-contract.md @@ -17,6 +17,7 @@ The output follows the `AXReport` TypeScript interface defined in `src/types.ts` | `categories` | `AXCategory[]` | Array of category scores | | `audits` | `Record` | Map of audit ID to audit result | | `recommendations` | `Recommendation[]` | Actionable recommendations sorted by impact | +| `stability` | `StabilityResult \| undefined` | Present when the audit is run with repeat mode | --- @@ -70,6 +71,18 @@ The output follows the `AXReport` TypeScript interface defined in `src/types.ts` | `message` | `string` | The audit description explaining the issue | | `impact` | `number` | Potential score improvement (higher = better) | +## `StabilityResult` + +| Field | Type | Description | +| ---------- | ---------- | -------------------------------------------------------- | +| `runs` | `number` | Number of sequential audit runs | +| `scores` | `number[]` | Overall score from each run, in execution order | +| `min` | `number` | Lowest overall score across the repeated runs | +| `max` | `number` | Highest overall score across the repeated runs | +| `mean` | `number` | Mean overall score across the repeated runs | +| `delta` | `number` | `max - min`, useful for quick drift checks | +| `variance` | `number` | Population variance across the repeated overall scores | + --- ## Audit IDs @@ -181,10 +194,21 @@ There are 18 audits organized into 6 categories: "message": "Rate limit headers inform AI agents about request quotas...", "impact": 3 } - ] + ], + "stability": { + "runs": 3, + "scores": [62, 58, 61], + "min": 58, + "max": 62, + "mean": 60.33, + "delta": 4, + "variance": 2.89 + } } ``` +When `runAudit()` is used directly, or the CLI runs without `--repeat`, the `stability` field is omitted. + --- ## Score Interpretation diff --git a/src/bin/ax-score.ts b/src/bin/ax-score.ts index 3ceb451..a6d53a0 100644 --- a/src/bin/ax-score.ts +++ b/src/bin/ax-score.ts @@ -1,8 +1,8 @@ #!/usr/bin/env node -import { Command } from 'commander'; +import { Command, InvalidArgumentError } from 'commander'; import ora from 'ora'; -import { runAudit } from '../runner.js'; +import { runRepeatedAudit } from '../runner.js'; import { renderReport } from '../reporter/cli.js'; import { renderJSON } from '../reporter/json.js'; import { uploadReport } from '../upload.js'; @@ -15,6 +15,7 @@ interface CliOptions { upload: boolean; apiUrl: string; apiKey?: string; + repeat: number; } const DEFAULT_API_URL = 'https://agentgram.co/api/v1/ax-score/scan'; @@ -34,15 +35,25 @@ program .option('-u, --upload', 'Upload results to AgentGram hosted API', false) .option('--api-url ', 'API endpoint for uploading results', DEFAULT_API_URL) .option('--api-key ', 'API key for authentication (or set AGENTGRAM_API_KEY env var)') + .option( + '-r, --repeat ', + 'Run the audit N times and report score stability', + parsePositiveInteger, + 1 + ) .action(async (url: string, options: CliOptions) => { - const spinner = ora(`Auditing ${url}...`).start(); + const repeat = options.repeat; + const spinner = ora( + repeat > 1 ? `Auditing ${url} (${repeat} runs)...` : `Auditing ${url}...` + ).start(); try { - const report = await runAudit({ + const config = { url, timeout: parseInt(options.timeout, 10), verbose: options.verbose, - }); + }; + const report = await runRepeatedAudit(config, repeat); spinner.stop(); @@ -74,9 +85,7 @@ program uploadSpinner.succeed('Results uploaded successfully.'); } catch (uploadError) { uploadSpinner.fail('Failed to upload results.'); - console.error( - uploadError instanceof Error ? uploadError.message : String(uploadError) - ); + console.error(uploadError instanceof Error ? uploadError.message : String(uploadError)); // Upload failure is non-fatal: still exit based on score } } @@ -90,3 +99,13 @@ program }); program.parse(); + +function parsePositiveInteger(value: string): number { + const parsed = Number.parseInt(value, 10); + + if (!Number.isInteger(parsed) || parsed < 1) { + throw new InvalidArgumentError('repeat must be a positive integer'); + } + + return parsed; +} diff --git a/src/index.ts b/src/index.ts index 4ca7c0b..e85c19c 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,5 +1,5 @@ // Main API -export { runAudit } from './runner.js'; +export { runAudit, runRepeatedAudit } from './runner.js'; // Types export type { @@ -11,6 +11,7 @@ export type { AuditDetails, AuditRef, SiteType, + StabilityResult, } from './types.js'; // Base classes (for extensibility) diff --git a/src/reporter/cli.ts b/src/reporter/cli.ts index 9dc2d54..2830ae2 100644 --- a/src/reporter/cli.ts +++ b/src/reporter/cli.ts @@ -27,6 +27,10 @@ function renderCategory(category: AXCategory): string { return ` ${label.padEnd(8)} ${category.title.padEnd(20)} ${score}`; } +function formatMetric(value: number): string { + return Number.isInteger(value) ? `${value}` : value.toFixed(2); +} + /** * Render an AX report as a rich CLI output. */ @@ -48,6 +52,17 @@ export function renderReport(report: AXReport): string { lines.push(renderCategory(category)); } + if (report.stability) { + const s = report.stability; + lines.push(''); + lines.push(chalk.bold(' Stability:')); + lines.push(` Runs: ${s.runs} Scores: [${s.scores.join(', ')}]`); + lines.push( + ` Mean: ${formatMetric(s.mean)} Range: ${formatMetric(s.min)}-${formatMetric(s.max)} ` + + `(delta ${formatMetric(s.delta)}) Variance: ${formatMetric(s.variance)}` + ); + } + if (report.recommendations.length > 0) { lines.push(''); lines.push(chalk.bold(' Top Fixes:')); diff --git a/src/runner.test.ts b/src/runner.test.ts index 49fd396..0b18b4e 100644 --- a/src/runner.test.ts +++ b/src/runner.test.ts @@ -1,5 +1,92 @@ import { describe, it, expect, vi, beforeEach } from 'vitest'; -import { runAudit } from './runner.js'; +import { runAudit, runRepeatedAudit } from './runner.js'; +import { renderJSON } from './reporter/json.js'; + +const RICH_HTML = ` + + Example + + + + + + + +

Welcome

+ +`; + +const MINIMAL_HTML = ` + Example +
Plain page
+`; + +function createResponse( + body: string, + status = 200, + contentType = 'text/html; charset=utf-8' +): Response { + const headers = new Map(); + headers.set('content-type', contentType); + + return { + ok: status >= 200 && status < 300, + status, + text: () => Promise.resolve(body), + headers: { + forEach: (cb: (value: string, key: string) => void) => headers.forEach((value, key) => cb(value, key)), + }, + } as unknown as Response; +} + +function installFetchMock({ + mainPages = [RICH_HTML], + llmsStatuses = [200], +}: { + mainPages?: string[]; + llmsStatuses?: number[]; +} = {}): void { + let mainPageIndex = 0; + let llmsIndex = 0; + + vi.spyOn(globalThis, 'fetch').mockImplementation((input) => { + const url = + typeof input === 'string' + ? input + : input instanceof URL + ? input.href + : (input as Request).url; + + if (url.endsWith('/robots.txt')) { + return Promise.resolve(createResponse('User-agent: *\nAllow: /', 200, 'text/plain; charset=utf-8')); + } + + if (url.endsWith('/llms.txt')) { + const status = llmsStatuses[Math.min(llmsIndex, llmsStatuses.length - 1)] ?? 200; + llmsIndex += 1; + return Promise.resolve( + createResponse( + status === 200 ? '# Example site info' : '', + status, + 'text/plain; charset=utf-8' + ) + ); + } + + if ( + url.endsWith('/llms-full.txt') || + url.endsWith('/openapi.json') || + url.endsWith('/sitemap.xml') || + url.includes('security.txt') + ) { + return Promise.resolve(createResponse('', 404)); + } + + const page = mainPages[Math.min(mainPageIndex, mainPages.length - 1)] ?? RICH_HTML; + mainPageIndex += 1; + return Promise.resolve(createResponse(page)); + }); +} describe('runAudit', () => { beforeEach(() => { @@ -7,104 +94,96 @@ describe('runAudit', () => { }); it('should return a complete AXReport with all required fields', async () => { - // Mock all fetch calls to return predictable data - vi.spyOn(globalThis, 'fetch').mockImplementation((input) => { - const url = typeof input === 'string' ? input : input instanceof URL ? input.href : (input as Request).url; - - const headers = new Map(); - headers.set('content-type', 'text/html; charset=utf-8'); - - if (url.endsWith('/robots.txt')) { - return Promise.resolve({ - ok: true, - status: 200, - text: () => Promise.resolve('User-agent: *\nAllow: /'), - headers: { forEach: (cb: (v: string, k: string) => void) => headers.forEach((v, k) => cb(v, k)) }, - } as unknown as Response); - } - if (url.endsWith('/llms.txt')) { - return Promise.resolve({ - ok: true, - status: 200, - text: () => Promise.resolve('# Example site info'), - headers: { forEach: (cb: (v: string, k: string) => void) => headers.forEach((v, k) => cb(v, k)) }, - } as unknown as Response); - } - // All other probes return 404 - if (url.endsWith('/llms-full.txt') || - url.endsWith('/openapi.json') || - url.endsWith('/sitemap.xml') || - url.includes('security.txt') - ) { - return Promise.resolve({ - ok: false, - status: 404, - text: () => Promise.resolve(''), - headers: { forEach: (cb: (v: string, k: string) => void) => headers.forEach((v, k) => cb(v, k)) }, - } as unknown as Response); - } - - // Main page - const body = ` - - Example - - - - - -

Welcome

- - `; - - return Promise.resolve({ - ok: true, - status: 200, - text: () => Promise.resolve(body), - headers: { forEach: (cb: (v: string, k: string) => void) => headers.forEach((v, k) => cb(v, k)) }, - } as unknown as Response); - }); + installFetchMock(); const report = await runAudit({ url: 'https://example.com', timeout: 5000 }); - // Verify report structure expect(report.url).toBe('https://example.com'); expect(report.timestamp).toBeDefined(); expect(typeof report.version).toBe('string'); expect(typeof report.score).toBe('number'); expect(report.score).toBeGreaterThanOrEqual(0); expect(report.score).toBeLessThanOrEqual(100); + expect(report.stability).toBeUndefined(); - // Verify site type expect(['api', 'content', 'hybrid', 'unknown']).toContain(report.siteType); - // Verify categories expect(report.categories).toBeInstanceOf(Array); expect(report.categories.length).toBeGreaterThan(0); - for (const cat of report.categories) { - expect(cat.id).toBeDefined(); - expect(cat.title).toBeDefined(); - expect(typeof cat.score).toBe('number'); + for (const category of report.categories) { + expect(category.id).toBeDefined(); + expect(category.title).toBeDefined(); + expect(typeof category.score).toBe('number'); } - // Verify audits expect(typeof report.audits).toBe('object'); expect(Object.keys(report.audits).length).toBe(19); - - // Verify recommendations expect(report.recommendations).toBeInstanceOf(Array); }); it('should handle audits that throw errors gracefully', async () => { - // Mock fetch to reject on all calls vi.spyOn(globalThis, 'fetch').mockRejectedValue(new Error('Simulated network failure')); const report = await runAudit({ url: 'https://unreachable.test', timeout: 1000 }); - // Even if all gathers fail, the report should still be produced expect(report.url).toBe('https://unreachable.test'); expect(typeof report.score).toBe('number'); expect(report.categories).toBeInstanceOf(Array); expect(typeof report.audits).toBe('object'); }); }); + +describe('runRepeatedAudit', () => { + beforeEach(() => { + vi.restoreAllMocks(); + }); + + it('preserves single-run behavior when repeat is 1', async () => { + installFetchMock(); + + const report = await runRepeatedAudit({ url: 'https://example.com', timeout: 5000 }, 1); + + expect(report.stability).toBeUndefined(); + expect(report.score).toBeGreaterThanOrEqual(0); + expect(report.score).toBeLessThanOrEqual(100); + }); + + it('adds stability metrics when repeat is greater than 1', async () => { + installFetchMock({ + mainPages: [RICH_HTML, MINIMAL_HTML, RICH_HTML], + llmsStatuses: [200, 404, 200], + }); + + const report = await runRepeatedAudit({ url: 'https://example.com', timeout: 5000 }, 3); + + expect(report.stability).toBeDefined(); + expect(report.stability?.runs).toBe(3); + expect(report.stability?.scores).toHaveLength(3); + expect(report.stability?.min).toBe(Math.min(...(report.stability?.scores ?? []))); + expect(report.stability?.max).toBe(Math.max(...(report.stability?.scores ?? []))); + expect(report.stability?.delta).toBe((report.stability?.max ?? 0) - (report.stability?.min ?? 0)); + expect(report.stability?.delta).toBeGreaterThan(0); + expect(report.stability?.variance).toBeGreaterThan(0); + }); + + it('keeps the stability summary in JSON output for repeat runs', async () => { + installFetchMock({ + mainPages: [RICH_HTML, MINIMAL_HTML], + llmsStatuses: [200, 404], + }); + + const report = await runRepeatedAudit({ url: 'https://example.com', timeout: 5000 }, 2); + const parsed = JSON.parse(renderJSON(report)); + + expect(parsed.stability).toMatchObject({ + runs: 2, + scores: expect.any(Array), + min: expect.any(Number), + max: expect.any(Number), + mean: expect.any(Number), + delta: expect.any(Number), + variance: expect.any(Number), + }); + expect(parsed.stability.scores).toHaveLength(2); + }); +}); diff --git a/src/runner.ts b/src/runner.ts index dfdfebb..1039f30 100644 --- a/src/runner.ts +++ b/src/runner.ts @@ -1,4 +1,4 @@ -import type { AXConfig, AXReport, AXCategory, AuditResult } from './types.js'; +import type { AXConfig, AXReport, AXCategory, AuditResult, StabilityResult } from './types.js'; import type { GatherResult } from './gatherers/base-gatherer.js'; import { getCategoriesForSiteType, VERSION } from './config/default.js'; import { classifySiteType } from './classifiers/site-type.js'; @@ -148,3 +148,45 @@ export async function runAudit(config: AXConfig): Promise { recommendations, }; } + +/** + * Run the audit `repeat` times and attach stability metrics. + * When repeat <= 1, behaves identically to `runAudit`. + */ +export async function runRepeatedAudit(config: AXConfig, repeat: number): Promise { + if (!Number.isInteger(repeat) || repeat < 1) { + throw new Error('Repeat count must be a positive integer.'); + } + + if (repeat === 1) { + return runAudit(config); + } + + const reports: AXReport[] = []; + for (let i = 0; i < repeat; i++) { + reports.push(await runAudit(config)); + } + + const scores = reports.map((report) => report.score); + const min = Math.min(...scores); + const max = Math.max(...scores); + const rawMean = scores.reduce((sum, score) => sum + score, 0) / scores.length; + const rawVariance = scores.reduce((sum, score) => sum + (score - rawMean) ** 2, 0) / scores.length; + + const stability: StabilityResult = { + runs: repeat, + scores, + min, + max, + mean: roundMetric(rawMean), + delta: max - min, + variance: roundMetric(rawVariance), + }; + + const base = reports[reports.length - 1]!; + return { ...base, stability }; +} + +function roundMetric(value: number): number { + return Math.round(value * 100) / 100; +} diff --git a/src/types.ts b/src/types.ts index 808eaca..7798bda 100644 --- a/src/types.ts +++ b/src/types.ts @@ -46,6 +46,7 @@ export interface AXReport { categories: AXCategory[]; audits: Record; recommendations: Recommendation[]; + stability?: StabilityResult; } export interface Recommendation { @@ -53,3 +54,13 @@ export interface Recommendation { message: string; impact: number; } + +export interface StabilityResult { + runs: number; + scores: number[]; + min: number; + max: number; + mean: number; + delta: number; + variance: number; +}