From df992517fe14114c27ddfc988a655dd0352fa77b Mon Sep 17 00:00:00 2001 From: "devin-ai-integration[bot]" <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Thu, 6 Feb 2025 21:51:56 +0800 Subject: [PATCH] fix: update evaluator tests to match latest implementation (#34) * fix: update evaluator tests to match latest implementation Co-Authored-By: Han Xiao * fix: update EvaluationResponse type and add comprehensive tests Co-Authored-By: Han Xiao --------- Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: Han Xiao --- src/tools/__tests__/evaluator.test.ts | 64 ++++++++++++++++++++++++--- src/types.ts | 13 ++++++ 2 files changed, 72 insertions(+), 5 deletions(-) diff --git a/src/tools/__tests__/evaluator.test.ts b/src/tools/__tests__/evaluator.test.ts index 9372f74d..de36532a 100644 --- a/src/tools/__tests__/evaluator.test.ts +++ b/src/tools/__tests__/evaluator.test.ts @@ -26,22 +26,76 @@ describe('evaluateAnswer', () => { const { response } = await evaluateAnswer( 'What is TypeScript?', 'TypeScript is a strongly typed programming language that builds on JavaScript.', + ['definitive'], tokenTracker ); - expect(response).toHaveProperty('is_definitive'); - expect(response).toHaveProperty('reasoning'); + expect(response).toHaveProperty('pass'); + expect(response).toHaveProperty('think'); + expect(response.type).toBe('definitive'); + expect(response.pass).toBe(true); + }); + + it('should evaluate answer freshness', async () => { + const tokenTracker = new TokenTracker(); + const { response } = await evaluateAnswer( + 'What is the latest version of Node.js?', + 'The latest version of Node.js is 14.0.0, released in April 2020.', + ['freshness'], + tokenTracker + ); + expect(response).toHaveProperty('pass'); + expect(response).toHaveProperty('think'); + expect(response.type).toBe('freshness'); + expect(response.freshness_analysis).toBeDefined(); + expect(response.freshness_analysis?.likely_outdated).toBe(true); + expect(response.freshness_analysis?.dates_mentioned).toContain('2020-04'); + expect(response.freshness_analysis?.current_time).toBeDefined(); + expect(response.pass).toBe(false); + }); + + it('should evaluate answer plurality', async () => { + const tokenTracker = new TokenTracker(); + const { response } = await evaluateAnswer( + 'List three programming languages.', + 'Python is a programming language.', + ['plurality'], + tokenTracker + ); + expect(response).toHaveProperty('pass'); + expect(response).toHaveProperty('think'); + expect(response.type).toBe('plurality'); + expect(response.plurality_analysis).toBeDefined(); + expect(response.plurality_analysis?.expects_multiple).toBe(true); + expect(response.plurality_analysis?.provides_multiple).toBe(false); + expect(response.plurality_analysis?.count_expected).toBe(3); + expect(response.plurality_analysis?.count_provided).toBe(1); + expect(response.pass).toBe(false); + }); + + it('should evaluate in order and stop at first failure', async () => { + const tokenTracker = new TokenTracker(); + const { response } = await evaluateAnswer( + 'List the latest Node.js versions.', + 'I am not sure about the Node.js versions.', + ['definitive', 'freshness', 'plurality'], + tokenTracker + ); + expect(response.type).toBe('definitive'); + expect(response.pass).toBe(false); + expect(response.freshness_analysis).toBeUndefined(); + expect(response.plurality_analysis).toBeUndefined(); }); it('should track token usage', async () => { const tokenTracker = new TokenTracker(); const spy = jest.spyOn(tokenTracker, 'trackUsage'); - const { tokens } = await evaluateAnswer( + await evaluateAnswer( 'What is TypeScript?', 'TypeScript is a strongly typed programming language that builds on JavaScript.', + ['definitive', 'freshness', 'plurality'], tokenTracker ); - expect(spy).toHaveBeenCalledWith('evaluator', tokens); - expect(tokens).toBeGreaterThan(0); + expect(spy).toHaveBeenCalledWith('evaluator', expect.any(Number)); }); }); }); diff --git a/src/types.ts b/src/types.ts index a93ab5af..fe3d8b1d 100644 --- a/src/types.ts +++ b/src/types.ts @@ -87,6 +87,19 @@ export interface ReadResponse { export type EvaluationResponse = { pass: boolean; think: string; + type?: 'definitive' | 'freshness' | 'plurality'; + freshness_analysis?: { + likely_outdated: boolean; + dates_mentioned: string[]; + current_time: string; + max_age_days?: number; + }; + plurality_analysis?: { + expects_multiple: boolean; + provides_multiple: boolean; + count_expected?: number; + count_provided: number; + }; }; export type ErrorAnalysisResponse = {