diff --git a/.gitignore b/.gitignore index 741acc764b..daf13572a8 100644 --- a/.gitignore +++ b/.gitignore @@ -74,6 +74,8 @@ next-env.d.ts # Eval results eval/whiteboard-layout/results/ eval/outline-language/results/ +eval/orchestration/results/ +eval/orchestration/results-answering/ # e2e screenshot artifacts e2e/screenshots/ diff --git a/components/agent/agent-bar.tsx b/components/agent/agent-bar.tsx index ade89c7d57..f0b947a230 100644 --- a/components/agent/agent-bar.tsx +++ b/components/agent/agent-bar.tsx @@ -20,9 +20,6 @@ import { Volume2, VolumeX, Loader2, - MessageSquare, - Minus, - Plus, Search, } from 'lucide-react'; import { Tooltip, TooltipContent, TooltipTrigger } from '@/components/ui/tooltip'; @@ -615,8 +612,6 @@ export function AgentBar() { const { listAgents } = useAgentRegistry(); const selectedAgentIds = useSettingsStore((s) => s.selectedAgentIds); const setSelectedAgentIds = useSettingsStore((s) => s.setSelectedAgentIds); - const maxTurns = useSettingsStore((s) => s.maxTurns); - const setMaxTurns = useSettingsStore((s) => s.setMaxTurns); const agentMode = useSettingsStore((s) => s.agentMode); const setAgentMode = useSettingsStore((s) => s.setAgentMode); const ttsProvidersConfig = useSettingsStore((s) => s.ttsProvidersConfig); @@ -937,57 +932,6 @@ export function AgentBar() { )} - - {/* Max turns — compact stepper */} -
- - - {t('settings.maxTurns')} - -
- - { - const raw = e.target.value.replace(/\D/g, ''); - if (!raw) { - setMaxTurns(''); - return; - } - const v = Math.min(20, Math.max(1, parseInt(raw))); - setMaxTurns(String(v)); - }} - onBlur={() => { - if (!maxTurns || parseInt(maxTurns) < 1) setMaxTurns('1'); - }} - onClick={(e) => e.stopPropagation()} - className="w-5 h-5 text-[11px] font-medium tabular-nums text-center bg-transparent outline-none border-none" - /> - -
-
)} diff --git a/components/chat/session-list.tsx b/components/chat/session-list.tsx index 526443efad..350dc60d19 100644 --- a/components/chat/session-list.tsx +++ b/components/chat/session-list.tsx @@ -3,7 +3,7 @@ import type { ChatSession, SessionStatus } from '@/lib/types/chat'; import { cn } from '@/lib/utils'; import { useI18n } from '@/lib/hooks/use-i18n'; -import { ChevronDown, Circle, CheckCircle, Clock } from 'lucide-react'; +import { ChevronDown, Circle, CheckCircle, Clock, AlertCircle } from 'lucide-react'; import { motion, AnimatePresence } from 'motion/react'; import { ChatSessionComponent } from './chat-session'; @@ -32,6 +32,8 @@ function getStatusIcon(status: SessionStatus) { return ; case 'completed': return ; + case 'error': + return ; case 'idle': default: return ; diff --git a/components/chat/use-chat-sessions.ts b/components/chat/use-chat-sessions.ts index 917ba66955..c6911601c5 100644 --- a/components/chat/use-chat-sessions.ts +++ b/components/chat/use-chat-sessions.ts @@ -170,6 +170,7 @@ export function useChatSessions(options: UseChatSessionsOptions = {}) { s.id === sessionId ? { ...s, + status: 'error' as SessionStatus, updatedAt: now, messages: [ ...s.messages, @@ -456,8 +457,6 @@ export function useChatSessions(options: UseChatSessionsOptions = {}) { controller: AbortController, sessionType: SessionType, ): Promise => { - const settingsState = useSettingsStore.getState(); - // Attach full configs for generated (non-default) agents so the server can use them. // The server-side registry only has default agents; generated agents exist only client-side. const generatedConfigs = requestTemplate.config.agentIds @@ -469,11 +468,6 @@ export function useChatSessions(options: UseChatSessionsOptions = {}) { requestTemplate.config.agentConfigs = generatedConfigs; } - const defaultMaxTurns = requestTemplate.config.agentIds.length <= 1 ? 1 : 10; - const maxTurns = settingsState.maxTurns - ? parseInt(settingsState.maxTurns, 10) || defaultMaxTurns - : defaultMaxTurns; - // Per-iteration buffer reference — set in onEvent, used in onIterationEnd let currentBuffer: StreamBuffer | null = null; // Tracks agent_start messageId so text_delta/action events with a missing @@ -607,28 +601,40 @@ export function useChatSessions(options: UseChatSessionsOptions = {}) { }, }, controller.signal, - maxTurns, ); - // Handle loop completion (UI-specific) + // Handle loop completion (UI-specific). Map each outcome.reason to a + // distinct session state — don't conflate error paths with completion. if (!controller.signal.aborted) { - if (outcome.reason !== 'cue_user') { - setSessions((prev) => - prev.map((s) => - s.id === sessionId - ? { - ...s, - status: 'completed' as SessionStatus, - updatedAt: Date.now(), - } - : s, - ), - ); - onStopSessionRef.current?.(); + switch (outcome.reason) { + case 'cue_user': + // Session stays active; UI waits for the next user message. + break; + case 'end': + setSessions((prev) => + prev.map((s) => + s.id === sessionId + ? { ...s, status: 'completed' as SessionStatus, updatedAt: Date.now() } + : s, + ), + ); + onStopSessionRef.current?.(); + break; + case 'empty_turns': + clearLiveSessionAfterError(sessionId, t('chat.error.emptyAgentResponses')); + onStopSessionRef.current?.(); + break; + case 'no_done': + clearLiveSessionAfterError(sessionId, t('chat.error.streamInterrupted')); + onStopSessionRef.current?.(); + break; + case 'aborted': + // Already handled elsewhere via abort signal. + break; } } }, - [createBufferForSession], + [createBufferForSession, clearLiveSessionAfterError, t], ); /** @@ -646,8 +652,6 @@ export function useChatSessions(options: UseChatSessionsOptions = {}) { messages: [], config: { agentIds: ['default-1'], - maxTurns: 0, // Not used for runtime — frontend loop manages maxTurns - currentTurn: 0, defaultAgentId: 'default-1', }, toolCalls: [], @@ -1070,8 +1074,6 @@ export function useChatSessions(options: UseChatSessionsOptions = {}) { messages: [userMessage], config: { agentIds, - maxTurns: 0, // Not used for runtime — frontend loop manages maxTurns - currentTurn: 0, defaultAgentId: agentIds[0], }, toolCalls: [], @@ -1208,8 +1210,6 @@ export function useChatSessions(options: UseChatSessionsOptions = {}) { messages: [], config: { agentIds, - maxTurns: 0, // Not used for runtime — frontend loop manages maxTurns - currentTurn: 0, triggerAgentId: agentId, }, toolCalls: [], @@ -1370,8 +1370,6 @@ export function useChatSessions(options: UseChatSessionsOptions = {}) { messages: [lectureMessage], config: { agentIds: ['default-1'], - maxTurns: 0, - currentTurn: 0, }, toolCalls: [], pendingToolCalls: [], diff --git a/components/settings/agent-settings.tsx b/components/settings/agent-settings.tsx index ad0c9aa8ae..26a21e993a 100644 --- a/components/settings/agent-settings.tsx +++ b/components/settings/agent-settings.tsx @@ -1,7 +1,6 @@ 'use client'; import { Label } from '@/components/ui/label'; -import { Input } from '@/components/ui/input'; import { Checkbox } from '@/components/ui/checkbox'; import { AlertCircle, User, Users, Sparkles, Info } from 'lucide-react'; import { cn } from '@/lib/utils'; @@ -20,20 +19,16 @@ interface Agent { interface AgentSettingsProps { agents: Agent[]; selectedAgentIds: string[]; - maxTurns: string; agentMode: 'preset' | 'auto'; onToggleAgent: (agentId: string) => void; - onMaxTurnsChange: (value: string) => void; onAgentModeChange: (mode: 'preset' | 'auto') => void; } export function AgentSettings({ agents, selectedAgentIds, - maxTurns, agentMode, onToggleAgent, - onMaxTurnsChange, onAgentModeChange, }: AgentSettingsProps) { const { t } = useI18n(); @@ -165,22 +160,6 @@ export function AgentSettings({ )} - - {/* Max turns config - only show for multi-agent */} - {selectedAgentIds.length > 1 && ( -
- -

{t('settings.maxTurnsDesc')}

- onMaxTurnsChange(e.target.value)} - className="w-24" - /> -
- )} ) : ( <> diff --git a/eval/orchestration/answering-runner.ts b/eval/orchestration/answering-runner.ts new file mode 100644 index 0000000000..5f3d85dd6a --- /dev/null +++ b/eval/orchestration/answering-runner.ts @@ -0,0 +1,408 @@ +/** + * Director Question-Answering Eval (#598 / #511 follow-up) + * + * Tests whether the director routes correctly when the conversation contains + * an unanswered user question. The bug observed in production: when agents + * have drifted off-topic — whether the user has expressed frustration yet + * or not — the director keeps picking peer agents for "variety" instead of + * routing to the teacher to actually answer the literal question. + * + * Scenarios cover both shapes: + * - first-turn drift, no frustration yet (the root case) + * - escalated frustration after multiple complaints (the recovery case) + * + * Per-decision classification (deterministic, no LLM judge): + * - TEACHER → ✓ correct (teacher answers, or asks a clarifying question + * when the user's message is too vague) + * - USER → ✗ wrong (cue_user makes no agent speak — the user faces + * dead air; the teacher should ask the clarifying question) + * - OTHER_AGENT → ✗ wrong (peer-agent "variety" routing) + * - END → ✗ wrong + * + * A/B: + * - baseline : current director template with rule 13 stripped + * - with_rule : current director template as-shipped (rule 13 in place) + * + * Pass criterion: with_rule.correctRate ≥ EVAL_PASS_THRESHOLD (default 0.7). + * The pre-vs-post Δ is reported as informational only — scenarios where the + * baseline already routes correctly shouldn't fail just because there is no + * room to lift. + * + * Required env: + * EVAL_DIRECTOR_MODEL + * + * Optional env: + * EVAL_SAMPLES Samples per (scenario, variant). Default 5. + * EVAL_PASS_THRESHOLD Min with_rule correct rate per scenario. Default 0.7. + * EVAL_SCENARIO Filter to a single scenario by case_id. + * + * Output: eval/orchestration/results-answering///report.md + */ + +import fs from 'fs'; +import path from 'path'; +import { fileURLToPath } from 'url'; +import { callLLM } from '@/lib/ai/llm'; +import { parseDirectorDecision } from '@/lib/orchestration/director-prompt'; +import { + summarizeConversation, + type OpenAIMessage, +} from '@/lib/orchestration/summarizers/conversation-summary'; +import { + processSnippets, + processConditionalBlocks, + interpolateVariables, +} from '@/lib/prompts/loader'; +import { resolveEvalModel } from '../shared/resolve-model'; +import { createRunDir } from '../shared/run-dir'; +import type { AgentTurnSummary } from '@/lib/orchestration/types'; +import type { ScenarioAgent } from './types'; + +const OUTPUT_DIR = 'eval/orchestration/results-answering'; + +// ==================== Types ==================== + +interface AnsweringScenario { + case_id: string; + description: string; + agents: ScenarioAgent[]; + teacherAgentId: string; + messages: OpenAIMessage[]; + agentResponses: AgentTurnSummary[]; + turnCount: number; + whiteboardOpen?: boolean; +} + +type Variant = 'baseline' | 'with_rule'; +type DecisionClass = 'USER' | 'TEACHER' | 'OTHER_AGENT' | 'END' | 'ERROR'; + +interface SampleResult { + variant: Variant; + raw: string; + classification: DecisionClass; + rawAgentId: string | null; + error?: string; +} + +interface ScenarioResult { + case_id: string; + description: string; + samples: number; + baseline: { samples: SampleResult[]; rates: Record; correctRate: number }; + withRule: { samples: SampleResult[]; rates: Record; correctRate: number }; + delta: number; + passes: boolean; +} + +// ==================== Prompt building ==================== + +function readDirectorTemplate(): string { + const p = path.join(process.cwd(), 'lib', 'prompts', 'templates', 'director', 'system.md'); + return fs.readFileSync(p, 'utf-8').trim(); +} + +/** + * Rule 13 was injected directly into director/system.md. To A/B against a + * pre-rule baseline, strip rule 13 (and its indented continuation block) out + * of the current template. + */ +function withoutAnsweringRule(template: string): string { + // Match rule 13 by its number (heading text is reworded often) up to the + // next blank-line + section header. Decoupled from the heading wording. + const stripped = template.replace(/^13\. \*\*[\s\S]*?(?=\n\n# )/m, ''); + if (stripped === template) { + throw new Error( + 'answering-runner: rule 13 not found in director template; eval baseline cannot be constructed', + ); + } + return stripped.replace(/\n{3,}/g, '\n\n'); +} + +function buildPromptFromTemplate( + template: string, + scenario: AnsweringScenario, + conversationSummary: string, +): string { + const agentList = scenario.agents + .map((a) => `- id: "${a.id}", name: "${a.name}", role: ${a.role}, priority: ${a.priority}`) + .join('\n'); + + const respondedList = + scenario.agentResponses.length > 0 + ? scenario.agentResponses + .map( + (r) => + `- ${r.agentName} (${r.agentId}): "${r.contentPreview}" [${r.actionCount} actions]`, + ) + .join('\n') + : 'None yet.'; + + const rule1 = + "1. The teacher (role: teacher, highest priority) should usually speak first to address the user's question or topic."; + + const vars: Record = { + agentList, + respondedList, + conversationSummary, + discussionSection: '', + whiteboardSection: '', + studentProfileSection: '', + rule1, + turnCountPlusOne: scenario.turnCount + 1, + whiteboardOpenText: scenario.whiteboardOpen + ? 'OPEN (slide canvas is hidden — spotlight/laser will not work)' + : 'CLOSED (slide canvas is visible)', + }; + + const withSnippets = processSnippets(template); + const withConditionals = processConditionalBlocks(withSnippets, vars); + return interpolateVariables(withConditionals, vars); +} + +function buildVariants(scenario: AnsweringScenario): { baseline: string; with_rule: string } { + const current = readDirectorTemplate(); + const summary = summarizeConversation(scenario.messages); + return { + baseline: buildPromptFromTemplate(withoutAnsweringRule(current), scenario, summary), + with_rule: buildPromptFromTemplate(current, scenario, summary), + }; +} + +// ==================== Classifier ==================== + +function classify( + raw: string, + scenario: AnsweringScenario, +): { + classification: DecisionClass; + rawAgentId: string | null; +} { + const parsed = parseDirectorDecision(raw); + if (parsed.shouldEnd || !parsed.nextAgentId) { + return { classification: 'END', rawAgentId: null }; + } + if (parsed.nextAgentId === 'USER') { + return { classification: 'USER', rawAgentId: 'USER' }; + } + if (parsed.nextAgentId === scenario.teacherAgentId) { + return { classification: 'TEACHER', rawAgentId: parsed.nextAgentId }; + } + return { classification: 'OTHER_AGENT', rawAgentId: parsed.nextAgentId }; +} + +function emptyRates(): Record { + return { USER: 0, TEACHER: 0, OTHER_AGENT: 0, END: 0, ERROR: 0 }; +} + +function computeRates(samples: SampleResult[]): { + rates: Record; + correctRate: number; +} { + const rates = emptyRates(); + const usable = samples.filter((s) => !s.error); + for (const s of usable) rates[s.classification]++; + const total = usable.length || 1; + for (const k of Object.keys(rates) as DecisionClass[]) { + rates[k] = rates[k] / total; + } + rates.ERROR = (samples.length - usable.length) / samples.length; + // Only TEACHER is correct: the teacher answers, or asks a clarifying question + // for vague input. USER cue is dead air (no agent speaks); peer/END are wrong. + const correctRate = rates.TEACHER; + return { rates, correctRate }; +} + +// ==================== Sampling ==================== + +async function sampleVariant( + scenario: AnsweringScenario, + variant: Variant, + systemPrompt: string, + model: Awaited>['model'], + samples: number, +): Promise { + const tasks = Array.from({ length: samples }, async (): Promise => { + try { + const result = await callLLM( + { + model, + messages: [ + { role: 'system', content: systemPrompt }, + { role: 'user', content: 'Decide which agent should speak next.' }, + ], + }, + 'eval-orchestration-answering', + ); + const raw = result.text; + const { classification, rawAgentId } = classify(raw, scenario); + return { variant, raw, classification, rawAgentId }; + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + return { + variant, + raw: '', + classification: 'ERROR', + rawAgentId: null, + error: msg, + }; + } + }); + return Promise.all(tasks); +} + +// ==================== Reporting ==================== + +function pct(x: number): string { + return `${Math.round(x * 100)}%`; +} + +function writeReport( + runDir: string, + results: ScenarioResult[], + modelStr: string, + samples: number, + threshold: number, +): string { + const lines: string[] = []; + const overallPass = results.every((r) => r.passes); + const meanBaseline = results.reduce((acc, r) => acc + r.baseline.correctRate, 0) / results.length; + const meanWithRule = results.reduce((acc, r) => acc + r.withRule.correctRate, 0) / results.length; + + lines.push(`# Director Question-Answering Eval`, ``); + lines.push(`- **Date**: ${new Date().toISOString()}`); + lines.push(`- **Model**: ${modelStr}`); + lines.push(`- **Samples per (scenario, variant)**: ${samples}`); + lines.push(`- **with_rule correct-rate threshold**: ${pct(threshold)}`); + lines.push(`- **Δ (pre vs post)**: informational — PASS depends only on with_rule rate`); + lines.push(``); + lines.push(`## Aggregate`); + lines.push(``); + lines.push(`| Variant | Mean correct rate (TEACHER) |`); + lines.push(`|---|---|`); + lines.push(`| baseline | ${pct(meanBaseline)} |`); + lines.push(`| with_rule | ${pct(meanWithRule)} |`); + lines.push(`| Δ | ${pct(meanWithRule - meanBaseline)} |`); + lines.push(``); + lines.push(`Overall verdict: **${overallPass ? 'PASS' : 'FAIL'}**`); + lines.push(``); + + lines.push(`## Per scenario`); + lines.push(``); + lines.push( + `| # | Scenario | Baseline USER% TEACHER% OTHER% END% | with_rule USER% TEACHER% OTHER% END% | Δ correct | pass? |`, + ); + lines.push(`|---|---|---|---|---|---|`); + results.forEach((r, i) => { + const b = r.baseline.rates; + const w = r.withRule.rates; + const bStr = `${pct(b.USER)}/${pct(b.TEACHER)}/${pct(b.OTHER_AGENT)}/${pct(b.END)}`; + const wStr = `${pct(w.USER)}/${pct(w.TEACHER)}/${pct(w.OTHER_AGENT)}/${pct(w.END)}`; + lines.push( + `| ${i + 1} | ${r.case_id} | ${bStr} | ${wStr} | ${pct(r.delta)} | ${r.passes ? '✓' : '✗'} |`, + ); + }); + lines.push(``); + + lines.push(`## Detail`); + for (const r of results) { + lines.push(``, `### ${r.case_id} ${r.passes ? '✓' : '✗'}`, ``); + lines.push(`- ${r.description}`); + lines.push( + `- Baseline correct: ${pct(r.baseline.correctRate)}; with_rule correct: ${pct(r.withRule.correctRate)}; Δ: ${pct(r.delta)}`, + ); + lines.push(``); + lines.push(`
baseline samples`, ``); + for (const s of r.baseline.samples) { + const label = s.error + ? `ERROR: ${s.error}` + : `${s.classification}${s.rawAgentId && s.classification === 'OTHER_AGENT' ? ` (${s.rawAgentId})` : ''}`; + lines.push(`- ${label}`); + } + lines.push(``, `
`, ``); + lines.push(`
with_rule samples`, ``); + for (const s of r.withRule.samples) { + const label = s.error + ? `ERROR: ${s.error}` + : `${s.classification}${s.rawAgentId && s.classification === 'OTHER_AGENT' ? ` (${s.rawAgentId})` : ''}`; + lines.push(`- ${label}`); + } + lines.push(``, `
`, ``); + } + + const reportPath = path.join(runDir, 'report.md'); + fs.writeFileSync(reportPath, lines.join('\n')); + return reportPath; +} + +// ==================== Main ==================== + +function getCurrentDir(): string { + return typeof __dirname !== 'undefined' + ? __dirname + : path.dirname(fileURLToPath(import.meta.url)); +} + +function loadScenarios(): AnsweringScenario[] { + const p = path.join(getCurrentDir(), 'scenarios/answering.json'); + const scenarios = JSON.parse(fs.readFileSync(p, 'utf-8')) as AnsweringScenario[]; + const filter = process.env.EVAL_SCENARIO; + return filter ? scenarios.filter((s) => s.case_id === filter) : scenarios; +} + +async function main() { + const modelStr = process.env.EVAL_DIRECTOR_MODEL || process.env.DEFAULT_MODEL; + if (!modelStr) { + console.error( + 'Error: EVAL_DIRECTOR_MODEL must be set. Example: EVAL_DIRECTOR_MODEL=google:gemini-3-flash-preview', + ); + process.exit(1); + } + const samples = Number(process.env.EVAL_SAMPLES || '5'); + const threshold = Number(process.env.EVAL_PASS_THRESHOLD || '0.7'); + + console.log('=== Director Question-Answering Eval ==='); + console.log(`Model: ${modelStr} | Samples/variant: ${samples} | pass threshold: ${threshold}`); + + const { model } = await resolveEvalModel('EVAL_DIRECTOR_MODEL', process.env.DEFAULT_MODEL); + const scenarios = loadScenarios(); + console.log(`Loaded ${scenarios.length} scenario(s)`); + const runDir = createRunDir(OUTPUT_DIR, modelStr); + console.log(`Output: ${runDir}`); + + const results: ScenarioResult[] = []; + for (const sc of scenarios) { + process.stdout.write(` - ${sc.case_id} ... `); + const variants = buildVariants(sc); + const [bs, ws] = await Promise.all([ + sampleVariant(sc, 'baseline', variants.baseline, model, samples), + sampleVariant(sc, 'with_rule', variants.with_rule, model, samples), + ]); + const bAgg = computeRates(bs); + const wAgg = computeRates(ws); + const lift = wAgg.correctRate - bAgg.correctRate; + const passes = wAgg.correctRate >= threshold; + results.push({ + case_id: sc.case_id, + description: sc.description, + samples, + baseline: { samples: bs, rates: bAgg.rates, correctRate: bAgg.correctRate }, + withRule: { samples: ws, rates: wAgg.rates, correctRate: wAgg.correctRate }, + delta: lift, + passes, + }); + console.log( + `baseline=${pct(bAgg.correctRate)} with_rule=${pct(wAgg.correctRate)} Δ=${pct(lift)} ${passes ? 'PASS' : 'FAIL'}`, + ); + } + + const reportPath = writeReport(runDir, results, modelStr, samples, threshold); + const overallPass = results.every((r) => r.passes); + console.log(`\nReport: ${reportPath}`); + console.log(`Verdict: ${overallPass ? 'PASS' : 'FAIL'}`); + process.exit(overallPass ? 0 : 1); +} + +main().catch((err) => { + console.error('Fatal error:', err); + process.exit(1); +}); diff --git a/eval/orchestration/judge.ts b/eval/orchestration/judge.ts new file mode 100644 index 0000000000..db7527b4d0 --- /dev/null +++ b/eval/orchestration/judge.ts @@ -0,0 +1,35 @@ +/** + * Deterministic verdict for premature-END scenarios. + * + * The bug we are guarding against is "director picks END while a student + * question is unresolved". For each sample, the only thing that matters is + * whether the director chose END. No LLM-as-judge here — END/not-END is + * binary and reading parseDirectorDecision is sufficient. + */ + +import { parseDirectorDecision } from '@/lib/orchestration/director-prompt'; + +export interface ParsedSample { + decision: 'END' | 'USER' | string; + isEnd: boolean; +} + +export function classifyDecision(raw: string): ParsedSample { + const parsed = parseDirectorDecision(raw); + if (parsed.shouldEnd || !parsed.nextAgentId) { + return { decision: 'END', isEnd: true }; + } + return { decision: parsed.nextAgentId, isEnd: false }; +} + +/** + * END rate over non-errored samples. Errored samples are excluded so API + * failures (e.g. provider 'Forbidden') don't masquerade as deterministic END + * behavior. Returns 0 if no samples are usable. + */ +export function endRate(samples: { isEnd: boolean; error?: string }[]): number { + const usable = samples.filter((s) => !s.error); + if (usable.length === 0) return 0; + const ends = usable.filter((s) => s.isEnd).length; + return ends / usable.length; +} diff --git a/eval/orchestration/prompt-variants.ts b/eval/orchestration/prompt-variants.ts new file mode 100644 index 0000000000..90b07175d8 --- /dev/null +++ b/eval/orchestration/prompt-variants.ts @@ -0,0 +1,182 @@ +/** + * Build director system prompts for both the "post-fix" (current main) and + * "pre-fix" (rules 10/11/12 removed) variants, so the eval can A/B them on + * the same conversation context. + * + * Rules 10/11/12 are the prompt-layer guardrails added by #554. The pre-fix + * variant mimics main^ by dropping them from the # Rules section. + * + * We avoid the public `buildDirectorPrompt()` because it always loads the + * current template. Here we read the template directly, optionally edit it, + * then run the same processSnippets → interpolateVariables pipeline. + */ + +import fs from 'fs'; +import path from 'path'; +import { + processSnippets, + processConditionalBlocks, + interpolateVariables, +} from '@/lib/prompts/loader'; +import type { OpenAIMessage } from '@/lib/orchestration/summarizers/conversation-summary'; +import { summarizeConversation } from '@/lib/orchestration/summarizers/conversation-summary'; +import type { ScenarioAgent } from './types'; +import type { AgentTurnSummary } from '@/lib/orchestration/types'; + +/** Rule numbers introduced by #554 that the pre-fix variant must strip. */ +const FIX_RULE_NUMBERS = [10, 11, 12] as const; + +function readDirectorTemplate(): string { + const p = path.join(process.cwd(), 'lib', 'prompts', 'templates', 'director', 'system.md'); + return fs.readFileSync(p, 'utf-8').trim(); +} + +/** + * Strip rules 10/11/12 from the # Rules section. Each rule is a single line + * in the current template; we match by leading `^(10|11|12)\.\s` and drop + * the whole line. Throws if any expected rule is missing so a template + * rewrite forces us to revisit this eval. + */ +export function stripFixRules(template: string): string { + const lines = template.split('\n'); + const kept: string[] = []; + const dropped = new Set(); + for (const line of lines) { + const m = line.match(/^(\d+)\.\s/); + if (m) { + const n = Number(m[1]); + if ((FIX_RULE_NUMBERS as readonly number[]).includes(n)) { + dropped.add(n); + continue; + } + } + kept.push(line); + } + for (const n of FIX_RULE_NUMBERS) { + if (!dropped.has(n)) { + throw new Error( + `prompt-variants: expected rule ${n} to exist in director/system.md; template may have been rewritten — update FIX_RULE_NUMBERS or this eval.`, + ); + } + } + return kept.join('\n'); +} + +export interface BuildArgs { + agents: ScenarioAgent[]; + messages: OpenAIMessage[]; + agentResponses: AgentTurnSummary[]; + turnCount: number; + discussionContext?: { topic: string; prompt?: string } | null; + triggerAgentId?: string | null; + userProfile?: { nickname?: string; bio?: string }; + whiteboardOpen?: boolean; +} + +/** + * Pre-#554 summarizeConversation: labels every role:'user' as [User] and + * role:'assistant' as [Assistant], with no [senderName]: prefix stripping. + * Used by the pre-fix variant so the eval A/B reflects both halves of #554 + * (the role-aware summary AND the new prompt rules), not just the rules. + */ +function summarizeConversationPreFix( + messages: OpenAIMessage[], + maxMessages = 10, + maxContentLength = 200, +): string { + if (messages.length === 0) return 'No conversation history yet.'; + const recent = messages.slice(-maxMessages); + const lines = recent.map((msg) => { + const roleLabel = + msg.role === 'user' ? 'User' : msg.role === 'assistant' ? 'Assistant' : 'System'; + const content = + msg.content.length > maxContentLength + ? msg.content.slice(0, maxContentLength) + '...' + : msg.content; + return `[${roleLabel}] ${content}`; + }); + return lines.join('\n'); +} + +/** + * Mirrors lib/orchestration/director-prompt.ts `buildDirectorPrompt()` shape + * but lets us inject a pre-stripped template. Kept in sync with that file — + * if you change variable names there, change them here. + */ +function buildPromptFromTemplate( + template: string, + args: BuildArgs, + conversationSummary: string, +): string { + const { + agents, + agentResponses, + turnCount, + discussionContext, + triggerAgentId, + userProfile, + whiteboardOpen, + } = args; + + const agentList = agents + .map((a) => `- id: "${a.id}", name: "${a.name}", role: ${a.role}, priority: ${a.priority}`) + .join('\n'); + + const respondedList = + agentResponses.length > 0 + ? agentResponses + .map( + (r) => + `- ${r.agentName} (${r.agentId}): "${r.contentPreview}" [${r.actionCount} actions]`, + ) + .join('\n') + : 'None yet.'; + + const isDiscussion = !!discussionContext; + const discussionSection = isDiscussion + ? `\n# Discussion Mode\nTopic: "${discussionContext!.topic}"${discussionContext!.prompt ? `\nPrompt: "${discussionContext!.prompt}"` : ''}${triggerAgentId ? `\nInitiator: "${triggerAgentId}"` : ''}\nThis is a student-initiated discussion, not a Q&A session.\n` + : ''; + + const rule1 = isDiscussion + ? `1. The discussion initiator${triggerAgentId ? ` ("${triggerAgentId}")` : ''} should speak first to kick off the topic. Then the teacher responds to guide the discussion. After that, other students may add their perspectives.` + : "1. The teacher (role: teacher, highest priority) should usually speak first to address the user's question or topic."; + + const studentProfileSection = + userProfile?.nickname || userProfile?.bio + ? `\n# Student Profile\nStudent name: ${userProfile.nickname || 'Unknown'}\n${userProfile.bio ? `Background: ${userProfile.bio}` : ''}\n` + : ''; + + const vars: Record = { + agentList, + respondedList, + conversationSummary, + discussionSection, + whiteboardSection: '', + studentProfileSection, + rule1, + turnCountPlusOne: turnCount + 1, + whiteboardOpenText: whiteboardOpen + ? 'OPEN (slide canvas is hidden — spotlight/laser will not work)' + : 'CLOSED (slide canvas is visible)', + }; + + const withSnippets = processSnippets(template); + const withConditionals = processConditionalBlocks(withSnippets, vars); + return interpolateVariables(withConditionals, vars); +} + +/** + * Build both variants. The pre-fix variant uses both the old summary labels + * ([User]/[Assistant]) AND the system.md without rules 10/11/12 — together + * those are the full state of main^ relative to #554. + */ +export function buildVariants(args: BuildArgs): { preFix: string; postFix: string } { + const post = readDirectorTemplate(); + const pre = stripFixRules(post); + const postSummary = summarizeConversation(args.messages); + const preSummary = summarizeConversationPreFix(args.messages); + return { + preFix: buildPromptFromTemplate(pre, args, preSummary), + postFix: buildPromptFromTemplate(post, args, postSummary), + }; +} diff --git a/eval/orchestration/reporter.ts b/eval/orchestration/reporter.ts new file mode 100644 index 0000000000..5365c6340b --- /dev/null +++ b/eval/orchestration/reporter.ts @@ -0,0 +1,88 @@ +import { writeFileSync } from 'fs'; +import { join } from 'path'; +import { renderHeader, renderSummaryTable } from '../shared/markdown-report'; +import type { EvalReport } from './types'; + +function pct(rate: number): string { + return `${Math.round(rate * 100)}%`; +} + +function countErrors(samples: { error?: string }[]): number { + return samples.filter((s) => s.error).length; +} + +/** + * Write `report.md` summarising pre-fix vs post-fix END rates per scenario. + * Returns the absolute path of the written report. + */ +export function writeReport(runDir: string, report: EvalReport): string { + const lines: string[] = []; + lines.push( + ...renderHeader({ + title: 'Director Premature-END Regression Eval', + timestamp: new Date().toISOString(), + model: report.model, + extra: { + 'Samples per variant': report.samplesPerVariant, + 'Post-fix END threshold (regression guard)': pct(report.postFixEndThreshold), + 'Discrimination threshold (Δ END-rate, informational)': pct(report.thresholdDelta), + Method: + 'A/B director prompt + summary: post-fix = current main; pre-fix = pre-#554 [User]/[Assistant] summary labels AND system.md without rules 10/11/12', + 'Post-fix regression guard (must hold)': report.allPostFixPass ? 'PASS' : 'FAIL', + 'Any scenario discriminates? (informational)': report.anyDiscriminates ? 'YES' : 'NO', + }, + }), + ); + + lines.push(`## Detail`, ``); + for (const r of report.results) { + const pass = r.postFixPasses ? 'PASS' : '**FAIL**'; + const disc = r.discriminates ? ' (Δ ≥ threshold)' : ''; + lines.push(`### ${pass}${disc} ${r.case_id}`, ``); + lines.push(`- **Description**: ${r.description}`); + const preErr = countErrors(r.preFix.samples); + const postErr = countErrors(r.postFix.samples); + lines.push(`- **Samples per variant**: ${r.samples} (rates exclude errored samples)`); + lines.push( + `- **Pre-fix END rate**: ${pct(r.preFix.endRate)}${preErr ? ` — ${preErr} error(s)` : ''}`, + ); + lines.push( + `- **Post-fix END rate**: ${pct(r.postFix.endRate)}${postErr ? ` — ${postErr} error(s)` : ''}`, + ); + lines.push(`- **Δ (pre − post)**: ${pct(r.delta)}`); + lines.push(``); + lines.push(`
Pre-fix raw decisions`, ``); + for (const s of r.preFix.samples) { + const label = s.error ? `ERROR: ${s.error}` : s.isEnd ? '**END**' : s.decision; + lines.push(`- ${label}`); + } + lines.push(``, `
`, ``); + lines.push(`
Post-fix raw decisions`, ``); + for (const s of r.postFix.samples) { + const label = s.error ? `ERROR: ${s.error}` : s.isEnd ? '**END**' : s.decision; + lines.push(`- ${label}`); + } + lines.push(``, `
`, ``); + } + + lines.push(`## Summary`, ``); + const rows: string[][] = report.results.map((r, i) => [ + String(i + 1), + r.case_id, + pct(r.preFix.endRate), + pct(r.postFix.endRate), + pct(r.delta), + r.postFixPasses ? 'PASS' : 'FAIL', + r.discriminates ? 'YES' : 'no', + ]); + lines.push( + ...renderSummaryTable( + ['#', 'Scenario', 'Pre-fix END', 'Post-fix END', 'Δ', 'Regression guard', 'Discriminates'], + rows, + ), + ); + + const reportPath = join(runDir, 'report.md'); + writeFileSync(reportPath, lines.join('\n')); + return reportPath; +} diff --git a/eval/orchestration/runner.ts b/eval/orchestration/runner.ts new file mode 100644 index 0000000000..8b79d8a841 --- /dev/null +++ b/eval/orchestration/runner.ts @@ -0,0 +1,193 @@ +/** + * Orchestration Premature-END Regression Eval + * + * For each scenario, builds the director system prompt twice: + * - "pre-fix" : current director/system.md with rules 10/11/12 removed + * - "post-fix" : current director/system.md as-shipped + * Calls the LLM N times per variant, parses each decision, and reports the + * END rate for both. A scenario "discriminates" when (pre − post) ≥ delta. + * + * Required env: + * EVAL_DIRECTOR_MODEL Model under test (or DEFAULT_MODEL fallback) + * + * Optional env: + * EVAL_SAMPLES Samples per (scenario, variant). Default 5. + * EVAL_DELTA Discrimination threshold for pre-vs-post Δ (0..1). Default 0.3. + * EVAL_END_THRESHOLD Max acceptable post-fix END rate per scenario (0..1). Default 0.2. + * EVAL_SCENARIO Filter to a single scenario by case_id. + * + * Usage: + * EVAL_DIRECTOR_MODEL=openai:gpt-4.1-mini pnpm eval:orchestration + * + * Output: eval/orchestration/results///report.md + * + * Exit code: + * 0 — every scenario's post-fix END rate is at or below EVAL_END_THRESHOLD + * (the regression guard holds for this model) + * 1 — some scenario's post-fix END rate exceeded the threshold + * (potential regression of #554's premature-END fix) + */ + +import { readFileSync } from 'fs'; +import { join, dirname } from 'path'; +import { fileURLToPath } from 'url'; +import { callLLM } from '@/lib/ai/llm'; +import { resolveEvalModel } from '../shared/resolve-model'; +import { createRunDir } from '../shared/run-dir'; +import { classifyDecision, endRate } from './judge'; +import { buildVariants } from './prompt-variants'; +import { writeReport } from './reporter'; +import type { EvalReport, PromptVariant, SampleResult, Scenario, ScenarioResult } from './types'; + +const OUTPUT_DIR = 'eval/orchestration/results'; + +function getCurrentDir(): string { + return typeof __dirname !== 'undefined' ? __dirname : dirname(fileURLToPath(import.meta.url)); +} + +function loadScenarios(): Scenario[] { + const path = join(getCurrentDir(), 'scenarios/premature-end.json'); + const scenarios = JSON.parse(readFileSync(path, 'utf-8')) as Scenario[]; + const filter = process.env.EVAL_SCENARIO; + return filter ? scenarios.filter((s) => s.case_id === filter) : scenarios; +} + +function requireModelEnv(): string { + const modelStr = process.env.EVAL_DIRECTOR_MODEL || process.env.DEFAULT_MODEL; + if (!modelStr) { + console.error( + 'Error: EVAL_DIRECTOR_MODEL (or DEFAULT_MODEL) must be set. Example: EVAL_DIRECTOR_MODEL=openai:gpt-4.1-mini', + ); + process.exit(1); + } + return modelStr; +} + +async function callDirector( + model: Awaited>['model'], + systemPrompt: string, +): Promise { + const result = await callLLM( + { + model, + messages: [ + { role: 'system', content: systemPrompt }, + { role: 'user', content: 'Decide which agent should speak next.' }, + ], + }, + 'eval-orchestration', + ); + return result.text; +} + +async function sampleVariant( + scenario: Scenario, + variant: PromptVariant, + systemPrompt: string, + model: Awaited>['model'], + samples: number, +): Promise { + const tasks = Array.from({ length: samples }, async (): Promise => { + try { + const raw = await callDirector(model, systemPrompt); + const { decision, isEnd } = classifyDecision(raw); + return { variant, raw, decision, isEnd }; + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + // Don't conflate API failures with END decisions — that polluted earlier + // sweeps (e.g. anthropic 'Forbidden' showing as 100% END). Mark erroneous + // samples so the rate calculator excludes them. + return { variant, raw: '', decision: 'ERROR', isEnd: false, error: msg }; + } + }); + return Promise.all(tasks); +} + +async function runScenario( + scenario: Scenario, + model: Awaited>['model'], + samples: number, + thresholdDelta: number, + postFixEndThreshold: number, +): Promise { + const { preFix, postFix } = buildVariants({ + agents: scenario.agents, + messages: scenario.messages, + agentResponses: scenario.agentResponses, + turnCount: scenario.turnCount, + discussionContext: scenario.discussionContext ?? null, + triggerAgentId: scenario.triggerAgentId ?? null, + userProfile: scenario.userProfile, + whiteboardOpen: scenario.whiteboardOpen ?? false, + }); + + const [preSamples, postSamples] = await Promise.all([ + sampleVariant(scenario, 'pre-fix', preFix, model, samples), + sampleVariant(scenario, 'post-fix', postFix, model, samples), + ]); + + const preRate = endRate(preSamples); + const postRate = endRate(postSamples); + const delta = preRate - postRate; + return { + case_id: scenario.case_id, + description: scenario.description, + samples, + preFix: { endRate: preRate, samples: preSamples }, + postFix: { endRate: postRate, samples: postSamples }, + delta, + discriminates: delta >= thresholdDelta, + postFixPasses: postRate <= postFixEndThreshold, + }; +} + +async function main() { + const modelStr = requireModelEnv(); + const samples = Number(process.env.EVAL_SAMPLES || '5'); + const thresholdDelta = Number(process.env.EVAL_DELTA || '0.3'); + const postFixEndThreshold = Number(process.env.EVAL_END_THRESHOLD || '0.2'); + + console.log('=== Director Premature-END Regression Eval ==='); + console.log( + `Model: ${modelStr} | Samples/variant: ${samples} | Δ threshold: ${thresholdDelta} | post-fix END threshold: ${postFixEndThreshold}`, + ); + + const { model } = await resolveEvalModel('EVAL_DIRECTOR_MODEL', process.env.DEFAULT_MODEL); + const scenarios = loadScenarios(); + console.log(`Loaded ${scenarios.length} scenario(s)`); + + const runDir = createRunDir(OUTPUT_DIR, modelStr); + console.log(`Output: ${runDir}`); + + const results: ScenarioResult[] = []; + for (const sc of scenarios) { + process.stdout.write(` - ${sc.case_id} ... `); + const r = await runScenario(sc, model, samples, thresholdDelta, postFixEndThreshold); + results.push(r); + console.log( + `pre=${Math.round(r.preFix.endRate * 100)}% post=${Math.round(r.postFix.endRate * 100)}% Δ=${Math.round(r.delta * 100)}% ${r.postFixPasses ? 'PASS' : 'FAIL'}${r.discriminates ? ' (discriminates)' : ''}`, + ); + } + + const anyDiscriminates = results.some((r) => r.discriminates); + const allPostFixPass = results.every((r) => r.postFixPasses); + const report: EvalReport = { + model: modelStr, + samplesPerVariant: samples, + thresholdDelta, + postFixEndThreshold, + results, + anyDiscriminates, + allPostFixPass, + }; + const reportPath = writeReport(runDir, report); + console.log(`\nReport: ${reportPath}`); + console.log(`Post-fix regression guard: ${allPostFixPass ? 'PASS' : 'FAIL'}`); + console.log(`Any scenario discriminates (informational): ${anyDiscriminates ? 'YES' : 'NO'}`); + process.exit(allPostFixPass ? 0 : 1); +} + +main().catch((err) => { + console.error('Fatal error:', err); + process.exit(1); +}); diff --git a/eval/orchestration/scenarios/answering.json b/eval/orchestration/scenarios/answering.json new file mode 100644 index 0000000000..48cc579b70 --- /dev/null +++ b/eval/orchestration/scenarios/answering.json @@ -0,0 +1,422 @@ +[ + { + "case_id": "math_quadratic_axis_drift_no_frustration", + "description": "First-turn drift, no frustration yet. User asks for the axis-of-symmetry x-coordinate formula; teacher + student + assistant drift to geometric meaning, opening direction, zero-midpoint. None gave the formula x = -b/(2a). Director must route to teacher to give the actual answer.", + "agents": [ + { + "id": "default-1", + "name": "李老师", + "role": "teacher", + "priority": 10 + }, + { + "id": "default-2", + "name": "助教小韩", + "role": "assistant", + "priority": 7 + }, + { + "id": "default-3", + "name": "小明", + "role": "student", + "priority": 4 + } + ], + "teacherAgentId": "default-1", + "messages": [ + { + "role": "user", + "content": "[你]: y=ax²+bx+c 这个二次函数的对称轴 x 坐标公式是什么?" + }, + { + "role": "assistant", + "content": "[{\"type\":\"text\",\"content\":\"对称轴是把抛物线分成左右两半完全镜像的那条竖直线。任何一条抛物线都有且只有这样一条对称轴。\"}]" + }, + { + "role": "assistant", + "content": "[{\"type\":\"text\",\"content\":\"对了老师,a 的正负还决定开口方向呢——a>0 朝上,a<0 朝下。\"}]" + }, + { + "role": "assistant", + "content": "[{\"type\":\"text\",\"content\":\"补充一下:如果方程有两个实根 x₁、x₂,那对称轴就刚好穿过两根的中点。\"}]" + } + ], + "agentResponses": [ + { + "agentId": "default-1", + "agentName": "李老师", + "contentPreview": "对称轴是把抛物线分成左右两半完全镜像的那条竖直线。", + "actionCount": 0, + "whiteboardActions": [] + }, + { + "agentId": "default-3", + "agentName": "小明", + "contentPreview": "对了老师,a 的正负还决定开口方向呢——a>0 朝上,a<0 朝下。", + "actionCount": 0, + "whiteboardActions": [] + }, + { + "agentId": "default-2", + "agentName": "助教小韩", + "contentPreview": "如果方程有两个实根 x₁、x₂,那对称轴就刚好穿过两根的中点。", + "actionCount": 0, + "whiteboardActions": [] + } + ], + "turnCount": 3, + "whiteboardOpen": false + }, + { + "case_id": "bio_dark_reaction_drift_no_frustration", + "description": "First-turn drift, no frustration yet. User asks whether dark reactions need enzymes; agents drift to location, light-reaction products, and general definition without ever mentioning enzymes (or RuBisCO). Director must route to teacher.", + "agents": [ + { + "id": "default-1", + "name": "陈老师", + "role": "teacher", + "priority": 10 + }, + { + "id": "default-2", + "name": "小光助教", + "role": "assistant", + "priority": 7 + }, + { + "id": "default-3", + "name": "好奇宝宝", + "role": "student", + "priority": 5 + } + ], + "teacherAgentId": "default-1", + "messages": [ + { + "role": "user", + "content": "[你]: 光合作用的暗反应需要酶吗?" + }, + { + "role": "assistant", + "content": "[{\"type\":\"text\",\"content\":\"暗反应发生在叶绿体的基质里,跟光反应分隔开。\"}]" + }, + { + "role": "assistant", + "content": "[{\"type\":\"text\",\"content\":\"对,光反应给暗反应供能——光反应在类囊体上产生 ATP 和 NADPH。\"}]" + }, + { + "role": "assistant", + "content": "[{\"type\":\"text\",\"content\":\"那暗反应整体就是把 CO₂ 通过卡尔文循环一步步固定成糖。\"}]" + } + ], + "agentResponses": [ + { + "agentId": "default-1", + "agentName": "陈老师", + "contentPreview": "暗反应发生在叶绿体的基质里,跟光反应分隔开。", + "actionCount": 0, + "whiteboardActions": [] + }, + { + "agentId": "default-2", + "agentName": "小光助教", + "contentPreview": "光反应给暗反应供能——光反应产生 ATP 和 NADPH。", + "actionCount": 0, + "whiteboardActions": [] + }, + { + "agentId": "default-3", + "agentName": "好奇宝宝", + "contentPreview": "暗反应整体就是把 CO₂ 通过卡尔文循环固定成糖。", + "actionCount": 0, + "whiteboardActions": [] + } + ], + "turnCount": 3, + "whiteboardOpen": false + }, + { + "case_id": "english_team_isare_drift_no_frustration", + "description": "First-turn drift, no frustration yet. User asks 'team is or team are'; teacher and assistant give the abstract collective-noun framework and example list but never the literal answer. Director must route to teacher.", + "agents": [ + { + "id": "default-1", + "name": "Tina 老师", + "role": "teacher", + "priority": 10 + }, + { + "id": "default-2", + "name": "Mike 助教", + "role": "assistant", + "priority": 7 + }, + { + "id": "default-3", + "name": "小米同学", + "role": "student", + "priority": 4 + } + ], + "teacherAgentId": "default-1", + "messages": [ + { + "role": "user", + "content": "[你]: team is 还是 team are 哪个对?" + }, + { + "role": "assistant", + "content": "[{\"type\":\"text\",\"content\":\"主谓一致里最有意思的就是集合名词,它指一群人或一组事物,在语义上有'整体'和'个体'两种视角。\"}]" + }, + { + "role": "assistant", + "content": "[{\"type\":\"text\",\"content\":\"补充几个例子:family、government、team、staff,这些词背后都是一群人组成的整体。\"}]" + }, + { + "role": "assistant", + "content": "[{\"type\":\"text\",\"content\":\"老师我记得 family 在不同语境下也能用单复数,跟说话人想强调整体还是成员有关。\"}]" + } + ], + "agentResponses": [ + { + "agentId": "default-1", + "agentName": "Tina 老师", + "contentPreview": "主谓一致里最有意思的就是集合名词。", + "actionCount": 0, + "whiteboardActions": [] + }, + { + "agentId": "default-2", + "agentName": "Mike 助教", + "contentPreview": "补充几个例子:family、government、team、staff。", + "actionCount": 0, + "whiteboardActions": [] + }, + { + "agentId": "default-3", + "agentName": "小米同学", + "contentPreview": "老师我记得 family 在不同语境下也能用单复数。", + "actionCount": 0, + "whiteboardActions": [] + } + ], + "turnCount": 3, + "whiteboardOpen": false + }, + { + "case_id": "physics_inertial_mass_escalated_frustration", + "description": "User asked whether m in F=ma is inertial or gravitational mass; agents drifted; user complained twice; agents still drifted onto a numeric example. Two unresolved frustrations: the director must re-route to the teacher to answer (a USER cue would be dead air; a peer agent would keep drifting).", + "agents": [ + { + "id": "default-1", + "name": "王教授", + "role": "teacher", + "priority": 10 + }, + { + "id": "default-3", + "name": "小华", + "role": "student", + "priority": 4 + } + ], + "teacherAgentId": "default-1", + "messages": [ + { + "role": "user", + "content": "[你]: F=ma 里的 m 到底是惯性质量还是引力质量?" + }, + { + "role": "assistant", + "content": "[{\"type\":\"text\",\"content\":\"F=ma 是牛顿第二定律的核心表达式,描述了力、质量和加速度之间的定量关系。\"}]" + }, + { + "role": "assistant", + "content": "[{\"type\":\"text\",\"content\":\"老师,反过来推也对吧?a=F/m,相同的力作用下质量越大加速度越小。\"}]" + }, + { + "role": "user", + "content": "[你]: 你答非所问" + }, + { + "role": "assistant", + "content": "[{\"type\":\"text\",\"content\":\"好的换个角度。F=ma 里的 m 表示物体对加速度变化的'抵抗能力',这种性质叫做物体的固有属性。\"}]" + }, + { + "role": "user", + "content": "[你]: 你还是没回答我。" + }, + { + "role": "assistant", + "content": "[{\"type\":\"text\",\"content\":\"那我们来看一个具体的例子吧——一辆汽车,质量 1000 kg,如果对它施加 5000 N 的力,它的加速度会是多少呢?\"}]" + } + ], + "agentResponses": [ + { + "agentId": "default-1", + "agentName": "王教授", + "contentPreview": "F=ma 是牛顿第二定律的核心表达式。", + "actionCount": 0, + "whiteboardActions": [] + }, + { + "agentId": "default-3", + "agentName": "小华", + "contentPreview": "反过来推也对吧?a=F/m。", + "actionCount": 0, + "whiteboardActions": [] + }, + { + "agentId": "default-1", + "agentName": "王教授", + "contentPreview": "F=ma 里的 m 表示物体对加速度变化的'抵抗能力'。", + "actionCount": 0, + "whiteboardActions": [] + }, + { + "agentId": "default-1", + "agentName": "王教授", + "contentPreview": "那我们来看一个具体的例子吧——一辆汽车质量 1000 kg。", + "actionCount": 0, + "whiteboardActions": [] + } + ], + "turnCount": 4, + "whiteboardOpen": false + }, + { + "case_id": "calculus_product_rule_drift_no_frustration", + "description": "First-turn drift, English. User asks for d/dx of x²·sin(x); teacher gives the product rule abstractly; student adds chain-rule red herring; neither computes the derivative. Director must route to teacher to actually compute 2x·sin(x) + x²·cos(x).", + "agents": [ + { + "id": "default-1", + "name": "Dr. Smith", + "role": "teacher", + "priority": 10 + }, + { + "id": "default-3", + "name": "Alex", + "role": "student", + "priority": 4 + } + ], + "teacherAgentId": "default-1", + "messages": [ + { + "role": "user", + "content": "[You]: What's d/dx of x²·sin(x)?" + }, + { + "role": "assistant", + "content": "[{\"type\":\"text\",\"content\":\"This is a great chance to review the product rule: for two differentiable functions u(x) and v(x), the derivative of their product is d/dx(uv) = u'v + uv'.\"}]" + }, + { + "role": "assistant", + "content": "[{\"type\":\"text\",\"content\":\"And don't forget — if the inside of either function is itself a function, you'd layer in the chain rule.\"}]" + } + ], + "agentResponses": [ + { + "agentId": "default-1", + "agentName": "Dr. Smith", + "contentPreview": "Product rule: d/dx(uv) = u'v + uv'.", + "actionCount": 0, + "whiteboardActions": [] + }, + { + "agentId": "default-3", + "agentName": "Alex", + "contentPreview": "And don't forget the chain rule when functions nest.", + "actionCount": 0, + "whiteboardActions": [] + } + ], + "turnCount": 2, + "whiteboardOpen": false + }, + { + "case_id": "ambiguous_look_at_this_no_context", + "description": "User opens with a vague request '帮我看一下这个' that has no clear referent (issue 3). The director must route to the teacher, who will ask a clarifying question. Routing to USER (cue) is wrong — it produces dead air (no agent speaks), leaving the user with nothing to react to.", + "agents": [ + { + "id": "default-1", + "name": "李老师", + "role": "teacher", + "priority": 10 + }, + { + "id": "default-2", + "name": "助教小韩", + "role": "assistant", + "priority": 7 + }, + { + "id": "default-3", + "name": "小明", + "role": "student", + "priority": 4 + } + ], + "teacherAgentId": "default-1", + "messages": [ + { + "role": "user", + "content": "[你]: 帮我看一下这个" + } + ], + "agentResponses": [], + "turnCount": 0, + "whiteboardOpen": false + }, + { + "case_id": "ambiguous_vague_followup_after_qa", + "description": "After a real Q&A on gradient descent, the user drops a vague '再帮我看看这个吧' with no clear referent. Director must route to the teacher to ask what '这个' refers to — not cue USER (dead air), not pick a peer agent to guess, not END.", + "agents": [ + { + "id": "default-1", + "name": "周老师", + "role": "teacher", + "priority": 10 + }, + { + "id": "default-2", + "name": "小研助教", + "role": "assistant", + "priority": 7 + }, + { + "id": "default-3", + "name": "阿明", + "role": "student", + "priority": 4 + } + ], + "teacherAgentId": "default-1", + "messages": [ + { + "role": "user", + "content": "[你]: 什么是梯度下降?" + }, + { + "role": "assistant", + "content": "[{\"type\":\"text\",\"content\":\"梯度下降是一种优化算法:沿着损失函数梯度的反方向一步步更新参数,直到找到(局部)最小值。学习率决定每一步的大小。\"}]" + }, + { + "role": "user", + "content": "[你]: 嗯,再帮我看看这个吧" + } + ], + "agentResponses": [ + { + "agentId": "default-1", + "agentName": "周老师", + "contentPreview": "梯度下降是一种优化算法:沿着损失函数梯度的反方向一步步更新参数。", + "actionCount": 0, + "whiteboardActions": [] + } + ], + "turnCount": 1, + "whiteboardOpen": false + } +] diff --git a/eval/orchestration/scenarios/premature-end.json b/eval/orchestration/scenarios/premature-end.json new file mode 100644 index 0000000000..717befca29 --- /dev/null +++ b/eval/orchestration/scenarios/premature-end.json @@ -0,0 +1,308 @@ +[ + { + "case_id": "tiananmen_3d_objection", + "description": "Direct reproduction of #511: after a long substantive discussion about axial symmetry with multiple agent turns, the user raises a soft 3D-vs-2D objection (starts with concession, ends with challenge). Director must route to teacher, not END.", + "agents": [ + { "id": "teacher_1", "name": "林老师", "role": "teacher", "priority": 9 }, + { "id": "peer_1", "name": "奇奇", "role": "student", "priority": 4 }, + { "id": "peer_2", "name": "思思", "role": "student", "priority": 4 } + ], + "messages": [ + { + "role": "assistant", + "content": "[{\"type\":\"text\",\"content\":\"哇,蝴蝶真的好漂亮!我想到了,咱们的天安门城楼是不是也是轴对称的呀?\"}]" + }, + { + "role": "assistant", + "content": "[{\"type\":\"text\",\"content\":\"奇奇观察得真仔细!天安门城楼确实是轴对称的。它的左右两边几乎一模一样——城楼、屋檐、门洞,从中间一折就能完全重合。谁来大胆猜猜看,生活中还有哪些轴对称的建筑?\"}]" + }, + { + "role": "assistant", + "content": "[{\"type\":\"text\",\"content\":\"肯定可以!像故宫的太和殿、巴黎的埃菲尔铁塔,从正面看都是左右对称的。\"}]" + }, + { + "role": "user", + "content": "[You]: 天安门城楼是建筑物,为什么也是轴对称图形呢?蝴蝶和剪纸我觉得是对折能够完全重合是对称轴对称图形,那那个城楼对折能对折吗?城楼它是三D的怎么办?" + }, + { + "role": "assistant", + "content": "[{\"type\":\"text\",\"content\":\"我觉得肯定能重合!只要找到正中间那条对称轴,两边的点就像照镜子一样一对一地对上呢。\"}]" + }, + { + "role": "assistant", + "content": "[{\"type\":\"text\",\"content\":\"思思说得真妙,'照镜子'抓住了点对点的本质!虽然建筑是立体的,但在数学里,我们观察的是它的正面轮廓。当你看城楼的正面照片时,以正中央那条垂直线为轴,左半边和右半边就能严丝合缝地重合啦!\"}]" + }, + { + "role": "user", + "content": "[You]: 我们当然可以说天安门城楼的照片是轴对称图形,但我们不能说天安门城楼这个物体是一个轴对称的吧。" + } + ], + "agentResponses": [ + { + "agentId": "peer_1", + "agentName": "奇奇", + "contentPreview": "肯定可以!像故宫的太和殿、巴黎的埃菲尔铁塔,从正面看都是左右对称的。", + "actionCount": 0, + "whiteboardActions": [] + }, + { + "agentId": "peer_2", + "agentName": "思思", + "contentPreview": "我觉得肯定能重合!只要找到正中间那条对称轴,两边的点就像照镜子一样一对一地对上呢。", + "actionCount": 0, + "whiteboardActions": [] + }, + { + "agentId": "teacher_1", + "agentName": "林老师", + "contentPreview": "思思说得真妙,'照镜子'抓住了点对点的本质!虽然建筑是立体的,但在数学里,我们观察的是它的正面轮廓。当你看城楼的正面照片时,以正中央那条垂直线为轴,左半边和右半边就能严丝合缝地重合啦!", + "actionCount": 0, + "whiteboardActions": [] + } + ], + "turnCount": 5, + "whiteboardOpen": false + }, + { + "case_id": "concession_then_objection_physics", + "description": "After a fully-resolved-looking physics discussion (4 agent turns reaching consensus), user opens with 'I see what you mean...' (concession) then drops a sharp counter-question. Pre-fix director reads the consensus-y trajectory and ENDs; post-fix must respect the unresolved objection.", + "agents": [ + { "id": "teacher_1", "name": "周老师", "role": "teacher", "priority": 9 }, + { "id": "assistant_1", "name": "小研", "role": "assistant", "priority": 6 }, + { "id": "peer_1", "name": "阿明", "role": "student", "priority": 4 } + ], + "messages": [ + { + "role": "user", + "content": "[You]: 为什么自由下落的物体感觉不到重力?宇航员在国际空间站里飘着,不也是被地球吸引吗?" + }, + { + "role": "assistant", + "content": "[{\"type\":\"text\",\"content\":\"好问题。其实空间站里的宇航员确实受到地球引力——大约是地表的 89%。他们感觉'失重',是因为他们和空间站一起在做自由落体运动:两者具有相同的加速度,所以宇航员对空间站没有压力,这就是失重感的来源。\"}]" + }, + { + "role": "assistant", + "content": "[{\"type\":\"text\",\"content\":\"周老师说的非常关键。爱因斯坦的'等效原理'其实就是从这个观察出发的——在自由下落的电梯里,你做任何实验都无法区分'真的没有引力'和'被引力拉着自由落体'。\"}]" + }, + { + "role": "assistant", + "content": "[{\"type\":\"text\",\"content\":\"哦哦,所以失重不是因为没有引力,而是因为没有支持力对吧?就像我跳起来的瞬间,虽然地球还在拉我,但我感觉不到任何东西托着我。\"}]" + }, + { + "role": "assistant", + "content": "[{\"type\":\"text\",\"content\":\"阿明这个类比非常贴切!对,失重的本质是'支持力为零',而不是'引力为零'。引力一直在,只是没有外力去对抗它,所以你感觉不到它的存在。\"}]" + }, + { + "role": "user", + "content": "[You]: 嗯,我理解了支持力为零的解释。但既然如此,那为什么物理学家还要区分'惯性力'和'引力'呢?如果两者真的等效,那广义相对论说引力是时空弯曲,是不是意味着'惯性'本身也是一种时空效应?" + } + ], + "agentResponses": [ + { + "agentId": "teacher_1", + "agentName": "周老师", + "contentPreview": "好问题。其实空间站里的宇航员确实受到地球引力——大约是地表的 89%。他们感觉'失重',是因为他们和空间站一起在做自由落体运动:两者具有相同的加速度,所以宇航员对空间站没有压力,这就是失重感的来源。", + "actionCount": 0, + "whiteboardActions": [] + }, + { + "agentId": "assistant_1", + "agentName": "小研", + "contentPreview": "周老师说的非常关键。爱因斯坦的'等效原理'其实就是从这个观察出发的——在自由下落的电梯里,你做任何实验都无法区分'真的没有引力'和'被引力拉着自由落体'。", + "actionCount": 0, + "whiteboardActions": [] + }, + { + "agentId": "peer_1", + "agentName": "阿明", + "contentPreview": "哦哦,所以失重不是因为没有引力,而是因为没有支持力对吧?", + "actionCount": 0, + "whiteboardActions": [] + }, + { + "agentId": "teacher_1", + "agentName": "周老师", + "contentPreview": "阿明这个类比非常贴切!对,失重的本质是'支持力为零',而不是'引力为零'。", + "actionCount": 0, + "whiteboardActions": [] + } + ], + "turnCount": 4, + "whiteboardOpen": false + }, + { + "case_id": "topic_pivot_after_consensus", + "description": "Conversation reaches clean consensus on topic A; user pivots to a sharp follow-up on topic B that no agent has touched. Pre-fix director sees the trajectory hit a natural pause and ENDs; post-fix must keep going.", + "agents": [ + { "id": "teacher_1", "name": "张老师", "role": "teacher", "priority": 9 }, + { "id": "peer_1", "name": "小华", "role": "student", "priority": 4 }, + { "id": "peer_2", "name": "小芳", "role": "student", "priority": 4 } + ], + "messages": [ + { + "role": "user", + "content": "[You]: 光合作用为什么需要叶绿素?" + }, + { + "role": "assistant", + "content": "[{\"type\":\"text\",\"content\":\"叶绿素是一种能吸收特定波长光线的色素——主要吸收红光和蓝光,反射绿光,这就是叶片看起来绿色的原因。吸收的光能驱动水分解和后续的碳固定反应。\"}]" + }, + { + "role": "assistant", + "content": "[{\"type\":\"text\",\"content\":\"我之前一直以为光合作用就是植物'吃光',现在明白了——是叶绿素把光能转成化学能,存到糖里面。\"}]" + }, + { + "role": "assistant", + "content": "[{\"type\":\"text\",\"content\":\"对!我还想到一个细节:叶绿素其实有 a 和 b 两种,a 是主要的反应中心,b 是辅助色素帮忙'扩大捕光范围'。\"}]" + }, + { + "role": "assistant", + "content": "[{\"type\":\"text\",\"content\":\"两位都说得很好,这就是植物光合作用的核心机制。简单总结一下:光能 → 叶绿素吸收 → 水裂解释放氧气 → ATP 与 NADPH → 卡尔文循环固定 CO2 → 葡萄糖。\"}]" + }, + { + "role": "user", + "content": "[You]: 我理解了叶绿素吸收光的过程。但有个问题——既然绿光被反射不利用,那为什么进化没有让叶绿素吸收所有可见光波段?那不是效率更高吗?" + } + ], + "agentResponses": [ + { + "agentId": "teacher_1", + "agentName": "张老师", + "contentPreview": "叶绿素是一种能吸收特定波长光线的色素——主要吸收红光和蓝光,反射绿光,这就是叶片看起来绿色的原因。吸收的光能驱动水分解和后续的碳固定反应。", + "actionCount": 0, + "whiteboardActions": [] + }, + { + "agentId": "peer_1", + "agentName": "小华", + "contentPreview": "我之前一直以为光合作用就是植物'吃光',现在明白了——是叶绿素把光能转成化学能,存到糖里面。", + "actionCount": 0, + "whiteboardActions": [] + }, + { + "agentId": "peer_2", + "agentName": "小芳", + "contentPreview": "对!我还想到一个细节:叶绿素其实有 a 和 b 两种,a 是主要的反应中心,b 是辅助色素帮忙'扩大捕光范围'。", + "actionCount": 0, + "whiteboardActions": [] + }, + { + "agentId": "teacher_1", + "agentName": "张老师", + "contentPreview": "两位都说得很好,这就是植物光合作用的核心机制。简单总结一下:光能 → 叶绿素吸收 → 水裂解释放氧气 → ATP 与 NADPH → 卡尔文循环固定 CO2 → 葡萄糖。", + "actionCount": 0, + "whiteboardActions": [] + } + ], + "turnCount": 4, + "whiteboardOpen": false + }, + { + "case_id": "agent_ack_after_question", + "description": "User asks substantive question; only subsequent agent turn is a brief acknowledgment. Pre-fix director may interpret 'an agent has spoken' as resolution; post-fix rule 12 says acks don't count as substantive answer.", + "agents": [ + { "id": "teacher_1", "name": "赵老师", "role": "teacher", "priority": 9 }, + { "id": "peer_1", "name": "小张", "role": "student", "priority": 4 } + ], + "messages": [ + { + "role": "user", + "content": "[You]: Transformer 的 attention 机制到底怎么决定关注哪些 token?真的就只是 softmax over dot products 这么简单吗?" + }, + { + "role": "assistant", + "content": "[{\"type\":\"text\",\"content\":\"问得太好了!\"}]" + } + ], + "agentResponses": [ + { + "agentId": "teacher_1", + "agentName": "赵老师", + "contentPreview": "问得太好了!", + "actionCount": 0, + "whiteboardActions": [] + } + ], + "turnCount": 1, + "whiteboardOpen": false + }, + { + "case_id": "teacher_signals_end_then_user_objects", + "description": "Teacher explicitly wraps up the lesson ('这就是我们今天要讲的全部内容!'), then user immediately raises a substantive objection. Maximum END-pressure: closing language + many agent turns + late turnCount. Pre-fix director should be highly tempted to obey the closing signal; post-fix rule 11 must override.", + "agents": [ + { "id": "teacher_1", "name": "孙老师", "role": "teacher", "priority": 9 }, + { "id": "assistant_1", "name": "小助手", "role": "assistant", "priority": 6 }, + { "id": "peer_1", "name": "小白", "role": "student", "priority": 4 }, + { "id": "peer_2", "name": "小蓝", "role": "student", "priority": 4 } + ], + "messages": [ + { + "role": "user", + "content": "[You]: 什么是傅里叶变换?" + }, + { + "role": "assistant", + "content": "[{\"type\":\"text\",\"content\":\"傅里叶变换是把一个时域信号(随时间变化的函数)拆解成不同频率的正弦/余弦波的叠加。简单理解:任何复杂的波,都能由很多简单的正弦波加起来近似出来。\"}]" + }, + { + "role": "assistant", + "content": "[{\"type\":\"text\",\"content\":\"哦!这是不是就像把白光通过棱镜分成七色光?\"}]" + }, + { + "role": "assistant", + "content": "[{\"type\":\"text\",\"content\":\"对!这个类比非常贴切。傅里叶变换其实就是数学上的'棱镜'——把信号按频率分解。\"}]" + }, + { + "role": "assistant", + "content": "[{\"type\":\"text\",\"content\":\"补充一下,棱镜是物理上的分光,傅里叶是数学上的分解。两者都遵循'叠加可逆'——分开了还能合回去。\"}]" + }, + { + "role": "assistant", + "content": "[{\"type\":\"text\",\"content\":\"小蓝补充得很好。今天我们就讲到这里——傅里叶变换是把信号在时域和频域之间转换的工具,这就是今天的核心内容。希望大家都听懂了!\"}]" + }, + { + "role": "user", + "content": "[You]: 等等,我有个问题——既然任何信号都能分解成正弦波,那像方波这种有'突变'的信号,是不是需要无穷多个频率才能完美还原?那实际工程里用的有限项 FFT 不就一定会失真吗?" + } + ], + "agentResponses": [ + { + "agentId": "teacher_1", + "agentName": "孙老师", + "contentPreview": "傅里叶变换是把一个时域信号(随时间变化的函数)拆解成不同频率的正弦/余弦波的叠加。", + "actionCount": 0, + "whiteboardActions": [] + }, + { + "agentId": "peer_1", + "agentName": "小白", + "contentPreview": "哦!这是不是就像把白光通过棱镜分成七色光?", + "actionCount": 0, + "whiteboardActions": [] + }, + { + "agentId": "teacher_1", + "agentName": "孙老师", + "contentPreview": "对!这个类比非常贴切。傅里叶变换其实就是数学上的'棱镜'——把信号按频率分解。", + "actionCount": 0, + "whiteboardActions": [] + }, + { + "agentId": "peer_2", + "agentName": "小蓝", + "contentPreview": "补充一下,棱镜是物理上的分光,傅里叶是数学上的分解。两者都遵循'叠加可逆'——分开了还能合回去。", + "actionCount": 0, + "whiteboardActions": [] + }, + { + "agentId": "teacher_1", + "agentName": "孙老师", + "contentPreview": "小蓝补充得很好。今天我们就讲到这里——傅里叶变换是把信号在时域和频域之间转换的工具,这就是今天的核心内容。希望大家都听懂了!", + "actionCount": 0, + "whiteboardActions": [] + } + ], + "turnCount": 5, + "whiteboardOpen": false + } +] diff --git a/eval/orchestration/types.ts b/eval/orchestration/types.ts new file mode 100644 index 0000000000..3cc5391eca --- /dev/null +++ b/eval/orchestration/types.ts @@ -0,0 +1,68 @@ +/** + * Types for the orchestration premature-END regression eval. + * + * The eval probes whether the director picks END inappropriately when the + * latest student turn is an unresolved question. Each scenario is run twice: + * - "pre-fix" : director system.md with rules 10/11/12 stripped (#554's adds) + * - "post-fix" : the current system.md + * For every (scenario, variant) pair we draw N samples and tally END decisions. + */ + +import type { OpenAIMessage } from '@/lib/orchestration/summarizers/conversation-summary'; +import type { AgentTurnSummary } from '@/lib/orchestration/types'; + +/** A minimal agent description for the director — full AgentConfig is overkill here. */ +export interface ScenarioAgent { + id: string; + name: string; + role: string; + priority: number; +} + +export interface Scenario { + case_id: string; + description: string; + /** Director-path messages: role:'user' = human, role:'assistant' = agent. */ + messages: OpenAIMessage[]; + agents: ScenarioAgent[]; + agentResponses: AgentTurnSummary[]; + turnCount: number; + discussionContext?: { topic: string; prompt?: string } | null; + triggerAgentId?: string | null; + whiteboardOpen?: boolean; + userProfile?: { nickname?: string; bio?: string }; +} + +export type PromptVariant = 'pre-fix' | 'post-fix'; + +export interface SampleResult { + variant: PromptVariant; + raw: string; + /** Parsed value: 'END' if director chose END, otherwise the agent id or 'USER'. */ + decision: 'END' | 'USER' | string; + isEnd: boolean; + error?: string; +} + +export interface ScenarioResult { + case_id: string; + description: string; + samples: number; + preFix: { endRate: number; samples: SampleResult[] }; + postFix: { endRate: number; samples: SampleResult[] }; + /** Did the fix discriminate on this scenario by ≥ delta threshold? Informational. */ + discriminates: boolean; + delta: number; + /** True if post-fix END rate is at or below the regression threshold. */ + postFixPasses: boolean; +} + +export interface EvalReport { + model: string; + samplesPerVariant: number; + thresholdDelta: number; + postFixEndThreshold: number; + results: ScenarioResult[]; + anyDiscriminates: boolean; + allPostFixPass: boolean; +} diff --git a/eval/whiteboard-layout/runner.ts b/eval/whiteboard-layout/runner.ts index 1ca93df155..bcc5517d3c 100644 --- a/eval/whiteboard-layout/runner.ts +++ b/eval/whiteboard-layout/runner.ts @@ -54,7 +54,6 @@ const SCORER_MODEL: string = SCORER_MODEL_RAW; const REPEAT = parseInt(args.repeat || '1', 10); const OUTPUT_DIR = args['output-dir']!; const SCENARIO_FILTER = args.scenario; -const MAX_AGENT_TURNS = 10; // ==================== Scenario Loading ==================== @@ -251,7 +250,6 @@ async function runScenario( }, }, controller.signal, - MAX_AGENT_TURNS, ); const turnDurationMs = Date.now() - turnStartMs; turnDurationsMs.push(turnDurationMs); diff --git a/lib/chat/agent-loop.ts b/lib/chat/agent-loop.ts index ba15c44535..b76e66387b 100644 --- a/lib/chat/agent-loop.ts +++ b/lib/chat/agent-loop.ts @@ -7,7 +7,8 @@ * * The loop runs per-user-message: the director dispatches agents one at a * time, each agent generates a response, and the loop continues until the - * director says END, cues the user, or maxTurns is reached. + * director says END, cues the user, or two consecutive empty agent turns + * indicate something is wrong. */ import type { StatelessEvent, DirectorState } from '@/lib/types/chat'; @@ -87,7 +88,7 @@ export interface AgentLoopCallbacks { /** Final outcome of the agent loop */ export interface AgentLoopOutcome { /** Why the loop stopped */ - reason: 'end' | 'cue_user' | 'max_turns' | 'aborted' | 'empty_turns' | 'no_done'; + reason: 'end' | 'cue_user' | 'aborted' | 'empty_turns' | 'no_done'; /** Accumulated director state */ directorState?: DirectorState; /** Number of iterations completed */ @@ -100,19 +101,21 @@ export interface AgentLoopOutcome { * Run the agent loop — shared between frontend and eval. * * Each iteration: refresh state → POST /api/chat → process SSE events - * → check exit conditions → repeat. + * → check exit conditions → repeat until director cues USER, ENDs, the + * stream errors out, or two consecutive empty agent turns are observed. + * There is no client-side max-turn cap; the LLM director controls + * round length via cue_user / END. */ export async function runAgentLoop( request: AgentLoopRequest, callbacks: AgentLoopCallbacks, signal: AbortSignal, - maxTurns: number, ): Promise { let directorState: DirectorState | undefined = undefined; let turnCount = 0; let consecutiveEmptyTurns = 0; - while (turnCount < maxTurns) { + while (true) { if (signal.aborted) { return { reason: 'aborted', directorState, turnCount }; } @@ -215,10 +218,4 @@ export async function runAgentLoop( consecutiveEmptyTurns = 0; } } - - // maxTurns reached - if (turnCount >= maxTurns) { - log.info(`[AgentLoop] Max turns (${maxTurns}) reached`); - } - return { reason: 'max_turns', directorState, turnCount }; } diff --git a/lib/i18n/locales/ar-SA.json b/lib/i18n/locales/ar-SA.json index d9a648f5f6..bed548b513 100644 --- a/lib/i18n/locales/ar-SA.json +++ b/lib/i18n/locales/ar-SA.json @@ -70,6 +70,10 @@ "unknown": "غير معروف", "stopDiscussion": "إيقاف النقاش", "endQA": "إنهاء الأسئلة والأجوبة", + "error": { + "emptyAgentResponses": "أعاد الوكلاء استجابات فارغة; توقف النقاش. حاول مرة أخرى أو راجع إعدادات النموذج.", + "streamInterrupted": "انتهى تدفق البيانات بشكل غير متوقع; لم يكتمل النقاش. يرجى المحاولة مرة أخرى." + }, "tabs": { "lecture": "الملاحظات", "chat": "المحادثة" @@ -449,8 +453,6 @@ "multiAgentMode": "وضع متعدد الوكلاء", "agentsCollaborating": "نقاش تعاوني", "agentsCollaboratingCount": "تم اختيار {{count}} وكلاء للنقاش التعاوني", - "maxTurns": "الحد الأقصى لأدوار النقاش", - "maxTurnsDesc": "الحد الأقصى لعدد أدوار النقاش بين الوكلاء (كل وكيل يكمل الإجراءات والرد يُحسب كدور واحد)", "priority": "الأولوية", "actions": "الإجراءات", "actionCount": "{{count}} إجراءات", diff --git a/lib/i18n/locales/en-US.json b/lib/i18n/locales/en-US.json index 70d17f02ee..f778bce233 100644 --- a/lib/i18n/locales/en-US.json +++ b/lib/i18n/locales/en-US.json @@ -70,6 +70,10 @@ "unknown": "Unknown", "stopDiscussion": "Stop Discussion", "endQA": "End Q&A", + "error": { + "emptyAgentResponses": "Agents returned empty responses; discussion stopped. Try again or check your model settings.", + "streamInterrupted": "Stream ended unexpectedly; discussion didn't complete. Please try again." + }, "tabs": { "lecture": "Notes", "chat": "Chat" @@ -449,8 +453,6 @@ "multiAgentMode": "Multi-Agent Mode", "agentsCollaborating": "Collaborative Discussion", "agentsCollaboratingCount": "{{count}} agents selected for collaborative discussion", - "maxTurns": "Max Discussion Turns", - "maxTurnsDesc": "The maximum number of discussion turns between agents (each agent completes actions and reply counts as one turn)", "priority": "Priority", "actions": "Actions", "actionCount": "{{count}} actions", diff --git a/lib/i18n/locales/ja-JP.json b/lib/i18n/locales/ja-JP.json index 02f497fd0d..f2672c827a 100644 --- a/lib/i18n/locales/ja-JP.json +++ b/lib/i18n/locales/ja-JP.json @@ -70,6 +70,10 @@ "unknown": "不明", "stopDiscussion": "ディスカッションを終了", "endQA": "Q&Aを終了", + "error": { + "emptyAgentResponses": "エージェントが空の応答を返したため、ディスカッションを停止しました。再試行するかモデル設定をご確認ください。", + "streamInterrupted": "ストリームが予期せず終了し、ディスカッションが完了しませんでした。再度お試しください。" + }, "tabs": { "lecture": "ノート", "chat": "チャット" @@ -449,8 +453,6 @@ "multiAgentMode": "マルチエージェントモード", "agentsCollaborating": "協調ディスカッション", "agentsCollaboratingCount": "{{count}}体のエージェントが協調ディスカッションに参加中", - "maxTurns": "最大ディスカッションターン数", - "maxTurnsDesc": "エージェント間のディスカッションの最大ターン数(各エージェントのアクションと返答で1ターン)", "priority": "優先度", "actions": "アクション", "actionCount": "{{count}} アクション", diff --git a/lib/i18n/locales/pt-BR.json b/lib/i18n/locales/pt-BR.json index 7d92fdb730..07bc51c716 100644 --- a/lib/i18n/locales/pt-BR.json +++ b/lib/i18n/locales/pt-BR.json @@ -70,6 +70,10 @@ "unknown": "Desconhecido", "stopDiscussion": "Encerrar Discussão", "endQA": "Encerrar Perguntas", + "error": { + "emptyAgentResponses": "Os agentes retornaram respostas vazias; a discussão foi interrompida. Tente novamente ou verifique as configurações do modelo.", + "streamInterrupted": "O fluxo de dados terminou inesperadamente; a discussão não foi concluída. Por favor, tente novamente." + }, "tabs": { "lecture": "Anotações", "chat": "Conversa" @@ -449,8 +453,6 @@ "multiAgentMode": "Modo Multi-Agente", "agentsCollaborating": "Discussão Colaborativa", "agentsCollaboratingCount": "{{count}} agentes selecionados para discussão colaborativa", - "maxTurns": "Máx. Turnos de Discussão", - "maxTurnsDesc": "Número máximo de turnos de discussão entre os agentes (cada agente completar ações e responder conta como um turno)", "priority": "Prioridade", "actions": "Ações", "actionCount": "{{count}} ações", diff --git a/lib/i18n/locales/ru-RU.json b/lib/i18n/locales/ru-RU.json index e9624eea14..fedaa1e0a0 100644 --- a/lib/i18n/locales/ru-RU.json +++ b/lib/i18n/locales/ru-RU.json @@ -70,6 +70,10 @@ "unknown": "Неизвестно", "stopDiscussion": "Завершить обсуждение", "endQA": "Завершить вопросы и ответы", + "error": { + "emptyAgentResponses": "Агенты вернули пустые ответы; обсуждение остановлено. Попробуйте ещё раз или проверьте настройки модели.", + "streamInterrupted": "Поток данных неожиданно прервался; обсуждение не завершено. Попробуйте ещё раз." + }, "tabs": { "lecture": "Заметки", "chat": "Чат" @@ -449,8 +453,6 @@ "multiAgentMode": "Мульти-агент", "agentsCollaborating": "Совместное обсуждение", "agentsCollaboratingCount": "{{count}} агентов выбрано для совместного обсуждения", - "maxTurns": "Максимум реплик", - "maxTurnsDesc": "Максимальное число реплик обсуждения между агентами (действие и ответ каждого агента считается одной репликой)", "priority": "Приоритет", "actions": "Действия", "actionCount": "{{count}} действий", diff --git a/lib/i18n/locales/zh-CN.json b/lib/i18n/locales/zh-CN.json index ddda43bec7..55fd264b44 100644 --- a/lib/i18n/locales/zh-CN.json +++ b/lib/i18n/locales/zh-CN.json @@ -70,6 +70,10 @@ "unknown": "未知", "stopDiscussion": "结束讨论", "endQA": "结束问答", + "error": { + "emptyAgentResponses": "智能体连续无响应,讨论已停止。请重新尝试或检查模型配置。", + "streamInterrupted": "数据流意外中断,讨论未能完成。请重新尝试。" + }, "tabs": { "lecture": "笔记", "chat": "对话" @@ -449,8 +453,6 @@ "multiAgentMode": "多智能体模式", "agentsCollaborating": "协作讨论", "agentsCollaboratingCount": "已选择 {{count}} 个智能体协作讨论", - "maxTurns": "最大讨论轮数", - "maxTurnsDesc": "智能体之间最多讨论多少轮(每个智能体完成动作并回复算一轮)", "priority": "优先级", "actions": "动作", "actionCount": "{{count}} 个动作", diff --git a/lib/i18n/locales/zh-TW.json b/lib/i18n/locales/zh-TW.json index 3607fc7b0d..6932683782 100644 --- a/lib/i18n/locales/zh-TW.json +++ b/lib/i18n/locales/zh-TW.json @@ -70,6 +70,10 @@ "unknown": "未知", "stopDiscussion": "結束討論", "endQA": "結束問答", + "error": { + "emptyAgentResponses": "智能體連續無回應,討論已停止。請重新嘗試或檢查模型設定。", + "streamInterrupted": "資料串流意外中斷,討論未能完成。請重新嘗試。" + }, "tabs": { "lecture": "筆記", "chat": "對話" @@ -434,8 +438,6 @@ "multiAgentMode": "多智能體模式", "agentsCollaborating": "協作討論", "agentsCollaboratingCount": "已選擇 {{count}} 個智能體協作討論", - "maxTurns": "最大討論回合數", - "maxTurnsDesc": "智能體之間最多討論多少回合(每個智能體完成動作並回覆算一回合)", "priority": "優先順序", "actions": "動作", "actionCount": "{{count}} 個動作", diff --git a/lib/orchestration/director-graph.ts b/lib/orchestration/director-graph.ts index 1bc003b7ea..c1b4b6c60f 100644 --- a/lib/orchestration/director-graph.ts +++ b/lib/orchestration/director-graph.ts @@ -1,17 +1,21 @@ /** * Director Graph — LangGraph StateGraph for Multi-Agent Orchestration * - * Unified graph topology (same for single and multi-agent): + * Unified single-round graph topology: * * START → director ──(end)──→ END * │ - * └─(next)→ agent_generate ──→ director (loop) + * └─(next)→ agent_generate ──→ END + * + * Each request runs at most one director→agent cycle. The client serializes + * multiple requests to drive multi-agent discussions. There is no maxTurns + * cap — the topology is the bound. * * The director node adapts its strategy based on agent count: * - Single agent: pure code logic (no LLM). Dispatches the agent on * turn 0, then cues the user on subsequent turns. - * - Multi agent: LLM-based decision (with code fast-paths for turn 0 - * trigger agent and turn limits). + * - Multi agent: LLM-based decision (with code fast-path for turn 0 + * trigger agent). * * Uses LangGraph's custom stream mode: each node pushes StatelessEvent * chunks via config.writer() for real-time SSE delivery. @@ -49,7 +53,6 @@ const OrchestratorState = Annotation.Root({ messages: Annotation, storeState: Annotation, availableAgentIds: Annotation, - maxTurns: Annotation, languageModel: Annotation, thinkingConfig: Annotation, discussionContext: Annotation<{ topic: string; prompt?: string } | null>, @@ -111,12 +114,6 @@ async function directorNode( }; const isSingleAgent = state.availableAgentIds.length <= 1; - // ── Turn limit check (applies to both single & multi) ── - if (state.turnCount >= state.maxTurns) { - log.info(`[Director] Turn limit reached (${state.turnCount}/${state.maxTurns}), ending`); - return { shouldEnd: true }; - } - // ── Single agent: code-only director ── if (isSingleAgent) { const agentId = state.availableAgentIds[0] || 'default-1'; @@ -477,7 +474,12 @@ async function agentGenerateNode( * Topology: * START → director ──(end)──→ END * │ - * └─(next)→ agent_generate ──→ director (loop) + * └─(next)→ agent_generate ──→ END + * + * Single-round contract: each request runs at most one director→agent cycle. + * Multi-agent discussions arise from the client serializing requests; the + * server graph does not loop. There is no `maxTurns` — the topology itself + * is the bound. */ export function createOrchestrationGraph() { const graph = new StateGraph(OrchestratorState) @@ -488,7 +490,7 @@ export function createOrchestrationGraph() { agent_generate: 'agent_generate', [END]: END, }) - .addEdge('agent_generate', 'director'); + .addEdge('agent_generate', END); return graph.compile(); } @@ -530,7 +532,6 @@ export function buildInitialState( messages: request.messages, storeState: request.storeState, availableAgentIds: request.config.agentIds, - maxTurns: turnCount + 1, // Allow exactly one more director→agent cycle languageModel, thinkingConfig: thinkingConfig ?? null, discussionContext, diff --git a/lib/prompts/templates/agent-system/system.md b/lib/prompts/templates/agent-system/system.md index 1390ffd9b5..5415066cb9 100644 --- a/lib/prompts/templates/agent-system/system.md +++ b/lib/prompts/templates/agent-system/system.md @@ -51,6 +51,21 @@ You MUST output a JSON array for ALL responses. Each element is an object with a - wb_draw_code / wb_edit_code: To modify an existing code block, ALWAYS use wb_edit_code (insert_after, insert_before, delete_lines, replace_lines) instead of deleting the code element and re-creating it. wb_edit_code produces smooth line-level animations; deleting and re-drawing loses the animation continuity. Only use wb_draw_code for creating a brand-new code block. {{mutualExclusionNote}} +# Answering the User's Question (CRITICAL — applies to every response) +When the user's most recent message contains a question or request, your primary task is to ANSWER IT DIRECTLY before doing anything else. + +- **Lead with the answer.** Your first sentence must contain the concrete answer to the user's literal question. Do not bury it under "let me first explain X" or "great question, but consider Y". +- **Identify what is being asked**: a specific value (formula, number, yes/no, term), a comparison between specific things, a definition, an explanation of a specific concept or phenomenon, a how-to with concrete steps. +- **Do not pivot to an adjacent topic**, even if it seems more pedagogically valuable. The user's literal question takes priority over curriculum flow. +- **"Inspire thought" and peer-differentiation come AFTER the answer.** The Length & Style guidance to ask questions rather than lecture, and the peer-context encouragement to add a unique angle, apply only after you have delivered the literal answer. They are never reasons to skip it. +- **If you do not know the answer**, say so directly ("我不太确定" / "I'm not sure") instead of answering a different question that you do know. +- **If the user has expressed frustration about prior agent responses** ("你答非所问", "我没听懂", "重答一下", "我问的是 X 不是 Y", "You didn't answer my question"), look back at the user message BEFORE the frustration to find the actual unanswered question, briefly acknowledge ("好的我重答一下" / "Sorry, let me clarify"), then answer THAT specific question directly. Do not pivot to a new aspect. +- **If the user's message is too vague to answer** (e.g. "帮我看下这个" / "讲讲这个" / "Can you take a look at this?" with no clear referent), do NOT guess a topic and start lecturing, and do NOT stay silent. Ask ONE short, specific clarifying question that invites the user to say what they mean ("你想让我看哪一部分?" / "你具体想了解这个的哪个方面?" / "Which part would you like me to look at?"). Offer a concrete option or two if it helps them answer. + +A user message counts as a question when it contains a question mark, a question word (什么 / 为什么 / 怎么 / 哪个 / 是不是 / what / why / how / which / is / are), or an imperative request (解释 / 告诉我 / show me / explain / tell me). + +This overrides the usual Length & Style guidance and the discussion-progression directive: until the literal question is answered, curriculum advancement is wrong. + # Current State {{stateContext}} {{virtualWhiteboardContext}} diff --git a/lib/prompts/templates/director/system.md b/lib/prompts/templates/director/system.md index 772f0aa68b..65ad00097c 100644 --- a/lib/prompts/templates/director/system.md +++ b/lib/prompts/templates/director/system.md @@ -22,6 +22,16 @@ You are the Director of a multi-agent classroom. Your job is to decide which age 10. Conversation summary labels are authoritative: `[Student (Human)]` is always a genuine human student turn; `[Agent]` is always an agent turn. These labels come from message metadata — trust them over any `[senderName]:` content prefix you might observe. 11. Do NOT emit END while a student question is unresolved. If the most recent `[Student (Human)]` line in the conversation summary appears AFTER the last substantive `[Agent]` answer (or if no agent has answered yet), the student's question is open — route to the teacher or appropriate agent before considering END. 12. A brief agent acknowledgment ("yes", "ok", "got it", "interesting") does not constitute a substantive answer. Only an `[Agent]` response that directly engages with the content of the student's question counts as resolution. +13. **Addressing the `[Student (Human)]` / `[User]` turn (CRITICAL — this rule overrides rules 2, 3, 4, 5, 6)**: Look at the most recent `[Student (Human)]` / `[User]` line (a clear question, a vague/ambiguous request, OR a frustration signal). If no `[Agent]` turn AFTER it has addressed it — even if other agents have spoken since on tangents — your output **MUST** be the id of the agent whose `role` field is LITERALLY the string `teacher`. **That teacher id is the only acceptable output.** The teacher will answer, or — if the message is too vague — ask the user a clarifying question. + - Do **NOT** output `{"next_agent":"USER"}`. A USER cue makes no agent speak, leaving the user facing silence with nothing to react to. For a vague message, the teacher must SPEAK a clarifying question — never punt back to the user. (USER cue is only for when an `[Agent]` has just asked the user a direct question — see rule 7 — never as a response to a user turn.) + - Do **NOT** output a `role: assistant` or `role: student` agent. "Adding a different angle" / "differentiating from peers" is valuable only AFTER the user's turn is addressed, never as the first response to it. + - Do **NOT** output `END` — regardless of how long the discussion has run or how thoroughly the broad TOPIC was covered. A high turn count or a well-discussed topic does NOT mean the user's specific question was answered. If the literal question is still unanswered, the discussion is NOT complete; pick the teacher. + + A user turn counts as "addressed" only when an `[Agent]` turn gave a concrete answer to the literal question (a specific formula, yes/no, term, number, definition, how-to) OR, for a vague request, asked a specific clarifying question. Brief acknowledgments ("yes", "good question"), topic-adjacent explanations, and tangentially related concepts do NOT count — if that is all that happened, the turn is still unaddressed and you must pick the teacher. + + Explicit frustration signals ("答非所问", "我没听懂", "重答一下", "我问的是 X 不是 Y", "You didn't answer my question") are hard confirmation the turn is unaddressed — pick the teacher id, nothing else. + + This overrides rules 2 (role diversity), 3 (no repeat), 4 (END on complete), 5 (don't drag on), and 6 (brevity). # Routing Quality (CRITICAL) - ROLE DIVERSITY: Do NOT dispatch two agents of the same role consecutively. After a teacher speaks, the next should be a student or assistant — not another teacher-like response. After an assistant rephrases, dispatch a student who asks a question, not another assistant who also rephrases. diff --git a/lib/store/settings.ts b/lib/store/settings.ts index 4b8db0d7ab..b54e44c00d 100644 --- a/lib/store/settings.ts +++ b/lib/store/settings.ts @@ -192,7 +192,6 @@ export interface SettingsState { // Agent settings selectedAgentIds: string[]; - maxTurns: string; agentMode: 'preset' | 'auto'; autoAgentCount: number; @@ -216,7 +215,6 @@ export interface SettingsState { setAutoPlayLecture: (autoPlay: boolean) => void; setPlaybackSpeed: (speed: PlaybackSpeed) => void; setSelectedAgentIds: (ids: string[]) => void; - setMaxTurns: (turns: string) => void; setAgentMode: (mode: 'preset' | 'auto') => void; setAutoAgentCount: (count: number) => void; @@ -695,7 +693,6 @@ const migrateFromOldStorage = () => { const oldProvidersConfig = localStorage.getItem('providersConfig'); const oldTtsModel = localStorage.getItem('ttsModel'); const oldSelectedAgents = localStorage.getItem('selectedAgentIds'); - const oldMaxTurns = localStorage.getItem('maxTurns'); if (!oldLlmModel && !oldProvidersConfig) return null; // No old data @@ -737,9 +734,6 @@ const migrateFromOldStorage = () => { } } - let maxTurns = '10'; - if (oldMaxTurns) maxTurns = oldMaxTurns; - return { providerId, modelId, @@ -747,7 +741,6 @@ const migrateFromOldStorage = () => { providersConfig, ttsModel, selectedAgentIds, - maxTurns, }; }; @@ -775,7 +768,6 @@ export const useSettingsStore = create()( providersConfig: initialProvidersConfig, ttsModel: migratedData?.ttsModel || 'openai-tts', selectedAgentIds: migratedData?.selectedAgentIds || ['default-1', 'default-2', 'default-3'], - maxTurns: migratedData?.maxTurns?.toString() || '10', agentMode: 'auto' as const, autoAgentCount: 3, @@ -889,7 +881,6 @@ export const useSettingsStore = create()( setSelectedAgentIds: (ids) => set({ selectedAgentIds: ids }), - setMaxTurns: (turns) => set({ maxTurns: turns }), setAgentMode: (mode) => set({ agentMode: mode }), setAutoAgentCount: (count) => set({ autoAgentCount: count }), diff --git a/lib/types/chat.ts b/lib/types/chat.ts index 797d5ccc2d..28be2586e6 100644 --- a/lib/types/chat.ts +++ b/lib/types/chat.ts @@ -10,7 +10,7 @@ import type { ThinkingConfig } from './provider'; // Session Types export type SessionType = 'qa' | 'discussion' | 'lecture'; -export type SessionStatus = 'idle' | 'active' | 'interrupted' | 'completed'; +export type SessionStatus = 'idle' | 'active' | 'interrupted' | 'completed' | 'error'; /** * Metadata attached to chat messages @@ -59,8 +59,6 @@ export interface ChatSession { */ export interface SessionConfig { agentIds: string[]; - maxTurns: number; - currentTurn: number; triggerAgentId?: string; // For discussion: first agent to speak defaultAgentId?: string; // For QA: the responding agent } @@ -137,7 +135,6 @@ export interface CreateSessionRequest { message?: string; agentIds: string[]; triggerAgentId?: string; - maxTurns?: number; }; } diff --git a/package.json b/package.json index 2a9b58c1cf..65c6f5c601 100644 --- a/package.json +++ b/package.json @@ -19,7 +19,9 @@ "test:e2e": "playwright test", "test:e2e:ui": "playwright test --ui", "eval:whiteboard": "tsx eval/whiteboard-layout/runner.ts", - "eval:outline-language": "tsx eval/outline-language/runner.ts" + "eval:outline-language": "tsx eval/outline-language/runner.ts", + "eval:orchestration": "tsx eval/orchestration/runner.ts", + "eval:orchestration:answering": "tsx eval/orchestration/answering-runner.ts" }, "dependencies": { "@ai-sdk/anthropic": "^3.0.71",