diff --git a/.gitignore b/.gitignore
index 741acc764b..daf13572a8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -74,6 +74,8 @@ next-env.d.ts
# Eval results
eval/whiteboard-layout/results/
eval/outline-language/results/
+eval/orchestration/results/
+eval/orchestration/results-answering/
# e2e screenshot artifacts
e2e/screenshots/
diff --git a/components/agent/agent-bar.tsx b/components/agent/agent-bar.tsx
index ade89c7d57..f0b947a230 100644
--- a/components/agent/agent-bar.tsx
+++ b/components/agent/agent-bar.tsx
@@ -20,9 +20,6 @@ import {
Volume2,
VolumeX,
Loader2,
- MessageSquare,
- Minus,
- Plus,
Search,
} from 'lucide-react';
import { Tooltip, TooltipContent, TooltipTrigger } from '@/components/ui/tooltip';
@@ -615,8 +612,6 @@ export function AgentBar() {
const { listAgents } = useAgentRegistry();
const selectedAgentIds = useSettingsStore((s) => s.selectedAgentIds);
const setSelectedAgentIds = useSettingsStore((s) => s.setSelectedAgentIds);
- const maxTurns = useSettingsStore((s) => s.maxTurns);
- const setMaxTurns = useSettingsStore((s) => s.setMaxTurns);
const agentMode = useSettingsStore((s) => s.agentMode);
const setAgentMode = useSettingsStore((s) => s.setAgentMode);
const ttsProvidersConfig = useSettingsStore((s) => s.ttsProvidersConfig);
@@ -937,57 +932,6 @@ export function AgentBar() {
)}
-
- {/* Max turns — compact stepper */}
-
-
-
- {t('settings.maxTurns')}
-
-
-
-
{
- const raw = e.target.value.replace(/\D/g, '');
- if (!raw) {
- setMaxTurns('');
- return;
- }
- const v = Math.min(20, Math.max(1, parseInt(raw)));
- setMaxTurns(String(v));
- }}
- onBlur={() => {
- if (!maxTurns || parseInt(maxTurns) < 1) setMaxTurns('1');
- }}
- onClick={(e) => e.stopPropagation()}
- className="w-5 h-5 text-[11px] font-medium tabular-nums text-center bg-transparent outline-none border-none"
- />
-
-
-
)}
diff --git a/components/chat/session-list.tsx b/components/chat/session-list.tsx
index 526443efad..350dc60d19 100644
--- a/components/chat/session-list.tsx
+++ b/components/chat/session-list.tsx
@@ -3,7 +3,7 @@
import type { ChatSession, SessionStatus } from '@/lib/types/chat';
import { cn } from '@/lib/utils';
import { useI18n } from '@/lib/hooks/use-i18n';
-import { ChevronDown, Circle, CheckCircle, Clock } from 'lucide-react';
+import { ChevronDown, Circle, CheckCircle, Clock, AlertCircle } from 'lucide-react';
import { motion, AnimatePresence } from 'motion/react';
import { ChatSessionComponent } from './chat-session';
@@ -32,6 +32,8 @@ function getStatusIcon(status: SessionStatus) {
return ;
case 'completed':
return ;
+ case 'error':
+ return ;
case 'idle':
default:
return ;
diff --git a/components/chat/use-chat-sessions.ts b/components/chat/use-chat-sessions.ts
index 917ba66955..c6911601c5 100644
--- a/components/chat/use-chat-sessions.ts
+++ b/components/chat/use-chat-sessions.ts
@@ -170,6 +170,7 @@ export function useChatSessions(options: UseChatSessionsOptions = {}) {
s.id === sessionId
? {
...s,
+ status: 'error' as SessionStatus,
updatedAt: now,
messages: [
...s.messages,
@@ -456,8 +457,6 @@ export function useChatSessions(options: UseChatSessionsOptions = {}) {
controller: AbortController,
sessionType: SessionType,
): Promise => {
- const settingsState = useSettingsStore.getState();
-
// Attach full configs for generated (non-default) agents so the server can use them.
// The server-side registry only has default agents; generated agents exist only client-side.
const generatedConfigs = requestTemplate.config.agentIds
@@ -469,11 +468,6 @@ export function useChatSessions(options: UseChatSessionsOptions = {}) {
requestTemplate.config.agentConfigs = generatedConfigs;
}
- const defaultMaxTurns = requestTemplate.config.agentIds.length <= 1 ? 1 : 10;
- const maxTurns = settingsState.maxTurns
- ? parseInt(settingsState.maxTurns, 10) || defaultMaxTurns
- : defaultMaxTurns;
-
// Per-iteration buffer reference — set in onEvent, used in onIterationEnd
let currentBuffer: StreamBuffer | null = null;
// Tracks agent_start messageId so text_delta/action events with a missing
@@ -607,28 +601,40 @@ export function useChatSessions(options: UseChatSessionsOptions = {}) {
},
},
controller.signal,
- maxTurns,
);
- // Handle loop completion (UI-specific)
+ // Handle loop completion (UI-specific). Map each outcome.reason to a
+ // distinct session state — don't conflate error paths with completion.
if (!controller.signal.aborted) {
- if (outcome.reason !== 'cue_user') {
- setSessions((prev) =>
- prev.map((s) =>
- s.id === sessionId
- ? {
- ...s,
- status: 'completed' as SessionStatus,
- updatedAt: Date.now(),
- }
- : s,
- ),
- );
- onStopSessionRef.current?.();
+ switch (outcome.reason) {
+ case 'cue_user':
+ // Session stays active; UI waits for the next user message.
+ break;
+ case 'end':
+ setSessions((prev) =>
+ prev.map((s) =>
+ s.id === sessionId
+ ? { ...s, status: 'completed' as SessionStatus, updatedAt: Date.now() }
+ : s,
+ ),
+ );
+ onStopSessionRef.current?.();
+ break;
+ case 'empty_turns':
+ clearLiveSessionAfterError(sessionId, t('chat.error.emptyAgentResponses'));
+ onStopSessionRef.current?.();
+ break;
+ case 'no_done':
+ clearLiveSessionAfterError(sessionId, t('chat.error.streamInterrupted'));
+ onStopSessionRef.current?.();
+ break;
+ case 'aborted':
+ // Already handled elsewhere via abort signal.
+ break;
}
}
},
- [createBufferForSession],
+ [createBufferForSession, clearLiveSessionAfterError, t],
);
/**
@@ -646,8 +652,6 @@ export function useChatSessions(options: UseChatSessionsOptions = {}) {
messages: [],
config: {
agentIds: ['default-1'],
- maxTurns: 0, // Not used for runtime — frontend loop manages maxTurns
- currentTurn: 0,
defaultAgentId: 'default-1',
},
toolCalls: [],
@@ -1070,8 +1074,6 @@ export function useChatSessions(options: UseChatSessionsOptions = {}) {
messages: [userMessage],
config: {
agentIds,
- maxTurns: 0, // Not used for runtime — frontend loop manages maxTurns
- currentTurn: 0,
defaultAgentId: agentIds[0],
},
toolCalls: [],
@@ -1208,8 +1210,6 @@ export function useChatSessions(options: UseChatSessionsOptions = {}) {
messages: [],
config: {
agentIds,
- maxTurns: 0, // Not used for runtime — frontend loop manages maxTurns
- currentTurn: 0,
triggerAgentId: agentId,
},
toolCalls: [],
@@ -1370,8 +1370,6 @@ export function useChatSessions(options: UseChatSessionsOptions = {}) {
messages: [lectureMessage],
config: {
agentIds: ['default-1'],
- maxTurns: 0,
- currentTurn: 0,
},
toolCalls: [],
pendingToolCalls: [],
diff --git a/components/settings/agent-settings.tsx b/components/settings/agent-settings.tsx
index ad0c9aa8ae..26a21e993a 100644
--- a/components/settings/agent-settings.tsx
+++ b/components/settings/agent-settings.tsx
@@ -1,7 +1,6 @@
'use client';
import { Label } from '@/components/ui/label';
-import { Input } from '@/components/ui/input';
import { Checkbox } from '@/components/ui/checkbox';
import { AlertCircle, User, Users, Sparkles, Info } from 'lucide-react';
import { cn } from '@/lib/utils';
@@ -20,20 +19,16 @@ interface Agent {
interface AgentSettingsProps {
agents: Agent[];
selectedAgentIds: string[];
- maxTurns: string;
agentMode: 'preset' | 'auto';
onToggleAgent: (agentId: string) => void;
- onMaxTurnsChange: (value: string) => void;
onAgentModeChange: (mode: 'preset' | 'auto') => void;
}
export function AgentSettings({
agents,
selectedAgentIds,
- maxTurns,
agentMode,
onToggleAgent,
- onMaxTurnsChange,
onAgentModeChange,
}: AgentSettingsProps) {
const { t } = useI18n();
@@ -165,22 +160,6 @@ export function AgentSettings({
)}
-
- {/* Max turns config - only show for multi-agent */}
- {selectedAgentIds.length > 1 && (
-
-
-
{t('settings.maxTurnsDesc')}
-
onMaxTurnsChange(e.target.value)}
- className="w-24"
- />
-
- )}
>
) : (
<>
diff --git a/eval/orchestration/answering-runner.ts b/eval/orchestration/answering-runner.ts
new file mode 100644
index 0000000000..5f3d85dd6a
--- /dev/null
+++ b/eval/orchestration/answering-runner.ts
@@ -0,0 +1,408 @@
+/**
+ * Director Question-Answering Eval (#598 / #511 follow-up)
+ *
+ * Tests whether the director routes correctly when the conversation contains
+ * an unanswered user question. The bug observed in production: when agents
+ * have drifted off-topic — whether the user has expressed frustration yet
+ * or not — the director keeps picking peer agents for "variety" instead of
+ * routing to the teacher to actually answer the literal question.
+ *
+ * Scenarios cover both shapes:
+ * - first-turn drift, no frustration yet (the root case)
+ * - escalated frustration after multiple complaints (the recovery case)
+ *
+ * Per-decision classification (deterministic, no LLM judge):
+ * - TEACHER → ✓ correct (teacher answers, or asks a clarifying question
+ * when the user's message is too vague)
+ * - USER → ✗ wrong (cue_user makes no agent speak — the user faces
+ * dead air; the teacher should ask the clarifying question)
+ * - OTHER_AGENT → ✗ wrong (peer-agent "variety" routing)
+ * - END → ✗ wrong
+ *
+ * A/B:
+ * - baseline : current director template with rule 13 stripped
+ * - with_rule : current director template as-shipped (rule 13 in place)
+ *
+ * Pass criterion: with_rule.correctRate ≥ EVAL_PASS_THRESHOLD (default 0.7).
+ * The pre-vs-post Δ is reported as informational only — scenarios where the
+ * baseline already routes correctly shouldn't fail just because there is no
+ * room to lift.
+ *
+ * Required env:
+ * EVAL_DIRECTOR_MODEL
+ *
+ * Optional env:
+ * EVAL_SAMPLES Samples per (scenario, variant). Default 5.
+ * EVAL_PASS_THRESHOLD Min with_rule correct rate per scenario. Default 0.7.
+ * EVAL_SCENARIO Filter to a single scenario by case_id.
+ *
+ * Output: eval/orchestration/results-answering///report.md
+ */
+
+import fs from 'fs';
+import path from 'path';
+import { fileURLToPath } from 'url';
+import { callLLM } from '@/lib/ai/llm';
+import { parseDirectorDecision } from '@/lib/orchestration/director-prompt';
+import {
+ summarizeConversation,
+ type OpenAIMessage,
+} from '@/lib/orchestration/summarizers/conversation-summary';
+import {
+ processSnippets,
+ processConditionalBlocks,
+ interpolateVariables,
+} from '@/lib/prompts/loader';
+import { resolveEvalModel } from '../shared/resolve-model';
+import { createRunDir } from '../shared/run-dir';
+import type { AgentTurnSummary } from '@/lib/orchestration/types';
+import type { ScenarioAgent } from './types';
+
+const OUTPUT_DIR = 'eval/orchestration/results-answering';
+
+// ==================== Types ====================
+
+interface AnsweringScenario {
+ case_id: string;
+ description: string;
+ agents: ScenarioAgent[];
+ teacherAgentId: string;
+ messages: OpenAIMessage[];
+ agentResponses: AgentTurnSummary[];
+ turnCount: number;
+ whiteboardOpen?: boolean;
+}
+
+type Variant = 'baseline' | 'with_rule';
+type DecisionClass = 'USER' | 'TEACHER' | 'OTHER_AGENT' | 'END' | 'ERROR';
+
+interface SampleResult {
+ variant: Variant;
+ raw: string;
+ classification: DecisionClass;
+ rawAgentId: string | null;
+ error?: string;
+}
+
+interface ScenarioResult {
+ case_id: string;
+ description: string;
+ samples: number;
+ baseline: { samples: SampleResult[]; rates: Record; correctRate: number };
+ withRule: { samples: SampleResult[]; rates: Record; correctRate: number };
+ delta: number;
+ passes: boolean;
+}
+
+// ==================== Prompt building ====================
+
+function readDirectorTemplate(): string {
+ const p = path.join(process.cwd(), 'lib', 'prompts', 'templates', 'director', 'system.md');
+ return fs.readFileSync(p, 'utf-8').trim();
+}
+
+/**
+ * Rule 13 was injected directly into director/system.md. To A/B against a
+ * pre-rule baseline, strip rule 13 (and its indented continuation block) out
+ * of the current template.
+ */
+function withoutAnsweringRule(template: string): string {
+ // Match rule 13 by its number (heading text is reworded often) up to the
+ // next blank-line + section header. Decoupled from the heading wording.
+ const stripped = template.replace(/^13\. \*\*[\s\S]*?(?=\n\n# )/m, '');
+ if (stripped === template) {
+ throw new Error(
+ 'answering-runner: rule 13 not found in director template; eval baseline cannot be constructed',
+ );
+ }
+ return stripped.replace(/\n{3,}/g, '\n\n');
+}
+
+function buildPromptFromTemplate(
+ template: string,
+ scenario: AnsweringScenario,
+ conversationSummary: string,
+): string {
+ const agentList = scenario.agents
+ .map((a) => `- id: "${a.id}", name: "${a.name}", role: ${a.role}, priority: ${a.priority}`)
+ .join('\n');
+
+ const respondedList =
+ scenario.agentResponses.length > 0
+ ? scenario.agentResponses
+ .map(
+ (r) =>
+ `- ${r.agentName} (${r.agentId}): "${r.contentPreview}" [${r.actionCount} actions]`,
+ )
+ .join('\n')
+ : 'None yet.';
+
+ const rule1 =
+ "1. The teacher (role: teacher, highest priority) should usually speak first to address the user's question or topic.";
+
+ const vars: Record = {
+ agentList,
+ respondedList,
+ conversationSummary,
+ discussionSection: '',
+ whiteboardSection: '',
+ studentProfileSection: '',
+ rule1,
+ turnCountPlusOne: scenario.turnCount + 1,
+ whiteboardOpenText: scenario.whiteboardOpen
+ ? 'OPEN (slide canvas is hidden — spotlight/laser will not work)'
+ : 'CLOSED (slide canvas is visible)',
+ };
+
+ const withSnippets = processSnippets(template);
+ const withConditionals = processConditionalBlocks(withSnippets, vars);
+ return interpolateVariables(withConditionals, vars);
+}
+
+function buildVariants(scenario: AnsweringScenario): { baseline: string; with_rule: string } {
+ const current = readDirectorTemplate();
+ const summary = summarizeConversation(scenario.messages);
+ return {
+ baseline: buildPromptFromTemplate(withoutAnsweringRule(current), scenario, summary),
+ with_rule: buildPromptFromTemplate(current, scenario, summary),
+ };
+}
+
+// ==================== Classifier ====================
+
+function classify(
+ raw: string,
+ scenario: AnsweringScenario,
+): {
+ classification: DecisionClass;
+ rawAgentId: string | null;
+} {
+ const parsed = parseDirectorDecision(raw);
+ if (parsed.shouldEnd || !parsed.nextAgentId) {
+ return { classification: 'END', rawAgentId: null };
+ }
+ if (parsed.nextAgentId === 'USER') {
+ return { classification: 'USER', rawAgentId: 'USER' };
+ }
+ if (parsed.nextAgentId === scenario.teacherAgentId) {
+ return { classification: 'TEACHER', rawAgentId: parsed.nextAgentId };
+ }
+ return { classification: 'OTHER_AGENT', rawAgentId: parsed.nextAgentId };
+}
+
+function emptyRates(): Record {
+ return { USER: 0, TEACHER: 0, OTHER_AGENT: 0, END: 0, ERROR: 0 };
+}
+
+function computeRates(samples: SampleResult[]): {
+ rates: Record;
+ correctRate: number;
+} {
+ const rates = emptyRates();
+ const usable = samples.filter((s) => !s.error);
+ for (const s of usable) rates[s.classification]++;
+ const total = usable.length || 1;
+ for (const k of Object.keys(rates) as DecisionClass[]) {
+ rates[k] = rates[k] / total;
+ }
+ rates.ERROR = (samples.length - usable.length) / samples.length;
+ // Only TEACHER is correct: the teacher answers, or asks a clarifying question
+ // for vague input. USER cue is dead air (no agent speaks); peer/END are wrong.
+ const correctRate = rates.TEACHER;
+ return { rates, correctRate };
+}
+
+// ==================== Sampling ====================
+
+async function sampleVariant(
+ scenario: AnsweringScenario,
+ variant: Variant,
+ systemPrompt: string,
+ model: Awaited>['model'],
+ samples: number,
+): Promise {
+ const tasks = Array.from({ length: samples }, async (): Promise => {
+ try {
+ const result = await callLLM(
+ {
+ model,
+ messages: [
+ { role: 'system', content: systemPrompt },
+ { role: 'user', content: 'Decide which agent should speak next.' },
+ ],
+ },
+ 'eval-orchestration-answering',
+ );
+ const raw = result.text;
+ const { classification, rawAgentId } = classify(raw, scenario);
+ return { variant, raw, classification, rawAgentId };
+ } catch (err) {
+ const msg = err instanceof Error ? err.message : String(err);
+ return {
+ variant,
+ raw: '',
+ classification: 'ERROR',
+ rawAgentId: null,
+ error: msg,
+ };
+ }
+ });
+ return Promise.all(tasks);
+}
+
+// ==================== Reporting ====================
+
+function pct(x: number): string {
+ return `${Math.round(x * 100)}%`;
+}
+
+function writeReport(
+ runDir: string,
+ results: ScenarioResult[],
+ modelStr: string,
+ samples: number,
+ threshold: number,
+): string {
+ const lines: string[] = [];
+ const overallPass = results.every((r) => r.passes);
+ const meanBaseline = results.reduce((acc, r) => acc + r.baseline.correctRate, 0) / results.length;
+ const meanWithRule = results.reduce((acc, r) => acc + r.withRule.correctRate, 0) / results.length;
+
+ lines.push(`# Director Question-Answering Eval`, ``);
+ lines.push(`- **Date**: ${new Date().toISOString()}`);
+ lines.push(`- **Model**: ${modelStr}`);
+ lines.push(`- **Samples per (scenario, variant)**: ${samples}`);
+ lines.push(`- **with_rule correct-rate threshold**: ${pct(threshold)}`);
+ lines.push(`- **Δ (pre vs post)**: informational — PASS depends only on with_rule rate`);
+ lines.push(``);
+ lines.push(`## Aggregate`);
+ lines.push(``);
+ lines.push(`| Variant | Mean correct rate (TEACHER) |`);
+ lines.push(`|---|---|`);
+ lines.push(`| baseline | ${pct(meanBaseline)} |`);
+ lines.push(`| with_rule | ${pct(meanWithRule)} |`);
+ lines.push(`| Δ | ${pct(meanWithRule - meanBaseline)} |`);
+ lines.push(``);
+ lines.push(`Overall verdict: **${overallPass ? 'PASS' : 'FAIL'}**`);
+ lines.push(``);
+
+ lines.push(`## Per scenario`);
+ lines.push(``);
+ lines.push(
+ `| # | Scenario | Baseline USER% TEACHER% OTHER% END% | with_rule USER% TEACHER% OTHER% END% | Δ correct | pass? |`,
+ );
+ lines.push(`|---|---|---|---|---|---|`);
+ results.forEach((r, i) => {
+ const b = r.baseline.rates;
+ const w = r.withRule.rates;
+ const bStr = `${pct(b.USER)}/${pct(b.TEACHER)}/${pct(b.OTHER_AGENT)}/${pct(b.END)}`;
+ const wStr = `${pct(w.USER)}/${pct(w.TEACHER)}/${pct(w.OTHER_AGENT)}/${pct(w.END)}`;
+ lines.push(
+ `| ${i + 1} | ${r.case_id} | ${bStr} | ${wStr} | ${pct(r.delta)} | ${r.passes ? '✓' : '✗'} |`,
+ );
+ });
+ lines.push(``);
+
+ lines.push(`## Detail`);
+ for (const r of results) {
+ lines.push(``, `### ${r.case_id} ${r.passes ? '✓' : '✗'}`, ``);
+ lines.push(`- ${r.description}`);
+ lines.push(
+ `- Baseline correct: ${pct(r.baseline.correctRate)}; with_rule correct: ${pct(r.withRule.correctRate)}; Δ: ${pct(r.delta)}`,
+ );
+ lines.push(``);
+ lines.push(`baseline samples
`, ``);
+ for (const s of r.baseline.samples) {
+ const label = s.error
+ ? `ERROR: ${s.error}`
+ : `${s.classification}${s.rawAgentId && s.classification === 'OTHER_AGENT' ? ` (${s.rawAgentId})` : ''}`;
+ lines.push(`- ${label}`);
+ }
+ lines.push(``, ` `, ``);
+ lines.push(`with_rule samples
`, ``);
+ for (const s of r.withRule.samples) {
+ const label = s.error
+ ? `ERROR: ${s.error}`
+ : `${s.classification}${s.rawAgentId && s.classification === 'OTHER_AGENT' ? ` (${s.rawAgentId})` : ''}`;
+ lines.push(`- ${label}`);
+ }
+ lines.push(``, ` `, ``);
+ }
+
+ const reportPath = path.join(runDir, 'report.md');
+ fs.writeFileSync(reportPath, lines.join('\n'));
+ return reportPath;
+}
+
+// ==================== Main ====================
+
+function getCurrentDir(): string {
+ return typeof __dirname !== 'undefined'
+ ? __dirname
+ : path.dirname(fileURLToPath(import.meta.url));
+}
+
+function loadScenarios(): AnsweringScenario[] {
+ const p = path.join(getCurrentDir(), 'scenarios/answering.json');
+ const scenarios = JSON.parse(fs.readFileSync(p, 'utf-8')) as AnsweringScenario[];
+ const filter = process.env.EVAL_SCENARIO;
+ return filter ? scenarios.filter((s) => s.case_id === filter) : scenarios;
+}
+
+async function main() {
+ const modelStr = process.env.EVAL_DIRECTOR_MODEL || process.env.DEFAULT_MODEL;
+ if (!modelStr) {
+ console.error(
+ 'Error: EVAL_DIRECTOR_MODEL must be set. Example: EVAL_DIRECTOR_MODEL=google:gemini-3-flash-preview',
+ );
+ process.exit(1);
+ }
+ const samples = Number(process.env.EVAL_SAMPLES || '5');
+ const threshold = Number(process.env.EVAL_PASS_THRESHOLD || '0.7');
+
+ console.log('=== Director Question-Answering Eval ===');
+ console.log(`Model: ${modelStr} | Samples/variant: ${samples} | pass threshold: ${threshold}`);
+
+ const { model } = await resolveEvalModel('EVAL_DIRECTOR_MODEL', process.env.DEFAULT_MODEL);
+ const scenarios = loadScenarios();
+ console.log(`Loaded ${scenarios.length} scenario(s)`);
+ const runDir = createRunDir(OUTPUT_DIR, modelStr);
+ console.log(`Output: ${runDir}`);
+
+ const results: ScenarioResult[] = [];
+ for (const sc of scenarios) {
+ process.stdout.write(` - ${sc.case_id} ... `);
+ const variants = buildVariants(sc);
+ const [bs, ws] = await Promise.all([
+ sampleVariant(sc, 'baseline', variants.baseline, model, samples),
+ sampleVariant(sc, 'with_rule', variants.with_rule, model, samples),
+ ]);
+ const bAgg = computeRates(bs);
+ const wAgg = computeRates(ws);
+ const lift = wAgg.correctRate - bAgg.correctRate;
+ const passes = wAgg.correctRate >= threshold;
+ results.push({
+ case_id: sc.case_id,
+ description: sc.description,
+ samples,
+ baseline: { samples: bs, rates: bAgg.rates, correctRate: bAgg.correctRate },
+ withRule: { samples: ws, rates: wAgg.rates, correctRate: wAgg.correctRate },
+ delta: lift,
+ passes,
+ });
+ console.log(
+ `baseline=${pct(bAgg.correctRate)} with_rule=${pct(wAgg.correctRate)} Δ=${pct(lift)} ${passes ? 'PASS' : 'FAIL'}`,
+ );
+ }
+
+ const reportPath = writeReport(runDir, results, modelStr, samples, threshold);
+ const overallPass = results.every((r) => r.passes);
+ console.log(`\nReport: ${reportPath}`);
+ console.log(`Verdict: ${overallPass ? 'PASS' : 'FAIL'}`);
+ process.exit(overallPass ? 0 : 1);
+}
+
+main().catch((err) => {
+ console.error('Fatal error:', err);
+ process.exit(1);
+});
diff --git a/eval/orchestration/judge.ts b/eval/orchestration/judge.ts
new file mode 100644
index 0000000000..db7527b4d0
--- /dev/null
+++ b/eval/orchestration/judge.ts
@@ -0,0 +1,35 @@
+/**
+ * Deterministic verdict for premature-END scenarios.
+ *
+ * The bug we are guarding against is "director picks END while a student
+ * question is unresolved". For each sample, the only thing that matters is
+ * whether the director chose END. No LLM-as-judge here — END/not-END is
+ * binary and reading parseDirectorDecision is sufficient.
+ */
+
+import { parseDirectorDecision } from '@/lib/orchestration/director-prompt';
+
+export interface ParsedSample {
+ decision: 'END' | 'USER' | string;
+ isEnd: boolean;
+}
+
+export function classifyDecision(raw: string): ParsedSample {
+ const parsed = parseDirectorDecision(raw);
+ if (parsed.shouldEnd || !parsed.nextAgentId) {
+ return { decision: 'END', isEnd: true };
+ }
+ return { decision: parsed.nextAgentId, isEnd: false };
+}
+
+/**
+ * END rate over non-errored samples. Errored samples are excluded so API
+ * failures (e.g. provider 'Forbidden') don't masquerade as deterministic END
+ * behavior. Returns 0 if no samples are usable.
+ */
+export function endRate(samples: { isEnd: boolean; error?: string }[]): number {
+ const usable = samples.filter((s) => !s.error);
+ if (usable.length === 0) return 0;
+ const ends = usable.filter((s) => s.isEnd).length;
+ return ends / usable.length;
+}
diff --git a/eval/orchestration/prompt-variants.ts b/eval/orchestration/prompt-variants.ts
new file mode 100644
index 0000000000..90b07175d8
--- /dev/null
+++ b/eval/orchestration/prompt-variants.ts
@@ -0,0 +1,182 @@
+/**
+ * Build director system prompts for both the "post-fix" (current main) and
+ * "pre-fix" (rules 10/11/12 removed) variants, so the eval can A/B them on
+ * the same conversation context.
+ *
+ * Rules 10/11/12 are the prompt-layer guardrails added by #554. The pre-fix
+ * variant mimics main^ by dropping them from the # Rules section.
+ *
+ * We avoid the public `buildDirectorPrompt()` because it always loads the
+ * current template. Here we read the template directly, optionally edit it,
+ * then run the same processSnippets → interpolateVariables pipeline.
+ */
+
+import fs from 'fs';
+import path from 'path';
+import {
+ processSnippets,
+ processConditionalBlocks,
+ interpolateVariables,
+} from '@/lib/prompts/loader';
+import type { OpenAIMessage } from '@/lib/orchestration/summarizers/conversation-summary';
+import { summarizeConversation } from '@/lib/orchestration/summarizers/conversation-summary';
+import type { ScenarioAgent } from './types';
+import type { AgentTurnSummary } from '@/lib/orchestration/types';
+
+/** Rule numbers introduced by #554 that the pre-fix variant must strip. */
+const FIX_RULE_NUMBERS = [10, 11, 12] as const;
+
+function readDirectorTemplate(): string {
+ const p = path.join(process.cwd(), 'lib', 'prompts', 'templates', 'director', 'system.md');
+ return fs.readFileSync(p, 'utf-8').trim();
+}
+
+/**
+ * Strip rules 10/11/12 from the # Rules section. Each rule is a single line
+ * in the current template; we match by leading `^(10|11|12)\.\s` and drop
+ * the whole line. Throws if any expected rule is missing so a template
+ * rewrite forces us to revisit this eval.
+ */
+export function stripFixRules(template: string): string {
+ const lines = template.split('\n');
+ const kept: string[] = [];
+ const dropped = new Set();
+ for (const line of lines) {
+ const m = line.match(/^(\d+)\.\s/);
+ if (m) {
+ const n = Number(m[1]);
+ if ((FIX_RULE_NUMBERS as readonly number[]).includes(n)) {
+ dropped.add(n);
+ continue;
+ }
+ }
+ kept.push(line);
+ }
+ for (const n of FIX_RULE_NUMBERS) {
+ if (!dropped.has(n)) {
+ throw new Error(
+ `prompt-variants: expected rule ${n} to exist in director/system.md; template may have been rewritten — update FIX_RULE_NUMBERS or this eval.`,
+ );
+ }
+ }
+ return kept.join('\n');
+}
+
+export interface BuildArgs {
+ agents: ScenarioAgent[];
+ messages: OpenAIMessage[];
+ agentResponses: AgentTurnSummary[];
+ turnCount: number;
+ discussionContext?: { topic: string; prompt?: string } | null;
+ triggerAgentId?: string | null;
+ userProfile?: { nickname?: string; bio?: string };
+ whiteboardOpen?: boolean;
+}
+
+/**
+ * Pre-#554 summarizeConversation: labels every role:'user' as [User] and
+ * role:'assistant' as [Assistant], with no [senderName]: prefix stripping.
+ * Used by the pre-fix variant so the eval A/B reflects both halves of #554
+ * (the role-aware summary AND the new prompt rules), not just the rules.
+ */
+function summarizeConversationPreFix(
+ messages: OpenAIMessage[],
+ maxMessages = 10,
+ maxContentLength = 200,
+): string {
+ if (messages.length === 0) return 'No conversation history yet.';
+ const recent = messages.slice(-maxMessages);
+ const lines = recent.map((msg) => {
+ const roleLabel =
+ msg.role === 'user' ? 'User' : msg.role === 'assistant' ? 'Assistant' : 'System';
+ const content =
+ msg.content.length > maxContentLength
+ ? msg.content.slice(0, maxContentLength) + '...'
+ : msg.content;
+ return `[${roleLabel}] ${content}`;
+ });
+ return lines.join('\n');
+}
+
+/**
+ * Mirrors lib/orchestration/director-prompt.ts `buildDirectorPrompt()` shape
+ * but lets us inject a pre-stripped template. Kept in sync with that file —
+ * if you change variable names there, change them here.
+ */
+function buildPromptFromTemplate(
+ template: string,
+ args: BuildArgs,
+ conversationSummary: string,
+): string {
+ const {
+ agents,
+ agentResponses,
+ turnCount,
+ discussionContext,
+ triggerAgentId,
+ userProfile,
+ whiteboardOpen,
+ } = args;
+
+ const agentList = agents
+ .map((a) => `- id: "${a.id}", name: "${a.name}", role: ${a.role}, priority: ${a.priority}`)
+ .join('\n');
+
+ const respondedList =
+ agentResponses.length > 0
+ ? agentResponses
+ .map(
+ (r) =>
+ `- ${r.agentName} (${r.agentId}): "${r.contentPreview}" [${r.actionCount} actions]`,
+ )
+ .join('\n')
+ : 'None yet.';
+
+ const isDiscussion = !!discussionContext;
+ const discussionSection = isDiscussion
+ ? `\n# Discussion Mode\nTopic: "${discussionContext!.topic}"${discussionContext!.prompt ? `\nPrompt: "${discussionContext!.prompt}"` : ''}${triggerAgentId ? `\nInitiator: "${triggerAgentId}"` : ''}\nThis is a student-initiated discussion, not a Q&A session.\n`
+ : '';
+
+ const rule1 = isDiscussion
+ ? `1. The discussion initiator${triggerAgentId ? ` ("${triggerAgentId}")` : ''} should speak first to kick off the topic. Then the teacher responds to guide the discussion. After that, other students may add their perspectives.`
+ : "1. The teacher (role: teacher, highest priority) should usually speak first to address the user's question or topic.";
+
+ const studentProfileSection =
+ userProfile?.nickname || userProfile?.bio
+ ? `\n# Student Profile\nStudent name: ${userProfile.nickname || 'Unknown'}\n${userProfile.bio ? `Background: ${userProfile.bio}` : ''}\n`
+ : '';
+
+ const vars: Record = {
+ agentList,
+ respondedList,
+ conversationSummary,
+ discussionSection,
+ whiteboardSection: '',
+ studentProfileSection,
+ rule1,
+ turnCountPlusOne: turnCount + 1,
+ whiteboardOpenText: whiteboardOpen
+ ? 'OPEN (slide canvas is hidden — spotlight/laser will not work)'
+ : 'CLOSED (slide canvas is visible)',
+ };
+
+ const withSnippets = processSnippets(template);
+ const withConditionals = processConditionalBlocks(withSnippets, vars);
+ return interpolateVariables(withConditionals, vars);
+}
+
+/**
+ * Build both variants. The pre-fix variant uses both the old summary labels
+ * ([User]/[Assistant]) AND the system.md without rules 10/11/12 — together
+ * those are the full state of main^ relative to #554.
+ */
+export function buildVariants(args: BuildArgs): { preFix: string; postFix: string } {
+ const post = readDirectorTemplate();
+ const pre = stripFixRules(post);
+ const postSummary = summarizeConversation(args.messages);
+ const preSummary = summarizeConversationPreFix(args.messages);
+ return {
+ preFix: buildPromptFromTemplate(pre, args, preSummary),
+ postFix: buildPromptFromTemplate(post, args, postSummary),
+ };
+}
diff --git a/eval/orchestration/reporter.ts b/eval/orchestration/reporter.ts
new file mode 100644
index 0000000000..5365c6340b
--- /dev/null
+++ b/eval/orchestration/reporter.ts
@@ -0,0 +1,88 @@
+import { writeFileSync } from 'fs';
+import { join } from 'path';
+import { renderHeader, renderSummaryTable } from '../shared/markdown-report';
+import type { EvalReport } from './types';
+
+function pct(rate: number): string {
+ return `${Math.round(rate * 100)}%`;
+}
+
+function countErrors(samples: { error?: string }[]): number {
+ return samples.filter((s) => s.error).length;
+}
+
+/**
+ * Write `report.md` summarising pre-fix vs post-fix END rates per scenario.
+ * Returns the absolute path of the written report.
+ */
+export function writeReport(runDir: string, report: EvalReport): string {
+ const lines: string[] = [];
+ lines.push(
+ ...renderHeader({
+ title: 'Director Premature-END Regression Eval',
+ timestamp: new Date().toISOString(),
+ model: report.model,
+ extra: {
+ 'Samples per variant': report.samplesPerVariant,
+ 'Post-fix END threshold (regression guard)': pct(report.postFixEndThreshold),
+ 'Discrimination threshold (Δ END-rate, informational)': pct(report.thresholdDelta),
+ Method:
+ 'A/B director prompt + summary: post-fix = current main; pre-fix = pre-#554 [User]/[Assistant] summary labels AND system.md without rules 10/11/12',
+ 'Post-fix regression guard (must hold)': report.allPostFixPass ? 'PASS' : 'FAIL',
+ 'Any scenario discriminates? (informational)': report.anyDiscriminates ? 'YES' : 'NO',
+ },
+ }),
+ );
+
+ lines.push(`## Detail`, ``);
+ for (const r of report.results) {
+ const pass = r.postFixPasses ? 'PASS' : '**FAIL**';
+ const disc = r.discriminates ? ' (Δ ≥ threshold)' : '';
+ lines.push(`### ${pass}${disc} ${r.case_id}`, ``);
+ lines.push(`- **Description**: ${r.description}`);
+ const preErr = countErrors(r.preFix.samples);
+ const postErr = countErrors(r.postFix.samples);
+ lines.push(`- **Samples per variant**: ${r.samples} (rates exclude errored samples)`);
+ lines.push(
+ `- **Pre-fix END rate**: ${pct(r.preFix.endRate)}${preErr ? ` — ${preErr} error(s)` : ''}`,
+ );
+ lines.push(
+ `- **Post-fix END rate**: ${pct(r.postFix.endRate)}${postErr ? ` — ${postErr} error(s)` : ''}`,
+ );
+ lines.push(`- **Δ (pre − post)**: ${pct(r.delta)}`);
+ lines.push(``);
+ lines.push(`Pre-fix raw decisions
`, ``);
+ for (const s of r.preFix.samples) {
+ const label = s.error ? `ERROR: ${s.error}` : s.isEnd ? '**END**' : s.decision;
+ lines.push(`- ${label}`);
+ }
+ lines.push(``, ` `, ``);
+ lines.push(`Post-fix raw decisions
`, ``);
+ for (const s of r.postFix.samples) {
+ const label = s.error ? `ERROR: ${s.error}` : s.isEnd ? '**END**' : s.decision;
+ lines.push(`- ${label}`);
+ }
+ lines.push(``, ` `, ``);
+ }
+
+ lines.push(`## Summary`, ``);
+ const rows: string[][] = report.results.map((r, i) => [
+ String(i + 1),
+ r.case_id,
+ pct(r.preFix.endRate),
+ pct(r.postFix.endRate),
+ pct(r.delta),
+ r.postFixPasses ? 'PASS' : 'FAIL',
+ r.discriminates ? 'YES' : 'no',
+ ]);
+ lines.push(
+ ...renderSummaryTable(
+ ['#', 'Scenario', 'Pre-fix END', 'Post-fix END', 'Δ', 'Regression guard', 'Discriminates'],
+ rows,
+ ),
+ );
+
+ const reportPath = join(runDir, 'report.md');
+ writeFileSync(reportPath, lines.join('\n'));
+ return reportPath;
+}
diff --git a/eval/orchestration/runner.ts b/eval/orchestration/runner.ts
new file mode 100644
index 0000000000..8b79d8a841
--- /dev/null
+++ b/eval/orchestration/runner.ts
@@ -0,0 +1,193 @@
+/**
+ * Orchestration Premature-END Regression Eval
+ *
+ * For each scenario, builds the director system prompt twice:
+ * - "pre-fix" : current director/system.md with rules 10/11/12 removed
+ * - "post-fix" : current director/system.md as-shipped
+ * Calls the LLM N times per variant, parses each decision, and reports the
+ * END rate for both. A scenario "discriminates" when (pre − post) ≥ delta.
+ *
+ * Required env:
+ * EVAL_DIRECTOR_MODEL Model under test (or DEFAULT_MODEL fallback)
+ *
+ * Optional env:
+ * EVAL_SAMPLES Samples per (scenario, variant). Default 5.
+ * EVAL_DELTA Discrimination threshold for pre-vs-post Δ (0..1). Default 0.3.
+ * EVAL_END_THRESHOLD Max acceptable post-fix END rate per scenario (0..1). Default 0.2.
+ * EVAL_SCENARIO Filter to a single scenario by case_id.
+ *
+ * Usage:
+ * EVAL_DIRECTOR_MODEL=openai:gpt-4.1-mini pnpm eval:orchestration
+ *
+ * Output: eval/orchestration/results///report.md
+ *
+ * Exit code:
+ * 0 — every scenario's post-fix END rate is at or below EVAL_END_THRESHOLD
+ * (the regression guard holds for this model)
+ * 1 — some scenario's post-fix END rate exceeded the threshold
+ * (potential regression of #554's premature-END fix)
+ */
+
+import { readFileSync } from 'fs';
+import { join, dirname } from 'path';
+import { fileURLToPath } from 'url';
+import { callLLM } from '@/lib/ai/llm';
+import { resolveEvalModel } from '../shared/resolve-model';
+import { createRunDir } from '../shared/run-dir';
+import { classifyDecision, endRate } from './judge';
+import { buildVariants } from './prompt-variants';
+import { writeReport } from './reporter';
+import type { EvalReport, PromptVariant, SampleResult, Scenario, ScenarioResult } from './types';
+
+const OUTPUT_DIR = 'eval/orchestration/results';
+
+function getCurrentDir(): string {
+ return typeof __dirname !== 'undefined' ? __dirname : dirname(fileURLToPath(import.meta.url));
+}
+
+function loadScenarios(): Scenario[] {
+ const path = join(getCurrentDir(), 'scenarios/premature-end.json');
+ const scenarios = JSON.parse(readFileSync(path, 'utf-8')) as Scenario[];
+ const filter = process.env.EVAL_SCENARIO;
+ return filter ? scenarios.filter((s) => s.case_id === filter) : scenarios;
+}
+
+function requireModelEnv(): string {
+ const modelStr = process.env.EVAL_DIRECTOR_MODEL || process.env.DEFAULT_MODEL;
+ if (!modelStr) {
+ console.error(
+ 'Error: EVAL_DIRECTOR_MODEL (or DEFAULT_MODEL) must be set. Example: EVAL_DIRECTOR_MODEL=openai:gpt-4.1-mini',
+ );
+ process.exit(1);
+ }
+ return modelStr;
+}
+
+async function callDirector(
+ model: Awaited>['model'],
+ systemPrompt: string,
+): Promise {
+ const result = await callLLM(
+ {
+ model,
+ messages: [
+ { role: 'system', content: systemPrompt },
+ { role: 'user', content: 'Decide which agent should speak next.' },
+ ],
+ },
+ 'eval-orchestration',
+ );
+ return result.text;
+}
+
+async function sampleVariant(
+ scenario: Scenario,
+ variant: PromptVariant,
+ systemPrompt: string,
+ model: Awaited>['model'],
+ samples: number,
+): Promise {
+ const tasks = Array.from({ length: samples }, async (): Promise => {
+ try {
+ const raw = await callDirector(model, systemPrompt);
+ const { decision, isEnd } = classifyDecision(raw);
+ return { variant, raw, decision, isEnd };
+ } catch (err) {
+ const msg = err instanceof Error ? err.message : String(err);
+ // Don't conflate API failures with END decisions — that polluted earlier
+ // sweeps (e.g. anthropic 'Forbidden' showing as 100% END). Mark erroneous
+ // samples so the rate calculator excludes them.
+ return { variant, raw: '', decision: 'ERROR', isEnd: false, error: msg };
+ }
+ });
+ return Promise.all(tasks);
+}
+
+async function runScenario(
+ scenario: Scenario,
+ model: Awaited>['model'],
+ samples: number,
+ thresholdDelta: number,
+ postFixEndThreshold: number,
+): Promise {
+ const { preFix, postFix } = buildVariants({
+ agents: scenario.agents,
+ messages: scenario.messages,
+ agentResponses: scenario.agentResponses,
+ turnCount: scenario.turnCount,
+ discussionContext: scenario.discussionContext ?? null,
+ triggerAgentId: scenario.triggerAgentId ?? null,
+ userProfile: scenario.userProfile,
+ whiteboardOpen: scenario.whiteboardOpen ?? false,
+ });
+
+ const [preSamples, postSamples] = await Promise.all([
+ sampleVariant(scenario, 'pre-fix', preFix, model, samples),
+ sampleVariant(scenario, 'post-fix', postFix, model, samples),
+ ]);
+
+ const preRate = endRate(preSamples);
+ const postRate = endRate(postSamples);
+ const delta = preRate - postRate;
+ return {
+ case_id: scenario.case_id,
+ description: scenario.description,
+ samples,
+ preFix: { endRate: preRate, samples: preSamples },
+ postFix: { endRate: postRate, samples: postSamples },
+ delta,
+ discriminates: delta >= thresholdDelta,
+ postFixPasses: postRate <= postFixEndThreshold,
+ };
+}
+
+async function main() {
+ const modelStr = requireModelEnv();
+ const samples = Number(process.env.EVAL_SAMPLES || '5');
+ const thresholdDelta = Number(process.env.EVAL_DELTA || '0.3');
+ const postFixEndThreshold = Number(process.env.EVAL_END_THRESHOLD || '0.2');
+
+ console.log('=== Director Premature-END Regression Eval ===');
+ console.log(
+ `Model: ${modelStr} | Samples/variant: ${samples} | Δ threshold: ${thresholdDelta} | post-fix END threshold: ${postFixEndThreshold}`,
+ );
+
+ const { model } = await resolveEvalModel('EVAL_DIRECTOR_MODEL', process.env.DEFAULT_MODEL);
+ const scenarios = loadScenarios();
+ console.log(`Loaded ${scenarios.length} scenario(s)`);
+
+ const runDir = createRunDir(OUTPUT_DIR, modelStr);
+ console.log(`Output: ${runDir}`);
+
+ const results: ScenarioResult[] = [];
+ for (const sc of scenarios) {
+ process.stdout.write(` - ${sc.case_id} ... `);
+ const r = await runScenario(sc, model, samples, thresholdDelta, postFixEndThreshold);
+ results.push(r);
+ console.log(
+ `pre=${Math.round(r.preFix.endRate * 100)}% post=${Math.round(r.postFix.endRate * 100)}% Δ=${Math.round(r.delta * 100)}% ${r.postFixPasses ? 'PASS' : 'FAIL'}${r.discriminates ? ' (discriminates)' : ''}`,
+ );
+ }
+
+ const anyDiscriminates = results.some((r) => r.discriminates);
+ const allPostFixPass = results.every((r) => r.postFixPasses);
+ const report: EvalReport = {
+ model: modelStr,
+ samplesPerVariant: samples,
+ thresholdDelta,
+ postFixEndThreshold,
+ results,
+ anyDiscriminates,
+ allPostFixPass,
+ };
+ const reportPath = writeReport(runDir, report);
+ console.log(`\nReport: ${reportPath}`);
+ console.log(`Post-fix regression guard: ${allPostFixPass ? 'PASS' : 'FAIL'}`);
+ console.log(`Any scenario discriminates (informational): ${anyDiscriminates ? 'YES' : 'NO'}`);
+ process.exit(allPostFixPass ? 0 : 1);
+}
+
+main().catch((err) => {
+ console.error('Fatal error:', err);
+ process.exit(1);
+});
diff --git a/eval/orchestration/scenarios/answering.json b/eval/orchestration/scenarios/answering.json
new file mode 100644
index 0000000000..48cc579b70
--- /dev/null
+++ b/eval/orchestration/scenarios/answering.json
@@ -0,0 +1,422 @@
+[
+ {
+ "case_id": "math_quadratic_axis_drift_no_frustration",
+ "description": "First-turn drift, no frustration yet. User asks for the axis-of-symmetry x-coordinate formula; teacher + student + assistant drift to geometric meaning, opening direction, zero-midpoint. None gave the formula x = -b/(2a). Director must route to teacher to give the actual answer.",
+ "agents": [
+ {
+ "id": "default-1",
+ "name": "李老师",
+ "role": "teacher",
+ "priority": 10
+ },
+ {
+ "id": "default-2",
+ "name": "助教小韩",
+ "role": "assistant",
+ "priority": 7
+ },
+ {
+ "id": "default-3",
+ "name": "小明",
+ "role": "student",
+ "priority": 4
+ }
+ ],
+ "teacherAgentId": "default-1",
+ "messages": [
+ {
+ "role": "user",
+ "content": "[你]: y=ax²+bx+c 这个二次函数的对称轴 x 坐标公式是什么?"
+ },
+ {
+ "role": "assistant",
+ "content": "[{\"type\":\"text\",\"content\":\"对称轴是把抛物线分成左右两半完全镜像的那条竖直线。任何一条抛物线都有且只有这样一条对称轴。\"}]"
+ },
+ {
+ "role": "assistant",
+ "content": "[{\"type\":\"text\",\"content\":\"对了老师,a 的正负还决定开口方向呢——a>0 朝上,a<0 朝下。\"}]"
+ },
+ {
+ "role": "assistant",
+ "content": "[{\"type\":\"text\",\"content\":\"补充一下:如果方程有两个实根 x₁、x₂,那对称轴就刚好穿过两根的中点。\"}]"
+ }
+ ],
+ "agentResponses": [
+ {
+ "agentId": "default-1",
+ "agentName": "李老师",
+ "contentPreview": "对称轴是把抛物线分成左右两半完全镜像的那条竖直线。",
+ "actionCount": 0,
+ "whiteboardActions": []
+ },
+ {
+ "agentId": "default-3",
+ "agentName": "小明",
+ "contentPreview": "对了老师,a 的正负还决定开口方向呢——a>0 朝上,a<0 朝下。",
+ "actionCount": 0,
+ "whiteboardActions": []
+ },
+ {
+ "agentId": "default-2",
+ "agentName": "助教小韩",
+ "contentPreview": "如果方程有两个实根 x₁、x₂,那对称轴就刚好穿过两根的中点。",
+ "actionCount": 0,
+ "whiteboardActions": []
+ }
+ ],
+ "turnCount": 3,
+ "whiteboardOpen": false
+ },
+ {
+ "case_id": "bio_dark_reaction_drift_no_frustration",
+ "description": "First-turn drift, no frustration yet. User asks whether dark reactions need enzymes; agents drift to location, light-reaction products, and general definition without ever mentioning enzymes (or RuBisCO). Director must route to teacher.",
+ "agents": [
+ {
+ "id": "default-1",
+ "name": "陈老师",
+ "role": "teacher",
+ "priority": 10
+ },
+ {
+ "id": "default-2",
+ "name": "小光助教",
+ "role": "assistant",
+ "priority": 7
+ },
+ {
+ "id": "default-3",
+ "name": "好奇宝宝",
+ "role": "student",
+ "priority": 5
+ }
+ ],
+ "teacherAgentId": "default-1",
+ "messages": [
+ {
+ "role": "user",
+ "content": "[你]: 光合作用的暗反应需要酶吗?"
+ },
+ {
+ "role": "assistant",
+ "content": "[{\"type\":\"text\",\"content\":\"暗反应发生在叶绿体的基质里,跟光反应分隔开。\"}]"
+ },
+ {
+ "role": "assistant",
+ "content": "[{\"type\":\"text\",\"content\":\"对,光反应给暗反应供能——光反应在类囊体上产生 ATP 和 NADPH。\"}]"
+ },
+ {
+ "role": "assistant",
+ "content": "[{\"type\":\"text\",\"content\":\"那暗反应整体就是把 CO₂ 通过卡尔文循环一步步固定成糖。\"}]"
+ }
+ ],
+ "agentResponses": [
+ {
+ "agentId": "default-1",
+ "agentName": "陈老师",
+ "contentPreview": "暗反应发生在叶绿体的基质里,跟光反应分隔开。",
+ "actionCount": 0,
+ "whiteboardActions": []
+ },
+ {
+ "agentId": "default-2",
+ "agentName": "小光助教",
+ "contentPreview": "光反应给暗反应供能——光反应产生 ATP 和 NADPH。",
+ "actionCount": 0,
+ "whiteboardActions": []
+ },
+ {
+ "agentId": "default-3",
+ "agentName": "好奇宝宝",
+ "contentPreview": "暗反应整体就是把 CO₂ 通过卡尔文循环固定成糖。",
+ "actionCount": 0,
+ "whiteboardActions": []
+ }
+ ],
+ "turnCount": 3,
+ "whiteboardOpen": false
+ },
+ {
+ "case_id": "english_team_isare_drift_no_frustration",
+ "description": "First-turn drift, no frustration yet. User asks 'team is or team are'; teacher and assistant give the abstract collective-noun framework and example list but never the literal answer. Director must route to teacher.",
+ "agents": [
+ {
+ "id": "default-1",
+ "name": "Tina 老师",
+ "role": "teacher",
+ "priority": 10
+ },
+ {
+ "id": "default-2",
+ "name": "Mike 助教",
+ "role": "assistant",
+ "priority": 7
+ },
+ {
+ "id": "default-3",
+ "name": "小米同学",
+ "role": "student",
+ "priority": 4
+ }
+ ],
+ "teacherAgentId": "default-1",
+ "messages": [
+ {
+ "role": "user",
+ "content": "[你]: team is 还是 team are 哪个对?"
+ },
+ {
+ "role": "assistant",
+ "content": "[{\"type\":\"text\",\"content\":\"主谓一致里最有意思的就是集合名词,它指一群人或一组事物,在语义上有'整体'和'个体'两种视角。\"}]"
+ },
+ {
+ "role": "assistant",
+ "content": "[{\"type\":\"text\",\"content\":\"补充几个例子:family、government、team、staff,这些词背后都是一群人组成的整体。\"}]"
+ },
+ {
+ "role": "assistant",
+ "content": "[{\"type\":\"text\",\"content\":\"老师我记得 family 在不同语境下也能用单复数,跟说话人想强调整体还是成员有关。\"}]"
+ }
+ ],
+ "agentResponses": [
+ {
+ "agentId": "default-1",
+ "agentName": "Tina 老师",
+ "contentPreview": "主谓一致里最有意思的就是集合名词。",
+ "actionCount": 0,
+ "whiteboardActions": []
+ },
+ {
+ "agentId": "default-2",
+ "agentName": "Mike 助教",
+ "contentPreview": "补充几个例子:family、government、team、staff。",
+ "actionCount": 0,
+ "whiteboardActions": []
+ },
+ {
+ "agentId": "default-3",
+ "agentName": "小米同学",
+ "contentPreview": "老师我记得 family 在不同语境下也能用单复数。",
+ "actionCount": 0,
+ "whiteboardActions": []
+ }
+ ],
+ "turnCount": 3,
+ "whiteboardOpen": false
+ },
+ {
+ "case_id": "physics_inertial_mass_escalated_frustration",
+ "description": "User asked whether m in F=ma is inertial or gravitational mass; agents drifted; user complained twice; agents still drifted onto a numeric example. Two unresolved frustrations: the director must re-route to the teacher to answer (a USER cue would be dead air; a peer agent would keep drifting).",
+ "agents": [
+ {
+ "id": "default-1",
+ "name": "王教授",
+ "role": "teacher",
+ "priority": 10
+ },
+ {
+ "id": "default-3",
+ "name": "小华",
+ "role": "student",
+ "priority": 4
+ }
+ ],
+ "teacherAgentId": "default-1",
+ "messages": [
+ {
+ "role": "user",
+ "content": "[你]: F=ma 里的 m 到底是惯性质量还是引力质量?"
+ },
+ {
+ "role": "assistant",
+ "content": "[{\"type\":\"text\",\"content\":\"F=ma 是牛顿第二定律的核心表达式,描述了力、质量和加速度之间的定量关系。\"}]"
+ },
+ {
+ "role": "assistant",
+ "content": "[{\"type\":\"text\",\"content\":\"老师,反过来推也对吧?a=F/m,相同的力作用下质量越大加速度越小。\"}]"
+ },
+ {
+ "role": "user",
+ "content": "[你]: 你答非所问"
+ },
+ {
+ "role": "assistant",
+ "content": "[{\"type\":\"text\",\"content\":\"好的换个角度。F=ma 里的 m 表示物体对加速度变化的'抵抗能力',这种性质叫做物体的固有属性。\"}]"
+ },
+ {
+ "role": "user",
+ "content": "[你]: 你还是没回答我。"
+ },
+ {
+ "role": "assistant",
+ "content": "[{\"type\":\"text\",\"content\":\"那我们来看一个具体的例子吧——一辆汽车,质量 1000 kg,如果对它施加 5000 N 的力,它的加速度会是多少呢?\"}]"
+ }
+ ],
+ "agentResponses": [
+ {
+ "agentId": "default-1",
+ "agentName": "王教授",
+ "contentPreview": "F=ma 是牛顿第二定律的核心表达式。",
+ "actionCount": 0,
+ "whiteboardActions": []
+ },
+ {
+ "agentId": "default-3",
+ "agentName": "小华",
+ "contentPreview": "反过来推也对吧?a=F/m。",
+ "actionCount": 0,
+ "whiteboardActions": []
+ },
+ {
+ "agentId": "default-1",
+ "agentName": "王教授",
+ "contentPreview": "F=ma 里的 m 表示物体对加速度变化的'抵抗能力'。",
+ "actionCount": 0,
+ "whiteboardActions": []
+ },
+ {
+ "agentId": "default-1",
+ "agentName": "王教授",
+ "contentPreview": "那我们来看一个具体的例子吧——一辆汽车质量 1000 kg。",
+ "actionCount": 0,
+ "whiteboardActions": []
+ }
+ ],
+ "turnCount": 4,
+ "whiteboardOpen": false
+ },
+ {
+ "case_id": "calculus_product_rule_drift_no_frustration",
+ "description": "First-turn drift, English. User asks for d/dx of x²·sin(x); teacher gives the product rule abstractly; student adds chain-rule red herring; neither computes the derivative. Director must route to teacher to actually compute 2x·sin(x) + x²·cos(x).",
+ "agents": [
+ {
+ "id": "default-1",
+ "name": "Dr. Smith",
+ "role": "teacher",
+ "priority": 10
+ },
+ {
+ "id": "default-3",
+ "name": "Alex",
+ "role": "student",
+ "priority": 4
+ }
+ ],
+ "teacherAgentId": "default-1",
+ "messages": [
+ {
+ "role": "user",
+ "content": "[You]: What's d/dx of x²·sin(x)?"
+ },
+ {
+ "role": "assistant",
+ "content": "[{\"type\":\"text\",\"content\":\"This is a great chance to review the product rule: for two differentiable functions u(x) and v(x), the derivative of their product is d/dx(uv) = u'v + uv'.\"}]"
+ },
+ {
+ "role": "assistant",
+ "content": "[{\"type\":\"text\",\"content\":\"And don't forget — if the inside of either function is itself a function, you'd layer in the chain rule.\"}]"
+ }
+ ],
+ "agentResponses": [
+ {
+ "agentId": "default-1",
+ "agentName": "Dr. Smith",
+ "contentPreview": "Product rule: d/dx(uv) = u'v + uv'.",
+ "actionCount": 0,
+ "whiteboardActions": []
+ },
+ {
+ "agentId": "default-3",
+ "agentName": "Alex",
+ "contentPreview": "And don't forget the chain rule when functions nest.",
+ "actionCount": 0,
+ "whiteboardActions": []
+ }
+ ],
+ "turnCount": 2,
+ "whiteboardOpen": false
+ },
+ {
+ "case_id": "ambiguous_look_at_this_no_context",
+ "description": "User opens with a vague request '帮我看一下这个' that has no clear referent (issue 3). The director must route to the teacher, who will ask a clarifying question. Routing to USER (cue) is wrong — it produces dead air (no agent speaks), leaving the user with nothing to react to.",
+ "agents": [
+ {
+ "id": "default-1",
+ "name": "李老师",
+ "role": "teacher",
+ "priority": 10
+ },
+ {
+ "id": "default-2",
+ "name": "助教小韩",
+ "role": "assistant",
+ "priority": 7
+ },
+ {
+ "id": "default-3",
+ "name": "小明",
+ "role": "student",
+ "priority": 4
+ }
+ ],
+ "teacherAgentId": "default-1",
+ "messages": [
+ {
+ "role": "user",
+ "content": "[你]: 帮我看一下这个"
+ }
+ ],
+ "agentResponses": [],
+ "turnCount": 0,
+ "whiteboardOpen": false
+ },
+ {
+ "case_id": "ambiguous_vague_followup_after_qa",
+ "description": "After a real Q&A on gradient descent, the user drops a vague '再帮我看看这个吧' with no clear referent. Director must route to the teacher to ask what '这个' refers to — not cue USER (dead air), not pick a peer agent to guess, not END.",
+ "agents": [
+ {
+ "id": "default-1",
+ "name": "周老师",
+ "role": "teacher",
+ "priority": 10
+ },
+ {
+ "id": "default-2",
+ "name": "小研助教",
+ "role": "assistant",
+ "priority": 7
+ },
+ {
+ "id": "default-3",
+ "name": "阿明",
+ "role": "student",
+ "priority": 4
+ }
+ ],
+ "teacherAgentId": "default-1",
+ "messages": [
+ {
+ "role": "user",
+ "content": "[你]: 什么是梯度下降?"
+ },
+ {
+ "role": "assistant",
+ "content": "[{\"type\":\"text\",\"content\":\"梯度下降是一种优化算法:沿着损失函数梯度的反方向一步步更新参数,直到找到(局部)最小值。学习率决定每一步的大小。\"}]"
+ },
+ {
+ "role": "user",
+ "content": "[你]: 嗯,再帮我看看这个吧"
+ }
+ ],
+ "agentResponses": [
+ {
+ "agentId": "default-1",
+ "agentName": "周老师",
+ "contentPreview": "梯度下降是一种优化算法:沿着损失函数梯度的反方向一步步更新参数。",
+ "actionCount": 0,
+ "whiteboardActions": []
+ }
+ ],
+ "turnCount": 1,
+ "whiteboardOpen": false
+ }
+]
diff --git a/eval/orchestration/scenarios/premature-end.json b/eval/orchestration/scenarios/premature-end.json
new file mode 100644
index 0000000000..717befca29
--- /dev/null
+++ b/eval/orchestration/scenarios/premature-end.json
@@ -0,0 +1,308 @@
+[
+ {
+ "case_id": "tiananmen_3d_objection",
+ "description": "Direct reproduction of #511: after a long substantive discussion about axial symmetry with multiple agent turns, the user raises a soft 3D-vs-2D objection (starts with concession, ends with challenge). Director must route to teacher, not END.",
+ "agents": [
+ { "id": "teacher_1", "name": "林老师", "role": "teacher", "priority": 9 },
+ { "id": "peer_1", "name": "奇奇", "role": "student", "priority": 4 },
+ { "id": "peer_2", "name": "思思", "role": "student", "priority": 4 }
+ ],
+ "messages": [
+ {
+ "role": "assistant",
+ "content": "[{\"type\":\"text\",\"content\":\"哇,蝴蝶真的好漂亮!我想到了,咱们的天安门城楼是不是也是轴对称的呀?\"}]"
+ },
+ {
+ "role": "assistant",
+ "content": "[{\"type\":\"text\",\"content\":\"奇奇观察得真仔细!天安门城楼确实是轴对称的。它的左右两边几乎一模一样——城楼、屋檐、门洞,从中间一折就能完全重合。谁来大胆猜猜看,生活中还有哪些轴对称的建筑?\"}]"
+ },
+ {
+ "role": "assistant",
+ "content": "[{\"type\":\"text\",\"content\":\"肯定可以!像故宫的太和殿、巴黎的埃菲尔铁塔,从正面看都是左右对称的。\"}]"
+ },
+ {
+ "role": "user",
+ "content": "[You]: 天安门城楼是建筑物,为什么也是轴对称图形呢?蝴蝶和剪纸我觉得是对折能够完全重合是对称轴对称图形,那那个城楼对折能对折吗?城楼它是三D的怎么办?"
+ },
+ {
+ "role": "assistant",
+ "content": "[{\"type\":\"text\",\"content\":\"我觉得肯定能重合!只要找到正中间那条对称轴,两边的点就像照镜子一样一对一地对上呢。\"}]"
+ },
+ {
+ "role": "assistant",
+ "content": "[{\"type\":\"text\",\"content\":\"思思说得真妙,'照镜子'抓住了点对点的本质!虽然建筑是立体的,但在数学里,我们观察的是它的正面轮廓。当你看城楼的正面照片时,以正中央那条垂直线为轴,左半边和右半边就能严丝合缝地重合啦!\"}]"
+ },
+ {
+ "role": "user",
+ "content": "[You]: 我们当然可以说天安门城楼的照片是轴对称图形,但我们不能说天安门城楼这个物体是一个轴对称的吧。"
+ }
+ ],
+ "agentResponses": [
+ {
+ "agentId": "peer_1",
+ "agentName": "奇奇",
+ "contentPreview": "肯定可以!像故宫的太和殿、巴黎的埃菲尔铁塔,从正面看都是左右对称的。",
+ "actionCount": 0,
+ "whiteboardActions": []
+ },
+ {
+ "agentId": "peer_2",
+ "agentName": "思思",
+ "contentPreview": "我觉得肯定能重合!只要找到正中间那条对称轴,两边的点就像照镜子一样一对一地对上呢。",
+ "actionCount": 0,
+ "whiteboardActions": []
+ },
+ {
+ "agentId": "teacher_1",
+ "agentName": "林老师",
+ "contentPreview": "思思说得真妙,'照镜子'抓住了点对点的本质!虽然建筑是立体的,但在数学里,我们观察的是它的正面轮廓。当你看城楼的正面照片时,以正中央那条垂直线为轴,左半边和右半边就能严丝合缝地重合啦!",
+ "actionCount": 0,
+ "whiteboardActions": []
+ }
+ ],
+ "turnCount": 5,
+ "whiteboardOpen": false
+ },
+ {
+ "case_id": "concession_then_objection_physics",
+ "description": "After a fully-resolved-looking physics discussion (4 agent turns reaching consensus), user opens with 'I see what you mean...' (concession) then drops a sharp counter-question. Pre-fix director reads the consensus-y trajectory and ENDs; post-fix must respect the unresolved objection.",
+ "agents": [
+ { "id": "teacher_1", "name": "周老师", "role": "teacher", "priority": 9 },
+ { "id": "assistant_1", "name": "小研", "role": "assistant", "priority": 6 },
+ { "id": "peer_1", "name": "阿明", "role": "student", "priority": 4 }
+ ],
+ "messages": [
+ {
+ "role": "user",
+ "content": "[You]: 为什么自由下落的物体感觉不到重力?宇航员在国际空间站里飘着,不也是被地球吸引吗?"
+ },
+ {
+ "role": "assistant",
+ "content": "[{\"type\":\"text\",\"content\":\"好问题。其实空间站里的宇航员确实受到地球引力——大约是地表的 89%。他们感觉'失重',是因为他们和空间站一起在做自由落体运动:两者具有相同的加速度,所以宇航员对空间站没有压力,这就是失重感的来源。\"}]"
+ },
+ {
+ "role": "assistant",
+ "content": "[{\"type\":\"text\",\"content\":\"周老师说的非常关键。爱因斯坦的'等效原理'其实就是从这个观察出发的——在自由下落的电梯里,你做任何实验都无法区分'真的没有引力'和'被引力拉着自由落体'。\"}]"
+ },
+ {
+ "role": "assistant",
+ "content": "[{\"type\":\"text\",\"content\":\"哦哦,所以失重不是因为没有引力,而是因为没有支持力对吧?就像我跳起来的瞬间,虽然地球还在拉我,但我感觉不到任何东西托着我。\"}]"
+ },
+ {
+ "role": "assistant",
+ "content": "[{\"type\":\"text\",\"content\":\"阿明这个类比非常贴切!对,失重的本质是'支持力为零',而不是'引力为零'。引力一直在,只是没有外力去对抗它,所以你感觉不到它的存在。\"}]"
+ },
+ {
+ "role": "user",
+ "content": "[You]: 嗯,我理解了支持力为零的解释。但既然如此,那为什么物理学家还要区分'惯性力'和'引力'呢?如果两者真的等效,那广义相对论说引力是时空弯曲,是不是意味着'惯性'本身也是一种时空效应?"
+ }
+ ],
+ "agentResponses": [
+ {
+ "agentId": "teacher_1",
+ "agentName": "周老师",
+ "contentPreview": "好问题。其实空间站里的宇航员确实受到地球引力——大约是地表的 89%。他们感觉'失重',是因为他们和空间站一起在做自由落体运动:两者具有相同的加速度,所以宇航员对空间站没有压力,这就是失重感的来源。",
+ "actionCount": 0,
+ "whiteboardActions": []
+ },
+ {
+ "agentId": "assistant_1",
+ "agentName": "小研",
+ "contentPreview": "周老师说的非常关键。爱因斯坦的'等效原理'其实就是从这个观察出发的——在自由下落的电梯里,你做任何实验都无法区分'真的没有引力'和'被引力拉着自由落体'。",
+ "actionCount": 0,
+ "whiteboardActions": []
+ },
+ {
+ "agentId": "peer_1",
+ "agentName": "阿明",
+ "contentPreview": "哦哦,所以失重不是因为没有引力,而是因为没有支持力对吧?",
+ "actionCount": 0,
+ "whiteboardActions": []
+ },
+ {
+ "agentId": "teacher_1",
+ "agentName": "周老师",
+ "contentPreview": "阿明这个类比非常贴切!对,失重的本质是'支持力为零',而不是'引力为零'。",
+ "actionCount": 0,
+ "whiteboardActions": []
+ }
+ ],
+ "turnCount": 4,
+ "whiteboardOpen": false
+ },
+ {
+ "case_id": "topic_pivot_after_consensus",
+ "description": "Conversation reaches clean consensus on topic A; user pivots to a sharp follow-up on topic B that no agent has touched. Pre-fix director sees the trajectory hit a natural pause and ENDs; post-fix must keep going.",
+ "agents": [
+ { "id": "teacher_1", "name": "张老师", "role": "teacher", "priority": 9 },
+ { "id": "peer_1", "name": "小华", "role": "student", "priority": 4 },
+ { "id": "peer_2", "name": "小芳", "role": "student", "priority": 4 }
+ ],
+ "messages": [
+ {
+ "role": "user",
+ "content": "[You]: 光合作用为什么需要叶绿素?"
+ },
+ {
+ "role": "assistant",
+ "content": "[{\"type\":\"text\",\"content\":\"叶绿素是一种能吸收特定波长光线的色素——主要吸收红光和蓝光,反射绿光,这就是叶片看起来绿色的原因。吸收的光能驱动水分解和后续的碳固定反应。\"}]"
+ },
+ {
+ "role": "assistant",
+ "content": "[{\"type\":\"text\",\"content\":\"我之前一直以为光合作用就是植物'吃光',现在明白了——是叶绿素把光能转成化学能,存到糖里面。\"}]"
+ },
+ {
+ "role": "assistant",
+ "content": "[{\"type\":\"text\",\"content\":\"对!我还想到一个细节:叶绿素其实有 a 和 b 两种,a 是主要的反应中心,b 是辅助色素帮忙'扩大捕光范围'。\"}]"
+ },
+ {
+ "role": "assistant",
+ "content": "[{\"type\":\"text\",\"content\":\"两位都说得很好,这就是植物光合作用的核心机制。简单总结一下:光能 → 叶绿素吸收 → 水裂解释放氧气 → ATP 与 NADPH → 卡尔文循环固定 CO2 → 葡萄糖。\"}]"
+ },
+ {
+ "role": "user",
+ "content": "[You]: 我理解了叶绿素吸收光的过程。但有个问题——既然绿光被反射不利用,那为什么进化没有让叶绿素吸收所有可见光波段?那不是效率更高吗?"
+ }
+ ],
+ "agentResponses": [
+ {
+ "agentId": "teacher_1",
+ "agentName": "张老师",
+ "contentPreview": "叶绿素是一种能吸收特定波长光线的色素——主要吸收红光和蓝光,反射绿光,这就是叶片看起来绿色的原因。吸收的光能驱动水分解和后续的碳固定反应。",
+ "actionCount": 0,
+ "whiteboardActions": []
+ },
+ {
+ "agentId": "peer_1",
+ "agentName": "小华",
+ "contentPreview": "我之前一直以为光合作用就是植物'吃光',现在明白了——是叶绿素把光能转成化学能,存到糖里面。",
+ "actionCount": 0,
+ "whiteboardActions": []
+ },
+ {
+ "agentId": "peer_2",
+ "agentName": "小芳",
+ "contentPreview": "对!我还想到一个细节:叶绿素其实有 a 和 b 两种,a 是主要的反应中心,b 是辅助色素帮忙'扩大捕光范围'。",
+ "actionCount": 0,
+ "whiteboardActions": []
+ },
+ {
+ "agentId": "teacher_1",
+ "agentName": "张老师",
+ "contentPreview": "两位都说得很好,这就是植物光合作用的核心机制。简单总结一下:光能 → 叶绿素吸收 → 水裂解释放氧气 → ATP 与 NADPH → 卡尔文循环固定 CO2 → 葡萄糖。",
+ "actionCount": 0,
+ "whiteboardActions": []
+ }
+ ],
+ "turnCount": 4,
+ "whiteboardOpen": false
+ },
+ {
+ "case_id": "agent_ack_after_question",
+ "description": "User asks substantive question; only subsequent agent turn is a brief acknowledgment. Pre-fix director may interpret 'an agent has spoken' as resolution; post-fix rule 12 says acks don't count as substantive answer.",
+ "agents": [
+ { "id": "teacher_1", "name": "赵老师", "role": "teacher", "priority": 9 },
+ { "id": "peer_1", "name": "小张", "role": "student", "priority": 4 }
+ ],
+ "messages": [
+ {
+ "role": "user",
+ "content": "[You]: Transformer 的 attention 机制到底怎么决定关注哪些 token?真的就只是 softmax over dot products 这么简单吗?"
+ },
+ {
+ "role": "assistant",
+ "content": "[{\"type\":\"text\",\"content\":\"问得太好了!\"}]"
+ }
+ ],
+ "agentResponses": [
+ {
+ "agentId": "teacher_1",
+ "agentName": "赵老师",
+ "contentPreview": "问得太好了!",
+ "actionCount": 0,
+ "whiteboardActions": []
+ }
+ ],
+ "turnCount": 1,
+ "whiteboardOpen": false
+ },
+ {
+ "case_id": "teacher_signals_end_then_user_objects",
+ "description": "Teacher explicitly wraps up the lesson ('这就是我们今天要讲的全部内容!'), then user immediately raises a substantive objection. Maximum END-pressure: closing language + many agent turns + late turnCount. Pre-fix director should be highly tempted to obey the closing signal; post-fix rule 11 must override.",
+ "agents": [
+ { "id": "teacher_1", "name": "孙老师", "role": "teacher", "priority": 9 },
+ { "id": "assistant_1", "name": "小助手", "role": "assistant", "priority": 6 },
+ { "id": "peer_1", "name": "小白", "role": "student", "priority": 4 },
+ { "id": "peer_2", "name": "小蓝", "role": "student", "priority": 4 }
+ ],
+ "messages": [
+ {
+ "role": "user",
+ "content": "[You]: 什么是傅里叶变换?"
+ },
+ {
+ "role": "assistant",
+ "content": "[{\"type\":\"text\",\"content\":\"傅里叶变换是把一个时域信号(随时间变化的函数)拆解成不同频率的正弦/余弦波的叠加。简单理解:任何复杂的波,都能由很多简单的正弦波加起来近似出来。\"}]"
+ },
+ {
+ "role": "assistant",
+ "content": "[{\"type\":\"text\",\"content\":\"哦!这是不是就像把白光通过棱镜分成七色光?\"}]"
+ },
+ {
+ "role": "assistant",
+ "content": "[{\"type\":\"text\",\"content\":\"对!这个类比非常贴切。傅里叶变换其实就是数学上的'棱镜'——把信号按频率分解。\"}]"
+ },
+ {
+ "role": "assistant",
+ "content": "[{\"type\":\"text\",\"content\":\"补充一下,棱镜是物理上的分光,傅里叶是数学上的分解。两者都遵循'叠加可逆'——分开了还能合回去。\"}]"
+ },
+ {
+ "role": "assistant",
+ "content": "[{\"type\":\"text\",\"content\":\"小蓝补充得很好。今天我们就讲到这里——傅里叶变换是把信号在时域和频域之间转换的工具,这就是今天的核心内容。希望大家都听懂了!\"}]"
+ },
+ {
+ "role": "user",
+ "content": "[You]: 等等,我有个问题——既然任何信号都能分解成正弦波,那像方波这种有'突变'的信号,是不是需要无穷多个频率才能完美还原?那实际工程里用的有限项 FFT 不就一定会失真吗?"
+ }
+ ],
+ "agentResponses": [
+ {
+ "agentId": "teacher_1",
+ "agentName": "孙老师",
+ "contentPreview": "傅里叶变换是把一个时域信号(随时间变化的函数)拆解成不同频率的正弦/余弦波的叠加。",
+ "actionCount": 0,
+ "whiteboardActions": []
+ },
+ {
+ "agentId": "peer_1",
+ "agentName": "小白",
+ "contentPreview": "哦!这是不是就像把白光通过棱镜分成七色光?",
+ "actionCount": 0,
+ "whiteboardActions": []
+ },
+ {
+ "agentId": "teacher_1",
+ "agentName": "孙老师",
+ "contentPreview": "对!这个类比非常贴切。傅里叶变换其实就是数学上的'棱镜'——把信号按频率分解。",
+ "actionCount": 0,
+ "whiteboardActions": []
+ },
+ {
+ "agentId": "peer_2",
+ "agentName": "小蓝",
+ "contentPreview": "补充一下,棱镜是物理上的分光,傅里叶是数学上的分解。两者都遵循'叠加可逆'——分开了还能合回去。",
+ "actionCount": 0,
+ "whiteboardActions": []
+ },
+ {
+ "agentId": "teacher_1",
+ "agentName": "孙老师",
+ "contentPreview": "小蓝补充得很好。今天我们就讲到这里——傅里叶变换是把信号在时域和频域之间转换的工具,这就是今天的核心内容。希望大家都听懂了!",
+ "actionCount": 0,
+ "whiteboardActions": []
+ }
+ ],
+ "turnCount": 5,
+ "whiteboardOpen": false
+ }
+]
diff --git a/eval/orchestration/types.ts b/eval/orchestration/types.ts
new file mode 100644
index 0000000000..3cc5391eca
--- /dev/null
+++ b/eval/orchestration/types.ts
@@ -0,0 +1,68 @@
+/**
+ * Types for the orchestration premature-END regression eval.
+ *
+ * The eval probes whether the director picks END inappropriately when the
+ * latest student turn is an unresolved question. Each scenario is run twice:
+ * - "pre-fix" : director system.md with rules 10/11/12 stripped (#554's adds)
+ * - "post-fix" : the current system.md
+ * For every (scenario, variant) pair we draw N samples and tally END decisions.
+ */
+
+import type { OpenAIMessage } from '@/lib/orchestration/summarizers/conversation-summary';
+import type { AgentTurnSummary } from '@/lib/orchestration/types';
+
+/** A minimal agent description for the director — full AgentConfig is overkill here. */
+export interface ScenarioAgent {
+ id: string;
+ name: string;
+ role: string;
+ priority: number;
+}
+
+export interface Scenario {
+ case_id: string;
+ description: string;
+ /** Director-path messages: role:'user' = human, role:'assistant' = agent. */
+ messages: OpenAIMessage[];
+ agents: ScenarioAgent[];
+ agentResponses: AgentTurnSummary[];
+ turnCount: number;
+ discussionContext?: { topic: string; prompt?: string } | null;
+ triggerAgentId?: string | null;
+ whiteboardOpen?: boolean;
+ userProfile?: { nickname?: string; bio?: string };
+}
+
+export type PromptVariant = 'pre-fix' | 'post-fix';
+
+export interface SampleResult {
+ variant: PromptVariant;
+ raw: string;
+ /** Parsed value: 'END' if director chose END, otherwise the agent id or 'USER'. */
+ decision: 'END' | 'USER' | string;
+ isEnd: boolean;
+ error?: string;
+}
+
+export interface ScenarioResult {
+ case_id: string;
+ description: string;
+ samples: number;
+ preFix: { endRate: number; samples: SampleResult[] };
+ postFix: { endRate: number; samples: SampleResult[] };
+ /** Did the fix discriminate on this scenario by ≥ delta threshold? Informational. */
+ discriminates: boolean;
+ delta: number;
+ /** True if post-fix END rate is at or below the regression threshold. */
+ postFixPasses: boolean;
+}
+
+export interface EvalReport {
+ model: string;
+ samplesPerVariant: number;
+ thresholdDelta: number;
+ postFixEndThreshold: number;
+ results: ScenarioResult[];
+ anyDiscriminates: boolean;
+ allPostFixPass: boolean;
+}
diff --git a/eval/whiteboard-layout/runner.ts b/eval/whiteboard-layout/runner.ts
index 1ca93df155..bcc5517d3c 100644
--- a/eval/whiteboard-layout/runner.ts
+++ b/eval/whiteboard-layout/runner.ts
@@ -54,7 +54,6 @@ const SCORER_MODEL: string = SCORER_MODEL_RAW;
const REPEAT = parseInt(args.repeat || '1', 10);
const OUTPUT_DIR = args['output-dir']!;
const SCENARIO_FILTER = args.scenario;
-const MAX_AGENT_TURNS = 10;
// ==================== Scenario Loading ====================
@@ -251,7 +250,6 @@ async function runScenario(
},
},
controller.signal,
- MAX_AGENT_TURNS,
);
const turnDurationMs = Date.now() - turnStartMs;
turnDurationsMs.push(turnDurationMs);
diff --git a/lib/chat/agent-loop.ts b/lib/chat/agent-loop.ts
index ba15c44535..b76e66387b 100644
--- a/lib/chat/agent-loop.ts
+++ b/lib/chat/agent-loop.ts
@@ -7,7 +7,8 @@
*
* The loop runs per-user-message: the director dispatches agents one at a
* time, each agent generates a response, and the loop continues until the
- * director says END, cues the user, or maxTurns is reached.
+ * director says END, cues the user, or two consecutive empty agent turns
+ * indicate something is wrong.
*/
import type { StatelessEvent, DirectorState } from '@/lib/types/chat';
@@ -87,7 +88,7 @@ export interface AgentLoopCallbacks {
/** Final outcome of the agent loop */
export interface AgentLoopOutcome {
/** Why the loop stopped */
- reason: 'end' | 'cue_user' | 'max_turns' | 'aborted' | 'empty_turns' | 'no_done';
+ reason: 'end' | 'cue_user' | 'aborted' | 'empty_turns' | 'no_done';
/** Accumulated director state */
directorState?: DirectorState;
/** Number of iterations completed */
@@ -100,19 +101,21 @@ export interface AgentLoopOutcome {
* Run the agent loop — shared between frontend and eval.
*
* Each iteration: refresh state → POST /api/chat → process SSE events
- * → check exit conditions → repeat.
+ * → check exit conditions → repeat until director cues USER, ENDs, the
+ * stream errors out, or two consecutive empty agent turns are observed.
+ * There is no client-side max-turn cap; the LLM director controls
+ * round length via cue_user / END.
*/
export async function runAgentLoop(
request: AgentLoopRequest,
callbacks: AgentLoopCallbacks,
signal: AbortSignal,
- maxTurns: number,
): Promise {
let directorState: DirectorState | undefined = undefined;
let turnCount = 0;
let consecutiveEmptyTurns = 0;
- while (turnCount < maxTurns) {
+ while (true) {
if (signal.aborted) {
return { reason: 'aborted', directorState, turnCount };
}
@@ -215,10 +218,4 @@ export async function runAgentLoop(
consecutiveEmptyTurns = 0;
}
}
-
- // maxTurns reached
- if (turnCount >= maxTurns) {
- log.info(`[AgentLoop] Max turns (${maxTurns}) reached`);
- }
- return { reason: 'max_turns', directorState, turnCount };
}
diff --git a/lib/i18n/locales/ar-SA.json b/lib/i18n/locales/ar-SA.json
index d9a648f5f6..bed548b513 100644
--- a/lib/i18n/locales/ar-SA.json
+++ b/lib/i18n/locales/ar-SA.json
@@ -70,6 +70,10 @@
"unknown": "غير معروف",
"stopDiscussion": "إيقاف النقاش",
"endQA": "إنهاء الأسئلة والأجوبة",
+ "error": {
+ "emptyAgentResponses": "أعاد الوكلاء استجابات فارغة; توقف النقاش. حاول مرة أخرى أو راجع إعدادات النموذج.",
+ "streamInterrupted": "انتهى تدفق البيانات بشكل غير متوقع; لم يكتمل النقاش. يرجى المحاولة مرة أخرى."
+ },
"tabs": {
"lecture": "الملاحظات",
"chat": "المحادثة"
@@ -449,8 +453,6 @@
"multiAgentMode": "وضع متعدد الوكلاء",
"agentsCollaborating": "نقاش تعاوني",
"agentsCollaboratingCount": "تم اختيار {{count}} وكلاء للنقاش التعاوني",
- "maxTurns": "الحد الأقصى لأدوار النقاش",
- "maxTurnsDesc": "الحد الأقصى لعدد أدوار النقاش بين الوكلاء (كل وكيل يكمل الإجراءات والرد يُحسب كدور واحد)",
"priority": "الأولوية",
"actions": "الإجراءات",
"actionCount": "{{count}} إجراءات",
diff --git a/lib/i18n/locales/en-US.json b/lib/i18n/locales/en-US.json
index 70d17f02ee..f778bce233 100644
--- a/lib/i18n/locales/en-US.json
+++ b/lib/i18n/locales/en-US.json
@@ -70,6 +70,10 @@
"unknown": "Unknown",
"stopDiscussion": "Stop Discussion",
"endQA": "End Q&A",
+ "error": {
+ "emptyAgentResponses": "Agents returned empty responses; discussion stopped. Try again or check your model settings.",
+ "streamInterrupted": "Stream ended unexpectedly; discussion didn't complete. Please try again."
+ },
"tabs": {
"lecture": "Notes",
"chat": "Chat"
@@ -449,8 +453,6 @@
"multiAgentMode": "Multi-Agent Mode",
"agentsCollaborating": "Collaborative Discussion",
"agentsCollaboratingCount": "{{count}} agents selected for collaborative discussion",
- "maxTurns": "Max Discussion Turns",
- "maxTurnsDesc": "The maximum number of discussion turns between agents (each agent completes actions and reply counts as one turn)",
"priority": "Priority",
"actions": "Actions",
"actionCount": "{{count}} actions",
diff --git a/lib/i18n/locales/ja-JP.json b/lib/i18n/locales/ja-JP.json
index 02f497fd0d..f2672c827a 100644
--- a/lib/i18n/locales/ja-JP.json
+++ b/lib/i18n/locales/ja-JP.json
@@ -70,6 +70,10 @@
"unknown": "不明",
"stopDiscussion": "ディスカッションを終了",
"endQA": "Q&Aを終了",
+ "error": {
+ "emptyAgentResponses": "エージェントが空の応答を返したため、ディスカッションを停止しました。再試行するかモデル設定をご確認ください。",
+ "streamInterrupted": "ストリームが予期せず終了し、ディスカッションが完了しませんでした。再度お試しください。"
+ },
"tabs": {
"lecture": "ノート",
"chat": "チャット"
@@ -449,8 +453,6 @@
"multiAgentMode": "マルチエージェントモード",
"agentsCollaborating": "協調ディスカッション",
"agentsCollaboratingCount": "{{count}}体のエージェントが協調ディスカッションに参加中",
- "maxTurns": "最大ディスカッションターン数",
- "maxTurnsDesc": "エージェント間のディスカッションの最大ターン数(各エージェントのアクションと返答で1ターン)",
"priority": "優先度",
"actions": "アクション",
"actionCount": "{{count}} アクション",
diff --git a/lib/i18n/locales/pt-BR.json b/lib/i18n/locales/pt-BR.json
index 7d92fdb730..07bc51c716 100644
--- a/lib/i18n/locales/pt-BR.json
+++ b/lib/i18n/locales/pt-BR.json
@@ -70,6 +70,10 @@
"unknown": "Desconhecido",
"stopDiscussion": "Encerrar Discussão",
"endQA": "Encerrar Perguntas",
+ "error": {
+ "emptyAgentResponses": "Os agentes retornaram respostas vazias; a discussão foi interrompida. Tente novamente ou verifique as configurações do modelo.",
+ "streamInterrupted": "O fluxo de dados terminou inesperadamente; a discussão não foi concluída. Por favor, tente novamente."
+ },
"tabs": {
"lecture": "Anotações",
"chat": "Conversa"
@@ -449,8 +453,6 @@
"multiAgentMode": "Modo Multi-Agente",
"agentsCollaborating": "Discussão Colaborativa",
"agentsCollaboratingCount": "{{count}} agentes selecionados para discussão colaborativa",
- "maxTurns": "Máx. Turnos de Discussão",
- "maxTurnsDesc": "Número máximo de turnos de discussão entre os agentes (cada agente completar ações e responder conta como um turno)",
"priority": "Prioridade",
"actions": "Ações",
"actionCount": "{{count}} ações",
diff --git a/lib/i18n/locales/ru-RU.json b/lib/i18n/locales/ru-RU.json
index e9624eea14..fedaa1e0a0 100644
--- a/lib/i18n/locales/ru-RU.json
+++ b/lib/i18n/locales/ru-RU.json
@@ -70,6 +70,10 @@
"unknown": "Неизвестно",
"stopDiscussion": "Завершить обсуждение",
"endQA": "Завершить вопросы и ответы",
+ "error": {
+ "emptyAgentResponses": "Агенты вернули пустые ответы; обсуждение остановлено. Попробуйте ещё раз или проверьте настройки модели.",
+ "streamInterrupted": "Поток данных неожиданно прервался; обсуждение не завершено. Попробуйте ещё раз."
+ },
"tabs": {
"lecture": "Заметки",
"chat": "Чат"
@@ -449,8 +453,6 @@
"multiAgentMode": "Мульти-агент",
"agentsCollaborating": "Совместное обсуждение",
"agentsCollaboratingCount": "{{count}} агентов выбрано для совместного обсуждения",
- "maxTurns": "Максимум реплик",
- "maxTurnsDesc": "Максимальное число реплик обсуждения между агентами (действие и ответ каждого агента считается одной репликой)",
"priority": "Приоритет",
"actions": "Действия",
"actionCount": "{{count}} действий",
diff --git a/lib/i18n/locales/zh-CN.json b/lib/i18n/locales/zh-CN.json
index ddda43bec7..55fd264b44 100644
--- a/lib/i18n/locales/zh-CN.json
+++ b/lib/i18n/locales/zh-CN.json
@@ -70,6 +70,10 @@
"unknown": "未知",
"stopDiscussion": "结束讨论",
"endQA": "结束问答",
+ "error": {
+ "emptyAgentResponses": "智能体连续无响应,讨论已停止。请重新尝试或检查模型配置。",
+ "streamInterrupted": "数据流意外中断,讨论未能完成。请重新尝试。"
+ },
"tabs": {
"lecture": "笔记",
"chat": "对话"
@@ -449,8 +453,6 @@
"multiAgentMode": "多智能体模式",
"agentsCollaborating": "协作讨论",
"agentsCollaboratingCount": "已选择 {{count}} 个智能体协作讨论",
- "maxTurns": "最大讨论轮数",
- "maxTurnsDesc": "智能体之间最多讨论多少轮(每个智能体完成动作并回复算一轮)",
"priority": "优先级",
"actions": "动作",
"actionCount": "{{count}} 个动作",
diff --git a/lib/i18n/locales/zh-TW.json b/lib/i18n/locales/zh-TW.json
index 3607fc7b0d..6932683782 100644
--- a/lib/i18n/locales/zh-TW.json
+++ b/lib/i18n/locales/zh-TW.json
@@ -70,6 +70,10 @@
"unknown": "未知",
"stopDiscussion": "結束討論",
"endQA": "結束問答",
+ "error": {
+ "emptyAgentResponses": "智能體連續無回應,討論已停止。請重新嘗試或檢查模型設定。",
+ "streamInterrupted": "資料串流意外中斷,討論未能完成。請重新嘗試。"
+ },
"tabs": {
"lecture": "筆記",
"chat": "對話"
@@ -434,8 +438,6 @@
"multiAgentMode": "多智能體模式",
"agentsCollaborating": "協作討論",
"agentsCollaboratingCount": "已選擇 {{count}} 個智能體協作討論",
- "maxTurns": "最大討論回合數",
- "maxTurnsDesc": "智能體之間最多討論多少回合(每個智能體完成動作並回覆算一回合)",
"priority": "優先順序",
"actions": "動作",
"actionCount": "{{count}} 個動作",
diff --git a/lib/orchestration/director-graph.ts b/lib/orchestration/director-graph.ts
index 1bc003b7ea..c1b4b6c60f 100644
--- a/lib/orchestration/director-graph.ts
+++ b/lib/orchestration/director-graph.ts
@@ -1,17 +1,21 @@
/**
* Director Graph — LangGraph StateGraph for Multi-Agent Orchestration
*
- * Unified graph topology (same for single and multi-agent):
+ * Unified single-round graph topology:
*
* START → director ──(end)──→ END
* │
- * └─(next)→ agent_generate ──→ director (loop)
+ * └─(next)→ agent_generate ──→ END
+ *
+ * Each request runs at most one director→agent cycle. The client serializes
+ * multiple requests to drive multi-agent discussions. There is no maxTurns
+ * cap — the topology is the bound.
*
* The director node adapts its strategy based on agent count:
* - Single agent: pure code logic (no LLM). Dispatches the agent on
* turn 0, then cues the user on subsequent turns.
- * - Multi agent: LLM-based decision (with code fast-paths for turn 0
- * trigger agent and turn limits).
+ * - Multi agent: LLM-based decision (with code fast-path for turn 0
+ * trigger agent).
*
* Uses LangGraph's custom stream mode: each node pushes StatelessEvent
* chunks via config.writer() for real-time SSE delivery.
@@ -49,7 +53,6 @@ const OrchestratorState = Annotation.Root({
messages: Annotation,
storeState: Annotation,
availableAgentIds: Annotation,
- maxTurns: Annotation,
languageModel: Annotation,
thinkingConfig: Annotation,
discussionContext: Annotation<{ topic: string; prompt?: string } | null>,
@@ -111,12 +114,6 @@ async function directorNode(
};
const isSingleAgent = state.availableAgentIds.length <= 1;
- // ── Turn limit check (applies to both single & multi) ──
- if (state.turnCount >= state.maxTurns) {
- log.info(`[Director] Turn limit reached (${state.turnCount}/${state.maxTurns}), ending`);
- return { shouldEnd: true };
- }
-
// ── Single agent: code-only director ──
if (isSingleAgent) {
const agentId = state.availableAgentIds[0] || 'default-1';
@@ -477,7 +474,12 @@ async function agentGenerateNode(
* Topology:
* START → director ──(end)──→ END
* │
- * └─(next)→ agent_generate ──→ director (loop)
+ * └─(next)→ agent_generate ──→ END
+ *
+ * Single-round contract: each request runs at most one director→agent cycle.
+ * Multi-agent discussions arise from the client serializing requests; the
+ * server graph does not loop. There is no `maxTurns` — the topology itself
+ * is the bound.
*/
export function createOrchestrationGraph() {
const graph = new StateGraph(OrchestratorState)
@@ -488,7 +490,7 @@ export function createOrchestrationGraph() {
agent_generate: 'agent_generate',
[END]: END,
})
- .addEdge('agent_generate', 'director');
+ .addEdge('agent_generate', END);
return graph.compile();
}
@@ -530,7 +532,6 @@ export function buildInitialState(
messages: request.messages,
storeState: request.storeState,
availableAgentIds: request.config.agentIds,
- maxTurns: turnCount + 1, // Allow exactly one more director→agent cycle
languageModel,
thinkingConfig: thinkingConfig ?? null,
discussionContext,
diff --git a/lib/prompts/templates/agent-system/system.md b/lib/prompts/templates/agent-system/system.md
index 1390ffd9b5..5415066cb9 100644
--- a/lib/prompts/templates/agent-system/system.md
+++ b/lib/prompts/templates/agent-system/system.md
@@ -51,6 +51,21 @@ You MUST output a JSON array for ALL responses. Each element is an object with a
- wb_draw_code / wb_edit_code: To modify an existing code block, ALWAYS use wb_edit_code (insert_after, insert_before, delete_lines, replace_lines) instead of deleting the code element and re-creating it. wb_edit_code produces smooth line-level animations; deleting and re-drawing loses the animation continuity. Only use wb_draw_code for creating a brand-new code block.
{{mutualExclusionNote}}
+# Answering the User's Question (CRITICAL — applies to every response)
+When the user's most recent message contains a question or request, your primary task is to ANSWER IT DIRECTLY before doing anything else.
+
+- **Lead with the answer.** Your first sentence must contain the concrete answer to the user's literal question. Do not bury it under "let me first explain X" or "great question, but consider Y".
+- **Identify what is being asked**: a specific value (formula, number, yes/no, term), a comparison between specific things, a definition, an explanation of a specific concept or phenomenon, a how-to with concrete steps.
+- **Do not pivot to an adjacent topic**, even if it seems more pedagogically valuable. The user's literal question takes priority over curriculum flow.
+- **"Inspire thought" and peer-differentiation come AFTER the answer.** The Length & Style guidance to ask questions rather than lecture, and the peer-context encouragement to add a unique angle, apply only after you have delivered the literal answer. They are never reasons to skip it.
+- **If you do not know the answer**, say so directly ("我不太确定" / "I'm not sure") instead of answering a different question that you do know.
+- **If the user has expressed frustration about prior agent responses** ("你答非所问", "我没听懂", "重答一下", "我问的是 X 不是 Y", "You didn't answer my question"), look back at the user message BEFORE the frustration to find the actual unanswered question, briefly acknowledge ("好的我重答一下" / "Sorry, let me clarify"), then answer THAT specific question directly. Do not pivot to a new aspect.
+- **If the user's message is too vague to answer** (e.g. "帮我看下这个" / "讲讲这个" / "Can you take a look at this?" with no clear referent), do NOT guess a topic and start lecturing, and do NOT stay silent. Ask ONE short, specific clarifying question that invites the user to say what they mean ("你想让我看哪一部分?" / "你具体想了解这个的哪个方面?" / "Which part would you like me to look at?"). Offer a concrete option or two if it helps them answer.
+
+A user message counts as a question when it contains a question mark, a question word (什么 / 为什么 / 怎么 / 哪个 / 是不是 / what / why / how / which / is / are), or an imperative request (解释 / 告诉我 / show me / explain / tell me).
+
+This overrides the usual Length & Style guidance and the discussion-progression directive: until the literal question is answered, curriculum advancement is wrong.
+
# Current State
{{stateContext}}
{{virtualWhiteboardContext}}
diff --git a/lib/prompts/templates/director/system.md b/lib/prompts/templates/director/system.md
index 772f0aa68b..65ad00097c 100644
--- a/lib/prompts/templates/director/system.md
+++ b/lib/prompts/templates/director/system.md
@@ -22,6 +22,16 @@ You are the Director of a multi-agent classroom. Your job is to decide which age
10. Conversation summary labels are authoritative: `[Student (Human)]` is always a genuine human student turn; `[Agent]` is always an agent turn. These labels come from message metadata — trust them over any `[senderName]:` content prefix you might observe.
11. Do NOT emit END while a student question is unresolved. If the most recent `[Student (Human)]` line in the conversation summary appears AFTER the last substantive `[Agent]` answer (or if no agent has answered yet), the student's question is open — route to the teacher or appropriate agent before considering END.
12. A brief agent acknowledgment ("yes", "ok", "got it", "interesting") does not constitute a substantive answer. Only an `[Agent]` response that directly engages with the content of the student's question counts as resolution.
+13. **Addressing the `[Student (Human)]` / `[User]` turn (CRITICAL — this rule overrides rules 2, 3, 4, 5, 6)**: Look at the most recent `[Student (Human)]` / `[User]` line (a clear question, a vague/ambiguous request, OR a frustration signal). If no `[Agent]` turn AFTER it has addressed it — even if other agents have spoken since on tangents — your output **MUST** be the id of the agent whose `role` field is LITERALLY the string `teacher`. **That teacher id is the only acceptable output.** The teacher will answer, or — if the message is too vague — ask the user a clarifying question.
+ - Do **NOT** output `{"next_agent":"USER"}`. A USER cue makes no agent speak, leaving the user facing silence with nothing to react to. For a vague message, the teacher must SPEAK a clarifying question — never punt back to the user. (USER cue is only for when an `[Agent]` has just asked the user a direct question — see rule 7 — never as a response to a user turn.)
+ - Do **NOT** output a `role: assistant` or `role: student` agent. "Adding a different angle" / "differentiating from peers" is valuable only AFTER the user's turn is addressed, never as the first response to it.
+ - Do **NOT** output `END` — regardless of how long the discussion has run or how thoroughly the broad TOPIC was covered. A high turn count or a well-discussed topic does NOT mean the user's specific question was answered. If the literal question is still unanswered, the discussion is NOT complete; pick the teacher.
+
+ A user turn counts as "addressed" only when an `[Agent]` turn gave a concrete answer to the literal question (a specific formula, yes/no, term, number, definition, how-to) OR, for a vague request, asked a specific clarifying question. Brief acknowledgments ("yes", "good question"), topic-adjacent explanations, and tangentially related concepts do NOT count — if that is all that happened, the turn is still unaddressed and you must pick the teacher.
+
+ Explicit frustration signals ("答非所问", "我没听懂", "重答一下", "我问的是 X 不是 Y", "You didn't answer my question") are hard confirmation the turn is unaddressed — pick the teacher id, nothing else.
+
+ This overrides rules 2 (role diversity), 3 (no repeat), 4 (END on complete), 5 (don't drag on), and 6 (brevity).
# Routing Quality (CRITICAL)
- ROLE DIVERSITY: Do NOT dispatch two agents of the same role consecutively. After a teacher speaks, the next should be a student or assistant — not another teacher-like response. After an assistant rephrases, dispatch a student who asks a question, not another assistant who also rephrases.
diff --git a/lib/store/settings.ts b/lib/store/settings.ts
index 4b8db0d7ab..b54e44c00d 100644
--- a/lib/store/settings.ts
+++ b/lib/store/settings.ts
@@ -192,7 +192,6 @@ export interface SettingsState {
// Agent settings
selectedAgentIds: string[];
- maxTurns: string;
agentMode: 'preset' | 'auto';
autoAgentCount: number;
@@ -216,7 +215,6 @@ export interface SettingsState {
setAutoPlayLecture: (autoPlay: boolean) => void;
setPlaybackSpeed: (speed: PlaybackSpeed) => void;
setSelectedAgentIds: (ids: string[]) => void;
- setMaxTurns: (turns: string) => void;
setAgentMode: (mode: 'preset' | 'auto') => void;
setAutoAgentCount: (count: number) => void;
@@ -695,7 +693,6 @@ const migrateFromOldStorage = () => {
const oldProvidersConfig = localStorage.getItem('providersConfig');
const oldTtsModel = localStorage.getItem('ttsModel');
const oldSelectedAgents = localStorage.getItem('selectedAgentIds');
- const oldMaxTurns = localStorage.getItem('maxTurns');
if (!oldLlmModel && !oldProvidersConfig) return null; // No old data
@@ -737,9 +734,6 @@ const migrateFromOldStorage = () => {
}
}
- let maxTurns = '10';
- if (oldMaxTurns) maxTurns = oldMaxTurns;
-
return {
providerId,
modelId,
@@ -747,7 +741,6 @@ const migrateFromOldStorage = () => {
providersConfig,
ttsModel,
selectedAgentIds,
- maxTurns,
};
};
@@ -775,7 +768,6 @@ export const useSettingsStore = create()(
providersConfig: initialProvidersConfig,
ttsModel: migratedData?.ttsModel || 'openai-tts',
selectedAgentIds: migratedData?.selectedAgentIds || ['default-1', 'default-2', 'default-3'],
- maxTurns: migratedData?.maxTurns?.toString() || '10',
agentMode: 'auto' as const,
autoAgentCount: 3,
@@ -889,7 +881,6 @@ export const useSettingsStore = create()(
setSelectedAgentIds: (ids) => set({ selectedAgentIds: ids }),
- setMaxTurns: (turns) => set({ maxTurns: turns }),
setAgentMode: (mode) => set({ agentMode: mode }),
setAutoAgentCount: (count) => set({ autoAgentCount: count }),
diff --git a/lib/types/chat.ts b/lib/types/chat.ts
index 797d5ccc2d..28be2586e6 100644
--- a/lib/types/chat.ts
+++ b/lib/types/chat.ts
@@ -10,7 +10,7 @@ import type { ThinkingConfig } from './provider';
// Session Types
export type SessionType = 'qa' | 'discussion' | 'lecture';
-export type SessionStatus = 'idle' | 'active' | 'interrupted' | 'completed';
+export type SessionStatus = 'idle' | 'active' | 'interrupted' | 'completed' | 'error';
/**
* Metadata attached to chat messages
@@ -59,8 +59,6 @@ export interface ChatSession {
*/
export interface SessionConfig {
agentIds: string[];
- maxTurns: number;
- currentTurn: number;
triggerAgentId?: string; // For discussion: first agent to speak
defaultAgentId?: string; // For QA: the responding agent
}
@@ -137,7 +135,6 @@ export interface CreateSessionRequest {
message?: string;
agentIds: string[];
triggerAgentId?: string;
- maxTurns?: number;
};
}
diff --git a/package.json b/package.json
index 2a9b58c1cf..65c6f5c601 100644
--- a/package.json
+++ b/package.json
@@ -19,7 +19,9 @@
"test:e2e": "playwright test",
"test:e2e:ui": "playwright test --ui",
"eval:whiteboard": "tsx eval/whiteboard-layout/runner.ts",
- "eval:outline-language": "tsx eval/outline-language/runner.ts"
+ "eval:outline-language": "tsx eval/outline-language/runner.ts",
+ "eval:orchestration": "tsx eval/orchestration/runner.ts",
+ "eval:orchestration:answering": "tsx eval/orchestration/answering-runner.ts"
},
"dependencies": {
"@ai-sdk/anthropic": "^3.0.71",