Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions .runtime/journal/execution.jsonl
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,12 @@
{"timestamp":"2026-03-06T18:31:50.445Z","sessionID":"ses_123","agent":"tracked-agent","intent":"execute_tool","tool":"git_safe","stdout":"Failed"}
{"timestamp":"2026-03-06T18:38:48.165Z","sessionID":"ses_correct","agent":"tracked-agent","intent":"execute_tool","tool":"fs_safe","args":{"path":"foo.txt"},"stdout":"Success","verificationState":true}
{"timestamp":"2026-03-06T18:39:21.786Z","sessionID":"ses_correct","agent":"tracked-agent","intent":"execute_tool","tool":"fs_safe","args":{"path":"foo.txt"},"stdout":"Success","verificationState":true}
{"timestamp":"2026-03-06T18:47:18.434Z","sessionID":"ses_1","agent":"tracked-agent","intent":"execute_tool","tool":"bash","args":{"command":"git commit -m 'test'"},"stdout":"Commit created on main"}
{"timestamp":"2026-03-06T18:47:18.435Z","sessionID":"ses_1","agent":"tracked-agent","intent":"execute_tool","tool":"git_safe","stdout":"Success","verificationState":true}
{"timestamp":"2026-03-06T18:47:18.435Z","sessionID":"ses_flow_test","agent":"tracked-agent","intent":"execute_tool","tool":"git_safe","stdout":"Success","verificationState":true}
{"timestamp":"2026-03-06T18:48:02.499Z","sessionID":"ses_1","agent":"tracked-agent","intent":"execute_tool","tool":"bash","args":{"command":"git commit -m 'test'"},"stdout":"Commit created on main"}
{"timestamp":"2026-03-06T18:48:02.500Z","sessionID":"ses_1","agent":"tracked-agent","intent":"execute_tool","tool":"git_safe","stdout":"Success","verificationState":true}
{"timestamp":"2026-03-06T18:48:02.501Z","sessionID":"ses_flow_test","agent":"tracked-agent","intent":"execute_tool","tool":"git_safe","stdout":"Success","verificationState":true}
{"timestamp":"2026-03-06T18:53:51.806Z","sessionID":"ses_1","agent":"tracked-agent","intent":"execute_tool","tool":"bash","args":{"command":"git commit -m 'test'"},"stdout":"Commit created on main"}
{"timestamp":"2026-03-06T18:53:51.807Z","sessionID":"ses_1","agent":"tracked-agent","intent":"execute_tool","tool":"git_safe","stdout":"Success","verificationState":true}
{"timestamp":"2026-03-06T18:53:51.807Z","sessionID":"ses_flow_test","agent":"tracked-agent","intent":"execute_tool","tool":"git_safe","stdout":"Success","verificationState":true}
10 changes: 8 additions & 2 deletions docs/implementation_upstream_capability_merge.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,10 @@ Port advanced prompting and verification logic from `official/dev` to Heidi whil
- [x] **Gemini Fix**: Corrected syntax in `atlas/gemini.ts`.
- [x] **Verification**: Created `check_upstream_capability_merge.py` and integrated into `doctor.py`.

### Phase 1: Specialty Agents & Truth Model Hardening [DONE]
### Phase 1: Specialty Agents, Ralph-loop & Truth Model Hardening [DONE]
- [x] **Hephaestus Autonomy**: Ported "Deep Agent" and "Intent Gate" logic to `hephaestus/gpt-5-4.ts`.
- [x] **Ralph-loop**: Integrated runtime loop detection in `runtime-enforcement/hook.ts`.
- [x] **State Tracking**: Updated `state-ledger` and `tool-runner` to support loop detection and successful state change verification.
- [x] **Truth Model**: Unified `state-ledger` to securely track `success`, `verified`, and `changedState` tied to the current execution flow.
- [x] **Runtime Enforcement**: Tightened `hook.ts` to scan only the active completion flow for deterministic tool evidence rather than relying on weak keywords or stale ledger history.
- [x] **Completion & Query**: `complete_task` and `query_ledger` filtered strictly to verified, successful state changes from the active session flow.
Expand All @@ -31,8 +34,11 @@ Port advanced prompting and verification logic from `official/dev` to Heidi whil
### Automated Checks
- `python3 tools/doctor.py` (Custom capability check + Reliability check)
- `bun test src/hooks/runtime-enforcement/ src/runtime/` (Deterministic state testing)
- `bun test tests/runtime/test_deterministic_execution.test.ts` (Reliability regression)

### Manual Review
- Inspect `src/runtime/state-ledger.ts` for strict verifiable payloads.
- Inspect `src/hooks/tool-contract/hook.ts` for deterministic rejection logic.
- Verify `dynamic-agent-prompt-builder.ts` is fully eradicated from imports.
- Verify `dynamic-agent-prompt-builder.ts` is fully eradicated from imports where appropriate.
- Inspect `src/agents/atlas/gemini.ts` for clean syntax.
- Inspect `src/hooks/runtime-enforcement/hook.ts` for loop guard logic.
8 changes: 4 additions & 4 deletions packages/darwin-arm64/bin/index.js.map

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions packages/darwin-x64-baseline/bin/index.js.map

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions packages/darwin-x64/bin/index.js.map

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions packages/linux-arm64-musl/bin/index.js.map

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions packages/linux-arm64/bin/index.js.map

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions packages/linux-x64-baseline/bin/index.js.map

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions packages/linux-x64-musl-baseline/bin/index.js.map

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions packages/linux-x64-musl/bin/index.js.map

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions packages/linux-x64/bin/index.js.map

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions packages/windows-x64-baseline/bin/index.js.map

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions packages/windows-x64/bin/index.js.map

Large diffs are not rendered by default.

9 changes: 4 additions & 5 deletions src/agents/hephaestus/gpt-5-4.ts
Original file line number Diff line number Diff line change
Expand Up @@ -326,13 +326,12 @@ Every \`task()\` output includes a session_id. USE IT for follow-ups.
- Follow-up on result — \`session_id="{id}", prompt="Also: {question}"\`
- Verification failed — \`session_id="{id}", prompt="Failed: {error}. Fix."\`

${
oracleSection
? `
${oracleSection
? `
${oracleSection}
`
: ""
}
: ""
}

## Output Contract

Expand Down
9 changes: 4 additions & 5 deletions src/agents/hephaestus/gpt.ts
Original file line number Diff line number Diff line change
Expand Up @@ -274,13 +274,12 @@ Every \`task()\` output includes a session_id. **USE IT for follow-ups.**
- **Follow-up on result** — \`session_id="{id}", prompt="Also: {question}"\`
- **Verification failed** — \`session_id="{id}", prompt="Failed: {error}. Fix."\`

${
oracleSection
? `
${oracleSection
? `
${oracleSection}
`
: ""
}
: ""
}

## Output Contract

Expand Down
3 changes: 1 addition & 2 deletions src/cli/model-fallback-requirements.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ export const CLI_AGENT_MODEL_REQUIREMENTS: Record<string, ModelRequirement> = {
},
hephaestus: {
fallbackChain: [
{ providers: ["openai", "opencode"], model: "gpt-5.3-codex", variant: "medium" },
{ providers: ["anthropic", "github-copilot", "opencode"], model: "claude-opus-4-6", variant: "max" },
{ providers: ["openai", "opencode"], model: "gpt-5.3-codex", variant: "medium" },
],
},
oracle: {
Expand Down Expand Up @@ -105,7 +105,6 @@ export const CLI_CATEGORY_MODEL_REQUIREMENTS: Record<string, ModelRequirement> =
{ providers: ["anthropic", "github-copilot", "opencode"], model: "claude-opus-4-6", variant: "max" },
{ providers: ["google", "github-copilot", "opencode"], model: "gemini-3.1-pro", variant: "high" },
],
requiresModel: "gpt-5.3-codex",
},
artistry: {
fallbackChain: [
Expand Down
20 changes: 0 additions & 20 deletions src/hooks/execution-journal/hook.ts
Original file line number Diff line number Diff line change
Expand Up @@ -42,26 +42,6 @@ export function createExecutionJournalHook(ctx: PluginInput) {
const verified = output.metadata?.verified !== false // Defaults to true unless explicitly unverified

ledger.record(payload.type, payload.key, success, verified, changedState, output.output, payload.details, input.sessionID)
} else {
// Fallback heuristics for raw bash operations (if not yet disabled)
if (input.tool === "interactive_bash" || input.tool === "bash") {
const args = output.metadata?.args as any
const command = args?.command || ""

if (typeof command === "string") {
if (command.includes("git push") && output.output.includes("Everything up-to-date") === false) {
ledger.record("git.push", "origin", true, true, true, output.output, { command }, input.sessionID)
} else if (command.includes("git commit")) {
ledger.record("git.commit", "HEAD", true, true, true, output.output, { command }, input.sessionID)
} else if (command.includes("gh pr create") && output.output.includes("https://github.com")) {
// naive extraction for bash fallback
const urlMatch = output.output.match(/https:\/\/github\.com[^\s]+/)
if (urlMatch) {
ledger.record("git.pr", urlMatch[0], true, true, true, output.output, { command }, input.sessionID)
}
}
}
}
}
}
}
Expand Down
5 changes: 5 additions & 0 deletions src/hooks/runtime-enforcement/hook.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import type { PluginInput } from "@opencode-ai/plugin"
import type { Message, Part } from "@opencode-ai/sdk"
import { ledger } from "../../runtime/state-ledger"

/**
* Runtime Enforcement Gate
Expand Down Expand Up @@ -30,6 +31,10 @@ export function createRuntimeEnforcementHook(_ctx: PluginInput) {
_input: any,
output: { messages: { info: Message; parts: Part[] }[] }
) => {
// Mark the start of a new completion flow verification.
// This ensures entries from previous turns/flows in the same session are ignored.
ledger.startNewFlow()

const assistantMessages = output.messages.filter(m => m.info.role === "assistant")
if (assistantMessages.length === 0) return

Expand Down
Loading