From 85ca6dbd1716c12a0efeb97c400e85682816ed8b Mon Sep 17 00:00:00 2001
From: perplexity-computer <agent@trinity.local>
Date: Sun, 3 May 2026 14:46:17 +0000
Subject: [PATCH] fix(watchdog): honest error reporting (PROBE_FAILED +
 TOOL_ERROR vs DRIFT)

Closes the 5-day window where audit-watchdog comments on trios#143 said
'verdict: DRIFT, ledger_rows: 0' even when tri-railway never produced a
JSON line at all (token rotated / auth scope wrong / network timeout).

Root cause:
  Both 'Audit Acc1/Acc2' steps had a hardcoded fallback that synthesized
  '{verdict:DRIFT, ledger_rows:0, services:0, exit_code:1}' on missing
  JSON output. This conflated three distinct failure modes:
    - genuine DRIFT (services found, divergence detected)
    - silent writer (services up, but writer not emitting rows)
    - tool/auth failure (we couldn't even ask)
  Reporting all three as 'DRIFT, 0 rows' was an R5 honesty violation.

Fix:
  1. Pre-probe each Railway token via 'query{me{id}}' against
     backboard.railway.com/graphql/v2 BEFORE running tri-railway.
     Probe outcomes: ok / missing_token / not_authorized / network_error
     / http_<code> / unknown_response.
  2. If probe != ok, skip tri-railway and emit:
       {verdict:'PROBE_FAILED', exit_code:3, services:null,
        ledger_rows:null, probe_reason:<reason>}
  3. If probe == ok but tri-railway emits no JSON line:
       {verdict:'TOOL_ERROR', exit_code:4, services:null, ledger_rows:null}
  4. Digest renders null services/ledger_rows as '?' instead of '0'
     (zero has a specific meaning - empty table, writer silent - and
     must not be conflated with 'we could not ask').
  5. Combined-exit logic: 3 (probe) and 4 (tool) are NOT folded into 1
     (DRIFT). New comment block surfaces the probe reason; new warn
     steps log without failing the workflow on probe/tool errors.

Cross-validation (TRI gateway, 2026-05-03 14:30Z):
  - Acc1 IGLA project list: 200 OK, 6 services
  - default project list: GraphQL 'Not Authorized' - matches the
    'Acc2 token rotated since 2026-04-27' hypothesis from #61

Refs:
  - issue #16 (audit-watchdog scope)
  - issue #61 (RailwayMultiClient P0 - the underlying single-token
    limitation that PROBE_FAILED surfaces honestly until #61 lands)
  - trios#143 (race tracker - 5 days of misleading watchdog comments)

phi^2 + phi^-2 = 3 - TRINITY - NEVER STOP
---
 .github/workflows/audit-watchdog.yml | 147 ++++++++++++++++++++++++---
 1 file changed, 134 insertions(+), 13 deletions(-)
diff --git a/.github/workflows/audit-watchdog.yml b/.github/workflows/audit-watchdog.yml
index e22ee05c..24688486 100644
--- a/.github/workflows/audit-watchdog.yml
+++ b/.github/workflows/audit-watchdog.yml
@@ -70,12 +70,60 @@ jobs:
       - name: Build tri-railway
         run: cargo build --release --bin tri-railway --locked
 
+      - name: Pre-probe Acc1 token (auth sanity)
+        id: acc1_probe
+        env:
+          RAILWAY_TOKEN: ${{ env.ACC1_TOKEN }}
+        run: |
+          # Honest pre-probe: hit Railway GraphQL `me` endpoint with the
+          # configured token BEFORE running tri-railway. This separates a
+          # genuine DRIFT verdict (services found, divergence detected)
+          # from PROBE_FAILED (token invalid / rotated / scope wrong),
+          # which previously masqueraded as DRIFT via the hardcoded
+          # ledger_rows:0 fallback.
+          if [[ -z "${RAILWAY_TOKEN:-}" ]]; then
+            echo 'probe=missing_token' >> "$GITHUB_OUTPUT"
+            exit 0
+          fi
+          set +e
+          http=$(curl -sS -o /tmp/acc1_probe.json -w '%{http_code}' \
+              -H "Authorization: Bearer $RAILWAY_TOKEN" \
+              -H 'Content-Type: application/json' \
+              --max-time 10 \
+              -d '{"query":"query{me{id}}"}' \
+              https://backboard.railway.com/graphql/v2)
+          rc=$?
+          if [[ $rc -ne 0 ]]; then
+            echo 'probe=network_error' >> "$GITHUB_OUTPUT"
+          elif jq -e '.errors[0].message | test("Not Authorized")' /tmp/acc1_probe.json >/dev/null 2>&1; then
+            echo 'probe=not_authorized' >> "$GITHUB_OUTPUT"
+          elif [[ "$http" != "200" ]]; then
+            echo "probe=http_$http" >> "$GITHUB_OUTPUT"
+          elif jq -e '.data.me.id' /tmp/acc1_probe.json >/dev/null 2>&1; then
+            echo 'probe=ok' >> "$GITHUB_OUTPUT"
+          else
+            echo 'probe=unknown_response' >> "$GITHUB_OUTPUT"
+          fi
+
       - name: Audit Acc1 / IGLA
         id: acc1
         env:
           RAILWAY_TOKEN: ${{ env.ACC1_TOKEN }}
+          PROBE: ${{ steps.acc1_probe.outputs.probe }}
         run: |
           set +e
+          # If the pre-probe failed, do NOT run tri-railway and do NOT
+          # emit a synthetic DRIFT verdict. Emit PROBE_FAILED with the
+          # exact probe reason — this is honest reporting (R5).
+          if [[ "$PROBE" != "ok" ]]; then
+            jq -n --arg p "$PROBE" --arg proj "$ACC1_PROJECT" --argjson t $TARGET \
+              '{error:("pre-probe failed: "+$p),verdict:"PROBE_FAILED",exit_code:3,services:null,events:[],ledger_rows:null,target:$t,project:$proj,project_name:"unknown",probe_reason:$p}' \
+              > /tmp/acc1.json
+            echo 'exit=3' >> "$GITHUB_OUTPUT"
+            echo "verdict=PROBE_FAILED" >> "$GITHUB_OUTPUT"
+            cat /tmp/acc1.json
+            exit 0
+          fi
           # tri-railway prints its text summary AND a single-line JSON
           # blob (with --json) to stdout, plus tracing-subscriber lines
           # on stderr. Capture stdout to acc1.txt and stderr to acc1.log;
@@ -96,19 +144,64 @@ jobs:
           # against text summary above and tracing being merged in.
           grep -E '^\s*\{' /tmp/acc1.txt | tail -n 1 > /tmp/acc1.json
           if [[ ! -s /tmp/acc1.json ]]; then
-            echo '{"error":"no JSON line in audit run output","verdict":"DRIFT","exit_code":1,"services":0,"events":[],"ledger_rows":0,"target":'"$TARGET"',"project":"'"$ACC1_PROJECT"'","project_name":"unknown"}' > /tmp/acc1.json
+            # Honest fallback: tri-railway ran (token was OK at probe)
+            # but produced no JSON line. Mark as TOOL_ERROR, not DRIFT.
+            # ledger_rows is NULL (unknown), not 0 (which would imply
+            # the writer is silent — different failure mode).
+            jq -n --arg proj "$ACC1_PROJECT" --argjson t $TARGET \
+              '{error:"no JSON line in audit run output",verdict:"TOOL_ERROR",exit_code:4,services:null,events:[],ledger_rows:null,target:$t,project:$proj,project_name:"unknown"}' \
+              > /tmp/acc1.json
           fi
           V=$(jq -r '.verdict // "?"' /tmp/acc1.json 2>/dev/null || echo '?')
           # GitHub Actions outputs need key=value (no spaces in value
           # without explicit format), so encode the verdict.
           printf 'verdict=%s\n' "${V// /_}" >> "$GITHUB_OUTPUT"
 
+      - name: Pre-probe Acc2 token (auth sanity)
+        id: acc2_probe
+        env:
+          RAILWAY_TOKEN: ${{ env.ACC2_TOKEN }}
+        run: |
+          if [[ -z "${RAILWAY_TOKEN:-}" ]]; then
+            echo 'probe=missing_token' >> "$GITHUB_OUTPUT"
+            exit 0
+          fi
+          set +e
+          http=$(curl -sS -o /tmp/acc2_probe.json -w '%{http_code}' \
+              -H "Authorization: Bearer $RAILWAY_TOKEN" \
+              -H 'Content-Type: application/json' \
+              --max-time 10 \
+              -d '{"query":"query{me{id}}"}' \
+              https://backboard.railway.com/graphql/v2)
+          rc=$?
+          if [[ $rc -ne 0 ]]; then
+            echo 'probe=network_error' >> "$GITHUB_OUTPUT"
+          elif jq -e '.errors[0].message | test("Not Authorized")' /tmp/acc2_probe.json >/dev/null 2>&1; then
+            echo 'probe=not_authorized' >> "$GITHUB_OUTPUT"
+          elif [[ "$http" != "200" ]]; then
+            echo "probe=http_$http" >> "$GITHUB_OUTPUT"
+          elif jq -e '.data.me.id' /tmp/acc2_probe.json >/dev/null 2>&1; then
+            echo 'probe=ok' >> "$GITHUB_OUTPUT"
+          else
+            echo 'probe=unknown_response' >> "$GITHUB_OUTPUT"
+          fi
+
       - name: Audit Acc2 / IGLA-MIRROR-2
         id: acc2
         env:
           RAILWAY_TOKEN: ${{ env.ACC2_TOKEN }}
+          PROBE: ${{ steps.acc2_probe.outputs.probe }}
         run: |
           set +e
+          if [[ "$PROBE" != "ok" ]]; then
+            jq -n --arg p "$PROBE" --arg proj "$ACC2_PROJECT" --argjson t $TARGET \
+              '{error:("pre-probe failed: "+$p),verdict:"PROBE_FAILED",exit_code:3,services:null,events:[],ledger_rows:null,target:$t,project:$proj,project_name:"unknown",probe_reason:$p}' \
+              > /tmp/acc2.json
+            echo 'exit=3' >> "$GITHUB_OUTPUT"
+            echo "verdict=PROBE_FAILED" >> "$GITHUB_OUTPUT"
+            cat /tmp/acc2.json
+            exit 0
+          fi
           ./target/release/tri-railway audit run \
               --project "$ACC2_PROJECT" \
               --target "$TARGET" \
@@ -121,7 +214,9 @@ jobs:
           cat /tmp/acc2.log || true
           grep -E '^\s*\{' /tmp/acc2.txt | tail -n 1 > /tmp/acc2.json
           if [[ ! -s /tmp/acc2.json ]]; then
-            echo '{"error":"no JSON line in audit run output","verdict":"DRIFT","exit_code":1,"services":0,"events":[],"ledger_rows":0,"target":'"$TARGET"',"project":"'"$ACC2_PROJECT"'","project_name":"unknown"}' > /tmp/acc2.json
+            jq -n --arg proj "$ACC2_PROJECT" --argjson t $TARGET \
+              '{error:"no JSON line in audit run output",verdict:"TOOL_ERROR",exit_code:4,services:null,events:[],ledger_rows:null,target:$t,project:$proj,project_name:"unknown"}' \
+              > /tmp/acc2.json
           fi
           V=$(jq -r '.verdict // "?"' /tmp/acc2.json 2>/dev/null || echo '?')
           printf 'verdict=%s\n' "${V// /_}" >> "$GITHUB_OUTPUT"
@@ -130,19 +225,30 @@ jobs:
         id: digest
         run: |
           ts=$(date -u +%Y-%m-%dT%H:%MZ)
+          # Honest digest: render `null` ledger_rows / services as '?' instead
+          # of '0'. '0' has a specific meaning (writer is silent, table is
+          # empty); '?' means "we couldn't even ask". Conflating the two
+          # is what made the watchdog lie for 5 days straight (issue #143
+          # comments 2026-04-27..2026-05-03).
+          fmt() { jq -r 'if . == null then "?" else tostring end' <<<"$1"; }
           a1v=$(jq -r '.verdict      // "?"' /tmp/acc1.json)
-          a1s=$(jq -r '.services     // 0'   /tmp/acc1.json)
+          a1s=$(fmt "$(jq -r '.services'    /tmp/acc1.json)")
           a1e=$(jq -r '.events|length // 0'  /tmp/acc1.json)
-          a1l=$(jq -r '.ledger_rows  // 0'   /tmp/acc1.json)
+          a1l=$(fmt "$(jq -r '.ledger_rows' /tmp/acc1.json)")
+          a1p=$(jq -r '.probe_reason // ""'  /tmp/acc1.json)
           a2v=$(jq -r '.verdict      // "?"' /tmp/acc2.json)
-          a2s=$(jq -r '.services     // 0'   /tmp/acc2.json)
+          a2s=$(fmt "$(jq -r '.services'    /tmp/acc2.json)")
           a2e=$(jq -r '.events|length // 0'  /tmp/acc2.json)
-          a2l=$(jq -r '.ledger_rows  // 0'   /tmp/acc2.json)
+          a2l=$(fmt "$(jq -r '.ledger_rows' /tmp/acc2.json)")
+          a2p=$(jq -r '.probe_reason // ""'  /tmp/acc2.json)
           x1=${{ steps.acc1.outputs.exit }}
           x2=${{ steps.acc2.outputs.exit }}
 
-          # Combined exit. PASS only when both clusters agree.
-          if [[ "$x1" == "1" || "$x2" == "1" ]]; then combined=1
+          # Combined exit. PASS only when both clusters agree on PASS.
+          # PROBE_FAILED (3) and TOOL_ERROR (4) are NOT folded into DRIFT.
+          if [[ "$x1" == "3" || "$x2" == "3" ]]; then combined=3
+          elif [[ "$x1" == "4" || "$x2" == "4" ]]; then combined=4
+          elif [[ "$x1" == "1" || "$x2" == "1" ]]; then combined=1
           elif [[ "$x1" == "0" && "$x2" == "0" ]]; then combined=0
           else combined=2
           fi
@@ -151,13 +257,17 @@ jobs:
           {
             echo "## 🛰️ Audit Watchdog — $ts (target BPB $TARGET)"
             echo ""
-            echo "| Cluster | Project | Services | Ledger rows | Drift events | Verdict | Exit |"
-            echo "|---|---|---:|---:|---:|---|---:|"
-            echo "| Acc1 / IGLA           | $ACC1_PROJECT | $a1s | $a1l | $a1e | $a1v | $x1 |"
-            echo "| Acc2 / IGLA-MIRROR-2  | $ACC2_PROJECT | $a2s | $a2l | $a2e | $a2v | $x2 |"
+            echo "| Cluster | Project | Services | Ledger rows | Drift events | Verdict | Exit | Probe |"
+            echo "|---|---|---:|---:|---:|---|---:|---|"
+            echo "| Acc1 / IGLA           | $ACC1_PROJECT | $a1s | $a1l | $a1e | $a1v | $x1 | $a1p |"
+            echo "| Acc2 / IGLA-MIRROR-2  | $ACC2_PROJECT | $a2s | $a2l | $a2e | $a2v | $x2 | $a2p |"
             echo ""
-            echo "**Combined exit:** $combined  (0 = GATE-2 PASS · 1 = DRIFT · 2 = NOT YET)"
+            echo "**Combined exit:** $combined  (0 = GATE-2 PASS · 1 = DRIFT · 2 = NOT YET · 3 = PROBE_FAILED · 4 = TOOL_ERROR)"
             echo ""
+            if [[ "$combined" == "3" ]]; then
+              echo "> ⚠️ One or both Railway tokens failed the auth pre-probe. This is **not** a DRIFT — services and ledger state are unknown until the token is repaired (see [#61 RailwayMultiClient P0](https://github.com/gHashTag/trios-railway/issues/61))."
+              echo ""
+            fi
             echo "_Run: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}_"
             echo ""
             echo "phi^2 + phi^-2 = 3 · TRINITY · NEVER STOP"
@@ -188,3 +298,14 @@ jobs:
         run: |
           echo "::error::DRIFT detected on at least one cluster (acc1=${{ steps.acc1.outputs.exit }}, acc2=${{ steps.acc2.outputs.exit }})"
           exit 1
+
+      - name: Warn on PROBE_FAILED (do not fail)
+        if: ${{ steps.digest.outputs.combined == '3' }}
+        run: |
+          echo "::warning::Pre-probe failed on at least one cluster — token repair required, NOT a DRIFT (acc1=${{ steps.acc1.outputs.exit }}, acc2=${{ steps.acc2.outputs.exit }})"
+
+      - name: Warn on TOOL_ERROR (do not fail)
+        if: ${{ steps.digest.outputs.combined == '4' }}
+        run: |
+          echo "::warning::tri-railway produced no JSON output on at least one cluster — tool bug, NOT a DRIFT (acc1=${{ steps.acc1.outputs.exit }}, acc2=${{ steps.acc2.outputs.exit }})"
+