diff --git a/.env.schema b/.env.schema index 827538c..78876ea 100644 --- a/.env.schema +++ b/.env.schema @@ -190,3 +190,27 @@ BRIDGE_API_PORT=7890 # Target pi session ID (auto-detects control-agent if unset) # @sensitive=false @type=string PI_SESSION_ID= + +# Bridge restart policy mode: legacy (fixed 5s restart) or adaptive (backoff + jitter) +# @sensitive=false @type=string +BAUDBOT_BRIDGE_RESTART_POLICY= + +# Adaptive restart base delay (seconds) +# @sensitive=false @type=number +BAUDBOT_BRIDGE_RESTART_BASE_DELAY_SECONDS= + +# Adaptive restart max delay cap (seconds) +# @sensitive=false @type=number +BAUDBOT_BRIDGE_RESTART_MAX_DELAY_SECONDS= + +# Adaptive restart stable runtime window before counters reset (seconds) +# @sensitive=false @type=number +BAUDBOT_BRIDGE_RESTART_STABLE_WINDOW_SECONDS= + +# Adaptive restart degraded-state threshold for consecutive failures +# @sensitive=false @type=number +BAUDBOT_BRIDGE_RESTART_MAX_CONSECUTIVE_FAILURES= + +# Adaptive restart random jitter upper bound (seconds) +# @sensitive=false @type=number +BAUDBOT_BRIDGE_RESTART_JITTER_SECONDS= diff --git a/CONFIGURATION.md b/CONFIGURATION.md index 4149793..5ec38ce 100644 --- a/CONFIGURATION.md +++ b/CONFIGURATION.md @@ -172,6 +172,12 @@ Set during `setup.sh` / `baudbot install` via env vars: |----------|-------------|---------| | `BRIDGE_API_PORT` | Local HTTP API port for outbound Slack messages | `7890` | | `PI_SESSION_ID` | Target pi session ID for the bridge | Auto-detects control-agent | +| `BAUDBOT_BRIDGE_RESTART_POLICY` | Bridge supervisor mode (`legacy` or `adaptive`) | auto (`legacy` unless adaptive knobs are set) | +| `BAUDBOT_BRIDGE_RESTART_BASE_DELAY_SECONDS` | Adaptive mode base restart delay | `5` | +| `BAUDBOT_BRIDGE_RESTART_MAX_DELAY_SECONDS` | Adaptive mode max backoff delay | `300` | +| `BAUDBOT_BRIDGE_RESTART_STABLE_WINDOW_SECONDS` | Runtime window that resets failure/backoff counters | `120` | +| `BAUDBOT_BRIDGE_RESTART_MAX_CONSECUTIVE_FAILURES` | Threshold that marks supervisor state as degraded (`threshold_exceeded`) | `5` | +| `BAUDBOT_BRIDGE_RESTART_JITTER_SECONDS` | Random jitter added to each adaptive restart sleep | `2` | ## Example `.env` File diff --git a/bin/deploy.sh b/bin/deploy.sh index 10de7b5..2a06dcc 100755 --- a/bin/deploy.sh +++ b/bin/deploy.sh @@ -87,6 +87,7 @@ if [ "$DRY_RUN" -eq 0 ]; then [ -f "$BAUDBOT_SRC/bin/$script" ] && cp --no-preserve=ownership "$BAUDBOT_SRC/bin/$script" "$STAGE_DIR/bin/$script" done [ -f "$BAUDBOT_SRC/bin/lib/runtime-node.sh" ] && cp --no-preserve=ownership "$BAUDBOT_SRC/bin/lib/runtime-node.sh" "$STAGE_DIR/bin/lib/runtime-node.sh" + [ -f "$BAUDBOT_SRC/bin/lib/bridge-restart-policy.sh" ] && cp --no-preserve=ownership "$BAUDBOT_SRC/bin/lib/bridge-restart-policy.sh" "$STAGE_DIR/bin/lib/bridge-restart-policy.sh" [ -f "$BAUDBOT_SRC/pi/settings.json" ] && cp --no-preserve=ownership "$BAUDBOT_SRC/pi/settings.json" "$STAGE_DIR/settings.json" [ -f "$BAUDBOT_SRC/.env.schema" ] && cp --no-preserve=ownership "$BAUDBOT_SRC/.env.schema" "$STAGE_DIR/.env.schema" chmod -R a+rX "$STAGE_DIR" @@ -263,6 +264,12 @@ if [ "$DRY_RUN" -eq 0 ]; then log "✓ bin/lib/runtime-node.sh" fi + if [ -f "$STAGE_DIR/bin/lib/bridge-restart-policy.sh" ]; then + as_agent cp "$STAGE_DIR/bin/lib/bridge-restart-policy.sh" "$BAUDBOT_HOME/runtime/bin/lib/bridge-restart-policy.sh" + as_agent chmod u+r "$BAUDBOT_HOME/runtime/bin/lib/bridge-restart-policy.sh" + log "✓ bin/lib/bridge-restart-policy.sh" + fi + as_agent cp "$STAGE_DIR/start.sh" "$BAUDBOT_HOME/runtime/start.sh" as_agent chmod u+x "$BAUDBOT_HOME/runtime/start.sh" log "✓ start.sh" diff --git a/bin/lib/baudbot-runtime.sh b/bin/lib/baudbot-runtime.sh index f45a20f..3295a7f 100644 --- a/bin/lib/baudbot-runtime.sh +++ b/bin/lib/baudbot-runtime.sh @@ -193,6 +193,61 @@ PY [ -n "$components_line" ] && echo -e "${BOLD}broker health:${RESET} $components_line" } +print_bridge_supervisor_status() { + local agent_user="${BAUDBOT_AGENT_USER:-baudbot_agent}" + local status_file="/home/$agent_user/.pi/agent/slack-bridge-supervisor.json" + local summary="" + local mode="" + local state="" + local failures="" + local threshold="" + + if [ ! -r "$status_file" ]; then + return 0 + fi + + summary="$(python3 - "$status_file" <<'PY' +import json +import sys + +path = sys.argv[1] +try: + with open(path, 'r', encoding='utf-8') as f: + data = json.load(f) +except Exception: + print('') + sys.exit(0) + +print(data.get('mode', 'unknown')) +print(data.get('state', 'unknown')) +print(data.get('consecutive_failures', 0)) +print(data.get('max_consecutive_failures', 0)) +PY + )" + + mode="$(printf '%s\n' "$summary" | sed -n '1p')" + state="$(printf '%s\n' "$summary" | sed -n '2p')" + failures="$(printf '%s\n' "$summary" | sed -n '3p')" + threshold="$(printf '%s\n' "$summary" | sed -n '4p')" + + [ -n "$mode" ] || return 0 + + case "$state" in + threshold_exceeded) + echo -e "${BOLD}bridge supervisor:${RESET} degraded (mode=$mode failures=$failures threshold=$threshold)" + ;; + restarting) + echo -e "${BOLD}bridge supervisor:${RESET} restarting (mode=$mode failures=$failures)" + ;; + running) + echo -e "${BOLD}bridge supervisor:${RESET} healthy (mode=$mode)" + ;; + *) + echo -e "${BOLD}bridge supervisor:${RESET} $state (mode=$mode)" + ;; + esac +} + pi_control_dir() { local agent_user="${1:-baudbot_agent}" echo "/home/$agent_user/.pi/session-control" @@ -290,6 +345,7 @@ cmd_status() { echo "" print_deployed_version print_broker_connection_status + print_bridge_supervisor_status exit "$status_rc" fi @@ -302,6 +358,7 @@ cmd_status() { echo "" print_deployed_version print_broker_connection_status + print_bridge_supervisor_status } cmd_logs() { diff --git a/bin/lib/bridge-restart-policy.sh b/bin/lib/bridge-restart-policy.sh new file mode 100644 index 0000000..d973cef --- /dev/null +++ b/bin/lib/bridge-restart-policy.sh @@ -0,0 +1,189 @@ +#!/usr/bin/env bash +# Shared Slack bridge restart policy helpers. + +bb_bridge_policy_mode() { + if [ -n "${BAUDBOT_BRIDGE_RESTART_POLICY:-}" ]; then + case "${BAUDBOT_BRIDGE_RESTART_POLICY}" in + adaptive|ADAPTIVE|Adaptive) echo "adaptive"; return 0 ;; + legacy|LEGACY|Legacy) echo "legacy"; return 0 ;; + esac + fi + + if [ -n "${BAUDBOT_BRIDGE_RESTART_BASE_DELAY_SECONDS:-}" ] \ + || [ -n "${BAUDBOT_BRIDGE_RESTART_MAX_DELAY_SECONDS:-}" ] \ + || [ -n "${BAUDBOT_BRIDGE_RESTART_STABLE_WINDOW_SECONDS:-}" ] \ + || [ -n "${BAUDBOT_BRIDGE_RESTART_MAX_CONSECUTIVE_FAILURES:-}" ] \ + || [ -n "${BAUDBOT_BRIDGE_RESTART_JITTER_SECONDS:-}" ]; then + echo "adaptive" + return 0 + fi + + # Backward-compatible fallback when no policy configuration is provided. + echo "legacy" +} + +bb_bridge_policy_int() { + local raw="${1:-}" + local fallback="${2:-0}" + + if [ -z "$raw" ]; then + echo "$fallback" + return 0 + fi + + if [[ "$raw" =~ ^[0-9]+$ ]]; then + echo "$raw" + return 0 + fi + + echo "$fallback" +} + +bb_bridge_policy_compute_next_delay() { + local current="$1" + local max_delay="$2" + local doubled=$((current * 2)) + + if [ "$doubled" -gt "$max_delay" ]; then + echo "$max_delay" + else + echo "$doubled" + fi +} + +bb_bridge_policy_random_jitter() { + local max_jitter="$1" + + if [ "$max_jitter" -le 0 ]; then + echo 0 + return 0 + fi + + echo $((RANDOM % (max_jitter + 1))) +} + +bb_bridge_policy_log() { + local log_file="$1" + shift + + if [ -z "$log_file" ]; then + return 0 + fi + + printf '[%s] bridge-supervisor %s\n' "$(date -Is)" "$*" >>"$log_file" +} + +bb_bridge_policy_write_status() { + local status_file="$1" + local mode="$2" + local bridge_script="$3" + local state="$4" + local consecutive_failures="$5" + local delay_seconds="$6" + local max_failures="$7" + local last_exit_code="$8" + local last_runtime_seconds="$9" + + [ -n "$status_file" ] || return 0 + mkdir -p "$(dirname "$status_file")" 2>/dev/null || true + + cat >"$status_file" <>"$log_file" 2>&1; then + exit_code=0 + else + exit_code=$? + fi + + bb_bridge_policy_log "$log_file" "event=restart_scheduled mode=legacy script=$bridge_script exit_code=$exit_code delay_seconds=5" + bb_bridge_policy_write_status "$status_file" "$mode" "$bridge_script" "restarting" 0 5 0 "$exit_code" 0 + sleep 5 + done + fi + + local base_delay max_delay stable_window max_failures max_jitter + base_delay="$(bb_bridge_policy_int "${BAUDBOT_BRIDGE_RESTART_BASE_DELAY_SECONDS:-}" 5)" + max_delay="$(bb_bridge_policy_int "${BAUDBOT_BRIDGE_RESTART_MAX_DELAY_SECONDS:-}" 300)" + stable_window="$(bb_bridge_policy_int "${BAUDBOT_BRIDGE_RESTART_STABLE_WINDOW_SECONDS:-}" 120)" + max_failures="$(bb_bridge_policy_int "${BAUDBOT_BRIDGE_RESTART_MAX_CONSECUTIVE_FAILURES:-}" 5)" + max_jitter="$(bb_bridge_policy_int "${BAUDBOT_BRIDGE_RESTART_JITTER_SECONDS:-}" 2)" + + if [ "$max_delay" -lt "$base_delay" ]; then + max_delay="$base_delay" + fi + + local consecutive_failures=0 + local current_delay="$base_delay" + + bb_bridge_policy_log "$log_file" "event=policy_selected mode=adaptive base_delay_seconds=$base_delay max_delay_seconds=$max_delay stable_window_seconds=$stable_window max_consecutive_failures=$max_failures max_jitter_seconds=$max_jitter" + bb_bridge_policy_write_status "$status_file" "$mode" "$bridge_script" "running" "$consecutive_failures" "$current_delay" "$max_failures" 0 0 + + while true; do + local started_at finished_at runtime_seconds exit_code + started_at="$(date +%s)" + if "$@" >>"$log_file" 2>&1; then + exit_code=0 + else + exit_code=$? + fi + finished_at="$(date +%s)" + runtime_seconds=$((finished_at - started_at)) + + local reset_failures=0 + local scheduled_delay="$current_delay" + if [ "$runtime_seconds" -ge "$stable_window" ]; then + reset_failures=1 + consecutive_failures=0 + scheduled_delay="$base_delay" + current_delay="$base_delay" + bb_bridge_policy_log "$log_file" "event=stable_window_reset mode=adaptive script=$bridge_script runtime_seconds=$runtime_seconds stable_window_seconds=$stable_window" + else + consecutive_failures=$((consecutive_failures + 1)) + scheduled_delay="$current_delay" + current_delay="$(bb_bridge_policy_compute_next_delay "$current_delay" "$max_delay")" + fi + + local jitter_seconds total_sleep_seconds + jitter_seconds="$(bb_bridge_policy_random_jitter "$max_jitter")" + total_sleep_seconds=$((scheduled_delay + jitter_seconds)) + + local state="restarting" + if [ "$max_failures" -gt 0 ] && [ "$consecutive_failures" -ge "$max_failures" ]; then + state="threshold_exceeded" + bb_bridge_policy_log "$log_file" "event=restart_threshold_exceeded mode=adaptive script=$bridge_script consecutive_failures=$consecutive_failures threshold=$max_failures exit_code=$exit_code runtime_seconds=$runtime_seconds" + fi + + bb_bridge_policy_log "$log_file" "event=restart_scheduled mode=adaptive script=$bridge_script exit_code=$exit_code runtime_seconds=$runtime_seconds reset_failures=$reset_failures consecutive_failures=$consecutive_failures backoff_seconds=$scheduled_delay next_backoff_seconds=$current_delay jitter_seconds=$jitter_seconds sleep_seconds=$total_sleep_seconds" + bb_bridge_policy_write_status "$status_file" "$mode" "$bridge_script" "$state" "$consecutive_failures" "$scheduled_delay" "$max_failures" "$exit_code" "$runtime_seconds" + + sleep "$total_sleep_seconds" + done +} diff --git a/bin/lib/bridge-restart-policy.test.sh b/bin/lib/bridge-restart-policy.test.sh new file mode 100644 index 0000000..60dfcb6 --- /dev/null +++ b/bin/lib/bridge-restart-policy.test.sh @@ -0,0 +1,113 @@ +#!/bin/bash +# Tests for bin/lib/bridge-restart-policy.sh + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +# shellcheck source=bin/lib/bridge-restart-policy.sh +source "$SCRIPT_DIR/bridge-restart-policy.sh" + +TOTAL=0 +PASSED=0 +FAILED=0 + +run_test() { + local name="$1" + shift + local out + + TOTAL=$((TOTAL + 1)) + printf " %-45s " "$name" + + out="$(mktemp /tmp/baudbot-bridge-restart-policy-test.XXXXXX)" + if "$@" >"$out" 2>&1; then + echo "✓" + PASSED=$((PASSED + 1)) + else + echo "✗ FAILED" + tail -40 "$out" | sed 's/^/ /' + FAILED=$((FAILED + 1)) + fi + rm -f "$out" +} + +test_mode_defaults_to_legacy() { + ( + set -euo pipefail + unset BAUDBOT_BRIDGE_RESTART_POLICY + unset BAUDBOT_BRIDGE_RESTART_BASE_DELAY_SECONDS + unset BAUDBOT_BRIDGE_RESTART_MAX_DELAY_SECONDS + unset BAUDBOT_BRIDGE_RESTART_STABLE_WINDOW_SECONDS + unset BAUDBOT_BRIDGE_RESTART_MAX_CONSECUTIVE_FAILURES + unset BAUDBOT_BRIDGE_RESTART_JITTER_SECONDS + + [ "$(bb_bridge_policy_mode)" = "legacy" ] + ) +} + +test_mode_uses_explicit_policy_override() { + ( + set -euo pipefail + export BAUDBOT_BRIDGE_RESTART_POLICY="adaptive" + [ "$(bb_bridge_policy_mode)" = "adaptive" ] + + export BAUDBOT_BRIDGE_RESTART_POLICY="legacy" + [ "$(bb_bridge_policy_mode)" = "legacy" ] + ) +} + +test_mode_enables_adaptive_when_policy_vars_set() { + ( + set -euo pipefail + unset BAUDBOT_BRIDGE_RESTART_POLICY + export BAUDBOT_BRIDGE_RESTART_BASE_DELAY_SECONDS="7" + + [ "$(bb_bridge_policy_mode)" = "adaptive" ] + ) +} + +test_int_parser_falls_back_for_invalid_values() { + ( + set -euo pipefail + [ "$(bb_bridge_policy_int "" 9)" = "9" ] + [ "$(bb_bridge_policy_int "abc" 9)" = "9" ] + [ "$(bb_bridge_policy_int "12" 9)" = "12" ] + ) +} + +test_next_delay_doubles_and_caps() { + ( + set -euo pipefail + [ "$(bb_bridge_policy_compute_next_delay 5 30)" = "10" ] + [ "$(bb_bridge_policy_compute_next_delay 20 30)" = "30" ] + ) +} + +test_jitter_within_range() { + ( + set -euo pipefail + local n val + for ((n = 0; n < 50; n++)); do + val="$(bb_bridge_policy_random_jitter 2)" + [ "$val" -ge 0 ] + [ "$val" -le 2 ] + done + ) +} + +echo "=== bridge-restart-policy tests ===" +echo "" + +run_test "mode: defaults to legacy" test_mode_defaults_to_legacy +run_test "mode: explicit policy override" test_mode_uses_explicit_policy_override +run_test "mode: adaptive when vars set" test_mode_enables_adaptive_when_policy_vars_set +run_test "int parser: invalid values fallback" test_int_parser_falls_back_for_invalid_values +run_test "backoff: doubles and caps" test_next_delay_doubles_and_caps +run_test "jitter: bounded range" test_jitter_within_range + +echo "" +echo "=== $PASSED/$TOTAL passed, $FAILED failed ===" + +if [ "$FAILED" -gt 0 ]; then + exit 1 +fi diff --git a/pi/skills/control-agent/SKILL.md b/pi/skills/control-agent/SKILL.md index 352f1ec..ff6bd72 100644 --- a/pi/skills/control-agent/SKILL.md +++ b/pi/skills/control-agent/SKILL.md @@ -345,6 +345,7 @@ If you need to restart the bridge manually, rerun startup cleanup and then inspe ```bash bash ~/.pi/agent/skills/control-agent/startup-cleanup.sh UUID1 UUID2 UUID3 tail -n 200 ~/.pi/agent/logs/slack-bridge.log +cat ~/.pi/agent/slack-bridge-supervisor.json ``` Verify: `curl -s -o /dev/null -w '%{http_code}' -X POST http://127.0.0.1:7890/send -H 'Content-Type: application/json' -d '{}'` → should return `400`. diff --git a/pi/skills/control-agent/startup-cleanup.sh b/pi/skills/control-agent/startup-cleanup.sh index 664a28d..ade0dc7 100755 --- a/pi/skills/control-agent/startup-cleanup.sh +++ b/pi/skills/control-agent/startup-cleanup.sh @@ -11,6 +11,12 @@ set -euo pipefail +BRIDGE_POLICY_HELPER="$HOME/runtime/bin/lib/bridge-restart-policy.sh" +if [ -r "$BRIDGE_POLICY_HELPER" ]; then + # shellcheck source=bin/lib/bridge-restart-policy.sh + source "$BRIDGE_POLICY_HELPER" +fi + SOCKET_DIR="$HOME/.pi/session-control" if [ $# -eq 0 ]; then @@ -69,6 +75,7 @@ fi BRIDGE_PID_FILE="$HOME/.pi/agent/slack-bridge.pid" BRIDGE_LOG_DIR="$HOME/.pi/agent/logs" BRIDGE_LOG_FILE="$BRIDGE_LOG_DIR/slack-bridge.log" +BRIDGE_STATUS_FILE="$HOME/.pi/agent/slack-bridge-supervisor.json" kill_bridge_supervisor() { local bridge_pid="$1" @@ -136,11 +143,21 @@ mkdir -p "$BRIDGE_LOG_DIR" export PATH="$HOME/.varlock/bin:$HOME/opt/node/bin:$PATH" export PI_SESSION_ID="$MY_UUID" cd /opt/baudbot/current/slack-bridge - while true; do - varlock run --path ~/.config/ -- node "$BRIDGE_SCRIPT" >>"$BRIDGE_LOG_FILE" 2>&1 - echo "[$(date -Is)] ⚠️ Bridge exited ($?), restarting in 5s..." >>"$BRIDGE_LOG_FILE" - sleep 5 - done + + if command -v bb_bridge_supervise >/dev/null 2>&1; then + bb_bridge_supervise "$BRIDGE_LOG_FILE" "$BRIDGE_STATUS_FILE" "$BRIDGE_SCRIPT" \ + varlock run --path ~/.config/ -- node "$BRIDGE_SCRIPT" + else + while true; do + if varlock run --path ~/.config/ -- node "$BRIDGE_SCRIPT" >>"$BRIDGE_LOG_FILE" 2>&1; then + exit_code=0 + else + exit_code=$? + fi + echo "[$(date -Is)] bridge-supervisor event=restart_scheduled mode=legacy script=$BRIDGE_SCRIPT exit_code=$exit_code delay_seconds=5" >>"$BRIDGE_LOG_FILE" + sleep 5 + done + fi ) & NEW_BRIDGE_PID=$! echo "$NEW_BRIDGE_PID" > "$BRIDGE_PID_FILE" diff --git a/start.sh b/start.sh index e64558e..8449f0f 100755 --- a/start.sh +++ b/start.sh @@ -14,6 +14,8 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" # shellcheck source=bin/lib/runtime-node.sh source "$SCRIPT_DIR/bin/lib/runtime-node.sh" +# shellcheck source=bin/lib/bridge-restart-policy.sh +source "$SCRIPT_DIR/bin/lib/bridge-restart-policy.sh" cd ~ NODE_BIN_DIR="$(bb_resolve_runtime_node_bin_dir "$HOME")" @@ -90,6 +92,7 @@ if [ -n "$BRIDGE_SCRIPT" ]; then RELEASE_BRIDGE="/opt/baudbot/current/slack-bridge" BRIDGE_LOG_DIR="$HOME/.pi/agent/logs" BRIDGE_LOG_FILE="$BRIDGE_LOG_DIR/slack-bridge.log" + BRIDGE_STATUS_FILE="$HOME/.pi/agent/slack-bridge-supervisor.json" BRIDGE_PID_FILE="$HOME/.pi/agent/slack-bridge.pid" mkdir -p "$BRIDGE_LOG_DIR" @@ -109,11 +112,8 @@ if [ -n "$BRIDGE_SCRIPT" ]; then ( export PATH="$HOME/.varlock/bin:$NODE_BIN_DIR:$PATH" cd "$RELEASE_BRIDGE" - while true; do - varlock run --path ~/.config/ -- node "$BRIDGE_SCRIPT" >>"$BRIDGE_LOG_FILE" 2>&1 - echo "[$(date -Is)] ⚠️ Bridge exited ($?), restarting in 5s..." >>"$BRIDGE_LOG_FILE" - sleep 5 - done + bb_bridge_supervise "$BRIDGE_LOG_FILE" "$BRIDGE_STATUS_FILE" "$BRIDGE_SCRIPT" \ + varlock run --path ~/.config/ -- node "$BRIDGE_SCRIPT" ) & # Intentionally track the supervisor subshell PID (not per-restart node child PID) # so a single kill stops the entire bridge restart loop. diff --git a/test/shell-scripts.test.mjs b/test/shell-scripts.test.mjs index 03eefd2..06d35cb 100644 --- a/test/shell-scripts.test.mjs +++ b/test/shell-scripts.test.mjs @@ -39,6 +39,10 @@ describe("shell script test suites", () => { expect(() => runScript("bin/lib/deploy-common.test.sh")).not.toThrow(); }); + it("bridge restart policy helpers", () => { + expect(() => runScript("bin/lib/bridge-restart-policy.test.sh")).not.toThrow(); + }); + it("doctor helpers", () => { expect(() => runScript("bin/lib/doctor-common.test.sh")).not.toThrow(); });