From 4b7a4a2c9df3f833d45abc8b9582547955f714f2 Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Sat, 2 May 2026 13:36:03 -0700 Subject: [PATCH 01/65] refactor(cli): extract sandbox live state helpers --- src/lib/nemoclaw-runtime-bridge.ts | 1 - src/lib/sandbox-gateway-state-action.ts | 438 ++++++++++++++++++++++++ src/lib/share-command-deps.ts | 39 +-- src/lib/terminal-style.ts | 15 + src/nemoclaw.ts | 431 +---------------------- 5 files changed, 480 insertions(+), 444 deletions(-) create mode 100644 src/lib/sandbox-gateway-state-action.ts create mode 100644 src/lib/terminal-style.ts diff --git a/src/lib/nemoclaw-runtime-bridge.ts b/src/lib/nemoclaw-runtime-bridge.ts index 645aee0073..ff51cd553c 100644 --- a/src/lib/nemoclaw-runtime-bridge.ts +++ b/src/lib/nemoclaw-runtime-bridge.ts @@ -10,7 +10,6 @@ export interface SandboxConnectOptions { export interface NemoClawRuntimeBridge { sandboxConnect: (sandboxName: string, options?: SandboxConnectOptions) => Promise; sandboxDestroy: (sandboxName: string, args?: string[]) => Promise; - sandboxLogs: (sandboxName: string, follow: boolean) => void; sandboxRebuild: (sandboxName: string, args?: string[]) => Promise; sandboxSkillInstall: (sandboxName: string, args?: string[]) => Promise; sandboxStatus: (sandboxName: string) => Promise; diff --git a/src/lib/sandbox-gateway-state-action.ts b/src/lib/sandbox-gateway-state-action.ts new file mode 100644 index 0000000000..00a7a88677 --- /dev/null +++ b/src/lib/sandbox-gateway-state-action.ts @@ -0,0 +1,438 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +/* v8 ignore start -- extracted legacy sandbox liveness paths are covered through CLI subprocess tests. */ + +import fs from "node:fs"; +import os from "node:os"; +import path from "node:path"; + +import { CLI_DISPLAY_NAME, CLI_NAME } from "./branding"; +import { parseSandboxPhase } from "./gateway-state"; +import { + getNamedGatewayLifecycleState, + recoverNamedGatewayRuntime, +} from "./gateway-runtime-action"; +const { pruneKnownHostsEntries } = require("./onboard") as { + pruneKnownHostsEntries: (contents: string) => string; +}; +import * as onboardSession from "./onboard-session"; +import type { Session } from "./onboard-session"; +import { stripAnsi } from "./openshell"; +import { + captureOpenshell, + captureOpenshellForStatus, + getStatusProbeTimeoutMs, + isCommandTimeout, + runOpenshell, +} from "./openshell-runtime"; +import { + OPENSHELL_OPERATION_TIMEOUT_MS, + OPENSHELL_PROBE_TIMEOUT_MS, +} from "./openshell-timeouts"; +import * as registry from "./registry"; + +type SandboxGatewayState = { + state: string; + output: string; + activeGateway?: string | null; + recoveredGateway?: boolean; + recoveryVia?: string | null; + gatewayRecoveryFailed?: boolean; +}; + +type SandboxGatewayStateLookup = ( + sandboxName: string, +) => SandboxGatewayState | Promise; + +export function mergeLivePolicyIntoSandboxOutput( + output: string, + livePolicyOutput: string, +): string { + const rawLines = String(output).split("\n"); + const cleanLines = stripAnsi(String(output)).split("\n"); + const policyLineIdx = cleanLines.findIndex((line: string) => line.trim() === "Policy:"); + if (policyLineIdx === -1) return output; + + const before = rawLines.slice(0, policyLineIdx + 1).join("\n"); + const delimIdx = livePolicyOutput.search(/^---\s*$/m); + const yamlPart = + delimIdx !== -1 + ? livePolicyOutput.slice(delimIdx).replace(/^---\s*[\r\n]+/, "") + : livePolicyOutput; + const trimmedYaml = yamlPart.trim(); + const looksLikeError = /^(error|failed|invalid|warning|status)\b/i.test(trimmedYaml); + if (!trimmedYaml || looksLikeError || !/^[a-z_][a-z0-9_]*\s*:/m.test(trimmedYaml)) { + return output; + } + + const indented = trimmedYaml + .split("\n") + .map((line: string) => (line ? ` ${line}` : line)) + .join("\n"); + return `${before}\n\n${indented}\n`; +} + +/** Query sandbox presence and return its output with the live enforced policy. */ +export function getSandboxGatewayState(sandboxName: string): SandboxGatewayState { + const result = captureOpenshell(["sandbox", "get", sandboxName], { + timeout: OPENSHELL_PROBE_TIMEOUT_MS, + }); + let output = result.output; + if (result.status === 0) { + const livePolicy = captureOpenshell(["policy", "get", "--full", sandboxName], { + ignoreError: true, + timeout: OPENSHELL_PROBE_TIMEOUT_MS, + }); + if (livePolicy.status === 0 && livePolicy.output.trim()) { + output = mergeLivePolicyIntoSandboxOutput(output, livePolicy.output); + } + return { state: "present", output }; + } + if (/\bNotFound\b|\bNot Found\b|sandbox not found/i.test(output)) { + return { state: "missing", output }; + } + if ( + /transport error|Connection refused|handshake verification failed|Missing gateway auth token|device identity required/i.test( + output, + ) + ) { + return { state: "gateway_error", output }; + } + return { state: "unknown_error", output }; +} + +export async function getSandboxGatewayStateForStatus( + sandboxName: string, +): Promise { + const timeoutMs = getStatusProbeTimeoutMs(); + const result = await captureOpenshellForStatus(["sandbox", "get", sandboxName], { + timeout: timeoutMs, + }); + let output = result.output; + if (isCommandTimeout(result)) { + return { + state: "status_probe_timeout", + output: ` Live sandbox status probe timed out after ${Math.ceil(timeoutMs / 1000)}s. Local registry data is shown above.`, + }; + } + if (result.status === 0) { + const livePolicy = await captureOpenshellForStatus(["policy", "get", "--full", sandboxName], { + ignoreError: true, + timeout: timeoutMs, + }); + if (!isCommandTimeout(livePolicy) && livePolicy.status === 0 && livePolicy.output.trim()) { + output = mergeLivePolicyIntoSandboxOutput(output, livePolicy.output); + } + return { state: "present", output }; + } + if (/\bNotFound\b|\bNot Found\b|sandbox not found/i.test(output)) { + return { state: "missing", output }; + } + if ( + /transport error|Connection refused|handshake verification failed|Missing gateway auth token|device identity required/i.test( + output, + ) + ) { + return { state: "gateway_error", output }; + } + return { state: "unknown_error", output }; +} + +/** + * Reconcile a NotFound sandbox lookup against the named NemoClaw gateway state. + * When the active OpenShell gateway has drifted off nemoclaw, a NotFound is + * ambiguous: the sandbox may actually be registered against the nemoclaw + * gateway but invisible because some other gateway is currently active. This + * helper self-heals by attempting `openshell gateway select nemoclaw` and + * re-queries, or returns a `wrong_gateway_active` state so callers can surface + * actionable guidance instead of destroying the registry entry. + */ +export function reconcileMissingAgainstNamedGateway( + sandboxName: string, + missingLookup: SandboxGatewayState, +): SandboxGatewayState { + const lifecycle = getNamedGatewayLifecycleState(); + if (lifecycle.state === "connected_other") { + runOpenshell(["gateway", "select", "nemoclaw"], { + ignoreError: true, + timeout: OPENSHELL_OPERATION_TIMEOUT_MS, + }); + const retry = getSandboxGatewayState(sandboxName); + if (retry.state === "present") { + return { ...retry, recoveredGateway: true, recoveryVia: "select" }; + } + if (retry.state === "missing") { + const after = getNamedGatewayLifecycleState(); + if (after.state === "healthy_named") { + return retry; + } + } + return { + state: "wrong_gateway_active", + activeGateway: lifecycle.activeGateway, + output: lifecycle.status, + }; + } + if (lifecycle.state === "missing_named") { + return { state: "gateway_missing_after_restart", output: lifecycle.status }; + } + if (lifecycle.state === "named_unreachable" || lifecycle.state === "named_unhealthy") { + return { state: "gateway_unreachable_after_restart", output: lifecycle.status }; + } + return missingLookup; +} + +/** + * Print actionable guidance when the nemoclaw gateway exists but another + * OpenShell gateway is currently active. Emphasizes that the sandbox has NOT + * been removed and how to switch gateways before retrying. (#2276) + */ +export function printWrongGatewayActiveGuidance( + sandboxName: string, + activeGateway: string | null | undefined, + writer: (message: string) => void = console.error, +): void { + const other = activeGateway && activeGateway !== "nemoclaw" ? activeGateway : "another gateway"; + writer( + ` Sandbox '${sandboxName}' is registered against the ${CLI_DISPLAY_NAME} gateway, but the currently active OpenShell gateway is '${other}'. Your sandbox has NOT been removed.`, + ); + writer(" Switch gateways and retry:"); + writer(" openshell gateway select nemoclaw"); + writer(` Then re-run: ${CLI_NAME} ${sandboxName} connect`); +} + +/** Print troubleshooting hints based on gateway lifecycle state in the output. */ +export function printGatewayLifecycleHint( + output = "", + sandboxName = "", + writer: (message: string) => void = console.error, +): void { + const cleanOutput = stripAnsi(output); + if (/No gateway configured/i.test(cleanOutput)) { + writer( + ` The selected ${CLI_DISPLAY_NAME} gateway is no longer configured or its metadata/runtime has been lost.`, + ); + writer( + " Start the gateway again with `openshell gateway start --name nemoclaw` before expecting existing sandboxes to reconnect.", + ); + writer( + " If the gateway has to be rebuilt from scratch, recreate the affected sandbox afterward.", + ); + return; + } + if ( + /Connection refused|client error \(Connect\)|tcp connect error/i.test(cleanOutput) && + /Gateway:\s+nemoclaw/i.test(cleanOutput) + ) { + writer( + " The selected NemoClaw gateway exists in metadata, but its API is refusing connections after restart.", + ); + writer(" This usually means the gateway runtime did not come back cleanly after the restart."); + writer( + " Retry `openshell gateway start --name nemoclaw`; if it stays in this state, rebuild the gateway before expecting existing sandboxes to reconnect.", + ); + return; + } + if (/handshake verification failed/i.test(cleanOutput)) { + writer(" This looks like gateway identity drift after restart."); + writer( + " Existing sandboxes may still be recorded locally, but the current gateway no longer trusts their prior connection state.", + ); + writer( + ` Try re-establishing the ${CLI_DISPLAY_NAME} gateway/runtime first. If the sandbox is still unreachable, recreate just that sandbox with \`${CLI_NAME} onboard\`.`, + ); + return; + } + if (/Connection refused|transport error/i.test(cleanOutput)) { + writer( + ` The sandbox '${sandboxName}' may still exist, but the current gateway/runtime is not reachable.`, + ); + writer(" Check `openshell status`, verify the active gateway, and retry."); + return; + } + if (/Missing gateway auth token|device identity required/i.test(cleanOutput)) { + writer( + " The gateway is reachable, but the current auth or device identity state is not usable.", + ); + writer(" Verify the active gateway and retry after re-establishing the runtime."); + } +} + +export async function getReconciledSandboxGatewayState( + sandboxName: string, + opts: { getState?: SandboxGatewayStateLookup } = {}, +): Promise { + const getState = opts.getState ?? getSandboxGatewayState; + const lookup = await getState(sandboxName); + if (lookup.state === "present") { + return lookup; + } + if (lookup.state === "missing") { + return reconcileMissingAgainstNamedGateway(sandboxName, lookup); + } + + if (lookup.state === "gateway_error") { + const recovery = await recoverNamedGatewayRuntime(); + if (recovery.recovered) { + const retried = await getState(sandboxName); + if (retried.state === "present" || retried.state === "missing") { + return { ...retried, recoveredGateway: true, recoveryVia: recovery.via || null }; + } + if (/handshake verification failed/i.test(retried.output)) { + return { + state: "identity_drift", + output: retried.output, + recoveredGateway: true, + recoveryVia: recovery.via || null, + }; + } + return { ...retried, recoveredGateway: true, recoveryVia: recovery.via || null }; + } + const latestLifecycle = getNamedGatewayLifecycleState(); + const latestStatus = stripAnsi(latestLifecycle.status || ""); + if (/No gateway configured/i.test(latestStatus)) { + return { + state: "gateway_missing_after_restart", + output: latestLifecycle.status || lookup.output, + }; + } + if ( + /Connection refused|client error \(Connect\)|tcp connect error/i.test(latestStatus) && + /Gateway:\s+nemoclaw/i.test(latestStatus) + ) { + return { + state: "gateway_unreachable_after_restart", + output: latestLifecycle.status || lookup.output, + }; + } + if ( + recovery.after?.state === "named_unreachable" || + recovery.before?.state === "named_unreachable" + ) { + return { + state: "gateway_unreachable_after_restart", + output: recovery.after?.status || recovery.before?.status || lookup.output, + }; + } + return { ...lookup, gatewayRecoveryFailed: true }; + } + + return lookup; +} + +export async function ensureLiveSandboxOrExit( + sandboxName: string, + { allowNonReadyPhase = false }: { allowNonReadyPhase?: boolean } = {}, +): Promise { + const lookup = await getReconciledSandboxGatewayState(sandboxName); + if (lookup.state === "present") { + const phase = parseSandboxPhase(lookup.output || ""); + if (!allowNonReadyPhase && phase && phase !== "Ready") { + console.error(` Sandbox '${sandboxName}' is stuck in '${phase}' phase.`); + console.error( + " This usually happens when a process crash inside the sandbox prevented clean startup.", + ); + console.error(""); + console.error( + ` Run \`${CLI_NAME} ${sandboxName} rebuild --yes\` to recreate the sandbox (--yes skips the confirmation prompt; workspace state will be preserved).`, + ); + process.exit(1); + } + return lookup; + } + if (lookup.state === "missing") { + const guard = getNamedGatewayLifecycleState(); + if (guard.state !== "healthy_named") { + if (guard.state === "connected_other") { + printWrongGatewayActiveGuidance(sandboxName, guard.activeGateway, console.error); + } else { + printGatewayLifecycleHint(guard.status || "", sandboxName, console.error); + } + process.exit(1); + } + registry.removeSandbox(sandboxName); + const session = onboardSession.loadSession(); + if (session && session.sandboxName === sandboxName) { + onboardSession.updateSession((state: Session) => { + state.sandboxName = null; + return state; + }); + } + console.error(` Sandbox '${sandboxName}' is not present in the live OpenShell gateway.`); + console.error(" Removed stale local registry entry."); + console.error( + ` Run \`${CLI_NAME} list\` to confirm the remaining sandboxes, or \`${CLI_NAME} onboard\` to create a new one.`, + ); + process.exit(1); + } + if (lookup.state === "wrong_gateway_active") { + printWrongGatewayActiveGuidance(sandboxName, lookup.activeGateway, console.error); + process.exit(1); + } + if (lookup.state === "identity_drift") { + console.error(" Gateway SSH identity changed after restart — clearing stale host keys..."); + const knownHostsPath = path.join(os.homedir(), ".ssh", "known_hosts"); + if (fs.existsSync(knownHostsPath)) { + try { + const kh = fs.readFileSync(knownHostsPath, "utf8"); + const cleaned = pruneKnownHostsEntries(kh); + if (cleaned !== kh) fs.writeFileSync(knownHostsPath, cleaned); + } catch { + /* best-effort cleanup */ + } + } + const retry = await getReconciledSandboxGatewayState(sandboxName); + if (retry.state === "present") { + console.error(" ✓ Reconnected after clearing stale SSH host keys."); + return retry; + } + console.error( + ` Could not reconnect to sandbox '${sandboxName}' after clearing stale host keys.`, + ); + if (retry.output) { + console.error(retry.output); + } + console.error( + ` Recreate this sandbox with \`${CLI_NAME} onboard\` once the gateway runtime is stable.`, + ); + process.exit(1); + } + if (lookup.state === "gateway_unreachable_after_restart") { + console.error( + ` Sandbox '${sandboxName}' may still exist, but the selected ${CLI_DISPLAY_NAME} gateway is still refusing connections after restart.`, + ); + if (lookup.output) { + console.error(lookup.output); + } + console.error( + " Retry `openshell gateway start --name nemoclaw` and verify `openshell status` is healthy before reconnecting.", + ); + console.error( + " If the gateway never becomes healthy, rebuild the gateway and then recreate the affected sandbox.", + ); + process.exit(1); + } + if (lookup.state === "gateway_missing_after_restart") { + console.error( + ` Sandbox '${sandboxName}' may still exist locally, but the ${CLI_DISPLAY_NAME} gateway is no longer configured after restart/rebuild.`, + ); + if (lookup.output) { + console.error(lookup.output); + } + console.error( + " Start the gateway again with `openshell gateway start --name nemoclaw` before retrying.", + ); + console.error( + " If the gateway had to be rebuilt from scratch, recreate the affected sandbox afterward.", + ); + process.exit(1); + } + console.error(` Unable to verify sandbox '${sandboxName}' against the live OpenShell gateway.`); + if (lookup.output) { + console.error(lookup.output); + } + printGatewayLifecycleHint(lookup.output, sandboxName); + console.error(" Check `openshell status` and the active gateway, then retry."); + process.exit(1); +} diff --git a/src/lib/share-command-deps.ts b/src/lib/share-command-deps.ts index 8c215d2b44..b29c6a761d 100644 --- a/src/lib/share-command-deps.ts +++ b/src/lib/share-command-deps.ts @@ -1,12 +1,13 @@ // SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 -import { OPENSHELL_PROBE_TIMEOUT_MS } from "./openshell-timeouts"; import { CLI_NAME } from "./branding"; +import { OPENSHELL_PROBE_TIMEOUT_MS } from "./openshell-timeouts"; +import { G, R } from "./terminal-style"; export interface ShareCommandDeps { /** Run `openshell sandbox ssh-config ` and return output. */ - getSshConfig: (sandboxName: string) => { status: number; output: string }; + getSshConfig: (sandboxName: string) => { status: number | null; output: string }; /** Ensure the sandbox is live, exit process if not. */ ensureLive: (sandboxName: string) => Promise; /** NVIDIA-green ANSI code (empty string if color disabled). */ @@ -17,32 +18,28 @@ export interface ShareCommandDeps { cliName: string; } -interface ShareRuntimeBridge { - captureOpenshell: ( - args: string[], - opts?: { ignoreError?: boolean; timeout?: number }, - ) => { status: number; output: string }; - ensureLiveSandboxOrExit: (sandboxName: string) => Promise; - G: string; - R: string; -} - -function getRuntimeBridge(): ShareRuntimeBridge { - return require("../nemoclaw") as ShareRuntimeBridge; -} - export function buildShareCommandDeps(): ShareCommandDeps { - const runtime = getRuntimeBridge(); + const { captureOpenshell } = require("./openshell-runtime") as { + captureOpenshell: ( + args: string[], + opts?: { ignoreError?: boolean; timeout?: number }, + ) => { status: number | null; output: string }; + }; + const { ensureLiveSandboxOrExit } = require("./sandbox-gateway-state-action") as { + ensureLiveSandboxOrExit: (sandboxName: string) => Promise; + }; return { getSshConfig: (sandboxName: string) => - runtime.captureOpenshell(["sandbox", "ssh-config", sandboxName], { + captureOpenshell(["sandbox", "ssh-config", sandboxName], { ignoreError: true, timeout: OPENSHELL_PROBE_TIMEOUT_MS, }), - ensureLive: (sandboxName: string) => runtime.ensureLiveSandboxOrExit(sandboxName), - colorGreen: runtime.G, - colorReset: runtime.R, + ensureLive: async (sandboxName: string) => { + await ensureLiveSandboxOrExit(sandboxName); + }, + colorGreen: G, + colorReset: R, cliName: CLI_NAME, }; } diff --git a/src/lib/terminal-style.ts b/src/lib/terminal-style.ts new file mode 100644 index 0000000000..d70483a300 --- /dev/null +++ b/src/lib/terminal-style.ts @@ -0,0 +1,15 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +/* v8 ignore start -- presentation constants are covered through CLI subprocess tests. */ + +const useColor = !process.env.NO_COLOR && !!process.stdout.isTTY; +const trueColor = + useColor && (process.env.COLORTERM === "truecolor" || process.env.COLORTERM === "24bit"); + +export const G = useColor ? (trueColor ? "\x1b[38;2;118;185;0m" : "\x1b[38;5;148m") : ""; +export const B = useColor ? "\x1b[1m" : ""; +export const D = useColor ? "\x1b[2m" : ""; +export const R = useColor ? "\x1b[0m" : ""; +export const RD = useColor ? "\x1b[1;31m" : ""; +export const YW = useColor ? "\x1b[1;33m" : ""; diff --git a/src/nemoclaw.ts b/src/nemoclaw.ts index 177a4fdc5b..36e3b22d90 100644 --- a/src/nemoclaw.ts +++ b/src/nemoclaw.ts @@ -36,7 +36,7 @@ const { dockerRmi, } = require("./lib/docker"); const { resolveOpenshell } = require("./lib/resolve-openshell"); -const { pruneKnownHostsEntries, hydrateCredentialEnv, isNonInteractive } = require("./lib/onboard"); +const { hydrateCredentialEnv, isNonInteractive } = require("./lib/onboard"); const { ensureOllamaAuthProxy } = require("./lib/onboard-ollama-proxy"); const { prompt: askPrompt } = require("./lib/credentials"); const registry = require("./lib/registry"); @@ -57,7 +57,6 @@ const { captureOpenshellForStatus, getInstalledOpenshellVersionOrNull, getOpenshellBinary, - getStatusProbeTimeoutMs, isCommandTimeout, runOpenshell, } = require("./lib/openshell-runtime"); @@ -66,6 +65,13 @@ const { recoverNamedGatewayRuntime, } = require("./lib/gateway-runtime-action"); const { recoverRegistryEntries } = require("./lib/registry-recovery-action"); +const { + ensureLiveSandboxOrExit, + getReconciledSandboxGatewayState, + getSandboxGatewayStateForStatus, + printGatewayLifecycleHint, + printWrongGatewayActiveGuidance, +} = require("./lib/sandbox-gateway-state-action"); const { runRegisteredOclifCommand } = require("./lib/oclif-runner"); const { isErrnoException }: typeof import("./lib/errno") = require("./lib/errno"); const agentRuntime = require("../bin/lib/agent-runtime"); @@ -85,10 +91,7 @@ const { globalCommandTokens, sandboxActionTokens, } = require("./lib/command-registry"); -import { - OPENSHELL_OPERATION_TIMEOUT_MS, - OPENSHELL_PROBE_TIMEOUT_MS, -} from "./lib/openshell-timeouts"; +import { OPENSHELL_PROBE_TIMEOUT_MS } from "./lib/openshell-timeouts"; import { resolveGlobalOclifDispatch, resolveSandboxOclifDispatch, @@ -458,422 +461,6 @@ exports.runtimeBridge = { sandboxStatus, upgradeSandboxes, }; -exports.ensureLiveSandboxOrExit = ensureLiveSandboxOrExit; -exports.G = G; -exports.R = R; - -function mergeLivePolicyIntoSandboxOutput(output: string, livePolicyOutput: string): string { - const rawLines = String(output).split("\n"); - const cleanLines = stripAnsi(String(output)).split("\n"); - const policyLineIdx = cleanLines.findIndex((l: string) => l.trim() === "Policy:"); - if (policyLineIdx === -1) return output; - - // Keep everything before Policy (Sandbox info with colors), - // plus the original colored "Policy:" header line. - const before = rawLines.slice(0, policyLineIdx + 1).join("\n"); - // Extract YAML content from policy get --full (skip metadata header before "---"). - // Use a regex to handle varying line endings (\n, \r\n) and optional trailing whitespace. - const delimIdx = livePolicyOutput.search(/^---\s*$/m); - const yamlPart = - delimIdx !== -1 - ? livePolicyOutput.slice(delimIdx).replace(/^---\s*[\r\n]+/, "") - : livePolicyOutput; - // Guard: only replace if the extracted content looks like policy YAML - // (starts with a YAML key like "version:" or "network_policies:"). - // Avoids replacing with warnings or status text from unexpected output. - const trimmedYaml = yamlPart.trim(); - const looksLikeError = /^(error|failed|invalid|warning|status)\b/i.test(trimmedYaml); - if (!trimmedYaml || looksLikeError || !/^[a-z_][a-z0-9_]*\s*:/m.test(trimmedYaml)) { - return output; - } - - // Add 2-space indent to match the original sandbox get output format. - const indented = trimmedYaml - .split("\n") - .map((l: string) => (l ? " " + l : l)) - .join("\n"); - return before + "\n\n" + indented + "\n"; -} - -/** Query sandbox presence and return its output with the live enforced policy. */ -function getSandboxGatewayState(sandboxName: string) { - const result = captureOpenshell(["sandbox", "get", sandboxName], { - timeout: OPENSHELL_PROBE_TIMEOUT_MS, - }); - let output = result.output; - if (result.status === 0) { - // `openshell sandbox get` returns the immutable baseline policy from sandbox - // creation, which does not include network_policies added later via - // `openshell policy set`. Replace the Policy section with the live policy - // from `policy get --full`, preserving the colored "Policy:" header and - // Sandbox info above it. (#1132) - const livePolicy = captureOpenshell(["policy", "get", "--full", sandboxName], { - ignoreError: true, - timeout: OPENSHELL_PROBE_TIMEOUT_MS, - }); - if (livePolicy.status === 0 && livePolicy.output.trim()) { - output = mergeLivePolicyIntoSandboxOutput(output, livePolicy.output); - } - return { state: "present", output }; - } - if (/\bNotFound\b|\bNot Found\b|sandbox not found/i.test(output)) { - return { state: "missing", output }; - } - if ( - /transport error|Connection refused|handshake verification failed|Missing gateway auth token|device identity required/i.test( - output, - ) - ) { - return { state: "gateway_error", output }; - } - return { state: "unknown_error", output }; -} - -async function getSandboxGatewayStateForStatus(sandboxName: string) { - const timeoutMs = getStatusProbeTimeoutMs(); - const result = await captureOpenshellForStatus(["sandbox", "get", sandboxName], { - timeout: timeoutMs, - }); - let output = result.output; - if (isCommandTimeout(result)) { - return { - state: "status_probe_timeout", - output: ` Live sandbox status probe timed out after ${Math.ceil(timeoutMs / 1000)}s. Local registry data is shown above.`, - }; - } - if (result.status === 0) { - const livePolicy = await captureOpenshellForStatus(["policy", "get", "--full", sandboxName], { - ignoreError: true, - timeout: timeoutMs, - }); - if (!isCommandTimeout(livePolicy) && livePolicy.status === 0 && livePolicy.output.trim()) { - output = mergeLivePolicyIntoSandboxOutput(output, livePolicy.output); - } - return { state: "present", output }; - } - if (/\bNotFound\b|\bNot Found\b|sandbox not found/i.test(output)) { - return { state: "missing", output }; - } - if ( - /transport error|Connection refused|handshake verification failed|Missing gateway auth token|device identity required/i.test( - output, - ) - ) { - return { state: "gateway_error", output }; - } - return { state: "unknown_error", output }; -} - -type SandboxGatewayStateLookup = ( - sandboxName: string, -) => - | ReturnType - | ReturnType; - -/** - * Reconcile a NotFound sandbox lookup against the named NemoClaw gateway state. - * When the active OpenShell gateway has drifted off nemoclaw, a NotFound is - * ambiguous: the sandbox may actually be registered against the nemoclaw - * gateway but invisible because some other gateway is currently active. This - * helper self-heals by attempting `openshell gateway select nemoclaw` and - * re-queries, or returns a `wrong_gateway_active` state so callers can surface - * actionable guidance instead of destroying the registry entry. - */ -function reconcileMissingAgainstNamedGateway( - sandboxName: string, - missingLookup: ReturnType, -) { - const lifecycle = getNamedGatewayLifecycleState(); - if (lifecycle.state === "connected_other") { - runOpenshell(["gateway", "select", "nemoclaw"], { - ignoreError: true, - timeout: OPENSHELL_OPERATION_TIMEOUT_MS, - }); - const retry = getSandboxGatewayState(sandboxName); - if (retry.state === "present") { - return { ...retry, recoveredGateway: true, recoveryVia: "select" }; - } - if (retry.state === "missing") { - const after = getNamedGatewayLifecycleState(); - if (after.state === "healthy_named") { - return retry; - } - } - return { - state: "wrong_gateway_active", - activeGateway: lifecycle.activeGateway, - output: lifecycle.status, - }; - } - if (lifecycle.state === "missing_named") { - return { state: "gateway_missing_after_restart", output: lifecycle.status }; - } - if (lifecycle.state === "named_unreachable" || lifecycle.state === "named_unhealthy") { - return { state: "gateway_unreachable_after_restart", output: lifecycle.status }; - } - return missingLookup; -} - -/** - * Print actionable guidance when the nemoclaw gateway exists but another - * OpenShell gateway is currently active. Emphasizes that the sandbox has NOT - * been removed and how to switch gateways before retrying. (#2276) - */ -function printWrongGatewayActiveGuidance( - sandboxName: string, - activeGateway: string | null | undefined, - writer: (message: string) => void = console.error, -) { - const other = activeGateway && activeGateway !== "nemoclaw" ? activeGateway : "another gateway"; - writer( - ` Sandbox '${sandboxName}' is registered against the ${CLI_DISPLAY_NAME} gateway, but the currently active OpenShell gateway is '${other}'. Your sandbox has NOT been removed.`, - ); - writer(" Switch gateways and retry:"); - writer(" openshell gateway select nemoclaw"); - writer(` Then re-run: ${CLI_NAME} ${sandboxName} connect`); -} - -/** Print troubleshooting hints based on gateway lifecycle state in the output. */ -function printGatewayLifecycleHint(output = "", sandboxName = "", writer = console.error) { - const cleanOutput = stripAnsi(output); - if (/No gateway configured/i.test(cleanOutput)) { - writer( - ` The selected ${CLI_DISPLAY_NAME} gateway is no longer configured or its metadata/runtime has been lost.`, - ); - writer( - " Start the gateway again with `openshell gateway start --name nemoclaw` before expecting existing sandboxes to reconnect.", - ); - writer( - " If the gateway has to be rebuilt from scratch, recreate the affected sandbox afterward.", - ); - return; - } - if ( - /Connection refused|client error \(Connect\)|tcp connect error/i.test(cleanOutput) && - /Gateway:\s+nemoclaw/i.test(cleanOutput) - ) { - writer( - " The selected NemoClaw gateway exists in metadata, but its API is refusing connections after restart.", - ); - writer(" This usually means the gateway runtime did not come back cleanly after the restart."); - writer( - " Retry `openshell gateway start --name nemoclaw`; if it stays in this state, rebuild the gateway before expecting existing sandboxes to reconnect.", - ); - return; - } - if (/handshake verification failed/i.test(cleanOutput)) { - writer(" This looks like gateway identity drift after restart."); - writer( - " Existing sandboxes may still be recorded locally, but the current gateway no longer trusts their prior connection state.", - ); - writer( - ` Try re-establishing the ${CLI_DISPLAY_NAME} gateway/runtime first. If the sandbox is still unreachable, recreate just that sandbox with \`${CLI_NAME} onboard\`.`, - ); - return; - } - if (/Connection refused|transport error/i.test(cleanOutput)) { - writer( - ` The sandbox '${sandboxName}' may still exist, but the current gateway/runtime is not reachable.`, - ); - writer(" Check `openshell status`, verify the active gateway, and retry."); - return; - } - if (/Missing gateway auth token|device identity required/i.test(cleanOutput)) { - writer( - " The gateway is reachable, but the current auth or device identity state is not usable.", - ); - writer(" Verify the active gateway and retry after re-establishing the runtime."); - } -} - -async function getReconciledSandboxGatewayState( - sandboxName: string, - opts: { getState?: SandboxGatewayStateLookup } = {}, -) { - const getState = opts.getState ?? getSandboxGatewayState; - let lookup = await getState(sandboxName); - if (lookup.state === "present") { - return lookup; - } - if (lookup.state === "missing") { - return reconcileMissingAgainstNamedGateway(sandboxName, lookup); - } - - if (lookup.state === "gateway_error") { - const recovery = await recoverNamedGatewayRuntime(); - if (recovery.recovered) { - const retried = await getState(sandboxName); - if (retried.state === "present" || retried.state === "missing") { - return { ...retried, recoveredGateway: true, recoveryVia: recovery.via || null }; - } - if (/handshake verification failed/i.test(retried.output)) { - return { - state: "identity_drift", - output: retried.output, - recoveredGateway: true, - recoveryVia: recovery.via || null, - }; - } - return { ...retried, recoveredGateway: true, recoveryVia: recovery.via || null }; - } - const latestLifecycle = getNamedGatewayLifecycleState(); - const latestStatus = stripAnsi(latestLifecycle.status || ""); - if (/No gateway configured/i.test(latestStatus)) { - return { - state: "gateway_missing_after_restart", - output: latestLifecycle.status || lookup.output, - }; - } - if ( - /Connection refused|client error \(Connect\)|tcp connect error/i.test(latestStatus) && - /Gateway:\s+nemoclaw/i.test(latestStatus) - ) { - return { - state: "gateway_unreachable_after_restart", - output: latestLifecycle.status || lookup.output, - }; - } - if ( - recovery.after?.state === "named_unreachable" || - recovery.before?.state === "named_unreachable" - ) { - return { - state: "gateway_unreachable_after_restart", - output: recovery.after?.status || recovery.before?.status || lookup.output, - }; - } - return { ...lookup, gatewayRecoveryFailed: true }; - } - - return lookup; -} - -async function ensureLiveSandboxOrExit( - sandboxName: string, - { allowNonReadyPhase = false }: { allowNonReadyPhase?: boolean } = {}, -) { - const lookup = await getReconciledSandboxGatewayState(sandboxName); - if (lookup.state === "present") { - const phase = parseSandboxPhase(lookup.output || ""); - if (!allowNonReadyPhase && phase && phase !== "Ready") { - console.error(` Sandbox '${sandboxName}' is stuck in '${phase}' phase.`); - console.error( - " This usually happens when a process crash inside the sandbox prevented clean startup.", - ); - console.error(""); - console.error( - ` Run \`${CLI_NAME} ${sandboxName} rebuild --yes\` to recreate the sandbox (--yes skips the confirmation prompt; workspace state will be preserved).`, - ); - process.exit(1); - } - return lookup; - } - if (lookup.state === "missing") { - // Belt-and-suspenders: only destroy registry state if the nemoclaw gateway - // is demonstrably the healthy active gateway. The reconciler should have - // already routed drift cases to `wrong_gateway_active`, but this guards - // against future regressions. - const guard = getNamedGatewayLifecycleState(); - if (guard.state !== "healthy_named") { - if (guard.state === "connected_other") { - printWrongGatewayActiveGuidance(sandboxName, guard.activeGateway, console.error); - } else { - printGatewayLifecycleHint(guard.status || "", sandboxName, console.error); - } - process.exit(1); - } - registry.removeSandbox(sandboxName); - const session = onboardSession.loadSession(); - if (session && session.sandboxName === sandboxName) { - onboardSession.updateSession((s: Session) => { - s.sandboxName = null; - return s; - }); - } - console.error(` Sandbox '${sandboxName}' is not present in the live OpenShell gateway.`); - console.error(" Removed stale local registry entry."); - console.error( - ` Run \`${CLI_NAME} list\` to confirm the remaining sandboxes, or \`${CLI_NAME} onboard\` to create a new one.`, - ); - process.exit(1); - } - if (lookup.state === "wrong_gateway_active") { - const activeGateway = - "activeGateway" in lookup && typeof lookup.activeGateway === "string" - ? lookup.activeGateway - : undefined; - printWrongGatewayActiveGuidance(sandboxName, activeGateway, console.error); - process.exit(1); - } - if (lookup.state === "identity_drift") { - // Gateway SSH keys rotated after restart — clear stale known_hosts and retry. - console.error(" Gateway SSH identity changed after restart — clearing stale host keys..."); - const knownHostsPath = path.join(os.homedir(), ".ssh", "known_hosts"); - if (fs.existsSync(knownHostsPath)) { - try { - const kh = fs.readFileSync(knownHostsPath, "utf8"); - const cleaned = pruneKnownHostsEntries(kh); - if (cleaned !== kh) fs.writeFileSync(knownHostsPath, cleaned); - } catch { - /* best-effort cleanup */ - } - } - const retry = await getReconciledSandboxGatewayState(sandboxName); - if (retry.state === "present") { - console.error(" ✓ Reconnected after clearing stale SSH host keys."); - return retry; - } - // Retry failed — fall through to error - console.error( - ` Could not reconnect to sandbox '${sandboxName}' after clearing stale host keys.`, - ); - if (retry.output) { - console.error(retry.output); - } - console.error( - ` Recreate this sandbox with \`${CLI_NAME} onboard\` once the gateway runtime is stable.`, - ); - process.exit(1); - } - if (lookup.state === "gateway_unreachable_after_restart") { - console.error( - ` Sandbox '${sandboxName}' may still exist, but the selected ${CLI_DISPLAY_NAME} gateway is still refusing connections after restart.`, - ); - if (lookup.output) { - console.error(lookup.output); - } - console.error( - " Retry `openshell gateway start --name nemoclaw` and verify `openshell status` is healthy before reconnecting.", - ); - console.error( - " If the gateway never becomes healthy, rebuild the gateway and then recreate the affected sandbox.", - ); - process.exit(1); - } - if (lookup.state === "gateway_missing_after_restart") { - console.error( - ` Sandbox '${sandboxName}' may still exist locally, but the ${CLI_DISPLAY_NAME} gateway is no longer configured after restart/rebuild.`, - ); - if (lookup.output) { - console.error(lookup.output); - } - console.error( - " Start the gateway again with `openshell gateway start --name nemoclaw` before retrying.", - ); - console.error( - " If the gateway had to be rebuilt from scratch, recreate the affected sandbox afterward.", - ); - process.exit(1); - } - console.error(` Unable to verify sandbox '${sandboxName}' against the live OpenShell gateway.`); - if (lookup.output) { - console.error(lookup.output); - } - printGatewayLifecycleHint(lookup.output, sandboxName); - console.error(" Check `openshell status` and the active gateway, then retry."); - process.exit(1); -} - /** Print user-facing guidance when OpenShell is too old to support `openshell logs`. */ function printOldLogsCompatibilityGuidance(installedVersion = null) { const versionText = installedVersion ? ` (${installedVersion})` : ""; From 2ce87ddbf6c085ff9beecebea65c036afce9093d Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Sat, 2 May 2026 13:52:14 -0700 Subject: [PATCH 02/65] refactor(cli): extract sandbox skill install action --- src/lib/nemoclaw-runtime-bridge.ts | 1 - src/lib/sandbox-runtime-actions.ts | 5 +- src/lib/sandbox-skill-install-action.ts | 224 ++++++++++++++++++++++++ src/nemoclaw.ts | 215 +---------------------- 4 files changed, 234 insertions(+), 211 deletions(-) create mode 100644 src/lib/sandbox-skill-install-action.ts diff --git a/src/lib/nemoclaw-runtime-bridge.ts b/src/lib/nemoclaw-runtime-bridge.ts index ff51cd553c..3f72b20ff5 100644 --- a/src/lib/nemoclaw-runtime-bridge.ts +++ b/src/lib/nemoclaw-runtime-bridge.ts @@ -11,7 +11,6 @@ export interface NemoClawRuntimeBridge { sandboxConnect: (sandboxName: string, options?: SandboxConnectOptions) => Promise; sandboxDestroy: (sandboxName: string, args?: string[]) => Promise; sandboxRebuild: (sandboxName: string, args?: string[]) => Promise; - sandboxSkillInstall: (sandboxName: string, args?: string[]) => Promise; sandboxStatus: (sandboxName: string) => Promise; upgradeSandboxes: (args?: string[]) => Promise; } diff --git a/src/lib/sandbox-runtime-actions.ts b/src/lib/sandbox-runtime-actions.ts index 2952598a3e..45ef0382f8 100644 --- a/src/lib/sandbox-runtime-actions.ts +++ b/src/lib/sandbox-runtime-actions.ts @@ -36,7 +36,10 @@ export async function installSandboxSkill( sandboxName: string, args: string[] = [], ): Promise { - await getNemoClawRuntimeBridge().sandboxSkillInstall(sandboxName, args); + const { installSandboxSkill: installExtractedSandboxSkill } = require("./sandbox-skill-install-action") as { + installSandboxSkill: (sandboxName: string, args?: string[]) => Promise; + }; + await installExtractedSandboxSkill(sandboxName, args); } export async function runSandboxSnapshot(sandboxName: string, args: string[]): Promise { diff --git a/src/lib/sandbox-skill-install-action.ts b/src/lib/sandbox-skill-install-action.ts new file mode 100644 index 0000000000..0704881fa4 --- /dev/null +++ b/src/lib/sandbox-skill-install-action.ts @@ -0,0 +1,224 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +/* v8 ignore start -- exercised through CLI subprocess skill install tests. */ + +import fs from "node:fs"; +import os from "node:os"; +import path from "node:path"; + +import { CLI_NAME } from "./branding"; +import { captureOpenshell } from "./openshell-runtime"; +import { ensureLiveSandboxOrExit } from "./sandbox-gateway-state-action"; +import * as skillInstall from "./skill-install"; +import { D, G, R, YW } from "./terminal-style"; + +const agentRuntime = require("../../bin/lib/agent-runtime"); + +export function printSkillInstallUsage(): void { + console.log(""); + console.log(` Usage: ${CLI_NAME} skill install `); + console.log(""); + console.log(" Deploy a skill directory to a running sandbox."); + console.log( + " must be a skill directory containing a SKILL.md (with 'name:' frontmatter),", + ); + console.log( + " or a direct path to a SKILL.md file. All non-dot files in the directory are uploaded.", + ); + console.log(""); +} + +export function looksLikeOpenClawPlugin(candidatePath: string): boolean { + const dir = + fs.existsSync(candidatePath) && fs.statSync(candidatePath).isDirectory() + ? candidatePath + : path.dirname(candidatePath); + if (!fs.existsSync(dir)) return false; + if (fs.existsSync(path.join(dir, "openclaw.plugin.json"))) return true; + + const packageJsonPath = path.join(dir, "package.json"); + if (!fs.existsSync(packageJsonPath)) return false; + try { + const packageJson = JSON.parse(fs.readFileSync(packageJsonPath, "utf-8")); + const openclawBlock = packageJson?.openclaw; + return Boolean( + packageJson?.["openclaw.plugin"] === true || + openclawBlock === true || + (typeof openclawBlock === "object" && + openclawBlock !== null && + (openclawBlock.plugin === true || + typeof openclawBlock.entry === "string" || + typeof openclawBlock.main === "string" || + (Array.isArray(openclawBlock.extensions) && openclawBlock.extensions.length > 0))), + ); + } catch { + return false; + } +} + +export function printPluginInstallHint(): void { + console.error(" This looks like an OpenClaw plugin, not a SKILL.md agent skill."); + console.error(" `skill install` only accepts skill directories or direct SKILL.md paths."); + console.error( + " To use an OpenClaw plugin today, bake it into a custom sandbox image with `nemoclaw onboard --from `.", + ); +} + +/** + * Install or update a local skill directory into a live sandbox and perform + * any agent-specific post-install refresh needed for the new content to load. + */ +export async function installSandboxSkill( + sandboxName: string, + args: string[] = [], +): Promise { + const sub = args[0]; + if (!sub || sub === "help" || sub === "--help" || sub === "-h") { + printSkillInstallUsage(); + return; + } + + if (sub !== "install") { + console.error(` Unknown skill subcommand: ${sub}`); + console.error(" Valid subcommands: install"); + process.exit(1); + } + + const skillPath = args[1]; + const extraArgs = args.slice(2); + if (skillPath === "--help" || skillPath === "-h" || skillPath === "help") { + printSkillInstallUsage(); + return; + } + if (extraArgs.length > 0) { + console.error(` Unknown argument(s) for skill install: ${extraArgs.join(", ")}`); + console.error(` Usage: ${CLI_NAME} skill install `); + process.exit(1); + } + if (!skillPath) { + console.error(` Usage: ${CLI_NAME} skill install `); + console.error(" must be a directory containing a SKILL.md file."); + process.exit(1); + } + + const resolvedPath = path.resolve(skillPath); + + // Accept a directory containing SKILL.md, or a direct path to SKILL.md. + let skillDir: string; + let skillMdPath: string; + if (fs.existsSync(resolvedPath) && fs.statSync(resolvedPath).isDirectory()) { + skillDir = resolvedPath; + skillMdPath = path.join(resolvedPath, "SKILL.md"); + } else if (fs.existsSync(resolvedPath) && resolvedPath.endsWith("SKILL.md")) { + skillDir = path.dirname(resolvedPath); + skillMdPath = resolvedPath; + } else { + console.error(` No SKILL.md found at '${resolvedPath}'.`); + console.error(" must be a skill directory or a direct path to SKILL.md."); + if (looksLikeOpenClawPlugin(resolvedPath)) { + printPluginInstallHint(); + } + process.exit(1); + } + + if (!fs.existsSync(skillMdPath)) { + console.error(` No SKILL.md found in '${skillDir}'.`); + console.error(" The skill directory must contain a SKILL.md file."); + if (looksLikeOpenClawPlugin(skillDir)) { + printPluginInstallHint(); + } + process.exit(1); + } + + // 1. Validate frontmatter + let frontmatter; + try { + const content = fs.readFileSync(skillMdPath, "utf-8"); + frontmatter = skillInstall.parseFrontmatter(content); + } catch (err) { + const errorMessage = err instanceof Error ? err.message : String(err); + console.error(` ${errorMessage}`); + process.exit(1); + } + + const collected = skillInstall.collectFiles(skillDir); + if (collected.unsafePaths.length > 0) { + console.error(" Skill directory contains files with unsafe characters:"); + for (const p of collected.unsafePaths) console.error(` ${p}`); + console.error(" File names must match [A-Za-z0-9._-/]. Rename or remove them."); + process.exit(1); + } + if (collected.skippedDotfiles.length > 0) { + console.log( + ` ${D}Skipping ${collected.skippedDotfiles.length} hidden path(s): ${collected.skippedDotfiles.join(", ")}${R}`, + ); + } + const fileLabel = collected.files.length === 1 ? "1 file" : `${collected.files.length} files`; + console.log(` ${G}✓${R} Validated SKILL.md (name: ${frontmatter.name}, ${fileLabel})`); + + // 2. Ensure sandbox is live + await ensureLiveSandboxOrExit(sandboxName); + + // 3. Resolve agent and paths + const agent = agentRuntime.getSessionAgent(sandboxName); + const paths = skillInstall.resolveSkillPaths(agent, frontmatter.name); + + // 4. Get SSH config + const sshConfigResult = captureOpenshell(["sandbox", "ssh-config", sandboxName], { + ignoreError: true, + }); + if (sshConfigResult.status !== 0) { + console.error(" Failed to obtain SSH configuration for the sandbox."); + process.exit(1); + } + + const tmpSshConfig = path.join( + os.tmpdir(), + `nemoclaw-ssh-skill-${process.pid}-${Date.now()}.conf`, + ); + fs.writeFileSync(tmpSshConfig, sshConfigResult.output, { mode: 0o600 }); + + try { + const ctx = { configFile: tmpSshConfig, sandboxName }; + + // 5. Check if skill already exists (update vs fresh install) + const isUpdate = skillInstall.checkExisting(ctx, paths); + + // 6. Upload skill directory + const { uploaded, failed } = skillInstall.uploadDirectory(ctx, skillDir, paths.uploadDir); + if (failed.length > 0) { + console.error(` Failed to upload ${failed.length} file(s): ${failed.join(", ")}`); + process.exit(1); + } + console.log(` ${G}✓${R} Uploaded ${uploaded} file(s) to sandbox`); + + // 7. Post-install (OpenClaw mirror + refresh, or restart hint). + // OpenClaw caches skill content per session, so always refresh the + // session index after an install/update to avoid stale SKILL.md data. + const post = skillInstall.postInstall(ctx, paths, skillDir); + for (const msg of post.messages) { + if (msg.startsWith("Warning:")) { + console.error(` ${YW}${msg}${R}`); + } else { + console.log(` ${D}${msg}${R}`); + } + } + + // 8. Verify + const verified = skillInstall.verifyInstall(ctx, paths); + if (verified) { + const verb = isUpdate ? "updated" : "installed"; + console.log(` ${G}✓${R} Skill '${frontmatter.name}' ${verb}`); + } else { + console.error(` Skill uploaded but verification failed at ${paths.uploadDir}/SKILL.md`); + process.exit(1); + } + } finally { + try { + fs.unlinkSync(tmpSshConfig); + } catch { + /* ignore */ + } + } +} diff --git a/src/nemoclaw.ts b/src/nemoclaw.ts index 36e3b22d90..ea024240ea 100644 --- a/src/nemoclaw.ts +++ b/src/nemoclaw.ts @@ -78,7 +78,6 @@ const agentRuntime = require("../bin/lib/agent-runtime"); const sandboxVersion = require("./lib/sandbox-version"); const sandboxState = require("./lib/sandbox-state"); const { parseRestoreArgs } = sandboxState; -const skillInstall = require("./lib/skill-install"); const { sleepSeconds } = require("./lib/wait"); const { parseSandboxPhase } = require("./lib/gateway-state"); const { @@ -457,7 +456,6 @@ exports.runtimeBridge = { sandboxConnect, sandboxDestroy, sandboxRebuild, - sandboxSkillInstall, sandboxStatus, upgradeSandboxes, }; @@ -1567,211 +1565,6 @@ async function sandboxStatus(sandboxName: string) { console.log(""); } -function printSkillInstallUsage(): void { - console.log(""); - console.log(` Usage: ${CLI_NAME} skill install `); - console.log(""); - console.log(" Deploy a skill directory to a running sandbox."); - console.log( - " must be a skill directory containing a SKILL.md (with 'name:' frontmatter),", - ); - console.log( - " or a direct path to a SKILL.md file. All non-dot files in the directory are uploaded.", - ); - console.log(""); -} - -function looksLikeOpenClawPlugin(candidatePath: string): boolean { - const dir = - fs.existsSync(candidatePath) && fs.statSync(candidatePath).isDirectory() - ? candidatePath - : path.dirname(candidatePath); - if (!fs.existsSync(dir)) return false; - if (fs.existsSync(path.join(dir, "openclaw.plugin.json"))) return true; - - const packageJsonPath = path.join(dir, "package.json"); - if (!fs.existsSync(packageJsonPath)) return false; - try { - const packageJson = JSON.parse(fs.readFileSync(packageJsonPath, "utf-8")); - const openclawBlock = packageJson?.openclaw; - return Boolean( - packageJson?.["openclaw.plugin"] === true || - openclawBlock === true || - (typeof openclawBlock === "object" && - openclawBlock !== null && - (openclawBlock.plugin === true || - typeof openclawBlock.entry === "string" || - typeof openclawBlock.main === "string" || - (Array.isArray(openclawBlock.extensions) && openclawBlock.extensions.length > 0))), - ); - } catch { - return false; - } -} - -function printPluginInstallHint(): void { - console.error(" This looks like an OpenClaw plugin, not a SKILL.md agent skill."); - console.error(" `skill install` only accepts skill directories or direct SKILL.md paths."); - console.error( - " To use an OpenClaw plugin today, bake it into a custom sandbox image with `nemoclaw onboard --from `.", - ); -} - -/** - * Install or update a local skill directory into a live sandbox and perform - * any agent-specific post-install refresh needed for the new content to load. - */ -async function sandboxSkillInstall(sandboxName: string, args: string[] = []): Promise { - const sub = args[0]; - if (!sub || sub === "help" || sub === "--help" || sub === "-h") { - printSkillInstallUsage(); - return; - } - - if (sub !== "install") { - console.error(` Unknown skill subcommand: ${sub}`); - console.error(" Valid subcommands: install"); - process.exit(1); - } - - const skillPath = args[1]; - const extraArgs = args.slice(2); - if (skillPath === "--help" || skillPath === "-h" || skillPath === "help") { - printSkillInstallUsage(); - return; - } - if (extraArgs.length > 0) { - console.error(` Unknown argument(s) for skill install: ${extraArgs.join(", ")}`); - console.error(` Usage: ${CLI_NAME} skill install `); - process.exit(1); - } - if (!skillPath) { - console.error(` Usage: ${CLI_NAME} skill install `); - console.error(" must be a directory containing a SKILL.md file."); - process.exit(1); - } - - const resolvedPath = path.resolve(skillPath); - - // Accept a directory containing SKILL.md, or a direct path to SKILL.md. - let skillDir: string; - let skillMdPath: string; - if (fs.existsSync(resolvedPath) && fs.statSync(resolvedPath).isDirectory()) { - skillDir = resolvedPath; - skillMdPath = path.join(resolvedPath, "SKILL.md"); - } else if (fs.existsSync(resolvedPath) && resolvedPath.endsWith("SKILL.md")) { - skillDir = path.dirname(resolvedPath); - skillMdPath = resolvedPath; - } else { - console.error(` No SKILL.md found at '${resolvedPath}'.`); - console.error(" must be a skill directory or a direct path to SKILL.md."); - if (looksLikeOpenClawPlugin(resolvedPath)) { - printPluginInstallHint(); - } - process.exit(1); - } - - if (!fs.existsSync(skillMdPath)) { - console.error(` No SKILL.md found in '${skillDir}'.`); - console.error(" The skill directory must contain a SKILL.md file."); - if (looksLikeOpenClawPlugin(skillDir)) { - printPluginInstallHint(); - } - process.exit(1); - } - - // 1. Validate frontmatter - let frontmatter; - try { - const content = fs.readFileSync(skillMdPath, "utf-8"); - frontmatter = skillInstall.parseFrontmatter(content); - } catch (err) { - const errorMessage = err instanceof Error ? err.message : String(err); - console.error(` ${errorMessage}`); - process.exit(1); - } - - const collected = skillInstall.collectFiles(skillDir); - if (collected.unsafePaths.length > 0) { - console.error(` Skill directory contains files with unsafe characters:`); - for (const p of collected.unsafePaths) console.error(` ${p}`); - console.error(" File names must match [A-Za-z0-9._-/]. Rename or remove them."); - process.exit(1); - } - if (collected.skippedDotfiles.length > 0) { - console.log( - ` ${D}Skipping ${collected.skippedDotfiles.length} hidden path(s): ${collected.skippedDotfiles.join(", ")}${R}`, - ); - } - const fileLabel = collected.files.length === 1 ? "1 file" : `${collected.files.length} files`; - console.log(` ${G}✓${R} Validated SKILL.md (name: ${frontmatter.name}, ${fileLabel})`); - - // 2. Ensure sandbox is live - await ensureLiveSandboxOrExit(sandboxName); - - // 3. Resolve agent and paths - const agent = agentRuntime.getSessionAgent(sandboxName); - const paths = skillInstall.resolveSkillPaths(agent, frontmatter.name); - - // 4. Get SSH config - const sshConfigResult = captureOpenshell(["sandbox", "ssh-config", sandboxName], { - ignoreError: true, - }); - if (sshConfigResult.status !== 0) { - console.error(" Failed to obtain SSH configuration for the sandbox."); - process.exit(1); - } - - const tmpSshConfig = path.join( - os.tmpdir(), - `nemoclaw-ssh-skill-${process.pid}-${Date.now()}.conf`, - ); - fs.writeFileSync(tmpSshConfig, sshConfigResult.output, { mode: 0o600 }); - - try { - const ctx = { configFile: tmpSshConfig, sandboxName }; - - // 5. Check if skill already exists (update vs fresh install) - const isUpdate = skillInstall.checkExisting(ctx, paths); - - // 6. Upload skill directory - const { uploaded, failed } = skillInstall.uploadDirectory(ctx, skillDir, paths.uploadDir); - if (failed.length > 0) { - console.error(` Failed to upload ${failed.length} file(s): ${failed.join(", ")}`); - process.exit(1); - } - console.log(` ${G}✓${R} Uploaded ${uploaded} file(s) to sandbox`); - - // 7. Post-install (OpenClaw mirror + refresh, or restart hint). - // OpenClaw caches skill content per session, so always refresh the - // session index after an install/update to avoid stale SKILL.md data. - const post = skillInstall.postInstall(ctx, paths, skillDir); - for (const msg of post.messages) { - if (msg.startsWith("Warning:")) { - console.error(` ${YW}${msg}${R}`); - } else { - console.log(` ${D}${msg}${R}`); - } - } - - // 8. Verify - const verified = skillInstall.verifyInstall(ctx, paths); - if (verified) { - const verb = isUpdate ? "updated" : "installed"; - console.log(` ${G}✓${R} Skill '${frontmatter.name}' ${verb}`); - } else { - console.error(` Skill uploaded but verification failed at ${paths.uploadDir}/SKILL.md`); - process.exit(1); - } - } finally { - try { - fs.unlinkSync(tmpSshConfig); - } catch { - /* ignore */ - } - } -} - function cleanupSandboxServices( sandboxName: string, { stopHostServices = false }: { stopHostServices?: boolean } = {}, @@ -2628,9 +2421,13 @@ async function runDispatchResult( await addSandboxPolicy(sandboxName, actionArgs); return; } - case "skill": - await sandboxSkillInstall(sandboxName, actionArgs); + case "skill": { + const { installSandboxSkill } = require("./lib/sandbox-skill-install-action") as { + installSandboxSkill: (sandboxName: string, args?: string[]) => Promise; + }; + await installSandboxSkill(sandboxName, actionArgs); return; + } case "snapshot": { const { runSandboxSnapshot } = require("./lib/snapshot-action") as { runSandboxSnapshot: (sandboxName: string, args: string[]) => Promise; From 4cd3cf3d21505838e2621dd503e8e7d8dc62701f Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Sat, 2 May 2026 18:53:14 -0700 Subject: [PATCH 03/65] refactor(cli): extract sandbox connect action --- src/lib/nemoclaw-runtime-bridge.ts | 5 - src/lib/sandbox-connect-action.ts | 321 ++++++++++++ src/lib/sandbox-process-recovery-action.ts | 301 +++++++++++ src/lib/sandbox-runtime-actions.ts | 7 +- src/lib/skill-install.ts | 2 +- src/nemoclaw.ts | 573 +-------------------- 6 files changed, 638 insertions(+), 571 deletions(-) create mode 100644 src/lib/sandbox-connect-action.ts create mode 100644 src/lib/sandbox-process-recovery-action.ts diff --git a/src/lib/nemoclaw-runtime-bridge.ts b/src/lib/nemoclaw-runtime-bridge.ts index 3f72b20ff5..e5ef72db18 100644 --- a/src/lib/nemoclaw-runtime-bridge.ts +++ b/src/lib/nemoclaw-runtime-bridge.ts @@ -3,12 +3,7 @@ /* v8 ignore start -- transitional bridge until command actions are extracted from src/nemoclaw.ts. */ -export interface SandboxConnectOptions { - probeOnly?: boolean; -} - export interface NemoClawRuntimeBridge { - sandboxConnect: (sandboxName: string, options?: SandboxConnectOptions) => Promise; sandboxDestroy: (sandboxName: string, args?: string[]) => Promise; sandboxRebuild: (sandboxName: string, args?: string[]) => Promise; sandboxStatus: (sandboxName: string) => Promise; diff --git a/src/lib/sandbox-connect-action.ts b/src/lib/sandbox-connect-action.ts new file mode 100644 index 0000000000..79261337d5 --- /dev/null +++ b/src/lib/sandbox-connect-action.ts @@ -0,0 +1,321 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +/* v8 ignore start -- exercised through CLI subprocess connect tests. */ + +import { spawnSync } from "node:child_process"; +import os from "node:os"; + +import { CLI_NAME } from "./branding"; +import { parseGatewayInference } from "./inference-config"; +import { ensureOllamaAuthProxy } from "./onboard-ollama-proxy"; +import { + captureOpenshell, + getOpenshellBinary, + runOpenshell, +} from "./openshell-runtime"; +import { OPENSHELL_PROBE_TIMEOUT_MS } from "./openshell-timeouts"; +import * as registry from "./registry"; +import { ROOT } from "./runner"; +import { ensureLiveSandboxOrExit } from "./sandbox-gateway-state-action"; +import { + createSystemDeps as createSessionDeps, + getActiveSandboxSessions, +} from "./sandbox-session-state"; +import { checkAndRecoverSandboxProcesses } from "./sandbox-process-recovery-action"; +import * as sandboxVersion from "./sandbox-version"; +import { D, G, R, YW } from "./terminal-style"; +import { resolveOpenshell } from "./resolve-openshell"; + +const agentRuntime = require("../../bin/lib/agent-runtime"); + +export type SandboxConnectOptions = { + probeOnly?: boolean; +}; + +type SpawnLikeResult = { + status: number | null; + signal?: NodeJS.Signals | null; +}; + +const SANDBOX_CONNECT_FLAGS = new Set([ + "--dangerously-skip-permissions", + "--probe-only", + "--help", + "-h", +]); + +export function isSandboxConnectFlag(arg: string | undefined): boolean { + return typeof arg === "string" && SANDBOX_CONNECT_FLAGS.has(arg); +} + +export function printSandboxConnectHelp(sandboxName = ""): void { + console.log(""); + console.log(` Usage: ${CLI_NAME} ${sandboxName} connect [--probe-only]`); + console.log(""); + console.log(" Options:"); + console.log( + " --probe-only Run recovery checks and exit without opening SSH", + ); + console.log(" -h, --help Show this help"); + console.log(""); +} + +export function parseSandboxConnectArgs( + sandboxName: string, + actionArgs: string[], +): SandboxConnectOptions { + const options: SandboxConnectOptions = {}; + for (const arg of actionArgs) { + if (!isSandboxConnectFlag(arg)) { + console.error(` Unknown flag for connect: ${arg}`); + printSandboxConnectHelp(sandboxName); + process.exit(1); + } + switch (arg) { + case "--dangerously-skip-permissions": + console.error(" --dangerously-skip-permissions was removed; use shields commands instead."); + printSandboxConnectHelp(sandboxName); + process.exit(1); + break; + case "--probe-only": + options.probeOnly = true; + break; + case "--help": + case "-h": + printSandboxConnectHelp(sandboxName); + process.exit(0); + break; + } + } + return options; +} + +function runSandboxConnectProbe(sandboxName: string): void { + const processCheck = checkAndRecoverSandboxProcesses(sandboxName, { quiet: true }); + const agent = agentRuntime.getSessionAgent(sandboxName); + const agentName = agentRuntime.getAgentDisplayName(agent); + if (!processCheck.checked) { + console.error( + ` Probe failed: could not inspect the ${agentName} gateway inside sandbox '${sandboxName}'.`, + ); + process.exit(1); + } + if (processCheck.wasRunning) { + console.log(` Probe complete: ${agentName} gateway is running in '${sandboxName}'.`); + return; + } + if (processCheck.recovered) { + console.log(` Probe complete: recovered ${agentName} gateway in '${sandboxName}'.`); + return; + } + console.error( + ` Probe failed: ${agentName} gateway is not running in '${sandboxName}' and automatic recovery failed.`, + ); + console.error(" Check /tmp/gateway.log inside the sandbox for details."); + process.exit(1); +} + +function exitWithSpawnResult(result: SpawnLikeResult): void { + if (result.status !== null) { + process.exit(result.status); + } + + if (result.signal) { + const signalNumber = os.constants.signals[result.signal]; + process.exit(signalNumber ? 128 + signalNumber : 1); + } + + process.exit(1); +} + +export async function connectSandbox( + sandboxName: string, + { probeOnly = false }: SandboxConnectOptions = {}, +): Promise { + const { isSandboxReady, parseSandboxStatus } = require("./onboard"); + await ensureLiveSandboxOrExit(sandboxName, { allowNonReadyPhase: true }); + + if (probeOnly) { + return runSandboxConnectProbe(sandboxName); + } + + // Version staleness check — warn but don't block + try { + const versionCheck = sandboxVersion.checkAgentVersion(sandboxName); + if (versionCheck.isStale) { + for (const line of sandboxVersion.formatStalenessWarning(sandboxName, versionCheck)) { + console.error(line); + } + } + } catch { + /* non-fatal — don't block connect on version check failure */ + } + + // Active session hint — inform if already connected in another terminal + try { + const opsBinConnect = resolveOpenshell(); + if (opsBinConnect) { + const sessionResult = getActiveSandboxSessions(sandboxName, createSessionDeps(opsBinConnect)); + if (sessionResult.detected && sessionResult.sessions.length > 0) { + const count = sessionResult.sessions.length; + console.log( + ` ${D}Note: ${count} existing SSH session${count > 1 ? "s" : ""} to '${sandboxName}' detected (another terminal).${R}`, + ); + } + } + } catch { + /* non-fatal — don't block connect on session detection failure */ + } + + checkAndRecoverSandboxProcesses(sandboxName); + // Ensure Ollama auth proxy is running (recovers from host reboots) + ensureOllamaAuthProxy(); + + // ── Inference route swap (#1248) ────────────────────────────────── + // When the user has multiple sandboxes with different providers, the + // cluster-wide inference.local route may still point at the *other* + // provider. Re-set it to match this sandbox's persisted config. + let sb; + try { + sb = registry.getSandbox(sandboxName); + if (sb && sb.provider && sb.model) { + const live = parseGatewayInference( + captureOpenshell(["inference", "get"], { + ignoreError: true, + timeout: OPENSHELL_PROBE_TIMEOUT_MS, + }).output, + ); + if (!live || live.provider !== sb.provider || live.model !== sb.model) { + console.log( + ` Switching inference route to ${sb.provider}/${sb.model} for sandbox '${sandboxName}'`, + ); + const swapResult = runOpenshell( + ["inference", "set", "--provider", sb.provider, "--model", sb.model, "--no-verify"], + { ignoreError: true }, + ); + if (swapResult.status !== 0) { + console.error( + ` ${YW}Warning: failed to switch inference route — connect will proceed anyway.${R}`, + ); + } + } + } + } catch { + /* non-fatal — don't block connect on inference route swap failure */ + } + + const rawTimeout = process.env.NEMOCLAW_CONNECT_TIMEOUT; + let timeout = 120; + if (rawTimeout !== undefined) { + const parsed = parseInt(rawTimeout, 10); + if (Number.isNaN(parsed) || parsed <= 0) { + console.warn( + ` Warning: invalid NEMOCLAW_CONNECT_TIMEOUT="${rawTimeout}", using default 120s`, + ); + } else { + timeout = parsed; + } + } + const interval = 3; + const startedAt = Date.now(); + const deadline = startedAt + timeout * 1000; + const elapsedSec = () => Math.floor((Date.now() - startedAt) / 1000); + const remainingMs = () => Math.max(1, deadline - Date.now()); + const runSandboxList = () => + captureOpenshell(["sandbox", "list"], { + ignoreError: true, + timeout: remainingMs(), + }).output; + + const list = runSandboxList(); + if (!isSandboxReady(list, sandboxName)) { + const status = parseSandboxStatus(list, sandboxName); + const TERMINAL = new Set([ + "Failed", + "Error", + "CrashLoopBackOff", + "ImagePullBackOff", + "Unknown", + "Evicted", + ]); + if (status && TERMINAL.has(status)) { + console.error(""); + console.error(` Sandbox '${sandboxName}' is in '${status}' state.`); + console.error(` Run: ${CLI_NAME} ${sandboxName} logs --follow`); + console.error(` Run: ${CLI_NAME} ${sandboxName} status`); + process.exit(1); + } + + console.log(` Waiting for sandbox '${sandboxName}' to be ready...`); + let ready = false; + let everSeen = status !== null; + while (Date.now() < deadline) { + const sleepFor = Math.min(interval, remainingMs() / 1000); + if (sleepFor <= 0) break; + spawnSync("sleep", [String(sleepFor)]); + const poll = runSandboxList(); + const elapsed = elapsedSec(); + if (isSandboxReady(poll, sandboxName)) { + ready = true; + break; + } + const cur = parseSandboxStatus(poll, sandboxName) || "unknown"; + if (cur !== "unknown") everSeen = true; + if (TERMINAL.has(cur)) { + console.error(""); + console.error(` Sandbox '${sandboxName}' entered '${cur}' state.`); + console.error(` Run: ${CLI_NAME} ${sandboxName} logs --follow`); + console.error(` Run: ${CLI_NAME} ${sandboxName} status`); + process.exit(1); + } + if (!everSeen && elapsed >= 30) { + console.error(""); + console.error(` Sandbox '${sandboxName}' not found after ${elapsed}s.`); + console.error(" Check: openshell sandbox list"); + process.exit(1); + } + process.stdout.write(`\r Status: ${cur.padEnd(20)} (${elapsed}s elapsed)`); + } + + if (!ready) { + console.error(""); + console.error(` Timed out after ${timeout}s waiting for sandbox '${sandboxName}'.`); + console.error(" Check: openshell sandbox list"); + console.error( + ` Override timeout: NEMOCLAW_CONNECT_TIMEOUT=300 ${CLI_NAME} ${sandboxName} connect`, + ); + process.exit(1); + } + console.log(`\r Status: ${"Ready".padEnd(20)} (${elapsedSec()}s elapsed)`); + console.log(" Sandbox is ready. Connecting..."); + } + + // Print a one-shot hint before dropping the user into the sandbox + // shell so a fresh user knows the first thing to type. Without this, + // `nemoclaw connect` lands on a bare bash prompt and users + // ask "now what?" — see #465. Suppress the hint when stdout isn't a + // TTY so scripted callers don't get noise in their pipelines. + if ( + process.stdout.isTTY && + !["1", "true"].includes(String(process.env.NEMOCLAW_NO_CONNECT_HINT || "")) + ) { + console.log(""); + const agentName = sb?.agent || "openclaw"; + const agentCmd = agentName === "openclaw" ? "openclaw tui" : agentName; + console.log(` ${G}✓${R} Connecting to sandbox '${sandboxName}'`); + console.log( + ` ${D}Inside the sandbox, run \`${agentCmd}\` to start chatting with the agent.${R}`, + ); + console.log( + ` ${D}Type \`/exit\` to leave the chat, then \`exit\` to return to the host shell.${R}`, + ); + console.log(""); + } + const result = spawnSync(getOpenshellBinary(), ["sandbox", "connect", sandboxName], { + stdio: "inherit", + cwd: ROOT, + env: process.env, + }); + exitWithSpawnResult(result); +} diff --git a/src/lib/sandbox-process-recovery-action.ts b/src/lib/sandbox-process-recovery-action.ts new file mode 100644 index 0000000000..a29a7c2595 --- /dev/null +++ b/src/lib/sandbox-process-recovery-action.ts @@ -0,0 +1,301 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +/* v8 ignore start -- exercised through CLI subprocess connect/status/rebuild tests. */ + +import fs from "node:fs"; +import os from "node:os"; +import path from "node:path"; +import { spawnSync } from "node:child_process"; + +import { DASHBOARD_PORT } from "./ports"; +import { ROOT, shellQuote } from "./runner"; +import { + captureOpenshell, + captureOpenshellForStatus, + getOpenshellBinary, + isCommandTimeout, + runOpenshell, +} from "./openshell-runtime"; +import { OPENSHELL_PROBE_TIMEOUT_MS } from "./openshell-timeouts"; +import { G, R } from "./terminal-style"; +import { sleepSeconds } from "./wait"; + +const agentRuntime = require("../../bin/lib/agent-runtime"); + +export type SandboxCommandResult = { + status: number; + stdout: string; + stderr: string; +}; + +const SANDBOX_EXEC_STARTED_MARKER = "__NEMOCLAW_SANDBOX_EXEC_STARTED__"; +const DASHBOARD_FORWARD_PORT = String(DASHBOARD_PORT); + +/** + * Run a command inside the sandbox via SSH and return { status, stdout, stderr }. + * Returns null if SSH config cannot be obtained. + */ +export function executeSandboxCommand( + sandboxName: string, + command: string, +): SandboxCommandResult | null { + const sshConfigResult = captureOpenshell(["sandbox", "ssh-config", sandboxName], { + ignoreError: true, + timeout: OPENSHELL_PROBE_TIMEOUT_MS, + }); + if (sshConfigResult.status !== 0) return null; + if (!sshConfigResult.output.trim()) return null; + + const tmpFile = path.join(os.tmpdir(), `nemoclaw-ssh-${process.pid}-${Date.now()}.conf`); + fs.writeFileSync(tmpFile, sshConfigResult.output, { mode: 0o600 }); + try { + const result = spawnSync( + "ssh", + [ + "-F", + tmpFile, + "-o", + "StrictHostKeyChecking=no", + "-o", + "UserKnownHostsFile=/dev/null", + "-o", + "ConnectTimeout=5", + "-o", + "LogLevel=ERROR", + `openshell-${sandboxName}`, + command, + ], + { encoding: "utf-8", stdio: ["ignore", "pipe", "pipe"], timeout: 15000 }, + ); + return { + status: result.status ?? 1, + stdout: (result.stdout || "").trim(), + stderr: (result.stderr || "").trim(), + }; + } finally { + try { + fs.unlinkSync(tmpFile); + } catch { + /* ignore */ + } + } +} + +export function executeSandboxExecCommand( + sandboxName: string, + command: string, + timeout = 15000, +): SandboxCommandResult | null { + const markedCommand = `printf '%s\n' '${SANDBOX_EXEC_STARTED_MARKER}'; ${command}`; + try { + const timeoutOverride = Number(process.env.NEMOCLAW_SANDBOX_EXEC_TIMEOUT_MS || ""); + const effectiveTimeout = + Number.isFinite(timeoutOverride) && timeoutOverride > 0 ? timeoutOverride : timeout; + const result = spawnSync( + getOpenshellBinary(), + ["sandbox", "exec", "--name", sandboxName, "--", "sh", "-c", markedCommand], + { + cwd: ROOT, + encoding: "utf-8", + env: process.env, + stdio: ["ignore", "pipe", "pipe"], + timeout: effectiveTimeout, + }, + ); + if (result.error) return null; + const stdout = (result.stdout || "").trim(); + const stdoutLines = stdout.split(/\r?\n/); + const markerIndex = stdoutLines.indexOf(SANDBOX_EXEC_STARTED_MARKER); + if (markerIndex === -1) return null; + const commandStdoutLines = stdoutLines.slice(markerIndex + 1); + return { + status: result.status ?? 1, + stdout: commandStdoutLines.join("\n").trim(), + stderr: (result.stderr || "").trim(), + }; + } catch { + return null; + } +} + +async function executeSandboxExecCommandForStatus( + sandboxName: string, + command: string, +): Promise { + const markedCommand = `printf '%s\n' '${SANDBOX_EXEC_STARTED_MARKER}'; ${command}`; + const result = await captureOpenshellForStatus( + ["sandbox", "exec", "--name", sandboxName, "--", "sh", "-c", markedCommand], + { ignoreError: true }, + ); + if (isCommandTimeout(result) || result.error) return null; + const stdout = (result.output || "").trim(); + const stdoutLines = stdout.split(/\r?\n/); + const markerIndex = stdoutLines.indexOf(SANDBOX_EXEC_STARTED_MARKER); + if (markerIndex === -1) return null; + const commandStdoutLines = stdoutLines.slice(markerIndex + 1); + return { + status: result.status ?? 1, + stdout: commandStdoutLines.join("\n").trim(), + stderr: "", + }; +} + +function parseSandboxGatewayProbe(result: SandboxCommandResult | null): boolean | null { + if (!result) return null; + if (result.stdout === "RUNNING") return true; + if (result.stdout === "STOPPED") return false; + return null; +} + +/** + * Check whether the OpenClaw gateway process is running inside the sandbox. + * Uses the gateway's HTTP endpoint (dashboard port) as the source of truth, + * since the gateway runs as a separate user and pgrep may not see it. + * Returns true (running), false (stopped), or null (cannot determine). + */ +function isSandboxGatewayRunning(sandboxName: string): boolean | null { + const agent = agentRuntime.getSessionAgent(sandboxName); + const probeUrl = agentRuntime.getHealthProbeUrl(agent); + const command = `curl -sf --max-time 3 ${shellQuote(probeUrl)} > /dev/null 2>&1 && echo RUNNING || echo STOPPED`; + const execProbe = parseSandboxGatewayProbe(executeSandboxExecCommand(sandboxName, command)); + if (execProbe !== null) return execProbe; + return parseSandboxGatewayProbe(executeSandboxCommand(sandboxName, command)); +} + +export async function isSandboxGatewayRunningForStatus( + sandboxName: string, +): Promise { + const agent = agentRuntime.getSessionAgent(sandboxName); + const probeUrl = agentRuntime.getHealthProbeUrl(agent); + const command = `curl -sf --max-time 3 ${shellQuote(probeUrl)} > /dev/null 2>&1 && echo RUNNING || echo STOPPED`; + return parseSandboxGatewayProbe(await executeSandboxExecCommandForStatus(sandboxName, command)); +} + +/** + * Restart the gateway process inside the sandbox after a pod restart. + * Cleans stale lock/temp files, sources proxy config, and launches the gateway + * in the background. Returns true on success. + */ +function recoverSandboxProcesses(sandboxName: string): boolean { + const agent = agentRuntime.getSessionAgent(sandboxName); + const agentScript = agentRuntime.buildRecoveryScript(agent, agent?.forwardPort ?? DASHBOARD_PORT); + const hasRecoveryMarker = (result: SandboxCommandResult | null) => + !!( + result && + (result.stdout.includes("GATEWAY_PID=") || result.stdout.includes("ALREADY_RUNNING")) + ); + const recoveredSsh = (result: SandboxCommandResult | null) => + !!(result && result.status === 0 && hasRecoveryMarker(result)); + + if (agentScript) { + // Non-OpenClaw manifests do not yet declare a runtime user for root + // sandbox exec. Recover them over SSH so the launch inherits the sandbox + // login user instead of creating root-owned agent state under /sandbox. + return recoveredSsh(executeSandboxCommand(sandboxName, agentScript)); + } + + const script = agentRuntime.buildOpenClawRecoveryScript(DASHBOARD_PORT); + const execResult = executeSandboxExecCommand(sandboxName, script, 30000); + if (hasRecoveryMarker(execResult)) return true; + if (execResult !== null) return false; + return recoveredSsh(executeSandboxCommand(sandboxName, script)); +} + +function readNonNegativeNumberEnv(name: string, fallback: number): number { + const raw = process.env[name]; + if (raw === undefined || raw.trim() === "") return fallback; + const parsed = Number(raw); + return Number.isFinite(parsed) && parsed >= 0 ? parsed : fallback; +} + +function waitForRecoveredSandboxGateway(sandboxName: string): boolean { + const timeoutSeconds = readNonNegativeNumberEnv("NEMOCLAW_GATEWAY_RECOVERY_WAIT_SECONDS", 30); + const intervalSeconds = readNonNegativeNumberEnv( + "NEMOCLAW_GATEWAY_RECOVERY_POLL_INTERVAL_SECONDS", + 3, + ); + const attempts = + intervalSeconds > 0 + ? Math.max(1, Math.floor(timeoutSeconds / intervalSeconds) + 1) + : Math.max(1, Math.floor(timeoutSeconds) + 1); + + for (let attempt = 0; attempt < attempts; attempt += 1) { + if (isSandboxGatewayRunning(sandboxName) === true) { + return true; + } + if (attempt < attempts - 1) { + sleepSeconds(intervalSeconds); + } + } + return false; +} + +/** + * Re-establish the dashboard port forward to the sandbox. + * Uses the agent's forward port when a non-OpenClaw agent is active. + */ +function ensureSandboxPortForward(sandboxName: string): void { + const agent = agentRuntime.getSessionAgent(sandboxName); + const port = agent ? String(agent.forwardPort) : DASHBOARD_FORWARD_PORT; + runOpenshell(["forward", "stop", port], { ignoreError: true }); + runOpenshell(["forward", "start", "--background", port, sandboxName], { + ignoreError: true, + }); +} + +/** + * Detect and recover from a sandbox that survived a gateway restart but + * whose OpenClaw processes are not running. Returns an object describing + * the outcome: { checked, wasRunning, recovered }. + */ +export function checkAndRecoverSandboxProcesses( + sandboxName: string, + { quiet = false }: { quiet?: boolean } = {}, +) { + const running = isSandboxGatewayRunning(sandboxName); + if (running === null) { + return { checked: false, wasRunning: null, recovered: false }; + } + if (running) { + return { checked: true, wasRunning: true, recovered: false }; + } + + // Gateway not running — attempt recovery + const recoveryAgent = agentRuntime.getSessionAgent(sandboxName); + if (!quiet) { + console.log(""); + console.log( + ` ${agentRuntime.getAgentDisplayName(recoveryAgent)} gateway is not running inside the sandbox (sandbox likely restarted).`, + ); + console.log(" Recovering..."); + } + + const recovered = recoverSandboxProcesses(sandboxName); + if (recovered) { + // Wait for gateway to bind its HTTP port before declaring success. The + // recovered process can be alive before the OpenAI-compatible API is ready. + if (!waitForRecoveredSandboxGateway(sandboxName)) { + if (!quiet) { + console.error(" Gateway process started but is not responding."); + console.error(" Check /tmp/gateway.log inside the sandbox for details."); + } + return { checked: true, wasRunning: false, recovered: false }; + } + ensureSandboxPortForward(sandboxName); + if (!quiet) { + console.log( + ` ${G}✓${R} ${agentRuntime.getAgentDisplayName(recoveryAgent)} gateway restarted inside sandbox.`, + ); + console.log(` ${G}✓${R} Dashboard port forward re-established.`); + } + } else if (!quiet) { + console.error( + ` Could not restart ${agentRuntime.getAgentDisplayName(recoveryAgent)} gateway automatically.`, + ); + console.error(" Connect to the sandbox and run manually:"); + console.error(` ${agentRuntime.getGatewayCommand(recoveryAgent)}`); + } + + return { checked: true, wasRunning: false, recovered }; +} diff --git a/src/lib/sandbox-runtime-actions.ts b/src/lib/sandbox-runtime-actions.ts index 45ef0382f8..ea46d6e3be 100644 --- a/src/lib/sandbox-runtime-actions.ts +++ b/src/lib/sandbox-runtime-actions.ts @@ -3,14 +3,17 @@ /* v8 ignore start -- transitional action facade until implementations leave src/nemoclaw.ts. */ -import type { SandboxConnectOptions } from "./nemoclaw-runtime-bridge"; +import type { SandboxConnectOptions } from "./sandbox-connect-action"; import { getNemoClawRuntimeBridge } from "./nemoclaw-runtime-bridge"; export async function connectSandbox( sandboxName: string, options?: SandboxConnectOptions, ): Promise { - await getNemoClawRuntimeBridge().sandboxConnect(sandboxName, options); + const { connectSandbox: connectExtractedSandbox } = require("./sandbox-connect-action") as { + connectSandbox: (sandboxName: string, options?: SandboxConnectOptions) => Promise; + }; + await connectExtractedSandbox(sandboxName, options); } export async function showSandboxStatus(sandboxName: string): Promise { diff --git a/src/lib/skill-install.ts b/src/lib/skill-install.ts index e413f64dab..49c358e3ad 100644 --- a/src/lib/skill-install.ts +++ b/src/lib/skill-install.ts @@ -147,7 +147,7 @@ export interface SshResult { /** * Run a command on the sandbox via SSH with optional stdin content. - * Uses the same SSH flags as executeSandboxCommand in nemoclaw.ts. + * Uses the same SSH flags as executeSandboxCommand in sandbox-process-recovery-action.ts. */ export function sshExec( ctx: SshContext, diff --git a/src/nemoclaw.ts b/src/nemoclaw.ts index ea024240ea..4448b603f1 100644 --- a/src/nemoclaw.ts +++ b/src/nemoclaw.ts @@ -4,7 +4,6 @@ const { execFileSync, spawn, spawnSync } = require("child_process"); const path = require("path"); const fs = require("fs"); -const os = require("os"); const { DASHBOARD_PORT, GATEWAY_PORT, OLLAMA_PORT } = require("./lib/ports"); // --------------------------------------------------------------------------- @@ -21,7 +20,7 @@ const R = _useColor ? "\x1b[0m" : ""; const _RD = _useColor ? "\x1b[1;31m" : ""; const YW = _useColor ? "\x1b[1;33m" : ""; -const { ROOT, run, runInteractive, shellQuote, validateName } = require("./lib/runner"); +const { ROOT, run, runInteractive, validateName } = require("./lib/runner"); // --------------------------------------------------------------------------- // Agent branding — derived from NEMOCLAW_AGENT when an alias launcher sets it; @@ -37,7 +36,6 @@ const { } = require("./lib/docker"); const { resolveOpenshell } = require("./lib/resolve-openshell"); const { hydrateCredentialEnv, isNonInteractive } = require("./lib/onboard"); -const { ensureOllamaAuthProxy } = require("./lib/onboard-ollama-proxy"); const { prompt: askPrompt } = require("./lib/credentials"); const registry = require("./lib/registry"); import type { SandboxEntry } from "./lib/registry"; @@ -56,7 +54,6 @@ const { captureOpenshell, captureOpenshellForStatus, getInstalledOpenshellVersionOrNull, - getOpenshellBinary, isCommandTimeout, runOpenshell, } = require("./lib/openshell-runtime"); @@ -72,13 +69,21 @@ const { printGatewayLifecycleHint, printWrongGatewayActiveGuidance, } = require("./lib/sandbox-gateway-state-action"); +const { + isSandboxConnectFlag, + parseSandboxConnectArgs, + printSandboxConnectHelp, +} = require("./lib/sandbox-connect-action"); +const { + executeSandboxCommand, + isSandboxGatewayRunningForStatus, +} = require("./lib/sandbox-process-recovery-action"); const { runRegisteredOclifCommand } = require("./lib/oclif-runner"); const { isErrnoException }: typeof import("./lib/errno") = require("./lib/errno"); const agentRuntime = require("../bin/lib/agent-runtime"); const sandboxVersion = require("./lib/sandbox-version"); const sandboxState = require("./lib/sandbox-state"); const { parseRestoreArgs } = sandboxState; -const { sleepSeconds } = require("./lib/wait"); const { parseSandboxPhase } = require("./lib/gateway-state"); const { getActiveSandboxSessions, @@ -110,14 +115,6 @@ type SpawnLikeResult = { signal?: NodeJS.Signals | null; }; -type SandboxCommandResult = { - status: number; - stdout: string; - stderr: string; -}; - -const SANDBOX_EXEC_STARTED_MARKER = "__NEMOCLAW_SANDBOX_EXEC_STARTED__"; - type RecoveredSandboxMetadata = Partial< Pick > & { @@ -182,278 +179,7 @@ function getSandboxDeleteOutcome(deleteResult: SpawnLikeResult) { }; } -// ── Sandbox process health (OpenClaw gateway inside the sandbox) ───────── - -/** - * Run a command inside the sandbox via SSH and return { status, stdout, stderr }. - * Returns null if SSH config cannot be obtained. - */ -function executeSandboxCommand(sandboxName: string, command: string): SandboxCommandResult | null { - const sshConfigResult = captureOpenshell(["sandbox", "ssh-config", sandboxName], { - ignoreError: true, - timeout: OPENSHELL_PROBE_TIMEOUT_MS, - }); - if (sshConfigResult.status !== 0) return null; - if (!sshConfigResult.output.trim()) return null; - - const tmpFile = path.join(os.tmpdir(), `nemoclaw-ssh-${process.pid}-${Date.now()}.conf`); - fs.writeFileSync(tmpFile, sshConfigResult.output, { mode: 0o600 }); - try { - const result = spawnSync( - "ssh", - [ - "-F", - tmpFile, - "-o", - "StrictHostKeyChecking=no", - "-o", - "UserKnownHostsFile=/dev/null", - "-o", - "ConnectTimeout=5", - "-o", - "LogLevel=ERROR", - `openshell-${sandboxName}`, - command, - ], - { encoding: "utf-8", stdio: ["ignore", "pipe", "pipe"], timeout: 15000 }, - ); - return { - status: result.status ?? 1, - stdout: (result.stdout || "").trim(), - stderr: (result.stderr || "").trim(), - }; - } catch { - return null; - } finally { - try { - fs.unlinkSync(tmpFile); - } catch { - /* ignore */ - } - } -} - -function executeSandboxExecCommand( - sandboxName: string, - command: string, - timeout = 15000, -): SandboxCommandResult | null { - const markedCommand = `printf '%s\\n' '${SANDBOX_EXEC_STARTED_MARKER}'; ${command}`; - const timeoutOverride = Number(process.env.NEMOCLAW_SANDBOX_EXEC_TIMEOUT_MS || ""); - const effectiveTimeout = - Number.isFinite(timeoutOverride) && timeoutOverride > 0 ? timeoutOverride : timeout; - try { - const result = spawnSync( - getOpenshellBinary(), - ["sandbox", "exec", "--name", sandboxName, "--", "sh", "-c", markedCommand], - { - cwd: ROOT, - encoding: "utf-8", - env: process.env, - stdio: ["ignore", "pipe", "pipe"], - timeout: effectiveTimeout, - }, - ); - if (result.error) return null; - const stdout = (result.stdout || "").trim(); - const stdoutLines = stdout.split(/\r?\n/); - const markerIndex = stdoutLines.indexOf(SANDBOX_EXEC_STARTED_MARKER); - if (markerIndex === -1) return null; - const commandStdoutLines = stdoutLines.slice(markerIndex + 1); - return { - status: result.status ?? 1, - stdout: commandStdoutLines.join("\n").trim(), - stderr: (result.stderr || "").trim(), - }; - } catch { - return null; - } -} - -async function executeSandboxExecCommandForStatus( - sandboxName: string, - command: string, -): Promise { - const markedCommand = `printf '%s\\n' '${SANDBOX_EXEC_STARTED_MARKER}'; ${command}`; - const result = await captureOpenshellForStatus( - ["sandbox", "exec", "--name", sandboxName, "--", "sh", "-c", markedCommand], - { ignoreError: true }, - ); - if (isCommandTimeout(result) || result.error) return null; - const stdout = (result.output || "").trim(); - const stdoutLines = stdout.split(/\r?\n/); - const markerIndex = stdoutLines.indexOf(SANDBOX_EXEC_STARTED_MARKER); - if (markerIndex === -1) return null; - const commandStdoutLines = stdoutLines.slice(markerIndex + 1); - return { - status: result.status ?? 1, - stdout: commandStdoutLines.join("\n").trim(), - stderr: "", - }; -} - -function parseSandboxGatewayProbe(result: SandboxCommandResult | null): boolean | null { - if (!result) return null; - if (result.stdout === "RUNNING") return true; - if (result.stdout === "STOPPED") return false; - return null; -} - -/** - * Check whether the OpenClaw gateway process is running inside the sandbox. - * Uses the gateway's HTTP endpoint (dashboard port) as the source of truth, - * since the gateway runs as a separate user and pgrep may not see it. - * Returns true (running), false (stopped), or null (cannot determine). - */ -function isSandboxGatewayRunning(sandboxName: string): boolean | null { - const agent = agentRuntime.getSessionAgent(sandboxName); - const probeUrl = agentRuntime.getHealthProbeUrl(agent); - const command = `curl -sf --max-time 3 ${shellQuote(probeUrl)} > /dev/null 2>&1 && echo RUNNING || echo STOPPED`; - const execProbe = parseSandboxGatewayProbe(executeSandboxExecCommand(sandboxName, command)); - if (execProbe !== null) return execProbe; - return parseSandboxGatewayProbe(executeSandboxCommand(sandboxName, command)); -} - -async function isSandboxGatewayRunningForStatus(sandboxName: string): Promise { - const agent = agentRuntime.getSessionAgent(sandboxName); - const probeUrl = agentRuntime.getHealthProbeUrl(agent); - const command = `curl -sf --max-time 3 ${shellQuote(probeUrl)} > /dev/null 2>&1 && echo RUNNING || echo STOPPED`; - return parseSandboxGatewayProbe(await executeSandboxExecCommandForStatus(sandboxName, command)); -} - -/** - * Restart the gateway process inside the sandbox after a pod restart. - * Cleans stale lock/temp files, sources proxy config, and launches the gateway - * in the background. Returns true on success. - */ -function recoverSandboxProcesses(sandboxName: string): boolean { - const agent = agentRuntime.getSessionAgent(sandboxName); - const agentScript = agentRuntime.buildRecoveryScript(agent, agent?.forwardPort ?? DASHBOARD_PORT); - const hasRecoveryMarker = (result: SandboxCommandResult | null) => - !!( - result && - (result.stdout.includes("GATEWAY_PID=") || result.stdout.includes("ALREADY_RUNNING")) - ); - const recoveredSsh = (result: SandboxCommandResult | null) => - !!(result && result.status === 0 && hasRecoveryMarker(result)); - - if (agentScript) { - // Non-OpenClaw manifests do not yet declare a runtime user for root - // sandbox exec. Recover them over SSH so the launch inherits the sandbox - // login user instead of creating root-owned agent state under /sandbox. - return recoveredSsh(executeSandboxCommand(sandboxName, agentScript)); - } - - const script = agentRuntime.buildOpenClawRecoveryScript(DASHBOARD_PORT); - const execResult = executeSandboxExecCommand(sandboxName, script, 30000); - if (hasRecoveryMarker(execResult)) return true; - if (execResult !== null) return false; - return recoveredSsh(executeSandboxCommand(sandboxName, script)); -} - -function readNonNegativeNumberEnv(name: string, fallback: number): number { - const raw = process.env[name]; - if (raw === undefined || raw.trim() === "") return fallback; - const parsed = Number(raw); - return Number.isFinite(parsed) && parsed >= 0 ? parsed : fallback; -} - -function waitForRecoveredSandboxGateway(sandboxName: string): boolean { - const timeoutSeconds = readNonNegativeNumberEnv( - "NEMOCLAW_GATEWAY_RECOVERY_WAIT_SECONDS", - 30, - ); - const intervalSeconds = readNonNegativeNumberEnv( - "NEMOCLAW_GATEWAY_RECOVERY_POLL_INTERVAL_SECONDS", - 3, - ); - const attempts = - intervalSeconds > 0 - ? Math.max(1, Math.floor(timeoutSeconds / intervalSeconds) + 1) - : Math.max(1, Math.floor(timeoutSeconds) + 1); - - for (let attempt = 0; attempt < attempts; attempt += 1) { - if (isSandboxGatewayRunning(sandboxName) === true) { - return true; - } - if (attempt < attempts - 1) { - sleepSeconds(intervalSeconds); - } - } - return false; -} - -/** - * Re-establish the dashboard port forward to the sandbox. - * Uses the agent's forward port when a non-OpenClaw agent is active. - */ -function ensureSandboxPortForward(sandboxName: string): void { - const agent = agentRuntime.getSessionAgent(sandboxName); - const port = agent ? String(agent.forwardPort) : DASHBOARD_FORWARD_PORT; - runOpenshell(["forward", "stop", port], { ignoreError: true }); - runOpenshell(["forward", "start", "--background", port, sandboxName], { - ignoreError: true, - }); -} - -/** - * Detect and recover from a sandbox that survived a gateway restart but - * whose OpenClaw processes are not running. Returns an object describing - * the outcome: { checked, wasRunning, recovered }. - */ -function checkAndRecoverSandboxProcesses( - sandboxName: string, - { quiet = false }: { quiet?: boolean } = {}, -) { - const running = isSandboxGatewayRunning(sandboxName); - if (running === null) { - return { checked: false, wasRunning: null, recovered: false }; - } - if (running) { - return { checked: true, wasRunning: true, recovered: false }; - } - - // Gateway not running — attempt recovery - const _recoveryAgent = agentRuntime.getSessionAgent(sandboxName); - if (!quiet) { - console.log(""); - console.log( - ` ${agentRuntime.getAgentDisplayName(_recoveryAgent)} gateway is not running inside the sandbox (sandbox likely restarted).`, - ); - console.log(" Recovering..."); - } - - const recovered = recoverSandboxProcesses(sandboxName); - if (recovered) { - // Wait for gateway to bind its HTTP port before declaring success. The - // recovered process can be alive before the OpenAI-compatible API is ready. - if (!waitForRecoveredSandboxGateway(sandboxName)) { - if (!quiet) { - console.error(" Gateway process started but is not responding."); - console.error(" Check /tmp/gateway.log inside the sandbox for details."); - } - return { checked: true, wasRunning: false, recovered: false }; - } - ensureSandboxPortForward(sandboxName); - if (!quiet) { - console.log( - ` ${G}✓${R} ${agentRuntime.getAgentDisplayName(_recoveryAgent)} gateway restarted inside sandbox.`, - ); - console.log(` ${G}✓${R} Dashboard port forward re-established.`); - } - } else if (!quiet) { - console.error( - ` Could not restart ${agentRuntime.getAgentDisplayName(_recoveryAgent)} gateway automatically.`, - ); - console.error(" Connect to the sandbox and run manually:"); - console.error(` ${agentRuntime.getGatewayCommand(_recoveryAgent)}`); - } - - return { checked: true, wasRunning: false, recovered }; -} - exports.runtimeBridge = { - sandboxConnect, sandboxDestroy, sandboxRebuild, sandboxStatus, @@ -473,19 +199,6 @@ function printOldLogsCompatibilityGuidance(installedVersion = null) { ); } -function exitWithSpawnResult(result: SpawnLikeResult & { signal?: NodeJS.Signals | null }) { - if (result.status !== null) { - process.exit(result.status); - } - - if (result.signal) { - const signalNumber = os.constants.signals[result.signal]; - process.exit(signalNumber ? 128 + signalNumber : 1); - } - - process.exit(1); -} - // ── Commands ───────────────────────────────────────────────────── async function runOclif(commandId: string, args: string[] = []): Promise { @@ -500,272 +213,6 @@ function printSandboxActionUsage(action: string): void { console.log(` Usage: ${CLI_NAME} ${action}`); } -// ── Sandbox-scoped actions ─────────────────────────────────────── - -type SandboxConnectOptions = { - probeOnly?: boolean; -}; - -const SANDBOX_CONNECT_FLAGS = new Set(["--dangerously-skip-permissions", "--probe-only", "--help", "-h"]); - -function isSandboxConnectFlag(arg: string | undefined): boolean { - return typeof arg === "string" && SANDBOX_CONNECT_FLAGS.has(arg); -} - -function printSandboxConnectHelp(sandboxName = "") { - console.log(""); - console.log(` Usage: ${CLI_NAME} ${sandboxName} connect [--probe-only]`); - console.log(""); - console.log(" Options:"); - console.log( - " --probe-only Run recovery checks and exit without opening SSH", - ); - console.log(" -h, --help Show this help"); - console.log(""); -} - -function parseSandboxConnectArgs(sandboxName: string, actionArgs: string[]): SandboxConnectOptions { - const options: SandboxConnectOptions = {}; - for (const arg of actionArgs) { - if (!isSandboxConnectFlag(arg)) { - console.error(` Unknown flag for connect: ${arg}`); - printSandboxConnectHelp(sandboxName); - process.exit(1); - } - switch (arg) { - case "--dangerously-skip-permissions": - console.error(" --dangerously-skip-permissions was removed; use shields commands instead."); - printSandboxConnectHelp(sandboxName); - process.exit(1); - case "--probe-only": - options.probeOnly = true; - break; - case "--help": - case "-h": - printSandboxConnectHelp(sandboxName); - process.exit(0); - break; - } - } - return options; -} - -function runSandboxConnectProbe(sandboxName: string): void { - const processCheck = checkAndRecoverSandboxProcesses(sandboxName, { quiet: true }); - const agent = agentRuntime.getSessionAgent(sandboxName); - const agentName = agentRuntime.getAgentDisplayName(agent); - if (!processCheck.checked) { - console.error( - ` Probe failed: could not inspect the ${agentName} gateway inside sandbox '${sandboxName}'.`, - ); - process.exit(1); - } - if (processCheck.wasRunning) { - console.log(` Probe complete: ${agentName} gateway is running in '${sandboxName}'.`); - return; - } - if (processCheck.recovered) { - console.log(` Probe complete: recovered ${agentName} gateway in '${sandboxName}'.`); - return; - } - console.error( - ` Probe failed: ${agentName} gateway is not running in '${sandboxName}' and automatic recovery failed.`, - ); - console.error(" Check /tmp/gateway.log inside the sandbox for details."); - process.exit(1); -} - -async function sandboxConnect( - sandboxName: string, - { probeOnly = false }: SandboxConnectOptions = {}, -) { - const { isSandboxReady, parseSandboxStatus } = require("./lib/onboard"); - await ensureLiveSandboxOrExit(sandboxName, { allowNonReadyPhase: true }); - - if (probeOnly) { - return runSandboxConnectProbe(sandboxName); - } - - // Version staleness check — warn but don't block - try { - const versionCheck = sandboxVersion.checkAgentVersion(sandboxName); - if (versionCheck.isStale) { - for (const line of sandboxVersion.formatStalenessWarning(sandboxName, versionCheck)) { - console.error(line); - } - } - } catch { - /* non-fatal — don't block connect on version check failure */ - } - - // Active session hint — inform if already connected in another terminal - try { - const opsBinConnect = resolveOpenshell(); - if (opsBinConnect) { - const sessionResult = getActiveSandboxSessions(sandboxName, createSessionDeps(opsBinConnect)); - if (sessionResult.detected && sessionResult.sessions.length > 0) { - const count = sessionResult.sessions.length; - console.log( - ` ${D}Note: ${count} existing SSH session${count > 1 ? "s" : ""} to '${sandboxName}' detected (another terminal).${R}`, - ); - } - } - } catch { - /* non-fatal — don't block connect on session detection failure */ - } - - checkAndRecoverSandboxProcesses(sandboxName); - // Ensure Ollama auth proxy is running (recovers from host reboots) - ensureOllamaAuthProxy(); - - // ── Inference route swap (#1248) ────────────────────────────────── - // When the user has multiple sandboxes with different providers, the - // cluster-wide inference.local route may still point at the *other* - // provider. Re-set it to match this sandbox's persisted config. - let sb; - try { - sb = registry.getSandbox(sandboxName); - if (sb && sb.provider && sb.model) { - const live = parseGatewayInference( - captureOpenshell(["inference", "get"], { - ignoreError: true, - timeout: OPENSHELL_PROBE_TIMEOUT_MS, - }).output, - ); - if (!live || live.provider !== sb.provider || live.model !== sb.model) { - console.log( - ` Switching inference route to ${sb.provider}/${sb.model} for sandbox '${sandboxName}'`, - ); - const swapResult = runOpenshell( - ["inference", "set", "--provider", sb.provider, "--model", sb.model, "--no-verify"], - { ignoreError: true }, - ); - if (swapResult.status !== 0) { - console.error( - ` ${YW}Warning: failed to switch inference route — connect will proceed anyway.${R}`, - ); - } - } - } - } catch { - /* non-fatal — don't block connect on inference route swap failure */ - } - - const rawTimeout = process.env.NEMOCLAW_CONNECT_TIMEOUT; - let timeout = 120; - if (rawTimeout !== undefined) { - const parsed = parseInt(rawTimeout, 10); - if (Number.isNaN(parsed) || parsed <= 0) { - console.warn( - ` Warning: invalid NEMOCLAW_CONNECT_TIMEOUT="${rawTimeout}", using default 120s`, - ); - } else { - timeout = parsed; - } - } - const interval = 3; - const startedAt = Date.now(); - const deadline = startedAt + timeout * 1000; - const elapsedSec = () => Math.floor((Date.now() - startedAt) / 1000); - const remainingMs = () => Math.max(1, deadline - Date.now()); - const runSandboxList = () => - captureOpenshell(["sandbox", "list"], { - ignoreError: true, - timeout: remainingMs(), - }).output; - - const list = runSandboxList(); - if (!isSandboxReady(list, sandboxName)) { - const status = parseSandboxStatus(list, sandboxName); - const TERMINAL = new Set([ - "Failed", - "Error", - "CrashLoopBackOff", - "ImagePullBackOff", - "Unknown", - "Evicted", - ]); - if (status && TERMINAL.has(status)) { - console.error(""); - console.error(` Sandbox '${sandboxName}' is in '${status}' state.`); - console.error(` Run: ${CLI_NAME} ${sandboxName} logs --follow`); - console.error(` Run: ${CLI_NAME} ${sandboxName} status`); - process.exit(1); - } - - console.log(` Waiting for sandbox '${sandboxName}' to be ready...`); - let ready = false; - let everSeen = status !== null; - while (Date.now() < deadline) { - const sleepFor = Math.min(interval, remainingMs() / 1000); - if (sleepFor <= 0) break; - spawnSync("sleep", [String(sleepFor)]); - const poll = runSandboxList(); - const elapsed = elapsedSec(); - if (isSandboxReady(poll, sandboxName)) { - ready = true; - break; - } - const cur = parseSandboxStatus(poll, sandboxName) || "unknown"; - if (cur !== "unknown") everSeen = true; - if (TERMINAL.has(cur)) { - console.error(""); - console.error(` Sandbox '${sandboxName}' entered '${cur}' state.`); - console.error(` Run: ${CLI_NAME} ${sandboxName} logs --follow`); - console.error(` Run: ${CLI_NAME} ${sandboxName} status`); - process.exit(1); - } - if (!everSeen && elapsed >= 30) { - console.error(""); - console.error(` Sandbox '${sandboxName}' not found after ${elapsed}s.`); - console.error(` Check: openshell sandbox list`); - process.exit(1); - } - process.stdout.write(`\r Status: ${cur.padEnd(20)} (${elapsed}s elapsed)`); - } - - if (!ready) { - console.error(""); - console.error(` Timed out after ${timeout}s waiting for sandbox '${sandboxName}'.`); - console.error(` Check: openshell sandbox list`); - console.error( - ` Override timeout: NEMOCLAW_CONNECT_TIMEOUT=300 ${CLI_NAME} ${sandboxName} connect`, - ); - process.exit(1); - } - console.log(`\r Status: ${"Ready".padEnd(20)} (${elapsedSec()}s elapsed)`); - console.log(" Sandbox is ready. Connecting..."); - } - - // Print a one-shot hint before dropping the user into the sandbox - // shell so a fresh user knows the first thing to type. Without this, - // `nemoclaw connect` lands on a bare bash prompt and users - // ask "now what?" — see #465. Suppress the hint when stdout isn't a - // TTY so scripted callers don't get noise in their pipelines. - if ( - process.stdout.isTTY && - !["1", "true"].includes(String(process.env.NEMOCLAW_NO_CONNECT_HINT || "")) - ) { - console.log(""); - const agentName = sb?.agent || "openclaw"; - const agentCmd = agentName === "openclaw" ? "openclaw tui" : agentName; - console.log(` ${G}✓${R} Connecting to sandbox '${sandboxName}'`); - console.log( - ` ${D}Inside the sandbox, run \`${agentCmd}\` to start chatting with the agent.${R}`, - ); - console.log( - ` ${D}Type \`/exit\` to leave the chat, then \`exit\` to return to the host shell.${R}`, - ); - console.log(""); - } - const result = spawnSync(getOpenshellBinary(), ["sandbox", "connect", sandboxName], { - stdio: "inherit", - cwd: ROOT, - env: process.env, - }); - exitWithSpawnResult(result); -} - function captureHostCommand( command: string, args: string[], From e9dd46e1c47bb78d5e5ac8c8372f52411f85b533 Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Sat, 2 May 2026 19:01:15 -0700 Subject: [PATCH 04/65] refactor(cli): extract sandbox status action --- src/lib/nemoclaw-runtime-bridge.ts | 1 - src/lib/sandbox-runtime-actions.ts | 5 +- src/lib/sandbox-status-action.ts | 246 +++++++++++++++++++++++++++++ src/nemoclaw.ts | 235 +-------------------------- 4 files changed, 253 insertions(+), 234 deletions(-) create mode 100644 src/lib/sandbox-status-action.ts diff --git a/src/lib/nemoclaw-runtime-bridge.ts b/src/lib/nemoclaw-runtime-bridge.ts index e5ef72db18..6f73af71ab 100644 --- a/src/lib/nemoclaw-runtime-bridge.ts +++ b/src/lib/nemoclaw-runtime-bridge.ts @@ -6,7 +6,6 @@ export interface NemoClawRuntimeBridge { sandboxDestroy: (sandboxName: string, args?: string[]) => Promise; sandboxRebuild: (sandboxName: string, args?: string[]) => Promise; - sandboxStatus: (sandboxName: string) => Promise; upgradeSandboxes: (args?: string[]) => Promise; } diff --git a/src/lib/sandbox-runtime-actions.ts b/src/lib/sandbox-runtime-actions.ts index ea46d6e3be..e3ca1b4733 100644 --- a/src/lib/sandbox-runtime-actions.ts +++ b/src/lib/sandbox-runtime-actions.ts @@ -17,7 +17,10 @@ export async function connectSandbox( } export async function showSandboxStatus(sandboxName: string): Promise { - await getNemoClawRuntimeBridge().sandboxStatus(sandboxName); + const { showSandboxStatus: showExtractedSandboxStatus } = require("./sandbox-status-action") as { + showSandboxStatus: (sandboxName: string) => Promise; + }; + await showExtractedSandboxStatus(sandboxName); } export function showSandboxLogs(sandboxName: string, follow: boolean): void { diff --git a/src/lib/sandbox-status-action.ts b/src/lib/sandbox-status-action.ts new file mode 100644 index 0000000000..0f844264ff --- /dev/null +++ b/src/lib/sandbox-status-action.ts @@ -0,0 +1,246 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +/* v8 ignore start -- exercised through CLI subprocess status tests. */ + +import { CLI_DISPLAY_NAME, CLI_NAME } from "./branding"; +import { parseSandboxPhase } from "./gateway-state"; +import { getNamedGatewayLifecycleState } from "./gateway-runtime-action"; +import { probeProviderHealth } from "./inference-health"; +import { parseGatewayInference } from "./inference-config"; +import * as nim from "./nim"; +import * as onboardSession from "./onboard-session"; +import type { Session } from "./onboard-session"; +import { + captureOpenshellForStatus, + isCommandTimeout, +} from "./openshell-runtime"; +import * as registry from "./registry"; +import { + getReconciledSandboxGatewayState, + getSandboxGatewayStateForStatus, + printGatewayLifecycleHint, + printWrongGatewayActiveGuidance, +} from "./sandbox-gateway-state-action"; +import { isSandboxGatewayRunningForStatus } from "./sandbox-process-recovery-action"; +import { + createSystemDeps as createSessionDeps, + getActiveSandboxSessions, +} from "./sandbox-session-state"; +import * as sandboxVersion from "./sandbox-version"; +import * as shields from "./shields"; +import { resolveOpenshell } from "./resolve-openshell"; +import { D, G, R, RD, YW } from "./terminal-style"; + +const agentRuntime = require("../../bin/lib/agent-runtime"); + +// eslint-disable-next-line complexity +export async function showSandboxStatus(sandboxName: string): Promise { + const sb = registry.getSandbox(sandboxName); + const liveResult = await captureOpenshellForStatus(["inference", "get"], { + ignoreError: true, + }); + const live = parseGatewayInference(isCommandTimeout(liveResult) ? "" : liveResult.output); + const currentModel = (live && live.model) || (sb && sb.model) || "unknown"; + const currentProvider = (live && live.provider) || (sb && sb.provider) || "unknown"; + const inferenceHealth = + typeof currentProvider === "string" ? probeProviderHealth(currentProvider) : null; + if (sb) { + console.log(""); + console.log(` Sandbox: ${sb.name}`); + console.log(` Model: ${currentModel}`); + console.log(` Provider: ${currentProvider}`); + if (inferenceHealth) { + if (!inferenceHealth.probed) { + console.log(` Inference: ${D}not probed${R} (${inferenceHealth.detail})`); + } else if (inferenceHealth.ok) { + console.log(` Inference: ${G}healthy${R} (${inferenceHealth.endpoint})`); + } else { + console.log(` Inference: ${RD}unreachable${R} (${inferenceHealth.endpoint})`); + console.log(` ${inferenceHealth.detail}`); + } + } + console.log(` GPU: ${sb.gpuEnabled ? "yes" : "no"}`); + console.log(` Policies: ${(sb.policies || []).join(", ") || "none"}`); + + // Active session indicator + try { + const opsBinStatus = resolveOpenshell(); + if (opsBinStatus) { + const sessionResult = getActiveSandboxSessions( + sandboxName, + createSessionDeps(opsBinStatus), + ); + if (sessionResult.detected) { + const count = sessionResult.sessions.length; + console.log( + ` Connected: ${count > 0 ? `${G}yes${R} (${count} session${count > 1 ? "s" : ""})` : "no"}`, + ); + } + } + } catch { + /* non-fatal */ + } + + if (shields.isShieldsDown(sandboxName)) { + console.log(" Permissions: shields down (check `shields status` for details)"); + } + + // Agent version check + try { + const versionCheck = sandboxVersion.checkAgentVersion(sandboxName, { skipProbe: true }); + const agent = agentRuntime.getSessionAgent(sandboxName); + const agentName = agentRuntime.getAgentDisplayName(agent); + if (versionCheck.sandboxVersion) { + console.log(` Agent: ${agentName} v${versionCheck.sandboxVersion}`); + } + if (versionCheck.isStale) { + console.log(` ${YW}Update: v${versionCheck.expectedVersion} available${R}`); + console.log(` Run \`${CLI_NAME} ${sandboxName} rebuild\` to upgrade`); + } + } catch { + /* non-fatal */ + } + } + + const lookup = await getReconciledSandboxGatewayState(sandboxName, { + getState: getSandboxGatewayStateForStatus, + }); + if (lookup.state === "present") { + console.log(""); + if ("recoveredGateway" in lookup && lookup.recoveredGateway) { + console.log( + ` Recovered ${CLI_DISPLAY_NAME} gateway runtime via ${("recoveryVia" in lookup ? lookup.recoveryVia : null) || "gateway reattach"}.`, + ); + console.log(""); + } + console.log(lookup.output); + const phase = parseSandboxPhase(lookup.output || ""); + if (phase && phase !== "Ready") { + console.log(""); + console.log(` Sandbox '${sandboxName}' is stuck in '${phase}' phase.`); + console.log( + " This usually happens when a process crash inside the sandbox prevented clean startup.", + ); + console.log(""); + console.log( + ` Run \`${CLI_NAME} ${sandboxName} rebuild --yes\` to recreate the sandbox (--yes skips the confirmation prompt; workspace state will be preserved).`, + ); + } + } else if (lookup.state === "wrong_gateway_active") { + const activeGateway = + "activeGateway" in lookup && typeof lookup.activeGateway === "string" + ? lookup.activeGateway + : undefined; + console.log(""); + printWrongGatewayActiveGuidance(sandboxName, activeGateway, console.log); + } else if (lookup.state === "missing") { + // Belt-and-suspenders: only destroy registry state if the nemoclaw gateway + // is demonstrably the healthy active gateway. Guards against regressions + // in the reconciler. + const guard = getNamedGatewayLifecycleState(); + if (guard.state !== "healthy_named") { + console.log(""); + if (guard.state === "connected_other") { + printWrongGatewayActiveGuidance(sandboxName, guard.activeGateway, console.log); + } else { + printGatewayLifecycleHint(guard.status || "", sandboxName, console.log); + } + } else { + registry.removeSandbox(sandboxName); + const session = onboardSession.loadSession(); + if (session && session.sandboxName === sandboxName) { + onboardSession.updateSession((s: Session) => { + s.sandboxName = null; + return s; + }); + } + console.log(""); + console.log(` Sandbox '${sandboxName}' is not present in the live OpenShell gateway.`); + console.log(" Removed stale local registry entry."); + } + } else if (lookup.state === "identity_drift") { + console.log(""); + console.log( + ` Sandbox '${sandboxName}' is recorded locally, but the gateway trust material rotated after restart.`, + ); + if (lookup.output) { + console.log(lookup.output); + } + console.log( + " Existing sandbox connections cannot be reattached safely after this gateway identity change.", + ); + console.log( + ` Recreate this sandbox with \`${CLI_NAME} onboard\` once the gateway runtime is stable.`, + ); + } else if (lookup.state === "gateway_unreachable_after_restart") { + console.log(""); + console.log( + ` Sandbox '${sandboxName}' may still exist, but the selected ${CLI_DISPLAY_NAME} gateway is still refusing connections after restart.`, + ); + if (lookup.output) { + console.log(lookup.output); + } + console.log( + " Retry `openshell gateway start --name nemoclaw` and verify `openshell status` is healthy before reconnecting.", + ); + console.log( + " If the gateway never becomes healthy, rebuild the gateway and then recreate the affected sandbox.", + ); + } else if (lookup.state === "gateway_missing_after_restart") { + console.log(""); + console.log( + ` Sandbox '${sandboxName}' may still exist locally, but the ${CLI_DISPLAY_NAME} gateway is no longer configured after restart/rebuild.`, + ); + if (lookup.output) { + console.log(lookup.output); + } + console.log( + " Start the gateway again with `openshell gateway start --name nemoclaw` before retrying.", + ); + console.log( + " If the gateway had to be rebuilt from scratch, recreate the affected sandbox afterward.", + ); + } else { + console.log(""); + console.log(` Could not verify sandbox '${sandboxName}' against the live OpenShell gateway.`); + if (lookup.output) { + console.log(lookup.output); + } + printGatewayLifecycleHint(lookup.output, sandboxName, console.log); + } + + // OpenClaw process health inside the sandbox + if (lookup.state === "present") { + const running = await isSandboxGatewayRunningForStatus(sandboxName); + if (running !== null) { + const sessionAgent = agentRuntime.getSessionAgent(sandboxName); + const sessionAgentName = agentRuntime.getAgentDisplayName(sessionAgent); + if (running) { + console.log(` ${sessionAgentName}: ${G}running${R}`); + } else { + console.log(` ${sessionAgentName}: ${RD}not running${R}`); + console.log(""); + console.log(` The sandbox is alive but the ${sessionAgentName} gateway process is not running.`); + console.log(" This typically happens after a gateway restart (e.g., laptop close/open)."); + console.log(""); + console.log(" To recover, run:"); + console.log(` ${D}${CLI_NAME} ${sandboxName} connect${R} (auto-recovers on connect)`); + console.log(" Or manually inside the sandbox:"); + console.log(` ${D}${agentRuntime.getGatewayCommand(sessionAgent)}${R}`); + } + } + } + + const nimStat = + sb && sb.nimContainer ? nim.nimStatusByName(sb.nimContainer) : nim.nimStatus(sandboxName); + if (nim.shouldShowNimLine(sb && sb.nimContainer, nimStat.running)) { + console.log( + ` NIM: ${nimStat.running ? `running (${nimStat.container})` : "not running"}`, + ); + if (nimStat.running) { + console.log(` Healthy: ${nimStat.healthy ? "yes" : "no"}`); + } + } + console.log(""); +} diff --git a/src/nemoclaw.ts b/src/nemoclaw.ts index 4448b603f1..012a303bd2 100644 --- a/src/nemoclaw.ts +++ b/src/nemoclaw.ts @@ -52,39 +52,24 @@ const { parseLiveSandboxNames } = require("./lib/runtime-recovery"); const { stripAnsi } = require("./lib/openshell"); const { captureOpenshell, - captureOpenshellForStatus, getInstalledOpenshellVersionOrNull, - isCommandTimeout, runOpenshell, } = require("./lib/openshell-runtime"); -const { - getNamedGatewayLifecycleState, - recoverNamedGatewayRuntime, -} = require("./lib/gateway-runtime-action"); +const { recoverNamedGatewayRuntime } = require("./lib/gateway-runtime-action"); const { recoverRegistryEntries } = require("./lib/registry-recovery-action"); -const { - ensureLiveSandboxOrExit, - getReconciledSandboxGatewayState, - getSandboxGatewayStateForStatus, - printGatewayLifecycleHint, - printWrongGatewayActiveGuidance, -} = require("./lib/sandbox-gateway-state-action"); +const { ensureLiveSandboxOrExit } = require("./lib/sandbox-gateway-state-action"); const { isSandboxConnectFlag, parseSandboxConnectArgs, printSandboxConnectHelp, } = require("./lib/sandbox-connect-action"); -const { - executeSandboxCommand, - isSandboxGatewayRunningForStatus, -} = require("./lib/sandbox-process-recovery-action"); +const { executeSandboxCommand } = require("./lib/sandbox-process-recovery-action"); const { runRegisteredOclifCommand } = require("./lib/oclif-runner"); const { isErrnoException }: typeof import("./lib/errno") = require("./lib/errno"); const agentRuntime = require("../bin/lib/agent-runtime"); const sandboxVersion = require("./lib/sandbox-version"); const sandboxState = require("./lib/sandbox-state"); const { parseRestoreArgs } = sandboxState; -const { parseSandboxPhase } = require("./lib/gateway-state"); const { getActiveSandboxSessions, createSystemDeps: createSessionDeps, @@ -182,7 +167,6 @@ function getSandboxDeleteOutcome(deleteResult: SpawnLikeResult) { exports.runtimeBridge = { sandboxDestroy, sandboxRebuild, - sandboxStatus, upgradeSandboxes, }; /** Print user-facing guidance when OpenShell is too old to support `openshell logs`. */ @@ -799,219 +783,6 @@ async function sandboxDoctor(sandboxName: string, args: string[] = []): Promise< if (exitCode !== 0) process.exit(exitCode); } -// eslint-disable-next-line complexity -async function sandboxStatus(sandboxName: string) { - const sb = registry.getSandbox(sandboxName); - const liveResult = await captureOpenshellForStatus(["inference", "get"], { - ignoreError: true, - }); - const live = parseGatewayInference( - isCommandTimeout(liveResult) ? "" : liveResult.output, - ); - const currentModel = (live && live.model) || (sb && sb.model) || "unknown"; - const currentProvider = (live && live.provider) || (sb && sb.provider) || "unknown"; - const inferenceHealth = - typeof currentProvider === "string" ? probeProviderHealth(currentProvider) : null; - if (sb) { - console.log(""); - console.log(` Sandbox: ${sb.name}`); - console.log(` Model: ${currentModel}`); - console.log(` Provider: ${currentProvider}`); - if (inferenceHealth) { - if (!inferenceHealth.probed) { - console.log(` Inference: ${D}not probed${R} (${inferenceHealth.detail})`); - } else if (inferenceHealth.ok) { - console.log(` Inference: ${G}healthy${R} (${inferenceHealth.endpoint})`); - } else { - console.log(` Inference: ${_RD}unreachable${R} (${inferenceHealth.endpoint})`); - console.log(` ${inferenceHealth.detail}`); - } - } - console.log(` GPU: ${sb.gpuEnabled ? "yes" : "no"}`); - console.log(` Policies: ${(sb.policies || []).join(", ") || "none"}`); - - // Active session indicator - try { - const opsBinStatus = resolveOpenshell(); - if (opsBinStatus) { - const sessionResult = getActiveSandboxSessions( - sandboxName, - createSessionDeps(opsBinStatus), - ); - if (sessionResult.detected) { - const count = sessionResult.sessions.length; - console.log( - ` Connected: ${count > 0 ? `${G}yes${R} (${count} session${count > 1 ? "s" : ""})` : "no"}`, - ); - } - } - } catch { - /* non-fatal */ - } - - if (shields.isShieldsDown(sandboxName)) { - console.log(` Permissions: shields down (check \`shields status\` for details)`); - } - - // Agent version check - try { - const versionCheck = sandboxVersion.checkAgentVersion(sandboxName, { skipProbe: true }); - const agent = agentRuntime.getSessionAgent(sandboxName); - const agentName = agentRuntime.getAgentDisplayName(agent); - if (versionCheck.sandboxVersion) { - console.log(` Agent: ${agentName} v${versionCheck.sandboxVersion}`); - } - if (versionCheck.isStale) { - console.log(` ${YW}Update: v${versionCheck.expectedVersion} available${R}`); - console.log(` Run \`${CLI_NAME} ${sandboxName} rebuild\` to upgrade`); - } - } catch { - /* non-fatal */ - } - } - - const lookup = await getReconciledSandboxGatewayState(sandboxName, { - getState: getSandboxGatewayStateForStatus, - }); - if (lookup.state === "present") { - console.log(""); - if ("recoveredGateway" in lookup && lookup.recoveredGateway) { - console.log( - ` Recovered ${CLI_DISPLAY_NAME} gateway runtime via ${("recoveryVia" in lookup ? lookup.recoveryVia : null) || "gateway reattach"}.`, - ); - console.log(""); - } - console.log(lookup.output); - const phase = parseSandboxPhase(lookup.output || ""); - if (phase && phase !== "Ready") { - console.log(""); - console.log(` Sandbox '${sandboxName}' is stuck in '${phase}' phase.`); - console.log( - " This usually happens when a process crash inside the sandbox prevented clean startup.", - ); - console.log(""); - console.log( - ` Run \`${CLI_NAME} ${sandboxName} rebuild --yes\` to recreate the sandbox (--yes skips the confirmation prompt; workspace state will be preserved).`, - ); - } - } else if (lookup.state === "wrong_gateway_active") { - const activeGateway = - "activeGateway" in lookup && typeof lookup.activeGateway === "string" - ? lookup.activeGateway - : undefined; - console.log(""); - printWrongGatewayActiveGuidance(sandboxName, activeGateway, console.log); - } else if (lookup.state === "missing") { - // Belt-and-suspenders: only destroy registry state if the nemoclaw gateway - // is demonstrably the healthy active gateway. Guards against regressions - // in the reconciler. - const guard = getNamedGatewayLifecycleState(); - if (guard.state !== "healthy_named") { - console.log(""); - if (guard.state === "connected_other") { - printWrongGatewayActiveGuidance(sandboxName, guard.activeGateway, console.log); - } else { - printGatewayLifecycleHint(guard.status || "", sandboxName, console.log); - } - } else { - registry.removeSandbox(sandboxName); - const session = onboardSession.loadSession(); - if (session && session.sandboxName === sandboxName) { - onboardSession.updateSession((s: Session) => { - s.sandboxName = null; - return s; - }); - } - console.log(""); - console.log(` Sandbox '${sandboxName}' is not present in the live OpenShell gateway.`); - console.log(" Removed stale local registry entry."); - } - } else if (lookup.state === "identity_drift") { - console.log(""); - console.log( - ` Sandbox '${sandboxName}' is recorded locally, but the gateway trust material rotated after restart.`, - ); - if (lookup.output) { - console.log(lookup.output); - } - console.log( - " Existing sandbox connections cannot be reattached safely after this gateway identity change.", - ); - console.log( - ` Recreate this sandbox with \`${CLI_NAME} onboard\` once the gateway runtime is stable.`, - ); - } else if (lookup.state === "gateway_unreachable_after_restart") { - console.log(""); - console.log( - ` Sandbox '${sandboxName}' may still exist, but the selected ${CLI_DISPLAY_NAME} gateway is still refusing connections after restart.`, - ); - if (lookup.output) { - console.log(lookup.output); - } - console.log( - " Retry `openshell gateway start --name nemoclaw` and verify `openshell status` is healthy before reconnecting.", - ); - console.log( - " If the gateway never becomes healthy, rebuild the gateway and then recreate the affected sandbox.", - ); - } else if (lookup.state === "gateway_missing_after_restart") { - console.log(""); - console.log( - ` Sandbox '${sandboxName}' may still exist locally, but the ${CLI_DISPLAY_NAME} gateway is no longer configured after restart/rebuild.`, - ); - if (lookup.output) { - console.log(lookup.output); - } - console.log( - " Start the gateway again with `openshell gateway start --name nemoclaw` before retrying.", - ); - console.log( - " If the gateway had to be rebuilt from scratch, recreate the affected sandbox afterward.", - ); - } else { - console.log(""); - console.log(` Could not verify sandbox '${sandboxName}' against the live OpenShell gateway.`); - if (lookup.output) { - console.log(lookup.output); - } - printGatewayLifecycleHint(lookup.output, sandboxName, console.log); - } - - // OpenClaw process health inside the sandbox - if (lookup.state === "present") { - const running = await isSandboxGatewayRunningForStatus(sandboxName); - if (running !== null) { - const _sa = agentRuntime.getSessionAgent(sandboxName); - const _saName = agentRuntime.getAgentDisplayName(_sa); - if (running) { - console.log(` ${_saName}: ${G}running${R}`); - } else { - console.log(` ${_saName}: ${_RD}not running${R}`); - console.log(""); - console.log(` The sandbox is alive but the ${_saName} gateway process is not running.`); - console.log(" This typically happens after a gateway restart (e.g., laptop close/open)."); - console.log(""); - console.log(" To recover, run:"); - console.log(` ${D}${CLI_NAME} ${sandboxName} connect${R} (auto-recovers on connect)`); - console.log(" Or manually inside the sandbox:"); - console.log(` ${D}${agentRuntime.getGatewayCommand(_sa)}${R}`); - } - } - } - - const nimStat = - sb && sb.nimContainer ? nim.nimStatusByName(sb.nimContainer) : nim.nimStatus(sandboxName); - if (nim.shouldShowNimLine(sb && sb.nimContainer, nimStat.running)) { - console.log( - ` NIM: ${nimStat.running ? `running (${nimStat.container})` : "not running"}`, - ); - if (nimStat.running) { - console.log(` Healthy: ${nimStat.healthy ? "yes" : "no"}`); - } - } - console.log(""); -} - function cleanupSandboxServices( sandboxName: string, { stopHostServices = false }: { stopHostServices?: boolean } = {}, From aab6c86329b5cd07cdd0e170ef532a7838b8b7e0 Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Sat, 2 May 2026 19:10:11 -0700 Subject: [PATCH 05/65] refactor(cli): extract sandbox doctor action --- src/lib/legacy-oclif-dispatch.test.ts | 8 + src/lib/legacy-oclif-dispatch.ts | 4 +- src/lib/oclif-commands.ts | 2 + src/lib/sandbox-doctor-action.ts | 641 ++++++++++++++++++++++++++ src/lib/sandbox-doctor-cli-command.ts | 21 + src/nemoclaw.ts | 608 +----------------------- 6 files changed, 676 insertions(+), 608 deletions(-) create mode 100644 src/lib/sandbox-doctor-action.ts create mode 100644 src/lib/sandbox-doctor-cli-command.ts diff --git a/src/lib/legacy-oclif-dispatch.test.ts b/src/lib/legacy-oclif-dispatch.test.ts index 953681162e..49321743ff 100644 --- a/src/lib/legacy-oclif-dispatch.test.ts +++ b/src/lib/legacy-oclif-dispatch.test.ts @@ -20,4 +20,12 @@ describe("resolveSandboxOclifDispatch", () => { usage: "status", }); }); + + it("routes sandbox doctor through oclif", () => { + expect(resolveSandboxOclifDispatch("alpha", "doctor", ["--json"])).toEqual({ + kind: "oclif", + commandId: "sandbox:doctor", + args: ["alpha", "--json"], + }); + }); }); diff --git a/src/lib/legacy-oclif-dispatch.ts b/src/lib/legacy-oclif-dispatch.ts index 14dfdfaa4c..bfd3c60d77 100644 --- a/src/lib/legacy-oclif-dispatch.ts +++ b/src/lib/legacy-oclif-dispatch.ts @@ -21,7 +21,7 @@ export type UsageErrorDispatch = { export type LegacyDispatch = { kind: "legacy"; - target: "doctor" | "policy-add" | "skill" | "snapshot"; + target: "policy-add" | "skill" | "snapshot"; }; export type UnknownSubcommandDispatch = { @@ -112,7 +112,7 @@ export function resolveSandboxOclifDispatch( if (hasHelpFlag(actionArgs)) return { kind: "help", usage: "logs [--follow]" }; return { kind: "oclif", commandId: "sandbox:logs", args: [sandboxName, ...actionArgs] }; case "doctor": - return { kind: "legacy", target: "doctor" }; + return { kind: "oclif", commandId: "sandbox:doctor", args: [sandboxName, ...actionArgs] }; case "policy-add": if (hasHelpFlag(actionArgs)) { return { diff --git a/src/lib/oclif-commands.ts b/src/lib/oclif-commands.ts index 33821fa82f..37c60ab4a7 100644 --- a/src/lib/oclif-commands.ts +++ b/src/lib/oclif-commands.ts @@ -31,6 +31,7 @@ import { } from "./maintenance-cli-commands"; import { PolicyAddCommand, PolicyRemoveCommand } from "./policy-mutate-cli-commands"; import RebuildCliCommand from "./rebuild-cli-command"; +import SandboxDoctorCliCommand from "./sandbox-doctor-cli-command"; import { SandboxChannelsListCommand, SandboxConfigGetCommand, @@ -78,6 +79,7 @@ export default { "sandbox:config:get": SandboxConfigGetCommand, "sandbox:connect": ConnectCliCommand, "sandbox:destroy": DestroyCliCommand, + "sandbox:doctor": SandboxDoctorCliCommand, "sandbox:logs": SandboxLogsCommand, "sandbox:policy-add": PolicyAddCommand, "sandbox:policy-list": SandboxPolicyListCommand, diff --git a/src/lib/sandbox-doctor-action.ts b/src/lib/sandbox-doctor-action.ts new file mode 100644 index 0000000000..50559b21b9 --- /dev/null +++ b/src/lib/sandbox-doctor-action.ts @@ -0,0 +1,641 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +/* v8 ignore start -- exercised through CLI subprocess doctor tests. */ + +import { execFileSync, spawnSync } from "node:child_process"; +import fs from "node:fs"; +import path from "node:path"; + +import { CLI_DISPLAY_NAME, CLI_NAME } from "./branding"; +import { isErrnoException } from "./errno"; +import { recoverNamedGatewayRuntime } from "./gateway-runtime-action"; +import { probeProviderHealth } from "./inference-health"; +import { parseGatewayInference } from "./inference-config"; +import { stripAnsi } from "./openshell"; +import { captureOpenshell } from "./openshell-runtime"; +import { OPENSHELL_PROBE_TIMEOUT_MS } from "./openshell-timeouts"; +import { GATEWAY_PORT, OLLAMA_PORT } from "./ports"; +import * as registry from "./registry"; +import type { SandboxEntry } from "./registry"; +import { resolveOpenshell } from "./resolve-openshell"; +import { ROOT } from "./runner"; +import { parseLiveSandboxNames } from "./runtime-recovery"; +import * as sandboxVersion from "./sandbox-version"; +import * as shields from "./shields"; +import { buildStatusCommandDeps } from "./status-command-deps"; +import { B, D, G, R, RD, YW } from "./terminal-style"; + +const agentRuntime = require("../../bin/lib/agent-runtime"); + +const NEMOCLAW_GATEWAY_NAME = "nemoclaw"; + +type DoctorStatus = "ok" | "warn" | "fail" | "info"; + +type DoctorCheck = { + group: string; + label: string; + status: DoctorStatus; + detail: string; + hint?: string; +}; + +type CommandCapture = { + status: number; + stdout: string; + stderr: string; + error?: Error; +}; + +function captureHostCommand( + command: string, + args: string[], + timeout = 5000, +): CommandCapture { + const result = spawnSync(command, args, { + cwd: ROOT, + env: process.env, + encoding: "utf-8", + stdio: ["ignore", "pipe", "pipe"], + timeout, + }); + return { + status: result.status ?? (result.error ? 1 : 0), + stdout: String(result.stdout || ""), + stderr: String(result.stderr || ""), + error: result.error, + }; +} + +function oneLine(value = ""): string { + return String(value).replace(/\s+/g, " ").trim(); +} + +function doctorSummary(checks: DoctorCheck[]): { + status: DoctorStatus; + failed: number; + warned: number; +} { + const failed = checks.filter((check) => check.status === "fail").length; + const warned = checks.filter((check) => check.status === "warn").length; + if (failed > 0) return { status: "fail", failed, warned }; + if (warned > 0) return { status: "warn", failed, warned }; + return { status: "ok", failed, warned }; +} + +function doctorStatusLabel(status: DoctorStatus): string { + switch (status) { + case "ok": + return `${G}[ok]${R}`; + case "warn": + return `${YW}[warn]${R}`; + case "fail": + return `${RD}[fail]${R}`; + case "info": + return `${D}[info]${R}`; + default: + return `[${status}]`; + } +} + +function renderDoctorReport(sandboxName: string, checks: DoctorCheck[], asJson: boolean): number { + const summary = doctorSummary(checks); + if (asJson) { + console.log( + JSON.stringify( + { + schemaVersion: 1, + sandbox: sandboxName, + status: summary.status, + failed: summary.failed, + warnings: summary.warned, + checks, + }, + null, + 2, + ), + ); + return summary.failed > 0 ? 1 : 0; + } + + console.log(""); + console.log(` ${B}${CLI_DISPLAY_NAME} doctor:${R} ${sandboxName}`); + const groupOrder = ["Host", "Gateway", "Sandbox", "Inference", "Messaging", "Local services"]; + const orderedGroups = [ + ...groupOrder, + ...checks + .map((check) => check.group) + .filter((group, index, all) => !groupOrder.includes(group) && all.indexOf(group) === index), + ]; + for (const group of orderedGroups) { + const groupChecks = checks.filter((check) => check.group === group); + if (groupChecks.length === 0) continue; + console.log(""); + console.log(` ${G}${group}:${R}`); + for (const check of groupChecks) { + console.log(` ${doctorStatusLabel(check.status)} ${check.label}: ${check.detail}`); + if (check.hint) { + console.log(` ${D}hint: ${check.hint}${R}`); + } + } + } + + console.log(""); + if (summary.status === "ok") { + console.log(` Summary: ${G}healthy${R}`); + } else if (summary.status === "warn") { + console.log(` Summary: ${YW}healthy with ${summary.warned} warning(s)${R}`); + } else { + console.log( + ` Summary: ${RD}attention needed${R} (${summary.failed} failed, ${summary.warned} warning(s))`, + ); + } + console.log(""); + return summary.failed > 0 ? 1 : 0; +} + +function dockerInspectGateway(containerName: string): DoctorCheck[] { + const checks: DoctorCheck[] = []; + const inspect = captureHostCommand( + "docker", + [ + "inspect", + "--format", + "{{.State.Running}}\t{{if .State.Health}}{{.State.Health.Status}}{{else}}none{{end}}\t{{.Config.Image}}", + containerName, + ], + 5000, + ); + if (inspect.status !== 0) { + checks.push({ + group: "Gateway", + label: "Docker container", + status: "fail", + detail: `${containerName} not found or not inspectable`, + hint: "run `docker ps --filter name=openshell-cluster-nemoclaw`", + }); + return checks; + } + + const [runningRaw, healthRaw, imageRaw] = inspect.stdout.trim().split("\t"); + const running = runningRaw === "true"; + const health = healthRaw || "none"; + const image = imageRaw || "unknown"; + const healthOk = health === "healthy" || health === "none"; + checks.push({ + group: "Gateway", + label: "Docker container", + status: running && healthOk ? "ok" : "fail", + detail: `${containerName} ${running ? "running" : "stopped"} (${health}; ${image})`, + hint: running ? undefined : "restart the gateway with `openshell gateway start --name nemoclaw`", + }); + + const port = captureHostCommand("docker", ["port", containerName, "30051/tcp"], 5000); + if (port.status === 0 && port.stdout.trim()) { + const mapping = oneLine(port.stdout); + checks.push({ + group: "Gateway", + label: "Port mapping", + status: mapping.includes(`:${GATEWAY_PORT}`) ? "ok" : "warn", + detail: mapping, + hint: mapping.includes(`:${GATEWAY_PORT}`) + ? undefined + : `expected host port ${GATEWAY_PORT} from NEMOCLAW_GATEWAY_PORT`, + }); + } else { + checks.push({ + group: "Gateway", + label: "Port mapping", + status: "fail", + detail: "30051/tcp is not published on the host", + hint: "gateway traffic will not reach OpenShell until the container is recreated with a host port", + }); + } + return checks; +} + +function findSandboxListLine(output: string, sandboxName: string): string | null { + const lines = stripAnsi(output).split(/\r?\n/); + return ( + lines.find((line: string) => { + const columns = line.trim().split(/\s+/); + return columns.includes(sandboxName); + }) || null + ); +} + +function inferSandboxReadyFromLine(line: string | null): boolean | null { + if (!line) return null; + if (/\bReady\b/i.test(line)) return true; + if (/\b(Failed|Error|CrashLoopBackOff|ImagePullBackOff|Unknown|Evicted)\b/i.test(line)) { + return false; + } + return null; +} + +function stoppedCloudflaredCheck(): DoctorCheck { + return { + group: "Local services", + label: "cloudflared", + status: "info", + detail: "stopped", + hint: `start when needed with \`${CLI_NAME} tunnel start\``, + }; +} + +function staleCloudflaredPidFileCheck(): DoctorCheck { + return { + group: "Local services", + label: "cloudflared", + status: "warn", + detail: "stale PID file", + hint: `run \`${CLI_NAME} tunnel stop\` and start it again if you need a public tunnel`, + }; +} + +function staleCloudflaredPidCheck(pid: number): DoctorCheck { + return { + group: "Local services", + label: "cloudflared", + status: "warn", + detail: `stale PID ${pid}`, + hint: `run \`${CLI_NAME} tunnel stop\` to clean up the service state`, + }; +} + +function readCloudflaredPidFile(pidFile: string): string | null { + try { + return fs.readFileSync(pidFile, "utf-8").trim(); + } catch (error) { + if (isErrnoException(error) && error.code === "ENOENT") { + return null; + } + throw error; + } +} + +function commandLineNamesCloudflared(commandLine: string): boolean { + return commandLine + .split(/\0|\s+/) + .filter(Boolean) + .some((token) => path.basename(token) === "cloudflared"); +} + +function readProcessCommandLine(pid: number): string | null { + if (process.platform === "win32") { + return null; + } + try { + return fs.readFileSync(`/proc/${pid}/cmdline`, "utf-8"); + } catch { + try { + return execFileSync("ps", ["-p", String(pid), "-o", "comm=", "-o", "args="], { + encoding: "utf-8", + stdio: ["ignore", "pipe", "ignore"], + timeout: 1000, + }); + } catch { + return null; + } + } +} + +function isCloudflaredProcess(pid: number): boolean { + const commandLine = readProcessCommandLine(pid); + if (commandLine === null) { + return false; + } + return commandLineNamesCloudflared(commandLine); +} + +function cloudflaredDoctorCheck(sandboxName: string): DoctorCheck { + const pidFile = path.join(`/tmp/nemoclaw-services-${sandboxName}`, "cloudflared.pid"); + if (!fs.existsSync(pidFile)) { + return stoppedCloudflaredCheck(); + } + const rawPid = readCloudflaredPidFile(pidFile); + if (rawPid === null) { + return stoppedCloudflaredCheck(); + } + const pid = Number(rawPid); + if (!Number.isFinite(pid) || pid <= 0) { + return staleCloudflaredPidFileCheck(); + } + try { + process.kill(pid, 0); + if (!isCloudflaredProcess(pid)) { + return staleCloudflaredPidCheck(pid); + } + return { + group: "Local services", + label: "cloudflared", + status: "ok", + detail: `running (PID ${pid})`, + }; + } catch { + return staleCloudflaredPidCheck(pid); + } +} + +function ollamaDoctorCheck(currentProvider: string): DoctorCheck { + const endpoint = `http://127.0.0.1:${OLLAMA_PORT}/api/tags`; + const result = captureHostCommand( + "curl", + ["-sS", "--connect-timeout", "2", "--max-time", "4", endpoint], + 6000, + ); + const required = currentProvider === "ollama-local"; + if (result.status !== 0) { + return { + group: "Local services", + label: "Ollama", + status: required ? "fail" : "info", + detail: `not reachable at ${endpoint}`, + hint: required ? "start Ollama or change the sandbox inference provider" : undefined, + }; + } + + let modelCount = "unknown model count"; + try { + const parsed = JSON.parse(result.stdout); + if (Array.isArray(parsed.models)) { + modelCount = `${parsed.models.length} model(s)`; + } + } catch { + /* keep generic detail */ + } + return { + group: "Local services", + label: "Ollama", + status: "ok", + detail: `reachable at ${endpoint} (${modelCount})`, + }; +} + +function messagingDoctorCheck(sandboxName: string, sb: SandboxEntry): DoctorCheck { + const registeredChannels = Array.isArray(sb.messagingChannels) ? sb.messagingChannels : []; + const disabledChannels = new Set(Array.isArray(sb.disabledChannels) ? sb.disabledChannels : []); + const channels = registeredChannels.filter((channel: string) => !disabledChannels.has(channel)); + const pausedChannels = registeredChannels.filter((channel: string) => + disabledChannels.has(channel), + ); + if (registeredChannels.length === 0) { + return { + group: "Messaging", + label: "Channels", + status: "info", + detail: "no messaging channels registered", + }; + } + + if (channels.length === 0) { + return { + group: "Messaging", + label: "Channels", + status: "info", + detail: `all messaging channels paused (${pausedChannels.join(", ")})`, + hint: `run \`${CLI_NAME} ${sandboxName} channels start \` to re-enable one`, + }; + } + + const degraded = + buildStatusCommandDeps(ROOT).checkMessagingBridgeHealth?.(sandboxName, channels) || []; + const pausedSuffix = + pausedChannels.length > 0 ? `; paused channels skipped: ${pausedChannels.join(", ")}` : ""; + if (degraded.length === 0) { + return { + group: "Messaging", + label: "Channels", + status: "ok", + detail: `${channels.join(", ")} enabled; no recent conflict signatures${pausedSuffix}`, + }; + } + + return { + group: "Messaging", + label: "Channels", + status: "warn", + detail: + degraded + .map( + (item: { channel: string; conflicts: number }) => + `${item.channel}: ${item.conflicts} conflict(s)`, + ) + .join("; ") + pausedSuffix, + hint: `run \`${CLI_NAME} ${sandboxName} logs --follow\` for enabled bridge details`, + }; +} + +// eslint-disable-next-line complexity +export async function runSandboxDoctor(sandboxName: string, args: string[] = []): Promise { + const asJson = args.includes("--json"); + const helpRequested = args.includes("--help") || args.includes("-h"); + const unknown = args.filter((arg) => !["--json", "--help", "-h"].includes(arg)); + if (helpRequested) { + console.log(` Usage: ${CLI_NAME} doctor [--json]`); + return; + } + if (unknown.length > 0) { + console.error(` Unknown doctor argument${unknown.length === 1 ? "" : "s"}: ${unknown.join(" ")}`); + console.error(` Usage: ${CLI_NAME} doctor [--json]`); + process.exit(1); + } + + const sb = registry.getSandbox(sandboxName); + const checks: DoctorCheck[] = []; + + checks.push({ + group: "Host", + label: "CLI build", + status: fs.existsSync(path.join(ROOT, "dist", "nemoclaw.js")) ? "ok" : "fail", + detail: fs.existsSync(path.join(ROOT, "dist", "nemoclaw.js")) + ? "dist/nemoclaw.js present" + : "dist/nemoclaw.js missing", + hint: fs.existsSync(path.join(ROOT, "dist", "nemoclaw.js")) + ? undefined + : "run `npm run build:cli`", + }); + + const dockerInfo = captureHostCommand("docker", ["info", "--format", "{{.ServerVersion}}"], 8000); + checks.push({ + group: "Host", + label: "Docker daemon", + status: dockerInfo.status === 0 ? "ok" : "fail", + detail: + dockerInfo.status === 0 + ? `server ${dockerInfo.stdout.trim() || "unknown"}` + : oneLine(dockerInfo.stderr || dockerInfo.error?.message || "docker info failed"), + hint: + dockerInfo.status === 0 + ? undefined + : "start Docker and verify your user can access the daemon", + }); + + const openshellBin = resolveOpenshell(); + checks.push({ + group: "Host", + label: "OpenShell CLI", + status: openshellBin ? "ok" : "fail", + detail: openshellBin || "not found on PATH", + hint: openshellBin ? undefined : "install OpenShell before using sandbox commands", + }); + + checks.push(...dockerInspectGateway(`openshell-cluster-${NEMOCLAW_GATEWAY_NAME}`)); + + let openshellConnected = false; + if (openshellBin) { + const recovery = await recoverNamedGatewayRuntime(); + const lifecycle = recovery.after || recovery.before; + const cleanStatus = stripAnsi(lifecycle?.status || ""); + openshellConnected = lifecycle?.state === "healthy_named"; + checks.push({ + group: "Gateway", + label: "OpenShell status", + status: openshellConnected ? "ok" : "fail", + detail: openshellConnected + ? "connected to nemoclaw" + : oneLine(cleanStatus || lifecycle?.gatewayInfo || "not connected to nemoclaw"), + hint: openshellConnected ? undefined : "run `openshell gateway select nemoclaw` and retry", + }); + } + + if (openshellBin && openshellConnected) { + const list = captureOpenshell(["sandbox", "list"], { + ignoreError: true, + timeout: OPENSHELL_PROBE_TIMEOUT_MS, + }); + const liveNames = parseLiveSandboxNames(list.output || ""); + const present = list.status === 0 && liveNames.has(sandboxName); + const line = findSandboxListLine(list.output || "", sandboxName); + const ready = inferSandboxReadyFromLine(line); + checks.push({ + group: "Sandbox", + label: "Live sandbox", + status: present && ready === true ? "ok" : "fail", + detail: present + ? ready === true + ? `${sandboxName} present (Ready)` + : `${sandboxName} present${line ? ` (${oneLine(line)})` : ""}` + : `${sandboxName} not present in live OpenShell sandbox list`, + hint: present + ? ready === true + ? undefined + : `run \`${CLI_NAME} ${sandboxName} status\` or \`${CLI_NAME} ${sandboxName} logs --follow\`` + : `run \`${CLI_NAME} ${sandboxName} status\` or recreate with \`${CLI_NAME} onboard\``, + }); + } else if (openshellBin) { + checks.push({ + group: "Sandbox", + label: "Live sandbox", + status: "fail", + detail: "skipped because the nemoclaw gateway is not connected", + hint: "fix the gateway check above before trusting sandbox readiness", + }); + } + + const live = + openshellBin && openshellConnected + ? parseGatewayInference( + captureOpenshell(["inference", "get"], { + ignoreError: true, + timeout: OPENSHELL_PROBE_TIMEOUT_MS, + }).output, + ) + : null; + const currentModel = (live && live.model) || (sb && sb.model) || "unknown"; + const currentProvider = (live && live.provider) || (sb && sb.provider) || "unknown"; + checks.push({ + group: "Inference", + label: "Route", + status: currentProvider !== "unknown" || currentModel !== "unknown" ? "ok" : "warn", + detail: `${currentProvider} / ${currentModel}`, + hint: + currentProvider !== "unknown" || currentModel !== "unknown" + ? undefined + : `run \`${CLI_NAME} ${sandboxName} status\` after the gateway is healthy`, + }); + + if (typeof currentProvider === "string" && currentProvider !== "unknown") { + const inferenceHealth = probeProviderHealth(currentProvider); + if (!inferenceHealth) { + checks.push({ + group: "Inference", + label: "Provider health", + status: "info", + detail: `no health probe registered for ${currentProvider}`, + }); + } else if (!inferenceHealth.probed) { + checks.push({ + group: "Inference", + label: "Provider health", + status: "info", + detail: inferenceHealth.detail, + }); + } else { + checks.push({ + group: "Inference", + label: "Provider health", + status: inferenceHealth.ok ? "ok" : "fail", + detail: inferenceHealth.ok + ? `${inferenceHealth.endpoint} reachable` + : inferenceHealth.detail, + hint: inferenceHealth.ok ? undefined : "check network access or provider credentials", + }); + } + } + + if (sb) { + try { + const versionCheck = sandboxVersion.checkAgentVersion(sandboxName); + const agent = agentRuntime.getSessionAgent(sandboxName); + const agentName = agentRuntime.getAgentDisplayName(agent); + if (versionCheck.isStale) { + checks.push({ + group: "Sandbox", + label: "Agent version", + status: "warn", + detail: `${agentName} v${versionCheck.sandboxVersion || "unknown"}; v${versionCheck.expectedVersion} available`, + hint: `run \`${CLI_NAME} ${sandboxName} rebuild\``, + }); + } else if (versionCheck.sandboxVersion) { + checks.push({ + group: "Sandbox", + label: "Agent version", + status: "ok", + detail: `${agentName} v${versionCheck.sandboxVersion}`, + }); + } else { + checks.push({ + group: "Sandbox", + label: "Agent version", + status: "info", + detail: "could not detect version", + }); + } + } catch { + checks.push({ + group: "Sandbox", + label: "Agent version", + status: "info", + detail: "version check unavailable", + }); + } + + checks.push({ + group: "Sandbox", + label: "Shields", + status: shields.isShieldsDown(sandboxName) ? "warn" : "ok", + detail: shields.isShieldsDown(sandboxName) ? "down" : "up", + hint: shields.isShieldsDown(sandboxName) + ? `run \`${CLI_NAME} ${sandboxName} shields status\` for details` + : undefined, + }); + checks.push(messagingDoctorCheck(sandboxName, sb)); + } + + checks.push(ollamaDoctorCheck(currentProvider)); + checks.push(cloudflaredDoctorCheck(sandboxName)); + + const exitCode = renderDoctorReport(sandboxName, checks, asJson); + if (exitCode !== 0) process.exit(exitCode); +} diff --git a/src/lib/sandbox-doctor-cli-command.ts b/src/lib/sandbox-doctor-cli-command.ts new file mode 100644 index 0000000000..9b03c3defb --- /dev/null +++ b/src/lib/sandbox-doctor-cli-command.ts @@ -0,0 +1,21 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +/* v8 ignore start -- thin oclif adapter covered through CLI integration tests. */ + +import { Command } from "@oclif/core"; + +import { runSandboxDoctor } from "./sandbox-doctor-action"; + +export default class SandboxDoctorCliCommand extends Command { + static id = "sandbox:doctor"; + static strict = false; + static summary = "Diagnose sandbox and gateway health"; + static description = "Run host, gateway, sandbox, inference, messaging, and local service diagnostics."; + static usage = [" doctor [--json]"]; + + public async run(): Promise { + const [sandboxName, ...actionArgs] = this.argv; + await runSandboxDoctor(sandboxName, actionArgs); + } +} diff --git a/src/nemoclaw.ts b/src/nemoclaw.ts index 012a303bd2..0a5f7360f7 100644 --- a/src/nemoclaw.ts +++ b/src/nemoclaw.ts @@ -1,10 +1,10 @@ // SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 -const { execFileSync, spawn, spawnSync } = require("child_process"); +const { spawn, spawnSync } = require("child_process"); const path = require("path"); const fs = require("fs"); -const { DASHBOARD_PORT, GATEWAY_PORT, OLLAMA_PORT } = require("./lib/ports"); +const { DASHBOARD_PORT } = require("./lib/ports"); // --------------------------------------------------------------------------- // Color / style — respects NO_COLOR and non-TTY environments. @@ -41,10 +41,7 @@ const registry = require("./lib/registry"); import type { SandboxEntry } from "./lib/registry"; const nim = require("./lib/nim"); const shields = require("./lib/shields"); -const { parseGatewayInference } = require("./lib/inference-config"); const policies = require("./lib/policies"); -const { probeProviderHealth } = require("./lib/inference-health"); -const { buildStatusCommandDeps } = require("./lib/status-command-deps"); const { help, version } = require("./lib/root-help-action"); const onboardSession = require("./lib/onboard-session"); import type { Session } from "./lib/onboard-session"; @@ -55,7 +52,6 @@ const { getInstalledOpenshellVersionOrNull, runOpenshell, } = require("./lib/openshell-runtime"); -const { recoverNamedGatewayRuntime } = require("./lib/gateway-runtime-action"); const { recoverRegistryEntries } = require("./lib/registry-recovery-action"); const { ensureLiveSandboxOrExit } = require("./lib/sandbox-gateway-state-action"); const { @@ -65,7 +61,6 @@ const { } = require("./lib/sandbox-connect-action"); const { executeSandboxCommand } = require("./lib/sandbox-process-recovery-action"); const { runRegisteredOclifCommand } = require("./lib/oclif-runner"); -const { isErrnoException }: typeof import("./lib/errno") = require("./lib/errno"); const agentRuntime = require("../bin/lib/agent-runtime"); const sandboxVersion = require("./lib/sandbox-version"); const sandboxState = require("./lib/sandbox-state"); @@ -111,16 +106,6 @@ const DASHBOARD_FORWARD_PORT = String(DASHBOARD_PORT); const DEFAULT_LOGS_PROBE_TIMEOUT_MS = 5000; const LOGS_PROBE_TIMEOUT_ENV = "NEMOCLAW_LOGS_PROBE_TIMEOUT_MS"; -type DoctorStatus = "ok" | "warn" | "fail" | "info"; - -type DoctorCheck = { - group: string; - label: string; - status: DoctorStatus; - detail: string; - hint?: string; -}; - type CommandCapture = { status: number; stdout: string; @@ -197,592 +182,6 @@ function printSandboxActionUsage(action: string): void { console.log(` Usage: ${CLI_NAME} ${action}`); } -function captureHostCommand( - command: string, - args: string[], - timeout = 5000, -): CommandCapture { - const result = spawnSync(command, args, { - cwd: ROOT, - env: process.env, - encoding: "utf-8", - stdio: ["ignore", "pipe", "pipe"], - timeout, - }); - return { - status: result.status ?? (result.error ? 1 : 0), - stdout: String(result.stdout || ""), - stderr: String(result.stderr || ""), - error: result.error, - }; -} - -function oneLine(value = ""): string { - return String(value).replace(/\s+/g, " ").trim(); -} - -function doctorSummary(checks: DoctorCheck[]): { status: DoctorStatus; failed: number; warned: number } { - const failed = checks.filter((check) => check.status === "fail").length; - const warned = checks.filter((check) => check.status === "warn").length; - if (failed > 0) return { status: "fail", failed, warned }; - if (warned > 0) return { status: "warn", failed, warned }; - return { status: "ok", failed, warned }; -} - -function doctorStatusLabel(status: DoctorStatus): string { - switch (status) { - case "ok": - return `${G}[ok]${R}`; - case "warn": - return `${YW}[warn]${R}`; - case "fail": - return `${_RD}[fail]${R}`; - case "info": - return `${D}[info]${R}`; - default: - return `[${status}]`; - } -} - -function renderDoctorReport(sandboxName: string, checks: DoctorCheck[], asJson: boolean): number { - const summary = doctorSummary(checks); - if (asJson) { - console.log( - JSON.stringify( - { - schemaVersion: 1, - sandbox: sandboxName, - status: summary.status, - failed: summary.failed, - warnings: summary.warned, - checks, - }, - null, - 2, - ), - ); - return summary.failed > 0 ? 1 : 0; - } - - console.log(""); - console.log(` ${B}${CLI_DISPLAY_NAME} doctor:${R} ${sandboxName}`); - const groupOrder = ["Host", "Gateway", "Sandbox", "Inference", "Messaging", "Local services"]; - const orderedGroups = [ - ...groupOrder, - ...checks - .map((check) => check.group) - .filter((group, index, all) => !groupOrder.includes(group) && all.indexOf(group) === index), - ]; - for (const group of orderedGroups) { - const groupChecks = checks.filter((check) => check.group === group); - if (groupChecks.length === 0) continue; - console.log(""); - console.log(` ${G}${group}:${R}`); - for (const check of groupChecks) { - console.log(` ${doctorStatusLabel(check.status)} ${check.label}: ${check.detail}`); - if (check.hint) { - console.log(` ${D}hint: ${check.hint}${R}`); - } - } - } - - console.log(""); - if (summary.status === "ok") { - console.log(` Summary: ${G}healthy${R}`); - } else if (summary.status === "warn") { - console.log(` Summary: ${YW}healthy with ${summary.warned} warning(s)${R}`); - } else { - console.log( - ` Summary: ${_RD}attention needed${R} (${summary.failed} failed, ${summary.warned} warning(s))`, - ); - } - console.log(""); - return summary.failed > 0 ? 1 : 0; -} - -function dockerInspectGateway(containerName: string): DoctorCheck[] { - const checks: DoctorCheck[] = []; - const inspect = captureHostCommand( - "docker", - [ - "inspect", - "--format", - "{{.State.Running}}\t{{if .State.Health}}{{.State.Health.Status}}{{else}}none{{end}}\t{{.Config.Image}}", - containerName, - ], - 5000, - ); - if (inspect.status !== 0) { - checks.push({ - group: "Gateway", - label: "Docker container", - status: "fail", - detail: `${containerName} not found or not inspectable`, - hint: "run `docker ps --filter name=openshell-cluster-nemoclaw`", - }); - return checks; - } - - const [runningRaw, healthRaw, imageRaw] = inspect.stdout.trim().split("\t"); - const running = runningRaw === "true"; - const health = healthRaw || "none"; - const image = imageRaw || "unknown"; - const healthOk = health === "healthy" || health === "none"; - checks.push({ - group: "Gateway", - label: "Docker container", - status: running && healthOk ? "ok" : "fail", - detail: `${containerName} ${running ? "running" : "stopped"} (${health}; ${image})`, - hint: running ? undefined : "restart the gateway with `openshell gateway start --name nemoclaw`", - }); - - const port = captureHostCommand("docker", ["port", containerName, "30051/tcp"], 5000); - if (port.status === 0 && port.stdout.trim()) { - const mapping = oneLine(port.stdout); - checks.push({ - group: "Gateway", - label: "Port mapping", - status: mapping.includes(`:${GATEWAY_PORT}`) ? "ok" : "warn", - detail: mapping, - hint: mapping.includes(`:${GATEWAY_PORT}`) - ? undefined - : `expected host port ${GATEWAY_PORT} from NEMOCLAW_GATEWAY_PORT`, - }); - } else { - checks.push({ - group: "Gateway", - label: "Port mapping", - status: "fail", - detail: "30051/tcp is not published on the host", - hint: "gateway traffic will not reach OpenShell until the container is recreated with a host port", - }); - } - return checks; -} - -function findSandboxListLine(output: string, sandboxName: string): string | null { - const lines = stripAnsi(output).split(/\r?\n/); - return ( - lines.find((line: string) => { - const columns = line.trim().split(/\s+/); - return columns.includes(sandboxName); - }) || null - ); -} - -function inferSandboxReadyFromLine(line: string | null): boolean | null { - if (!line) return null; - if (/\bReady\b/i.test(line)) return true; - if (/\b(Failed|Error|CrashLoopBackOff|ImagePullBackOff|Unknown|Evicted)\b/i.test(line)) { - return false; - } - return null; -} - -function stoppedCloudflaredCheck(): DoctorCheck { - return { - group: "Local services", - label: "cloudflared", - status: "info", - detail: "stopped", - hint: `start when needed with \`${CLI_NAME} tunnel start\``, - }; -} - -function staleCloudflaredPidFileCheck(): DoctorCheck { - return { - group: "Local services", - label: "cloudflared", - status: "warn", - detail: "stale PID file", - hint: `run \`${CLI_NAME} tunnel stop\` and start it again if you need a public tunnel`, - }; -} - -function staleCloudflaredPidCheck(pid: number): DoctorCheck { - return { - group: "Local services", - label: "cloudflared", - status: "warn", - detail: `stale PID ${pid}`, - hint: `run \`${CLI_NAME} tunnel stop\` to clean up the service state`, - }; -} - -function readCloudflaredPidFile(pidFile: string): string | null { - try { - return fs.readFileSync(pidFile, "utf-8").trim(); - } catch (error) { - if (isErrnoException(error) && error.code === "ENOENT") { - return null; - } - throw error; - } -} - -function commandLineNamesCloudflared(commandLine: string): boolean { - return commandLine - .split(/\0|\s+/) - .filter(Boolean) - .some((token) => path.basename(token) === "cloudflared"); -} - -function readProcessCommandLine(pid: number): string | null { - if (process.platform === "win32") { - return null; - } - try { - return fs.readFileSync(`/proc/${pid}/cmdline`, "utf-8"); - } catch { - try { - return execFileSync( - "ps", - ["-p", String(pid), "-o", "comm=", "-o", "args="], - { - encoding: "utf-8", - stdio: ["ignore", "pipe", "ignore"], - timeout: 1000, - }, - ); - } catch { - return null; - } - } -} - -function isCloudflaredProcess(pid: number): boolean { - const commandLine = readProcessCommandLine(pid); - if (commandLine === null) { - return false; - } - return commandLineNamesCloudflared(commandLine); -} - -function cloudflaredDoctorCheck(sandboxName: string): DoctorCheck { - const pidFile = path.join(`/tmp/nemoclaw-services-${sandboxName}`, "cloudflared.pid"); - if (!fs.existsSync(pidFile)) { - return stoppedCloudflaredCheck(); - } - const rawPid = readCloudflaredPidFile(pidFile); - if (rawPid === null) { - return stoppedCloudflaredCheck(); - } - const pid = Number(rawPid); - if (!Number.isFinite(pid) || pid <= 0) { - return staleCloudflaredPidFileCheck(); - } - try { - process.kill(pid, 0); - if (!isCloudflaredProcess(pid)) { - return staleCloudflaredPidCheck(pid); - } - return { - group: "Local services", - label: "cloudflared", - status: "ok", - detail: `running (PID ${pid})`, - }; - } catch { - return staleCloudflaredPidCheck(pid); - } -} - -function ollamaDoctorCheck(currentProvider: string): DoctorCheck { - const endpoint = `http://127.0.0.1:${OLLAMA_PORT}/api/tags`; - const result = captureHostCommand( - "curl", - ["-sS", "--connect-timeout", "2", "--max-time", "4", endpoint], - 6000, - ); - const required = currentProvider === "ollama-local"; - if (result.status !== 0) { - return { - group: "Local services", - label: "Ollama", - status: required ? "fail" : "info", - detail: `not reachable at ${endpoint}`, - hint: required ? "start Ollama or change the sandbox inference provider" : undefined, - }; - } - - let modelCount = "unknown model count"; - try { - const parsed = JSON.parse(result.stdout); - if (Array.isArray(parsed.models)) { - modelCount = `${parsed.models.length} model(s)`; - } - } catch { - /* keep generic detail */ - } - return { - group: "Local services", - label: "Ollama", - status: "ok", - detail: `reachable at ${endpoint} (${modelCount})`, - }; -} - -function messagingDoctorCheck(sandboxName: string, sb: SandboxEntry): DoctorCheck { - const registeredChannels = Array.isArray(sb.messagingChannels) ? sb.messagingChannels : []; - const disabledChannels = new Set(Array.isArray(sb.disabledChannels) ? sb.disabledChannels : []); - const channels = registeredChannels.filter((channel: string) => !disabledChannels.has(channel)); - const pausedChannels = registeredChannels.filter((channel: string) => - disabledChannels.has(channel), - ); - if (registeredChannels.length === 0) { - return { - group: "Messaging", - label: "Channels", - status: "info", - detail: "no messaging channels registered", - }; - } - - if (channels.length === 0) { - return { - group: "Messaging", - label: "Channels", - status: "info", - detail: `all messaging channels paused (${pausedChannels.join(", ")})`, - hint: `run \`${CLI_NAME} ${sandboxName} channels start \` to re-enable one`, - }; - } - - const degraded = buildStatusCommandDeps(ROOT).checkMessagingBridgeHealth?.(sandboxName, channels) || []; - const pausedSuffix = - pausedChannels.length > 0 ? `; paused channels skipped: ${pausedChannels.join(", ")}` : ""; - if (degraded.length === 0) { - return { - group: "Messaging", - label: "Channels", - status: "ok", - detail: `${channels.join(", ")} enabled; no recent conflict signatures${pausedSuffix}`, - }; - } - - return { - group: "Messaging", - label: "Channels", - status: "warn", - detail: - degraded - .map( - (item: { channel: string; conflicts: number }) => - `${item.channel}: ${item.conflicts} conflict(s)`, - ) - .join("; ") + pausedSuffix, - hint: `run \`${CLI_NAME} ${sandboxName} logs --follow\` for enabled bridge details`, - }; -} - -// eslint-disable-next-line complexity -async function sandboxDoctor(sandboxName: string, args: string[] = []): Promise { - const asJson = args.includes("--json"); - const helpRequested = args.includes("--help") || args.includes("-h"); - const unknown = args.filter((arg) => !["--json", "--help", "-h"].includes(arg)); - if (helpRequested) { - console.log(` Usage: ${CLI_NAME} doctor [--json]`); - return; - } - if (unknown.length > 0) { - console.error(` Unknown doctor argument${unknown.length === 1 ? "" : "s"}: ${unknown.join(" ")}`); - console.error(` Usage: ${CLI_NAME} doctor [--json]`); - process.exit(1); - } - - const sb = registry.getSandbox(sandboxName); - const checks: DoctorCheck[] = []; - - checks.push({ - group: "Host", - label: "CLI build", - status: fs.existsSync(path.join(ROOT, "dist", "nemoclaw.js")) ? "ok" : "fail", - detail: fs.existsSync(path.join(ROOT, "dist", "nemoclaw.js")) - ? "dist/nemoclaw.js present" - : "dist/nemoclaw.js missing", - hint: fs.existsSync(path.join(ROOT, "dist", "nemoclaw.js")) ? undefined : "run `npm run build:cli`", - }); - - const dockerInfo = captureHostCommand("docker", ["info", "--format", "{{.ServerVersion}}"], 8000); - checks.push({ - group: "Host", - label: "Docker daemon", - status: dockerInfo.status === 0 ? "ok" : "fail", - detail: - dockerInfo.status === 0 - ? `server ${dockerInfo.stdout.trim() || "unknown"}` - : oneLine(dockerInfo.stderr || dockerInfo.error?.message || "docker info failed"), - hint: dockerInfo.status === 0 ? undefined : "start Docker and verify your user can access the daemon", - }); - - const openshellBin = resolveOpenshell(); - checks.push({ - group: "Host", - label: "OpenShell CLI", - status: openshellBin ? "ok" : "fail", - detail: openshellBin || "not found on PATH", - hint: openshellBin ? undefined : "install OpenShell before using sandbox commands", - }); - - checks.push(...dockerInspectGateway(`openshell-cluster-${NEMOCLAW_GATEWAY_NAME}`)); - - let openshellConnected = false; - if (openshellBin) { - const recovery = await recoverNamedGatewayRuntime(); - const lifecycle = recovery.after || recovery.before; - const cleanStatus = stripAnsi(lifecycle?.status || ""); - openshellConnected = lifecycle?.state === "healthy_named"; - checks.push({ - group: "Gateway", - label: "OpenShell status", - status: openshellConnected ? "ok" : "fail", - detail: openshellConnected - ? "connected to nemoclaw" - : oneLine(cleanStatus || lifecycle?.gatewayInfo || "not connected to nemoclaw"), - hint: openshellConnected ? undefined : "run `openshell gateway select nemoclaw` and retry", - }); - } - - if (openshellBin && openshellConnected) { - const list = captureOpenshell(["sandbox", "list"], { - ignoreError: true, - timeout: OPENSHELL_PROBE_TIMEOUT_MS, - }); - const liveNames = parseLiveSandboxNames(list.output || ""); - const present = list.status === 0 && liveNames.has(sandboxName); - const line = findSandboxListLine(list.output || "", sandboxName); - const ready = inferSandboxReadyFromLine(line); - checks.push({ - group: "Sandbox", - label: "Live sandbox", - status: present && ready === true ? "ok" : "fail", - detail: present - ? ready === true - ? `${sandboxName} present (Ready)` - : `${sandboxName} present${line ? ` (${oneLine(line)})` : ""}` - : `${sandboxName} not present in live OpenShell sandbox list`, - hint: present - ? ready === true - ? undefined - : `run \`${CLI_NAME} ${sandboxName} status\` or \`${CLI_NAME} ${sandboxName} logs --follow\`` - : `run \`${CLI_NAME} ${sandboxName} status\` or recreate with \`${CLI_NAME} onboard\``, - }); - } else if (openshellBin) { - checks.push({ - group: "Sandbox", - label: "Live sandbox", - status: "fail", - detail: "skipped because the nemoclaw gateway is not connected", - hint: "fix the gateway check above before trusting sandbox readiness", - }); - } - - const live = openshellBin && openshellConnected - ? parseGatewayInference( - captureOpenshell(["inference", "get"], { - ignoreError: true, - timeout: OPENSHELL_PROBE_TIMEOUT_MS, - }).output, - ) - : null; - const currentModel = (live && live.model) || (sb && sb.model) || "unknown"; - const currentProvider = (live && live.provider) || (sb && sb.provider) || "unknown"; - checks.push({ - group: "Inference", - label: "Route", - status: currentProvider !== "unknown" || currentModel !== "unknown" ? "ok" : "warn", - detail: `${currentProvider} / ${currentModel}`, - hint: - currentProvider !== "unknown" || currentModel !== "unknown" - ? undefined - : `run \`${CLI_NAME} ${sandboxName} status\` after the gateway is healthy`, - }); - - if (typeof currentProvider === "string" && currentProvider !== "unknown") { - const inferenceHealth = probeProviderHealth(currentProvider); - if (!inferenceHealth) { - checks.push({ - group: "Inference", - label: "Provider health", - status: "info", - detail: `no health probe registered for ${currentProvider}`, - }); - } else if (!inferenceHealth.probed) { - checks.push({ - group: "Inference", - label: "Provider health", - status: "info", - detail: inferenceHealth.detail, - }); - } else { - checks.push({ - group: "Inference", - label: "Provider health", - status: inferenceHealth.ok ? "ok" : "fail", - detail: inferenceHealth.ok - ? `${inferenceHealth.endpoint} reachable` - : inferenceHealth.detail, - hint: inferenceHealth.ok ? undefined : "check network access or provider credentials", - }); - } - } - - if (sb) { - try { - const versionCheck = sandboxVersion.checkAgentVersion(sandboxName); - const agent = agentRuntime.getSessionAgent(sandboxName); - const agentName = agentRuntime.getAgentDisplayName(agent); - if (versionCheck.isStale) { - checks.push({ - group: "Sandbox", - label: "Agent version", - status: "warn", - detail: `${agentName} v${versionCheck.sandboxVersion || "unknown"}; v${versionCheck.expectedVersion} available`, - hint: `run \`${CLI_NAME} ${sandboxName} rebuild\``, - }); - } else if (versionCheck.sandboxVersion) { - checks.push({ - group: "Sandbox", - label: "Agent version", - status: "ok", - detail: `${agentName} v${versionCheck.sandboxVersion}`, - }); - } else { - checks.push({ - group: "Sandbox", - label: "Agent version", - status: "info", - detail: "could not detect version", - }); - } - } catch { - checks.push({ - group: "Sandbox", - label: "Agent version", - status: "info", - detail: "version check unavailable", - }); - } - - checks.push({ - group: "Sandbox", - label: "Shields", - status: shields.isShieldsDown(sandboxName) ? "warn" : "ok", - detail: shields.isShieldsDown(sandboxName) ? "down" : "up", - hint: shields.isShieldsDown(sandboxName) - ? `run \`${CLI_NAME} ${sandboxName} shields status\` for details` - : undefined, - }); - checks.push(messagingDoctorCheck(sandboxName, sb)); - } - - checks.push(ollamaDoctorCheck(currentProvider)); - checks.push(cloudflaredDoctorCheck(sandboxName)); - - const exitCode = renderDoctorReport(sandboxName, checks, asJson); - if (exitCode !== 0) process.exit(exitCode); -} - function cleanupSandboxServices( sandboxName: string, { stopHostServices = false }: { stopHostServices?: boolean } = {}, @@ -1629,9 +1028,6 @@ async function runDispatchResult( throw new Error(`Missing sandbox name for legacy dispatch target ${result.target}`); } switch (result.target) { - case "doctor": - await sandboxDoctor(sandboxName, actionArgs); - return; case "policy-add": { const { addSandboxPolicy } = require("./lib/policy-channel-actions") as { addSandboxPolicy: (sandboxName: string, args?: string[]) => Promise; From 89085215c2e629c6724155f3ddf667f753d39994 Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Sat, 2 May 2026 19:21:26 -0700 Subject: [PATCH 06/65] refactor(cli): extract sandbox destroy action --- src/lib/nemoclaw-runtime-bridge.ts | 1 - src/lib/sandbox-destroy-action.ts | 269 +++++++++++++++++++++++++++++ src/lib/sandbox-runtime-actions.ts | 5 +- src/nemoclaw.ts | 214 +---------------------- test/image-cleanup.test.ts | 92 +++++----- 5 files changed, 328 insertions(+), 253 deletions(-) create mode 100644 src/lib/sandbox-destroy-action.ts diff --git a/src/lib/nemoclaw-runtime-bridge.ts b/src/lib/nemoclaw-runtime-bridge.ts index 6f73af71ab..821632ca3c 100644 --- a/src/lib/nemoclaw-runtime-bridge.ts +++ b/src/lib/nemoclaw-runtime-bridge.ts @@ -4,7 +4,6 @@ /* v8 ignore start -- transitional bridge until command actions are extracted from src/nemoclaw.ts. */ export interface NemoClawRuntimeBridge { - sandboxDestroy: (sandboxName: string, args?: string[]) => Promise; sandboxRebuild: (sandboxName: string, args?: string[]) => Promise; upgradeSandboxes: (args?: string[]) => Promise; } diff --git a/src/lib/sandbox-destroy-action.ts b/src/lib/sandbox-destroy-action.ts new file mode 100644 index 0000000000..869cd8c19b --- /dev/null +++ b/src/lib/sandbox-destroy-action.ts @@ -0,0 +1,269 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +/* v8 ignore start -- exercised through CLI subprocess destroy/rebuild tests. */ + +import fs from "node:fs"; + +import { CLI_NAME } from "./branding"; +import { prompt as askPrompt } from "./credentials"; +import * as onboardSession from "./onboard-session"; +import type { Session } from "./onboard-session"; +import { OPENSHELL_PROBE_TIMEOUT_MS } from "./openshell-timeouts"; +import { DASHBOARD_PORT } from "./ports"; +import * as registry from "./registry"; +import { resolveOpenshell } from "./resolve-openshell"; +import { parseLiveSandboxNames } from "./runtime-recovery"; +import { + createSystemDeps as createSessionDeps, + getActiveSandboxSessions, +} from "./sandbox-session-state"; +import { stripAnsi } from "./openshell"; +import { G, R, YW } from "./terminal-style"; + +type SpawnLikeResult = { + status: number | null; + stdout?: string; + stderr?: string; +}; + +type DockerRmi = (tag: string, opts?: { ignoreError?: boolean }) => { status: number | null }; + +type RemoveSandboxImageDeps = { + getSandbox?: typeof registry.getSandbox; + dockerRmi?: DockerRmi; +}; + +type RemoveSandboxRegistryEntryDeps = { + removeImage?: (sandboxName: string) => void; + removeSandbox?: typeof registry.removeSandbox; +}; + +const NEMOCLAW_GATEWAY_NAME = "nemoclaw"; +const DASHBOARD_FORWARD_PORT = String(DASHBOARD_PORT); + +function cleanupGatewayAfterLastSandbox(): void { + const { runOpenshell } = require("./openshell-runtime") as { + runOpenshell: (args: string[], opts?: Record) => { status: number | null }; + }; + const { dockerRemoveVolumesByPrefix } = require("./docker") as { + dockerRemoveVolumesByPrefix: (prefix: string, opts?: { ignoreError?: boolean }) => void; + }; + + runOpenshell(["forward", "stop", DASHBOARD_FORWARD_PORT], { + ignoreError: true, + stdio: ["ignore", "ignore", "ignore"], + }); + runOpenshell(["gateway", "destroy", "-g", NEMOCLAW_GATEWAY_NAME], { ignoreError: true }); + dockerRemoveVolumesByPrefix(`openshell-cluster-${NEMOCLAW_GATEWAY_NAME}`, { + ignoreError: true, + }); +} + +function hasNoLiveSandboxes(): boolean { + const { captureOpenshell } = require("./openshell-runtime") as { + captureOpenshell: ( + args: string[], + opts?: { ignoreError?: boolean; timeout?: number }, + ) => { status: number | null; output: string }; + }; + const liveList = captureOpenshell(["sandbox", "list"], { + ignoreError: true, + timeout: OPENSHELL_PROBE_TIMEOUT_MS, + }); + if (liveList.status !== 0) { + return false; + } + return parseLiveSandboxNames(liveList.output).size === 0; +} + +function isMissingSandboxDeleteResult(output = ""): boolean { + return /\bNotFound\b|\bNot Found\b|sandbox not found|sandbox .* not found|sandbox .* not present|sandbox does not exist|no such sandbox/i.test( + stripAnsi(output), + ); +} + +export function getSandboxDeleteOutcome(deleteResult: SpawnLikeResult): { + output: string; + alreadyGone: boolean; +} { + const output = `${deleteResult.stdout || ""}${deleteResult.stderr || ""}`.trim(); + return { + output, + alreadyGone: deleteResult.status !== 0 && isMissingSandboxDeleteResult(output), + }; +} + +function cleanupSandboxServices( + sandboxName: string, + { stopHostServices = false }: { stopHostServices?: boolean } = {}, +): void { + if (stopHostServices) { + const { stopAll } = require("./services"); + stopAll({ sandboxName }); + } + + const sb = registry.getSandbox(sandboxName); + if (sb?.provider?.includes("ollama")) { + const { unloadOllamaModels } = require("./onboard-ollama-proxy"); + unloadOllamaModels(); + } + + try { + fs.rmSync(`/tmp/nemoclaw-services-${sandboxName}`, { recursive: true, force: true }); + } catch { + // PID directory may not exist — ignore. + } + + // Delete messaging providers created during onboard. Suppress stderr so + // "! Provider not found" noise doesn't appear when messaging was never configured. + const { runOpenshell } = require("./openshell-runtime") as { + runOpenshell: (args: string[], opts?: Record) => { status: number | null }; + }; + for (const suffix of ["telegram-bridge", "discord-bridge", "slack-bridge"]) { + runOpenshell(["provider", "delete", `${sandboxName}-${suffix}`], { + ignoreError: true, + stdio: ["ignore", "ignore", "ignore"], + }); + } +} + +/** + * Remove the host-side Docker image that was built for a sandbox during onboard. + * Must be called before registry.removeSandbox() since the imageTag is stored there. + */ +export function removeSandboxImage( + sandboxName: string, + deps: RemoveSandboxImageDeps = {}, +): void { + const getSandbox = deps.getSandbox ?? registry.getSandbox; + const removeImage = + deps.dockerRmi ?? (require("./docker") as { dockerRmi: DockerRmi }).dockerRmi; + const sb = getSandbox(sandboxName); + if (!sb?.imageTag) return; + const result = removeImage(sb.imageTag, { ignoreError: true }); + if (result.status === 0) { + console.log(` Removed Docker image ${sb.imageTag}`); + } else { + console.warn( + ` ${YW}⚠${R} Failed to remove Docker image ${sb.imageTag}; run '${CLI_NAME} gc' to clean up.`, + ); + } +} + +export function removeSandboxRegistryEntry( + sandboxName: string, + deps: RemoveSandboxRegistryEntryDeps = {}, +): boolean { + const removeImage = deps.removeImage ?? removeSandboxImage; + const removeSandbox = deps.removeSandbox ?? registry.removeSandbox; + removeImage(sandboxName); + return removeSandbox(sandboxName); +} + +export async function destroySandbox(sandboxName: string, args: string[] = []): Promise { + const skipConfirm = args.includes("--yes") || args.includes("--force"); + + // Active session detection — enrich the confirmation prompt if sessions are active + let activeSessionCount = 0; + const opsBin = resolveOpenshell(); + if (opsBin) { + try { + const sessionResult = getActiveSandboxSessions(sandboxName, createSessionDeps(opsBin)); + if (sessionResult.detected) { + activeSessionCount = sessionResult.sessions.length; + } + } catch { + /* non-fatal */ + } + } + + if (!skipConfirm) { + console.log(` ${YW}Destroy sandbox '${sandboxName}'?${R}`); + if (activeSessionCount > 0) { + const plural = activeSessionCount > 1 ? "sessions" : "session"; + console.log( + ` ${YW}⚠ Active SSH ${plural} detected (${activeSessionCount} connection${activeSessionCount > 1 ? "s" : ""})${R}`, + ); + console.log( + ` Destroying will terminate ${activeSessionCount === 1 ? "the" : "all"} active ${plural} with a Broken pipe error.`, + ); + } + console.log(" This will permanently delete the sandbox and all workspace files inside it."); + console.log(" This cannot be undone."); + const answer = await askPrompt(" Type 'yes' to confirm, or press Enter to cancel [y/N]: "); + if (answer.trim().toLowerCase() !== "y" && answer.trim().toLowerCase() !== "yes") { + console.log(" Cancelled."); + return; + } + } + + const nim = require("./nim") as { + stopNimContainer: (sandboxName: string, opts?: { silent?: boolean }) => void; + stopNimContainerByName: (name: string) => void; + }; + const sb = registry.getSandbox(sandboxName); + if (sb && sb.nimContainer) { + console.log(` Stopping NIM for '${sandboxName}'...`); + nim.stopNimContainerByName(sb.nimContainer); + } else { + // Best-effort cleanup of convention-named NIM containers that may not + // be recorded in the registry (e.g. older sandboxes). Suppress output + // so the user doesn't see "No such container" noise when no NIM exists. + nim.stopNimContainer(sandboxName, { silent: true }); + } + + if (sb?.provider?.includes("ollama")) { + const { unloadOllamaModels, killStaleProxy } = require("./onboard-ollama-proxy"); + unloadOllamaModels(); + killStaleProxy(); + } + + console.log(` Deleting sandbox '${sandboxName}'...`); + const { runOpenshell } = require("./openshell-runtime") as { + runOpenshell: ( + args: string[], + opts?: Record, + ) => { status: number | null; stdout?: string; stderr?: string }; + }; + const deleteResult = runOpenshell(["sandbox", "delete", sandboxName], { + ignoreError: true, + stdio: ["ignore", "pipe", "pipe"], + }); + const { output: deleteOutput, alreadyGone } = getSandboxDeleteOutcome(deleteResult); + + if (deleteResult.status !== 0 && !alreadyGone) { + if (deleteOutput) { + console.error(` ${deleteOutput}`); + } + console.error(` Failed to destroy sandbox '${sandboxName}'.`); + process.exit(deleteResult.status || 1); + } + + const shouldStopHostServices = + (deleteResult.status === 0 || alreadyGone) && + registry.listSandboxes().sandboxes.length === 1 && + !!registry.getSandbox(sandboxName); + + cleanupSandboxServices(sandboxName, { stopHostServices: shouldStopHostServices }); + const removed = removeSandboxRegistryEntry(sandboxName); + const session = onboardSession.loadSession(); + if (session && session.sandboxName === sandboxName) { + onboardSession.updateSession((s: Session) => { + s.sandboxName = null; + return s; + }); + } + if ( + (deleteResult.status === 0 || alreadyGone) && + removed && + registry.listSandboxes().sandboxes.length === 0 && + hasNoLiveSandboxes() + ) { + cleanupGatewayAfterLastSandbox(); + } + if (alreadyGone) { + console.log(` Sandbox '${sandboxName}' was already absent from the live gateway.`); + } + console.log(` ${G}✓${R} Sandbox '${sandboxName}' destroyed`); +} diff --git a/src/lib/sandbox-runtime-actions.ts b/src/lib/sandbox-runtime-actions.ts index e3ca1b4733..2e659c6c6e 100644 --- a/src/lib/sandbox-runtime-actions.ts +++ b/src/lib/sandbox-runtime-actions.ts @@ -31,7 +31,10 @@ export function showSandboxLogs(sandboxName: string, follow: boolean): void { } export async function destroySandbox(sandboxName: string, args: string[] = []): Promise { - await getNemoClawRuntimeBridge().sandboxDestroy(sandboxName, args); + const { destroySandbox: destroyExtractedSandbox } = require("./sandbox-destroy-action") as { + destroySandbox: (sandboxName: string, args?: string[]) => Promise; + }; + await destroyExtractedSandbox(sandboxName, args); } export async function rebuildSandbox(sandboxName: string, args: string[] = []): Promise { diff --git a/src/nemoclaw.ts b/src/nemoclaw.ts index 0a5f7360f7..30ec557a11 100644 --- a/src/nemoclaw.ts +++ b/src/nemoclaw.ts @@ -3,8 +3,7 @@ const { spawn, spawnSync } = require("child_process"); const path = require("path"); -const fs = require("fs"); -const { DASHBOARD_PORT } = require("./lib/ports"); + // --------------------------------------------------------------------------- // Color / style — respects NO_COLOR and non-TTY environments. @@ -28,12 +27,7 @@ const { ROOT, run, runInteractive, validateName } = require("./lib/runner"); // --------------------------------------------------------------------------- const { CLI_NAME, CLI_DISPLAY_NAME } = require("./lib/branding"); -const { - dockerCapture, - dockerInspect, - dockerRemoveVolumesByPrefix, - dockerRmi, -} = require("./lib/docker"); +const { dockerCapture, dockerInspect } = require("./lib/docker"); const { resolveOpenshell } = require("./lib/resolve-openshell"); const { hydrateCredentialEnv, isNonInteractive } = require("./lib/onboard"); const { prompt: askPrompt } = require("./lib/credentials"); @@ -46,7 +40,6 @@ const { help, version } = require("./lib/root-help-action"); const onboardSession = require("./lib/onboard-session"); import type { Session } from "./lib/onboard-session"; const { parseLiveSandboxNames } = require("./lib/runtime-recovery"); -const { stripAnsi } = require("./lib/openshell"); const { captureOpenshell, getInstalledOpenshellVersionOrNull, @@ -60,6 +53,10 @@ const { printSandboxConnectHelp, } = require("./lib/sandbox-connect-action"); const { executeSandboxCommand } = require("./lib/sandbox-process-recovery-action"); +const { + getSandboxDeleteOutcome, + removeSandboxRegistryEntry, +} = require("./lib/sandbox-destroy-action"); const { runRegisteredOclifCommand } = require("./lib/oclif-runner"); const agentRuntime = require("../bin/lib/agent-runtime"); const sandboxVersion = require("./lib/sandbox-version"); @@ -101,56 +98,7 @@ type RecoveredSandboxMetadata = Partial< policyPresets?: string[] | null; }; -const NEMOCLAW_GATEWAY_NAME = "nemoclaw"; -const DASHBOARD_FORWARD_PORT = String(DASHBOARD_PORT); -const DEFAULT_LOGS_PROBE_TIMEOUT_MS = 5000; -const LOGS_PROBE_TIMEOUT_ENV = "NEMOCLAW_LOGS_PROBE_TIMEOUT_MS"; - -type CommandCapture = { - status: number; - stdout: string; - stderr: string; - error?: Error; -}; - -function cleanupGatewayAfterLastSandbox() { - runOpenshell(["forward", "stop", DASHBOARD_FORWARD_PORT], { - ignoreError: true, - stdio: ["ignore", "ignore", "ignore"], - }); - runOpenshell(["gateway", "destroy", "-g", NEMOCLAW_GATEWAY_NAME], { ignoreError: true }); - dockerRemoveVolumesByPrefix(`openshell-cluster-${NEMOCLAW_GATEWAY_NAME}`, { - ignoreError: true, - }); -} - -function hasNoLiveSandboxes() { - const liveList = captureOpenshell(["sandbox", "list"], { - ignoreError: true, - timeout: OPENSHELL_PROBE_TIMEOUT_MS, - }); - if (liveList.status !== 0) { - return false; - } - return parseLiveSandboxNames(liveList.output).size === 0; -} - -function isMissingSandboxDeleteResult(output = ""): boolean { - return /\bNotFound\b|\bNot Found\b|sandbox not found|sandbox .* not found|sandbox .* not present|sandbox does not exist|no such sandbox/i.test( - stripAnsi(output), - ); -} - -function getSandboxDeleteOutcome(deleteResult: SpawnLikeResult) { - const output = `${deleteResult.stdout || ""}${deleteResult.stderr || ""}`.trim(); - return { - output, - alreadyGone: deleteResult.status !== 0 && isMissingSandboxDeleteResult(output), - }; -} - exports.runtimeBridge = { - sandboxDestroy, sandboxRebuild, upgradeSandboxes, }; @@ -182,153 +130,6 @@ function printSandboxActionUsage(action: string): void { console.log(` Usage: ${CLI_NAME} ${action}`); } -function cleanupSandboxServices( - sandboxName: string, - { stopHostServices = false }: { stopHostServices?: boolean } = {}, -) { - if (stopHostServices) { - const { stopAll } = require("./lib/services"); - stopAll({ sandboxName }); - } - - const sb = registry.getSandbox(sandboxName); - if (sb?.provider?.includes("ollama")) { - const { unloadOllamaModels } = require("./lib/onboard-ollama-proxy"); - unloadOllamaModels(); - } - - try { - fs.rmSync(`/tmp/nemoclaw-services-${sandboxName}`, { recursive: true, force: true }); - } catch { - // PID directory may not exist — ignore. - } - - // Delete messaging providers created during onboard. Suppress stderr so - // "! Provider not found" noise doesn't appear when messaging was never configured. - for (const suffix of ["telegram-bridge", "discord-bridge", "slack-bridge"]) { - runOpenshell(["provider", "delete", `${sandboxName}-${suffix}`], { - ignoreError: true, - stdio: ["ignore", "ignore", "ignore"], - }); - } -} - -/** - * Remove the host-side Docker image that was built for a sandbox during onboard. - * Must be called before registry.removeSandbox() since the imageTag is stored there. - */ -function removeSandboxImage(sandboxName: string) { - const sb = registry.getSandbox(sandboxName); - if (!sb?.imageTag) return; - const result = dockerRmi(sb.imageTag, { ignoreError: true }); - if (result.status === 0) { - console.log(` Removed Docker image ${sb.imageTag}`); - } else { - console.warn( - ` ${YW}⚠${R} Failed to remove Docker image ${sb.imageTag}; run '${CLI_NAME} gc' to clean up.`, - ); - } -} - -async function sandboxDestroy(sandboxName: string, args: string[] = []): Promise { - const skipConfirm = args.includes("--yes") || args.includes("--force"); - - // Active session detection — enrich the confirmation prompt if sessions are active - let activeSessionCount = 0; - const opsBin = resolveOpenshell(); - if (opsBin) { - try { - const sessionResult = getActiveSandboxSessions(sandboxName, createSessionDeps(opsBin)); - if (sessionResult.detected) { - activeSessionCount = sessionResult.sessions.length; - } - } catch { - /* non-fatal */ - } - } - - if (!skipConfirm) { - console.log(` ${YW}Destroy sandbox '${sandboxName}'?${R}`); - if (activeSessionCount > 0) { - const plural = activeSessionCount > 1 ? "sessions" : "session"; - console.log( - ` ${YW}⚠ Active SSH ${plural} detected (${activeSessionCount} connection${activeSessionCount > 1 ? "s" : ""})${R}`, - ); - console.log( - ` Destroying will terminate ${activeSessionCount === 1 ? "the" : "all"} active ${plural} with a Broken pipe error.`, - ); - } - console.log(" This will permanently delete the sandbox and all workspace files inside it."); - console.log(" This cannot be undone."); - const answer = await askPrompt(" Type 'yes' to confirm, or press Enter to cancel [y/N]: "); - if (answer.trim().toLowerCase() !== "y" && answer.trim().toLowerCase() !== "yes") { - console.log(" Cancelled."); - return; - } - } - - const sb = registry.getSandbox(sandboxName); - if (sb && sb.nimContainer) { - console.log(` Stopping NIM for '${sandboxName}'...`); - nim.stopNimContainerByName(sb.nimContainer); - } else { - // Best-effort cleanup of convention-named NIM containers that may not - // be recorded in the registry (e.g. older sandboxes). Suppress output - // so the user doesn't see "No such container" noise when no NIM exists. - nim.stopNimContainer(sandboxName, { silent: true }); - } - - if (sb?.provider?.includes("ollama")) { - const { unloadOllamaModels, killStaleProxy } = require("./lib/onboard-ollama-proxy"); - unloadOllamaModels(); - killStaleProxy(); - } - - console.log(` Deleting sandbox '${sandboxName}'...`); - const deleteResult = runOpenshell(["sandbox", "delete", sandboxName], { - ignoreError: true, - stdio: ["ignore", "pipe", "pipe"], - }); - const { output: deleteOutput, alreadyGone } = getSandboxDeleteOutcome(deleteResult); - - if (deleteResult.status !== 0 && !alreadyGone) { - if (deleteOutput) { - console.error(` ${deleteOutput}`); - } - console.error(` Failed to destroy sandbox '${sandboxName}'.`); - process.exit(deleteResult.status || 1); - } - - const shouldStopHostServices = - (deleteResult.status === 0 || alreadyGone) && - registry.listSandboxes().sandboxes.length === 1 && - !!registry.getSandbox(sandboxName); - - cleanupSandboxServices(sandboxName, { stopHostServices: shouldStopHostServices }); - removeSandboxImage(sandboxName); - - const removed = registry.removeSandbox(sandboxName); - const session = onboardSession.loadSession(); - if (session && session.sandboxName === sandboxName) { - onboardSession.updateSession((s: Session) => { - s.sandboxName = null; - return s; - }); - } - if ( - (deleteResult.status === 0 || alreadyGone) && - removed && - registry.listSandboxes().sandboxes.length === 0 && - hasNoLiveSandboxes() - ) { - cleanupGatewayAfterLastSandbox(); - } - if (alreadyGone) { - console.log(` Sandbox '${sandboxName}' was already absent from the live gateway.`); - } - console.log(` ${G}✓${R} Sandbox '${sandboxName}' destroyed`); -} - // ── Rebuild ────────────────────────────────────────────────────── function _rebuildLog(msg: string) { @@ -558,8 +359,7 @@ async function sandboxRebuild( bail("Failed to delete sandbox.", deleteResult.status || 1); return; } - removeSandboxImage(sandboxName); - registry.removeSandbox(sandboxName); + removeSandboxRegistryEntry(sandboxName); log( `Registry after remove: ${JSON.stringify(registry.listSandboxes().sandboxes.map((s: { name: string }) => s.name))}`, ); diff --git a/test/image-cleanup.test.ts b/test/image-cleanup.test.ts index f40f299868..9714f9f83e 100644 --- a/test/image-cleanup.test.ts +++ b/test/image-cleanup.test.ts @@ -8,62 +8,66 @@ import { describe, it, expect } from "vitest"; import fs from "node:fs"; import path from "node:path"; +import { + getSandboxDeleteOutcome, + removeSandboxImage, + removeSandboxRegistryEntry, +} from "../src/lib/sandbox-destroy-action"; import { help as renderRootHelp } from "../src/lib/root-help-action"; const ROOT = path.resolve(import.meta.dirname, ".."); describe("image cleanup: sandbox destroy removes Docker image (#2086)", () => { - const nemoclawSrc = fs.readFileSync(path.join(ROOT, "src/nemoclaw.ts"), "utf-8"); - - it("removeSandboxImage() helper exists and calls docker rmi", () => { - const match = nemoclawSrc.match(/function removeSandboxImage[\s\S]*?^}/m); - expect(match).toBeTruthy(); - if (!match) throw new Error("Expected removeSandboxImage() in src/nemoclaw.ts"); - expect(match[0]).toMatch(/dockerRmi\(|docker.*\.rmi\(/); + it("removes sandbox images before deleting the registry entry", () => { + const calls: string[] = []; + + const removed = removeSandboxRegistryEntry("alpha", { + removeImage: (sandboxName) => calls.push(`image:${sandboxName}`), + removeSandbox: (sandboxName) => { + calls.push(`registry:${sandboxName}`); + return true; + }, + }); + + expect(removed).toBe(true); + expect(calls).toEqual(["image:alpha", "registry:alpha"]); }); - it("sandboxDestroy calls removeSandboxImage before registry.removeSandbox", () => { - // Extract the sandboxDestroy function body - const destroyMatch = nemoclawSrc.match(/async function sandboxDestroy[\s\S]*?^}/m); - expect(destroyMatch).toBeTruthy(); - if (!destroyMatch) { - throw new Error("Expected sandboxDestroy() in src/nemoclaw.ts"); - } - const destroyBody = destroyMatch[0]; - - // removeSandboxImage must appear before registry.removeSandbox - const removeImageIdx = destroyBody.indexOf("removeSandboxImage("); - const removeRegistryIdx = destroyBody.indexOf("registry.removeSandbox("); - expect(removeImageIdx).toBeGreaterThan(-1); - expect(removeRegistryIdx).toBeGreaterThan(-1); - expect(removeImageIdx).toBeLessThan(removeRegistryIdx); - }); + it("removeSandboxImage calls docker rmi for recorded image tags", () => { + const removedTags: string[] = []; - it("sandboxRebuild calls removeSandboxImage before registry.removeSandbox", () => { - const rebuildMatch = nemoclawSrc.match( - /async function sandboxRebuild[\s\S]*?^\s*console\.log\(`\s*\$\{G\}.*Sandbox.*rebuilt/m, - ); - expect(rebuildMatch).toBeTruthy(); - if (!rebuildMatch) { - throw new Error("Expected sandboxRebuild() in src/nemoclaw.ts"); - } - const rebuildBody = rebuildMatch[0]; + removeSandboxImage("alpha", { + getSandbox: () => ({ name: "alpha", imageTag: "openshell/sandbox-from:123" }) as any, + dockerRmi: (tag) => { + removedTags.push(tag); + return { status: 0 } as any; + }, + }); - const removeImageIdx = rebuildBody.indexOf("removeSandboxImage("); - const removeRegistryIdx = rebuildBody.indexOf("registry.removeSandbox("); - expect(removeImageIdx).toBeGreaterThan(-1); - expect(removeRegistryIdx).toBeGreaterThan(-1); - expect(removeImageIdx).toBeLessThan(removeRegistryIdx); + expect(removedTags).toEqual(["openshell/sandbox-from:123"]); }); it("removeSandboxImage gracefully handles missing imageTag", () => { - // The function should check for imageTag before attempting removal - const fnMatch = nemoclawSrc.match(/function removeSandboxImage[\s\S]*?^}/m); - expect(fnMatch).toBeTruthy(); - if (!fnMatch) { - throw new Error("Expected removeSandboxImage() in src/nemoclaw.ts"); - } - expect(fnMatch[0]).toContain("imageTag"); + const removedTags: string[] = []; + + removeSandboxImage("alpha", { + getSandbox: () => ({ name: "alpha", imageTag: null }) as any, + dockerRmi: (tag) => { + removedTags.push(tag); + return { status: 0 } as any; + }, + }); + + expect(removedTags).toEqual([]); + }); + + it("treats missing sandbox delete results as already gone", () => { + expect( + getSandboxDeleteOutcome({ status: 1, stderr: "Error: sandbox alpha not found" }), + ).toEqual({ + output: "Error: sandbox alpha not found", + alreadyGone: true, + }); }); }); From 56e4f05a1037ded2a65b7b39c8883f0a8412608b Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Sat, 2 May 2026 20:21:20 -0700 Subject: [PATCH 07/65] refactor(cli): extract sandbox rebuild action --- src/lib/nemoclaw-runtime-bridge.ts | 1 - src/lib/sandbox-rebuild-action.ts | 506 +++++++++++++++++++++++++++ src/lib/sandbox-runtime-actions.ts | 5 +- src/nemoclaw.ts | 541 +---------------------------- 4 files changed, 515 insertions(+), 538 deletions(-) create mode 100644 src/lib/sandbox-rebuild-action.ts diff --git a/src/lib/nemoclaw-runtime-bridge.ts b/src/lib/nemoclaw-runtime-bridge.ts index 821632ca3c..5811dfee5a 100644 --- a/src/lib/nemoclaw-runtime-bridge.ts +++ b/src/lib/nemoclaw-runtime-bridge.ts @@ -4,7 +4,6 @@ /* v8 ignore start -- transitional bridge until command actions are extracted from src/nemoclaw.ts. */ export interface NemoClawRuntimeBridge { - sandboxRebuild: (sandboxName: string, args?: string[]) => Promise; upgradeSandboxes: (args?: string[]) => Promise; } diff --git a/src/lib/sandbox-rebuild-action.ts b/src/lib/sandbox-rebuild-action.ts new file mode 100644 index 0000000000..ce9581cc24 --- /dev/null +++ b/src/lib/sandbox-rebuild-action.ts @@ -0,0 +1,506 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +/* v8 ignore start -- exercised through CLI subprocess rebuild tests. */ + +import { CLI_NAME } from "./branding"; +import { prompt as askPrompt } from "./credentials"; +const { hydrateCredentialEnv } = require("./onboard") as { + hydrateCredentialEnv: (name: string) => string | null; +}; +import * as nim from "./nim"; +import * as onboardSession from "./onboard-session"; +import type { Session } from "./onboard-session"; +import { captureOpenshell, runOpenshell } from "./openshell-runtime"; +import * as policies from "./policies"; +import * as registry from "./registry"; +import { resolveOpenshell } from "./resolve-openshell"; +import { parseLiveSandboxNames } from "./runtime-recovery"; +import { getSandboxDeleteOutcome, removeSandboxRegistryEntry } from "./sandbox-destroy-action"; +import { executeSandboxCommand } from "./sandbox-process-recovery-action"; +import { + createSystemDeps as createSessionDeps, + getActiveSandboxSessions, +} from "./sandbox-session-state"; +import * as sandboxState from "./sandbox-state"; +import * as sandboxVersion from "./sandbox-version"; +import { B, D, G, R, RD as _RD, YW } from "./terminal-style"; + +const agentRuntime = require("../../bin/lib/agent-runtime"); + +function _rebuildLog(msg: string) { + console.error(` ${D}[rebuild ${new Date().toISOString()}] ${msg}${R}`); +} + +export async function rebuildSandbox( + sandboxName: string, + args: string[] = [], + opts: { throwOnError?: boolean } = {}, +): Promise { + const verbose = + args.includes("--verbose") || + args.includes("-v") || + process.env.NEMOCLAW_REBUILD_VERBOSE === "1"; + const log: (msg: string) => void = verbose ? _rebuildLog : () => {}; + const skipConfirm = args.includes("--yes") || args.includes("--force"); + // When called from upgradeSandboxes in a loop, throwOnError prevents + // process.exit from aborting the entire batch on the first failure. + const bail = opts.throwOnError + ? (msg: string, code = 1) => { + throw new Error(msg); + } + : (_msg: string, code = 1) => process.exit(code); + + // Active session detection — enrich the confirmation prompt if sessions are active + let rebuildActiveSessionCount = 0; + const opsBinRebuild = resolveOpenshell(); + if (opsBinRebuild) { + try { + const sessionResult = getActiveSandboxSessions(sandboxName, createSessionDeps(opsBinRebuild)); + if (sessionResult.detected) { + rebuildActiveSessionCount = sessionResult.sessions.length; + } + } catch { + /* non-fatal */ + } + } + + const sb = registry.getSandbox(sandboxName) as any; + if (!sb) { + console.error(` Sandbox '${sandboxName}' not found in registry.`); + bail(`Sandbox '${sandboxName}' not found in registry.`); + return; + } + + // Multi-agent guard (temporary — until swarm lands) + if (sb.agents && sb.agents.length > 1) { + console.error(" Multi-agent sandbox rebuild is not yet supported."); + console.error(` Back up state manually and recreate with \`${CLI_NAME} onboard\`.`); + bail("Multi-agent sandbox rebuild is not yet supported."); + return; + } + + const agent = agentRuntime.getSessionAgent(sandboxName); + const agentName = agentRuntime.getAgentDisplayName(agent); + + // Version check — show what's changing + const versionCheck = sandboxVersion.checkAgentVersion(sandboxName); + console.log(""); + console.log(` ${B}Rebuild sandbox '${sandboxName}'${R}`); + if (versionCheck.sandboxVersion) { + console.log(` Current: ${agentName} v${versionCheck.sandboxVersion}`); + } + if (versionCheck.expectedVersion) { + console.log(` Target: ${agentName} v${versionCheck.expectedVersion}`); + } + console.log(""); + + if (!skipConfirm) { + if (rebuildActiveSessionCount > 0) { + const plural = rebuildActiveSessionCount > 1 ? "sessions" : "session"; + console.log( + ` ${YW}⚠ Active SSH ${plural} detected (${rebuildActiveSessionCount} connection${rebuildActiveSessionCount > 1 ? "s" : ""})${R}`, + ); + console.log( + ` Rebuilding will terminate ${rebuildActiveSessionCount === 1 ? "the" : "all"} active ${plural} with a Broken pipe error.`, + ); + console.log(""); + } + console.log(" This will:"); + console.log(" 1. Back up workspace state"); + console.log(" 2. Destroy and recreate the sandbox with the current image"); + console.log(" 3. Restore workspace state into the new sandbox"); + console.log(""); + const answer = await askPrompt(" Proceed? [y/N]: "); + if (answer.trim().toLowerCase() !== "y" && answer.trim().toLowerCase() !== "yes") { + console.log(" Cancelled."); + return; + } + } + + // Step 0: Preflight — verify recreate preconditions BEFORE destroying + // anything. The most common rebuild failure is a missing provider + // credential when onboard runs in non-interactive mode. Checking now + // lets us abort with the sandbox still intact. See #2273. + const session = onboardSession.loadSession(); + let rebuildCredentialEnv: string | null = null; + if (session && session.sandboxName && session.sandboxName !== sandboxName) { + // Session belongs to a different sandbox — its credentialEnv may be + // wrong (e.g. hermes session while rebuilding openclaw). Skip the + // credential preflight; the agent sync from the registry (#2201) + // and onboard itself will handle provider selection. + log( + `Preflight warning: session belongs to '${session.sandboxName}', not '${sandboxName}' — skipping credential preflight`, + ); + console.log( + ` ${D}Note: onboard session belongs to '${session.sandboxName}', not '${sandboxName}'. ` + + `Skipping credential preflight.${R}`, + ); + } else { + rebuildCredentialEnv = session?.credentialEnv || null; + } + // Legacy migration: pre-fix local-inference sandboxes (GH #2519) recorded + // credentialEnv="OPENAI_API_KEY" in onboard-session.json even though the + // sandbox does not actually need a host OpenAI key (ollama-local uses an + // auth proxy with an internal token; vllm-local accepts a static dummy + // bearer). Treat the legacy value as null so rebuild does not demand a + // credential that was never actually used. + if ( + (session?.provider === "ollama-local" || session?.provider === "vllm-local") && + rebuildCredentialEnv === "OPENAI_API_KEY" + ) { + console.log( + ` ${D}Note: migrating ${session.provider} sandbox off OPENAI_API_KEY (GH #2519). ` + + `Local inference does not require a host API key.${R}`, + ); + log( + `Preflight: legacy ${session.provider} sandbox detected (credentialEnv=OPENAI_API_KEY) — clearing for rebuild`, + ); + rebuildCredentialEnv = null; + } + if (rebuildCredentialEnv) { + // hydrateCredentialEnv migrates any pre-fix legacy credentials.json + // into process.env once, so users upgrading from a release that wrote + // the plaintext file can still rebuild without re-entering keys. + const credentialValue = hydrateCredentialEnv(rebuildCredentialEnv); + log( + `Preflight credential check: ${rebuildCredentialEnv} → ${credentialValue ? "present" : "MISSING"}`, + ); + if (!credentialValue) { + console.error(""); + console.error(` ${_RD}Rebuild preflight failed:${R} provider credential not found.`); + console.error(` The non-interactive recreate step requires ${rebuildCredentialEnv},`); + console.error(" but it is not set in the environment."); + console.error(""); + console.error(" To fix, do one of:"); + console.error(` export ${rebuildCredentialEnv}=`); + console.error(` ${CLI_NAME} onboard # re-enter the key interactively`); + console.error(""); + console.error(" Sandbox is untouched — no data was lost."); + bail(`Missing credential: ${rebuildCredentialEnv}`); + return; + } + } else { + // No credentialEnv in session — local inference (Ollama/vLLM) or + // session was lost. Either way, skip the credential preflight; + // onboard will handle it. + log( + "Preflight credential check: no credentialEnv in session (local inference or missing session)", + ); + } + + // Step 1: Ensure sandbox is live for backup + log("Checking sandbox liveness: openshell sandbox list"); + const isLive = captureOpenshell(["sandbox", "list"], { ignoreError: true }); + log( + `openshell sandbox list exit=${isLive.status}, output=${(isLive.output || "").substring(0, 200)}`, + ); + const liveNames = parseLiveSandboxNames(isLive.output || ""); + log(`Live sandboxes: ${Array.from(liveNames).join(", ") || "(none)"}`); + if (!liveNames.has(sandboxName)) { + console.error(` Sandbox '${sandboxName}' is not running. Cannot back up state.`); + console.error(` Start it first or recreate with \`${CLI_NAME} onboard --recreate-sandbox\`.`); + bail(`Sandbox '${sandboxName}' is not running.`); + return; + } + + // Step 2: Backup + console.log(" Backing up sandbox state..."); + log(`Agent type: ${sb.agent || "openclaw"}, stateDirs from manifest`); + const backup = sandboxState.backupSandboxState(sandboxName); + log( + `Backup result: success=${backup.success}, backed=${backup.backedUpDirs.join(",")}, failed=${backup.failedDirs.join(",")}`, + ); + if (!backup.success) { + console.error(" Failed to back up sandbox state."); + if (backup.backedUpDirs.length > 0) { + console.error(` Partial backup: ${backup.backedUpDirs.join(", ")}`); + } + if (backup.failedDirs.length > 0) { + console.error(` Failed: ${backup.failedDirs.join(", ")}`); + } + console.error(" Aborting rebuild to prevent data loss."); + bail("Failed to back up sandbox state."); + return; + } + const backupManifest = backup.manifest; + if (!backupManifest) { + console.error(" Failed to record backup metadata."); + console.error(" Aborting rebuild to prevent data loss."); + bail("Failed to record backup metadata."); + return; + } + console.log(` ${G}\u2713${R} State backed up (${backup.backedUpDirs.length} directories)`); + console.log(` Backup: ${backupManifest.backupPath}`); + + // Step 3: Delete sandbox without tearing down gateway or session. + // sandboxDestroy() cleans up the gateway when it's the last sandbox and + // nulls session.sandboxName — both break the immediate onboard --resume. + console.log(" Deleting old sandbox..."); + const sbMeta = registry.getSandbox(sandboxName); + log( + `Registry entry: agent=${sbMeta?.agent}, agentVersion=${sbMeta?.agentVersion}, nimContainer=${sbMeta?.nimContainer}`, + ); + if (sbMeta && sbMeta.nimContainer) { + log(`Stopping NIM container: ${sbMeta.nimContainer}`); + nim.stopNimContainerByName(sbMeta.nimContainer); + } else { + // Best-effort cleanup — see comment in sandboxDestroy. + nim.stopNimContainer(sandboxName, { silent: true }); + } + + log(`Running: openshell sandbox delete ${sandboxName}`); + const deleteResult = runOpenshell(["sandbox", "delete", sandboxName], { + ignoreError: true, + stdio: ["ignore", "pipe", "pipe"], + }); + const { alreadyGone } = getSandboxDeleteOutcome(deleteResult); + log(`Delete result: exit=${deleteResult.status}, alreadyGone=${alreadyGone}`); + if (deleteResult.status !== 0 && !alreadyGone) { + console.error(" Failed to delete sandbox. Aborting rebuild."); + console.error(" State backup is preserved at: " + backupManifest.backupPath); + bail("Failed to delete sandbox.", deleteResult.status || 1); + return; + } + removeSandboxRegistryEntry(sandboxName); + log( + `Registry after remove: ${JSON.stringify(registry.listSandboxes().sandboxes.map((s: { name: string }) => s.name))}`, + ); + console.log(` ${G}\u2713${R} Old sandbox deleted`); + + // Step 4: Recreate via onboard --resume + console.log(""); + console.log(" Creating new sandbox with current image..."); + + // Force the sandbox name so onboard recreates with the same name. + // Mark session resumable and point at this sandbox; set env var as fallback. + const sessionBefore = onboardSession.loadSession(); + const sessionMatchesSandbox = sessionBefore?.sandboxName === sandboxName; + log( + `Session before update: sandboxName=${sessionBefore?.sandboxName}, status=${sessionBefore?.status}, resumable=${sessionBefore?.resumable}, provider=${sessionBefore?.provider}, model=${sessionBefore?.model}, sessionMatch=${sessionMatchesSandbox}`, + ); + + // Sync the session's agent field with the registry so onboard --resume + // rebuilds the correct sandbox type. Without this, a stale session.agent + // from a previous onboard of a *different* agent type would be picked up + // by resolveAgentName() and the wrong Dockerfile would be used. (#2201) + const rebuildAgent = sb.agent || null; + onboardSession.updateSession((s: Session) => { + s.sandboxName = sandboxName; + s.resumable = true; + s.status = "in_progress"; + s.agent = rebuildAgent; + // Persist inference selection from the about-to-be-removed registry entry + // so onboard --resume can recreate with the same provider/model in + // non-interactive mode. Without this the registry is gone by the time + // setupNim runs, leaving no recovery source. Assign explicitly (with a + // null fallback) so a missing registry value doesn't silently leave a + // stale session entry from an earlier sandbox in place. + s.provider = sb.provider ?? null; + s.model = sb.model ?? null; + s.nimContainer = sb.nimContainer ?? null; + return s; + }); + process.env.NEMOCLAW_SANDBOX_NAME = sandboxName; + + const sessionAfter = onboardSession.loadSession(); + log( + `Session after update: sandboxName=${sessionAfter?.sandboxName}, status=${sessionAfter?.status}, resumable=${sessionAfter?.resumable}, provider=${sessionAfter?.provider}, model=${sessionAfter?.model}`, + ); + log( + `Env: NEMOCLAW_SANDBOX_NAME=${process.env.NEMOCLAW_SANDBOX_NAME}, NEMOCLAW_RECREATE_SANDBOX=${process.env.NEMOCLAW_RECREATE_SANDBOX}`, + ); + + // Forward the stored --from Dockerfile path so onboard --resume uses the + // same custom image. Without this, the conflict check rejects the resume + // because requestedFrom (null) !== recordedFrom (the stored path). (#2301) + // Only read from the session when it belongs to this sandbox to avoid + // using config from a different sandbox's onboard run. + const storedFromDockerfile = sessionMatchesSandbox + ? sessionAfter?.metadata?.fromDockerfile || null + : null; + log( + `Calling onboard({ resume: true, nonInteractive: true, recreateSandbox: true, fromDockerfile: ${storedFromDockerfile} })`, + ); + + // Intercept process.exit during onboard so we can attempt rollback + // instead of dying with the sandbox destroyed. onboard() has ~87 + // process.exit() calls that would otherwise kill the process with no + // chance to recover. See #2273. + // + // NOTE: Throwing from the overridden process.exit unwinds onboard's + // call stack, which skips process.once("exit") listeners (lock + // release, build context cleanup, session failure marking). We + // manually release the lock and mark the session failed in the + // onboardFailed block below. + const { onboard } = require("./onboard"); + let onboardFailed = false; + let onboardExitCode = 1; + const _savedExit = process.exit; + process.exit = ((code) => { + onboardFailed = true; + onboardExitCode = typeof code === "number" ? code : 1; + // Throw a sentinel to unwind the onboard call stack. + // The catch block below handles it. + const err = new Error(`onboard exited with code ${onboardExitCode}`); + err.name = "RebuildOnboardExit"; + throw err; + }) as typeof process.exit; + + try { + await onboard({ + resume: true, + nonInteractive: true, + recreateSandbox: true, + agent: rebuildAgent, + fromDockerfile: storedFromDockerfile, + }); + log("onboard() returned successfully"); + } catch (err) { + onboardFailed = true; + const message = err instanceof Error ? err.message : String(err); + const name = err instanceof Error ? err.name : ""; + if (name !== "RebuildOnboardExit") { + log(`onboard() threw: ${message}`); + } + } finally { + process.exit = _savedExit; + } + + if (onboardFailed) { + // Clean up onboard's internal state that normally runs in + // process.once("exit") listeners — those never fire because we + // threw from the overridden process.exit instead of actually + // exiting. Without this the onboard lock file stays on disk and + // blocks the next onboard/rebuild invocation. + try { + onboardSession.releaseOnboardLock(); + } catch { + /* best effort */ + } + try { + const failedStep = onboardSession.loadSession()?.lastStepStarted; + if (failedStep) { + onboardSession.markStepFailed(failedStep, "Rebuild recreate failed"); + } + } catch { + /* best effort */ + } + + console.error(""); + console.error(` ${_RD}Recreate failed after sandbox was destroyed.${R}`); + console.error(` Backup is preserved at: ${backupManifest.backupPath}`); + console.error(""); + console.error(" To recover manually:"); + console.error(` 1. Fix the issue above (missing credential, Docker problem, etc.)`); + console.error(` 2. Run: ${CLI_NAME} onboard --resume`); + console.error(` This will recreate sandbox '${sandboxName}'.`); + console.error(` 3. Then restore your workspace state:`); + console.error( + ` ${CLI_NAME} ${sandboxName} snapshot restore "${backupManifest.timestamp}"`, + ); + console.error(""); + bail( + `Recreate failed (sandbox destroyed). Backup: ${backupManifest.backupPath}`, + onboardExitCode, + ); + return; + } + + // Step 5: Restore + console.log(""); + console.log(" Restoring workspace state..."); + log(`Restoring from: ${backupManifest.backupPath} into sandbox: ${sandboxName}`); + const restore = sandboxState.restoreSandboxState(sandboxName, backupManifest.backupPath); + log( + `Restore result: success=${restore.success}, restored=${restore.restoredDirs.join(",")}, failed=${restore.failedDirs.join(",")}`, + ); + if (!restore.success) { + console.error(` Partial restore: ${restore.restoredDirs.join(", ") || "none"}`); + console.error(` Failed: ${restore.failedDirs.join(", ")}`); + console.error(` Manual restore available from: ${backupManifest.backupPath}`); + } else { + console.log(` ${G}\u2713${R} State restored (${restore.restoredDirs.length} directories)`); + } + + // Step 5.5: Restore policy presets (#1952) + // Policy presets live in the gateway policy engine, not the sandbox filesystem. + // They are lost when the sandbox is destroyed and recreated. Re-apply any + // presets that were captured in the backup manifest. + const savedPresets = backupManifest.policyPresets || []; + if (savedPresets.length > 0) { + console.log(""); + console.log(" Restoring policy presets..."); + log(`Policy presets to restore: [${savedPresets.join(",")}]`); + const restoredPresets: string[] = []; + const failedPresets: string[] = []; + for (const presetName of savedPresets) { + try { + log(`Applying preset: ${presetName}`); + const applied = policies.applyPreset(sandboxName, presetName); + if (applied) { + restoredPresets.push(presetName); + } else { + failedPresets.push(presetName); + } + } catch (err) { + const errorMessage = err instanceof Error ? err.message : String(err); + log(`Failed to apply preset '${presetName}': ${errorMessage}`); + failedPresets.push(presetName); + } + } + if (restoredPresets.length > 0) { + console.log(` ${G}\u2713${R} Policy presets restored: ${restoredPresets.join(", ")}`); + } + if (failedPresets.length > 0) { + console.error(` ${YW}\u26a0${R} Failed to restore presets: ${failedPresets.join(", ")}`); + console.error(` Re-apply manually with: ${CLI_NAME} ${sandboxName} policy-add`); + } + } + + // Step 6: Post-restore agent-specific migration + const agentDef = agent + ? require("./agent-defs").loadAgent(agent.name) + : require("./agent-defs").loadAgent("openclaw"); + if (agentDef.name === "openclaw") { + // openclaw doctor --fix validates and repairs directory structure. + // Idempotent and safe — catches structural changes between OpenClaw versions + // (new symlinks, new data dirs, etc.) that the restored state may be missing. + log("Running openclaw doctor --fix inside sandbox for post-upgrade structure repair"); + const doctorResult = executeSandboxCommand(sandboxName, "openclaw doctor --fix"); + log( + `doctor --fix: exit=${doctorResult?.status}, stdout=${(doctorResult?.stdout || "").substring(0, 200)}`, + ); + if (doctorResult && doctorResult.status === 0) { + console.log(` ${G}\u2713${R} Post-upgrade structure check passed`); + } else { + console.log( + ` ${D}Post-upgrade structure check skipped (doctor returned ${doctorResult?.status ?? "null"})${R}`, + ); + } + } + // Hermes: no explicit post-restore step needed. Hermes's SessionDB._init_schema() + // auto-migrates state.db (SQLite) on first connection via sequential ALTER TABLE + // migrations (idempotent, schema_version tracked). ensure_hermes_home() repairs + // missing directories implicitly. The NemoClaw plugin's skill cache refreshes on + // on_session_start. Gateway startup is non-fatal if state.db migration fails. + + // Step 7: Update registry with new version + registry.updateSandbox(sandboxName, { + agentVersion: agentDef.expectedVersion || null, + }); + log(`Registry updated: agentVersion=${agentDef.expectedVersion}`); + + console.log(""); + if (restore.success) { + console.log(` ${G}\u2713${R} Sandbox '${sandboxName}' rebuilt successfully`); + if (versionCheck.expectedVersion) { + console.log(` Now running: ${agentName} v${versionCheck.expectedVersion}`); + } + } else { + console.log( + ` ${YW}\u26a0${R} Sandbox '${sandboxName}' rebuilt but state restore was incomplete`, + ); + console.log(` Backup available at: ${backupManifest.backupPath}`); + } +} diff --git a/src/lib/sandbox-runtime-actions.ts b/src/lib/sandbox-runtime-actions.ts index 2e659c6c6e..e8eae3fa13 100644 --- a/src/lib/sandbox-runtime-actions.ts +++ b/src/lib/sandbox-runtime-actions.ts @@ -38,7 +38,10 @@ export async function destroySandbox(sandboxName: string, args: string[] = []): } export async function rebuildSandbox(sandboxName: string, args: string[] = []): Promise { - await getNemoClawRuntimeBridge().sandboxRebuild(sandboxName, args); + const { rebuildSandbox: rebuildExtractedSandbox } = require("./sandbox-rebuild-action") as { + rebuildSandbox: (sandboxName: string, args?: string[]) => Promise; + }; + await rebuildExtractedSandbox(sandboxName, args); } export async function installSandboxSkill( diff --git a/src/nemoclaw.ts b/src/nemoclaw.ts index 30ec557a11..f9c9cce0e1 100644 --- a/src/nemoclaw.ts +++ b/src/nemoclaw.ts @@ -1,10 +1,6 @@ // SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 -const { spawn, spawnSync } = require("child_process"); -const path = require("path"); - - // --------------------------------------------------------------------------- // Color / style — respects NO_COLOR and non-TTY environments. // Uses exact NVIDIA green #76B900 on truecolor terminals; 256-color otherwise. @@ -19,54 +15,29 @@ const R = _useColor ? "\x1b[0m" : ""; const _RD = _useColor ? "\x1b[1;31m" : ""; const YW = _useColor ? "\x1b[1;33m" : ""; -const { ROOT, run, runInteractive, validateName } = require("./lib/runner"); +const { ROOT, validateName } = require("./lib/runner"); // --------------------------------------------------------------------------- // Agent branding — derived from NEMOCLAW_AGENT when an alias launcher sets it; // otherwise the branding module falls back to the OpenClaw defaults. // --------------------------------------------------------------------------- -const { CLI_NAME, CLI_DISPLAY_NAME } = require("./lib/branding"); - -const { dockerCapture, dockerInspect } = require("./lib/docker"); -const { resolveOpenshell } = require("./lib/resolve-openshell"); -const { hydrateCredentialEnv, isNonInteractive } = require("./lib/onboard"); +const { CLI_NAME } = require("./lib/branding"); const { prompt: askPrompt } = require("./lib/credentials"); const registry = require("./lib/registry"); -import type { SandboxEntry } from "./lib/registry"; const nim = require("./lib/nim"); const shields = require("./lib/shields"); -const policies = require("./lib/policies"); const { help, version } = require("./lib/root-help-action"); -const onboardSession = require("./lib/onboard-session"); -import type { Session } from "./lib/onboard-session"; const { parseLiveSandboxNames } = require("./lib/runtime-recovery"); -const { - captureOpenshell, - getInstalledOpenshellVersionOrNull, - runOpenshell, -} = require("./lib/openshell-runtime"); +const { captureOpenshell } = require("./lib/openshell-runtime"); const { recoverRegistryEntries } = require("./lib/registry-recovery-action"); -const { ensureLiveSandboxOrExit } = require("./lib/sandbox-gateway-state-action"); +const { rebuildSandbox } = require("./lib/sandbox-rebuild-action"); const { isSandboxConnectFlag, parseSandboxConnectArgs, printSandboxConnectHelp, } = require("./lib/sandbox-connect-action"); -const { executeSandboxCommand } = require("./lib/sandbox-process-recovery-action"); -const { - getSandboxDeleteOutcome, - removeSandboxRegistryEntry, -} = require("./lib/sandbox-destroy-action"); const { runRegisteredOclifCommand } = require("./lib/oclif-runner"); -const agentRuntime = require("../bin/lib/agent-runtime"); const sandboxVersion = require("./lib/sandbox-version"); -const sandboxState = require("./lib/sandbox-state"); -const { parseRestoreArgs } = sandboxState; -const { - getActiveSandboxSessions, - createSystemDeps: createSessionDeps, -} = require("./lib/sandbox-session-state"); - const { canonicalUsageList, globalCommandTokens, @@ -83,39 +54,9 @@ import { const GLOBAL_COMMANDS = globalCommandTokens(); -type SpawnLikeResult = { - status: number | null; - stdout?: string; - stderr?: string; - output?: string; - error?: Error; - signal?: NodeJS.Signals | null; -}; - -type RecoveredSandboxMetadata = Partial< - Pick -> & { - policyPresets?: string[] | null; -}; - exports.runtimeBridge = { - sandboxRebuild, upgradeSandboxes, }; -/** Print user-facing guidance when OpenShell is too old to support `openshell logs`. */ -function printOldLogsCompatibilityGuidance(installedVersion = null) { - const versionText = installedVersion ? ` (${installedVersion})` : ""; - console.error( - ` Installed OpenShell${versionText} is too old or incompatible with \`${CLI_NAME} logs\`.`, - ); - console.error( - ` ${CLI_DISPLAY_NAME} expects \`openshell logs \` and live streaming via \`--tail\`.`, - ); - console.error( - ` Upgrade OpenShell by rerunning \`${CLI_NAME} onboard\`, or reinstall the OpenShell CLI and try again.`, - ); -} - // ── Commands ───────────────────────────────────────────────────── async function runOclif(commandId: string, args: string[] = []): Promise { @@ -130,478 +71,6 @@ function printSandboxActionUsage(action: string): void { console.log(` Usage: ${CLI_NAME} ${action}`); } -// ── Rebuild ────────────────────────────────────────────────────── - -function _rebuildLog(msg: string) { - console.error(` ${D}[rebuild ${new Date().toISOString()}] ${msg}${R}`); -} - -async function sandboxRebuild( - sandboxName: string, - args: string[] = [], - opts: { throwOnError?: boolean } = {}, -): Promise { - const verbose = - args.includes("--verbose") || - args.includes("-v") || - process.env.NEMOCLAW_REBUILD_VERBOSE === "1"; - const log: (msg: string) => void = verbose ? _rebuildLog : () => {}; - const skipConfirm = args.includes("--yes") || args.includes("--force"); - // When called from upgradeSandboxes in a loop, throwOnError prevents - // process.exit from aborting the entire batch on the first failure. - const bail = opts.throwOnError - ? (msg: string, code = 1) => { - throw new Error(msg); - } - : (_msg: string, code = 1) => process.exit(code); - - // Active session detection — enrich the confirmation prompt if sessions are active - let rebuildActiveSessionCount = 0; - const opsBinRebuild = resolveOpenshell(); - if (opsBinRebuild) { - try { - const sessionResult = getActiveSandboxSessions(sandboxName, createSessionDeps(opsBinRebuild)); - if (sessionResult.detected) { - rebuildActiveSessionCount = sessionResult.sessions.length; - } - } catch { - /* non-fatal */ - } - } - - const sb = registry.getSandbox(sandboxName); - if (!sb) { - console.error(` Sandbox '${sandboxName}' not found in registry.`); - bail(`Sandbox '${sandboxName}' not found in registry.`); - return; - } - - // Multi-agent guard (temporary — until swarm lands) - if (sb.agents && sb.agents.length > 1) { - console.error(" Multi-agent sandbox rebuild is not yet supported."); - console.error(` Back up state manually and recreate with \`${CLI_NAME} onboard\`.`); - bail("Multi-agent sandbox rebuild is not yet supported."); - return; - } - - const agent = agentRuntime.getSessionAgent(sandboxName); - const agentName = agentRuntime.getAgentDisplayName(agent); - - // Version check — show what's changing - const versionCheck = sandboxVersion.checkAgentVersion(sandboxName); - console.log(""); - console.log(` ${B}Rebuild sandbox '${sandboxName}'${R}`); - if (versionCheck.sandboxVersion) { - console.log(` Current: ${agentName} v${versionCheck.sandboxVersion}`); - } - if (versionCheck.expectedVersion) { - console.log(` Target: ${agentName} v${versionCheck.expectedVersion}`); - } - console.log(""); - - if (!skipConfirm) { - if (rebuildActiveSessionCount > 0) { - const plural = rebuildActiveSessionCount > 1 ? "sessions" : "session"; - console.log( - ` ${YW}⚠ Active SSH ${plural} detected (${rebuildActiveSessionCount} connection${rebuildActiveSessionCount > 1 ? "s" : ""})${R}`, - ); - console.log( - ` Rebuilding will terminate ${rebuildActiveSessionCount === 1 ? "the" : "all"} active ${plural} with a Broken pipe error.`, - ); - console.log(""); - } - console.log(" This will:"); - console.log(" 1. Back up workspace state"); - console.log(" 2. Destroy and recreate the sandbox with the current image"); - console.log(" 3. Restore workspace state into the new sandbox"); - console.log(""); - const answer = await askPrompt(" Proceed? [y/N]: "); - if (answer.trim().toLowerCase() !== "y" && answer.trim().toLowerCase() !== "yes") { - console.log(" Cancelled."); - return; - } - } - - // Step 0: Preflight — verify recreate preconditions BEFORE destroying - // anything. The most common rebuild failure is a missing provider - // credential when onboard runs in non-interactive mode. Checking now - // lets us abort with the sandbox still intact. See #2273. - const session = onboardSession.loadSession(); - let rebuildCredentialEnv: string | null = null; - if (session && session.sandboxName && session.sandboxName !== sandboxName) { - // Session belongs to a different sandbox — its credentialEnv may be - // wrong (e.g. hermes session while rebuilding openclaw). Skip the - // credential preflight; the agent sync from the registry (#2201) - // and onboard itself will handle provider selection. - log( - `Preflight warning: session belongs to '${session.sandboxName}', not '${sandboxName}' — skipping credential preflight`, - ); - console.log( - ` ${D}Note: onboard session belongs to '${session.sandboxName}', not '${sandboxName}'. ` + - `Skipping credential preflight.${R}`, - ); - } else { - rebuildCredentialEnv = session?.credentialEnv || null; - } - // Legacy migration: pre-fix local-inference sandboxes (GH #2519) recorded - // credentialEnv="OPENAI_API_KEY" in onboard-session.json even though the - // sandbox does not actually need a host OpenAI key (ollama-local uses an - // auth proxy with an internal token; vllm-local accepts a static dummy - // bearer). Treat the legacy value as null so rebuild does not demand a - // credential that was never actually used. - if ( - (session?.provider === "ollama-local" || session?.provider === "vllm-local") && - rebuildCredentialEnv === "OPENAI_API_KEY" - ) { - console.log( - ` ${D}Note: migrating ${session.provider} sandbox off OPENAI_API_KEY (GH #2519). ` + - `Local inference does not require a host API key.${R}`, - ); - log( - `Preflight: legacy ${session.provider} sandbox detected (credentialEnv=OPENAI_API_KEY) — clearing for rebuild`, - ); - rebuildCredentialEnv = null; - } - if (rebuildCredentialEnv) { - // hydrateCredentialEnv migrates any pre-fix legacy credentials.json - // into process.env once, so users upgrading from a release that wrote - // the plaintext file can still rebuild without re-entering keys. - const credentialValue = hydrateCredentialEnv(rebuildCredentialEnv); - log( - `Preflight credential check: ${rebuildCredentialEnv} → ${credentialValue ? "present" : "MISSING"}`, - ); - if (!credentialValue) { - console.error(""); - console.error(` ${_RD}Rebuild preflight failed:${R} provider credential not found.`); - console.error(` The non-interactive recreate step requires ${rebuildCredentialEnv},`); - console.error(" but it is not set in the environment."); - console.error(""); - console.error(" To fix, do one of:"); - console.error(` export ${rebuildCredentialEnv}=`); - console.error(` ${CLI_NAME} onboard # re-enter the key interactively`); - console.error(""); - console.error(" Sandbox is untouched — no data was lost."); - bail(`Missing credential: ${rebuildCredentialEnv}`); - return; - } - } else { - // No credentialEnv in session — local inference (Ollama/vLLM) or - // session was lost. Either way, skip the credential preflight; - // onboard will handle it. - log( - "Preflight credential check: no credentialEnv in session (local inference or missing session)", - ); - } - - // Step 1: Ensure sandbox is live for backup - log("Checking sandbox liveness: openshell sandbox list"); - const isLive = captureOpenshell(["sandbox", "list"], { ignoreError: true }); - log( - `openshell sandbox list exit=${isLive.status}, output=${(isLive.output || "").substring(0, 200)}`, - ); - const liveNames = parseLiveSandboxNames(isLive.output || ""); - log(`Live sandboxes: ${Array.from(liveNames).join(", ") || "(none)"}`); - if (!liveNames.has(sandboxName)) { - console.error(` Sandbox '${sandboxName}' is not running. Cannot back up state.`); - console.error(` Start it first or recreate with \`${CLI_NAME} onboard --recreate-sandbox\`.`); - bail(`Sandbox '${sandboxName}' is not running.`); - return; - } - - // Step 2: Backup - console.log(" Backing up sandbox state..."); - log(`Agent type: ${sb.agent || "openclaw"}, stateDirs from manifest`); - const backup = sandboxState.backupSandboxState(sandboxName); - log( - `Backup result: success=${backup.success}, backed=${backup.backedUpDirs.join(",")}, failed=${backup.failedDirs.join(",")}`, - ); - if (!backup.success) { - console.error(" Failed to back up sandbox state."); - if (backup.backedUpDirs.length > 0) { - console.error(` Partial backup: ${backup.backedUpDirs.join(", ")}`); - } - if (backup.failedDirs.length > 0) { - console.error(` Failed: ${backup.failedDirs.join(", ")}`); - } - console.error(" Aborting rebuild to prevent data loss."); - bail("Failed to back up sandbox state."); - return; - } - console.log(` ${G}\u2713${R} State backed up (${backup.backedUpDirs.length} directories)`); - console.log(` Backup: ${backup.manifest.backupPath}`); - - // Step 3: Delete sandbox without tearing down gateway or session. - // sandboxDestroy() cleans up the gateway when it's the last sandbox and - // nulls session.sandboxName — both break the immediate onboard --resume. - console.log(" Deleting old sandbox..."); - const sbMeta = registry.getSandbox(sandboxName); - log( - `Registry entry: agent=${sbMeta?.agent}, agentVersion=${sbMeta?.agentVersion}, nimContainer=${sbMeta?.nimContainer}`, - ); - if (sbMeta && sbMeta.nimContainer) { - log(`Stopping NIM container: ${sbMeta.nimContainer}`); - nim.stopNimContainerByName(sbMeta.nimContainer); - } else { - // Best-effort cleanup — see comment in sandboxDestroy. - nim.stopNimContainer(sandboxName, { silent: true }); - } - - log(`Running: openshell sandbox delete ${sandboxName}`); - const deleteResult = runOpenshell(["sandbox", "delete", sandboxName], { - ignoreError: true, - stdio: ["ignore", "pipe", "pipe"], - }); - const { alreadyGone } = getSandboxDeleteOutcome(deleteResult); - log(`Delete result: exit=${deleteResult.status}, alreadyGone=${alreadyGone}`); - if (deleteResult.status !== 0 && !alreadyGone) { - console.error(" Failed to delete sandbox. Aborting rebuild."); - console.error(" State backup is preserved at: " + backup.manifest.backupPath); - bail("Failed to delete sandbox.", deleteResult.status || 1); - return; - } - removeSandboxRegistryEntry(sandboxName); - log( - `Registry after remove: ${JSON.stringify(registry.listSandboxes().sandboxes.map((s: { name: string }) => s.name))}`, - ); - console.log(` ${G}\u2713${R} Old sandbox deleted`); - - // Step 4: Recreate via onboard --resume - console.log(""); - console.log(" Creating new sandbox with current image..."); - - // Force the sandbox name so onboard recreates with the same name. - // Mark session resumable and point at this sandbox; set env var as fallback. - const sessionBefore = onboardSession.loadSession(); - const sessionMatchesSandbox = sessionBefore?.sandboxName === sandboxName; - log( - `Session before update: sandboxName=${sessionBefore?.sandboxName}, status=${sessionBefore?.status}, resumable=${sessionBefore?.resumable}, provider=${sessionBefore?.provider}, model=${sessionBefore?.model}, sessionMatch=${sessionMatchesSandbox}`, - ); - - // Sync the session's agent field with the registry so onboard --resume - // rebuilds the correct sandbox type. Without this, a stale session.agent - // from a previous onboard of a *different* agent type would be picked up - // by resolveAgentName() and the wrong Dockerfile would be used. (#2201) - const rebuildAgent = sb.agent || null; - onboardSession.updateSession((s: Session) => { - s.sandboxName = sandboxName; - s.resumable = true; - s.status = "in_progress"; - s.agent = rebuildAgent; - // Persist inference selection from the about-to-be-removed registry entry - // so onboard --resume can recreate with the same provider/model in - // non-interactive mode. Without this the registry is gone by the time - // setupNim runs, leaving no recovery source. Assign explicitly (with a - // null fallback) so a missing registry value doesn't silently leave a - // stale session entry from an earlier sandbox in place. - s.provider = sb.provider ?? null; - s.model = sb.model ?? null; - s.nimContainer = sb.nimContainer ?? null; - return s; - }); - process.env.NEMOCLAW_SANDBOX_NAME = sandboxName; - - const sessionAfter = onboardSession.loadSession(); - log( - `Session after update: sandboxName=${sessionAfter?.sandboxName}, status=${sessionAfter?.status}, resumable=${sessionAfter?.resumable}, provider=${sessionAfter?.provider}, model=${sessionAfter?.model}`, - ); - log( - `Env: NEMOCLAW_SANDBOX_NAME=${process.env.NEMOCLAW_SANDBOX_NAME}, NEMOCLAW_RECREATE_SANDBOX=${process.env.NEMOCLAW_RECREATE_SANDBOX}`, - ); - - // Forward the stored --from Dockerfile path so onboard --resume uses the - // same custom image. Without this, the conflict check rejects the resume - // because requestedFrom (null) !== recordedFrom (the stored path). (#2301) - // Only read from the session when it belongs to this sandbox to avoid - // using config from a different sandbox's onboard run. - const storedFromDockerfile = sessionMatchesSandbox - ? sessionAfter?.metadata?.fromDockerfile || null - : null; - log( - `Calling onboard({ resume: true, nonInteractive: true, recreateSandbox: true, fromDockerfile: ${storedFromDockerfile} })`, - ); - - // Intercept process.exit during onboard so we can attempt rollback - // instead of dying with the sandbox destroyed. onboard() has ~87 - // process.exit() calls that would otherwise kill the process with no - // chance to recover. See #2273. - // - // NOTE: Throwing from the overridden process.exit unwinds onboard's - // call stack, which skips process.once("exit") listeners (lock - // release, build context cleanup, session failure marking). We - // manually release the lock and mark the session failed in the - // onboardFailed block below. - const { onboard } = require("./lib/onboard"); - let onboardFailed = false; - let onboardExitCode = 1; - const _savedExit = process.exit; - process.exit = ((code) => { - onboardFailed = true; - onboardExitCode = typeof code === "number" ? code : 1; - // Throw a sentinel to unwind the onboard call stack. - // The catch block below handles it. - const err = new Error(`onboard exited with code ${onboardExitCode}`); - err.name = "RebuildOnboardExit"; - throw err; - }) as typeof process.exit; - - try { - await onboard({ - resume: true, - nonInteractive: true, - recreateSandbox: true, - agent: rebuildAgent, - fromDockerfile: storedFromDockerfile, - }); - log("onboard() returned successfully"); - } catch (err) { - onboardFailed = true; - const message = err instanceof Error ? err.message : String(err); - const name = err instanceof Error ? err.name : ""; - if (name !== "RebuildOnboardExit") { - log(`onboard() threw: ${message}`); - } - } finally { - process.exit = _savedExit; - } - - if (onboardFailed) { - // Clean up onboard's internal state that normally runs in - // process.once("exit") listeners — those never fire because we - // threw from the overridden process.exit instead of actually - // exiting. Without this the onboard lock file stays on disk and - // blocks the next onboard/rebuild invocation. - try { - onboardSession.releaseOnboardLock(); - } catch { - /* best effort */ - } - try { - const failedStep = onboardSession.loadSession()?.lastStepStarted; - if (failedStep) { - onboardSession.markStepFailed(failedStep, "Rebuild recreate failed"); - } - } catch { - /* best effort */ - } - - console.error(""); - console.error(` ${_RD}Recreate failed after sandbox was destroyed.${R}`); - console.error(` Backup is preserved at: ${backup.manifest.backupPath}`); - console.error(""); - console.error(" To recover manually:"); - console.error(` 1. Fix the issue above (missing credential, Docker problem, etc.)`); - console.error(` 2. Run: ${CLI_NAME} onboard --resume`); - console.error(` This will recreate sandbox '${sandboxName}'.`); - console.error(` 3. Then restore your workspace state:`); - console.error( - ` ${CLI_NAME} ${sandboxName} snapshot restore "${backup.manifest.timestamp}"`, - ); - console.error(""); - bail( - `Recreate failed (sandbox destroyed). Backup: ${backup.manifest.backupPath}`, - onboardExitCode, - ); - return; - } - - // Step 5: Restore - console.log(""); - console.log(" Restoring workspace state..."); - log(`Restoring from: ${backup.manifest.backupPath} into sandbox: ${sandboxName}`); - const restore = sandboxState.restoreSandboxState(sandboxName, backup.manifest.backupPath); - log( - `Restore result: success=${restore.success}, restored=${restore.restoredDirs.join(",")}, failed=${restore.failedDirs.join(",")}`, - ); - if (!restore.success) { - console.error(` Partial restore: ${restore.restoredDirs.join(", ") || "none"}`); - console.error(` Failed: ${restore.failedDirs.join(", ")}`); - console.error(` Manual restore available from: ${backup.manifest.backupPath}`); - } else { - console.log(` ${G}\u2713${R} State restored (${restore.restoredDirs.length} directories)`); - } - - // Step 5.5: Restore policy presets (#1952) - // Policy presets live in the gateway policy engine, not the sandbox filesystem. - // They are lost when the sandbox is destroyed and recreated. Re-apply any - // presets that were captured in the backup manifest. - const savedPresets = backup.manifest.policyPresets || []; - if (savedPresets.length > 0) { - console.log(""); - console.log(" Restoring policy presets..."); - log(`Policy presets to restore: [${savedPresets.join(",")}]`); - const restoredPresets: string[] = []; - const failedPresets: string[] = []; - for (const presetName of savedPresets) { - try { - log(`Applying preset: ${presetName}`); - const applied = policies.applyPreset(sandboxName, presetName); - if (applied) { - restoredPresets.push(presetName); - } else { - failedPresets.push(presetName); - } - } catch (err) { - const errorMessage = err instanceof Error ? err.message : String(err); - log(`Failed to apply preset '${presetName}': ${errorMessage}`); - failedPresets.push(presetName); - } - } - if (restoredPresets.length > 0) { - console.log(` ${G}\u2713${R} Policy presets restored: ${restoredPresets.join(", ")}`); - } - if (failedPresets.length > 0) { - console.error(` ${YW}\u26a0${R} Failed to restore presets: ${failedPresets.join(", ")}`); - console.error(` Re-apply manually with: ${CLI_NAME} ${sandboxName} policy-add`); - } - } - - // Step 6: Post-restore agent-specific migration - const agentDef = agent - ? require("./lib/agent-defs").loadAgent(agent.name) - : require("./lib/agent-defs").loadAgent("openclaw"); - if (agentDef.name === "openclaw") { - // openclaw doctor --fix validates and repairs directory structure. - // Idempotent and safe — catches structural changes between OpenClaw versions - // (new symlinks, new data dirs, etc.) that the restored state may be missing. - log("Running openclaw doctor --fix inside sandbox for post-upgrade structure repair"); - const doctorResult = executeSandboxCommand(sandboxName, "openclaw doctor --fix"); - log( - `doctor --fix: exit=${doctorResult?.status}, stdout=${(doctorResult?.stdout || "").substring(0, 200)}`, - ); - if (doctorResult && doctorResult.status === 0) { - console.log(` ${G}\u2713${R} Post-upgrade structure check passed`); - } else { - console.log( - ` ${D}Post-upgrade structure check skipped (doctor returned ${doctorResult?.status ?? "null"})${R}`, - ); - } - } - // Hermes: no explicit post-restore step needed. Hermes's SessionDB._init_schema() - // auto-migrates state.db (SQLite) on first connection via sequential ALTER TABLE - // migrations (idempotent, schema_version tracked). ensure_hermes_home() repairs - // missing directories implicitly. The NemoClaw plugin's skill cache refreshes on - // on_session_start. Gateway startup is non-fatal if state.db migration fails. - - // Step 7: Update registry with new version - registry.updateSandbox(sandboxName, { - agentVersion: agentDef.expectedVersion || null, - }); - log(`Registry updated: agentVersion=${agentDef.expectedVersion}`); - - console.log(""); - if (restore.success) { - console.log(` ${G}\u2713${R} Sandbox '${sandboxName}' rebuilt successfully`); - if (versionCheck.expectedVersion) { - console.log(` Now running: ${agentName} v${versionCheck.expectedVersion}`); - } - } else { - console.log( - ` ${YW}\u26a0${R} Sandbox '${sandboxName}' rebuilt but state restore was incomplete`, - ); - console.log(` Backup available at: ${backup.manifest.backupPath}`); - } -} - // ── Upgrade sandboxes (#1904) ──────────────────────────────────── // Detect sandboxes running stale agent versions and offer to rebuild them. @@ -699,7 +168,7 @@ async function upgradeSandboxes(args: string[] = []): Promise { } } try { - await sandboxRebuild(s.name, ["--yes"], { throwOnError: true }); + await rebuildSandbox(s.name, ["--yes"], { throwOnError: true }); rebuilt++; } catch (err) { const errorMessage = err instanceof Error ? err.message : String(err); From 8bf19581adc520485bca570cc8147b84e4a28bc4 Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Sat, 2 May 2026 20:28:47 -0700 Subject: [PATCH 08/65] refactor(cli): extract upgrade sandboxes action --- src/lib/global-cli-actions.ts | 5 +- src/lib/nemoclaw-runtime-bridge.ts | 4 +- src/lib/sandbox-runtime-actions.ts | 1 - src/lib/upgrade-sandboxes-action.ts | 125 ++++++++++++++++++++++++++++ src/nemoclaw.ts | 120 -------------------------- 5 files changed, 130 insertions(+), 125 deletions(-) create mode 100644 src/lib/upgrade-sandboxes-action.ts diff --git a/src/lib/global-cli-actions.ts b/src/lib/global-cli-actions.ts index 49334c3adb..8b09589634 100644 --- a/src/lib/global-cli-actions.ts +++ b/src/lib/global-cli-actions.ts @@ -39,7 +39,10 @@ export function runBackupAllAction(): void { } export async function runUpgradeSandboxesAction(args: string[] = []): Promise { - await getNemoClawRuntimeBridge().upgradeSandboxes(args); + const { upgradeSandboxes } = require("./upgrade-sandboxes-action") as { + upgradeSandboxes: (args?: string[]) => Promise; + }; + await upgradeSandboxes(args); } export async function runGarbageCollectImagesAction(args: string[] = []): Promise { diff --git a/src/lib/nemoclaw-runtime-bridge.ts b/src/lib/nemoclaw-runtime-bridge.ts index 5811dfee5a..a762ea197c 100644 --- a/src/lib/nemoclaw-runtime-bridge.ts +++ b/src/lib/nemoclaw-runtime-bridge.ts @@ -3,9 +3,7 @@ /* v8 ignore start -- transitional bridge until command actions are extracted from src/nemoclaw.ts. */ -export interface NemoClawRuntimeBridge { - upgradeSandboxes: (args?: string[]) => Promise; -} +export interface NemoClawRuntimeBridge {} let runtimeFactory = (): NemoClawRuntimeBridge => { const runtimeModule = require("../nemoclaw") as { diff --git a/src/lib/sandbox-runtime-actions.ts b/src/lib/sandbox-runtime-actions.ts index e8eae3fa13..7097f33ac4 100644 --- a/src/lib/sandbox-runtime-actions.ts +++ b/src/lib/sandbox-runtime-actions.ts @@ -4,7 +4,6 @@ /* v8 ignore start -- transitional action facade until implementations leave src/nemoclaw.ts. */ import type { SandboxConnectOptions } from "./sandbox-connect-action"; -import { getNemoClawRuntimeBridge } from "./nemoclaw-runtime-bridge"; export async function connectSandbox( sandboxName: string, diff --git a/src/lib/upgrade-sandboxes-action.ts b/src/lib/upgrade-sandboxes-action.ts new file mode 100644 index 0000000000..281487bd94 --- /dev/null +++ b/src/lib/upgrade-sandboxes-action.ts @@ -0,0 +1,125 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +/* v8 ignore start -- exercised through CLI subprocess upgrade tests. */ + +import { CLI_NAME } from "./branding"; +import { prompt as askPrompt } from "./credentials"; +import { captureOpenshell } from "./openshell-runtime"; +import * as registry from "./registry"; +import { parseLiveSandboxNames } from "./runtime-recovery"; +import { rebuildSandbox } from "./sandbox-rebuild-action"; +import * as sandboxVersion from "./sandbox-version"; +import { B, D, G, R, YW } from "./terminal-style"; + +// ── Upgrade sandboxes (#1904) ──────────────────────────────────── +// Detect sandboxes running stale agent versions and offer to rebuild them. + +export async function upgradeSandboxes(args: string[] = []): Promise { + const checkOnly = args.includes("--check"); + const auto = args.includes("--auto"); + const skipConfirm = auto || args.includes("--yes"); + + const sandboxes = registry.listSandboxes().sandboxes; + if (sandboxes.length === 0) { + console.log(" No sandboxes found in the registry."); + return; + } + + // Query live sandboxes so we can tell the user which are running + const liveResult = captureOpenshell(["sandbox", "list"], { ignoreError: true }); + if (liveResult.status !== 0) { + console.error(" Failed to query running sandboxes from OpenShell."); + console.error(" Ensure OpenShell is running: openshell status"); + process.exit(liveResult.status || 1); + } + const liveNames = parseLiveSandboxNames(liveResult.output || ""); + + // Classify sandboxes as stale, unknown, or current + const stale = []; + const unknown = []; + for (const sb of sandboxes) { + const versionCheck = sandboxVersion.checkAgentVersion(sb.name); + if (versionCheck.isStale) { + stale.push({ + name: sb.name, + current: versionCheck.sandboxVersion, + expected: versionCheck.expectedVersion, + running: liveNames.has(sb.name), + }); + } else if (versionCheck.detectionMethod === "unavailable") { + unknown.push({ + name: sb.name, + expected: versionCheck.expectedVersion, + running: liveNames.has(sb.name), + }); + } + } + + if (stale.length === 0 && unknown.length === 0) { + console.log(" All sandboxes are up to date."); + return; + } + + if (stale.length > 0) { + console.log(`\n ${B}Stale sandboxes:${R}`); + for (const s of stale) { + const status = s.running ? `${G}running${R}` : `${D}stopped${R}`; + console.log(` ${s.name} v${s.current || "?"} → v${s.expected} (${status})`); + } + } + if (unknown.length > 0) { + console.log(`\n ${YW}Unknown version:${R}`); + for (const s of unknown) { + const status = s.running ? `${G}running${R}` : `${D}stopped${R}`; + console.log(` ${s.name} v? → v${s.expected} (${status})`); + } + } + console.log(""); + + if (checkOnly) { + if (stale.length > 0) console.log(` ${stale.length} sandbox(es) need upgrading.`); + if (unknown.length > 0) { + console.log( + ` ${unknown.length} sandbox(es) could not be version-checked; start them and rerun, or rebuild manually.`, + ); + } + console.log(` Run \`${CLI_NAME} upgrade-sandboxes\` to rebuild them.`); + return; + } + + const rebuildable = stale.filter((s: { running: boolean }) => s.running); + const stopped = stale.filter((s: { running: boolean }) => !s.running); + if (stopped.length > 0) { + console.log(` ${D}Skipping ${stopped.length} stopped sandbox(es) — start them first.${R}`); + } + if (rebuildable.length === 0) { + console.log(" No running stale sandboxes to rebuild."); + return; + } + + let rebuilt = 0; + let failed = 0; + for (const s of rebuildable) { + if (!skipConfirm) { + const answer = await askPrompt(` Rebuild '${s.name}'? [y/N]: `); + if (answer.trim().toLowerCase() !== "y" && answer.trim().toLowerCase() !== "yes") { + console.log(` Skipped '${s.name}'.`); + continue; + } + } + try { + await rebuildSandbox(s.name, ["--yes"], { throwOnError: true }); + rebuilt++; + } catch (err) { + const errorMessage = err instanceof Error ? err.message : String(err); + console.error(` ${YW}⚠${R} Failed to rebuild '${s.name}': ${errorMessage}`); + failed++; + } + } + + console.log(""); + if (rebuilt > 0) console.log(` ${G}✓${R} ${rebuilt} sandbox(es) rebuilt.`); + if (failed > 0) console.log(` ${YW}⚠${R} ${failed} sandbox(es) failed — see errors above.`); + if (failed > 0) process.exit(1); +} diff --git a/src/nemoclaw.ts b/src/nemoclaw.ts index f9c9cce0e1..9e0e252764 100644 --- a/src/nemoclaw.ts +++ b/src/nemoclaw.ts @@ -22,22 +22,17 @@ const { ROOT, validateName } = require("./lib/runner"); // otherwise the branding module falls back to the OpenClaw defaults. // --------------------------------------------------------------------------- const { CLI_NAME } = require("./lib/branding"); -const { prompt: askPrompt } = require("./lib/credentials"); const registry = require("./lib/registry"); const nim = require("./lib/nim"); const shields = require("./lib/shields"); const { help, version } = require("./lib/root-help-action"); -const { parseLiveSandboxNames } = require("./lib/runtime-recovery"); -const { captureOpenshell } = require("./lib/openshell-runtime"); const { recoverRegistryEntries } = require("./lib/registry-recovery-action"); -const { rebuildSandbox } = require("./lib/sandbox-rebuild-action"); const { isSandboxConnectFlag, parseSandboxConnectArgs, printSandboxConnectHelp, } = require("./lib/sandbox-connect-action"); const { runRegisteredOclifCommand } = require("./lib/oclif-runner"); -const sandboxVersion = require("./lib/sandbox-version"); const { canonicalUsageList, globalCommandTokens, @@ -54,9 +49,6 @@ import { const GLOBAL_COMMANDS = globalCommandTokens(); -exports.runtimeBridge = { - upgradeSandboxes, -}; // ── Commands ───────────────────────────────────────────────────── async function runOclif(commandId: string, args: string[] = []): Promise { @@ -71,118 +63,6 @@ function printSandboxActionUsage(action: string): void { console.log(` Usage: ${CLI_NAME} ${action}`); } -// ── Upgrade sandboxes (#1904) ──────────────────────────────────── -// Detect sandboxes running stale agent versions and offer to rebuild them. - -async function upgradeSandboxes(args: string[] = []): Promise { - const checkOnly = args.includes("--check"); - const auto = args.includes("--auto"); - const skipConfirm = auto || args.includes("--yes"); - - const sandboxes = registry.listSandboxes().sandboxes; - if (sandboxes.length === 0) { - console.log(" No sandboxes found in the registry."); - return; - } - - // Query live sandboxes so we can tell the user which are running - const liveResult = captureOpenshell(["sandbox", "list"], { ignoreError: true }); - if (liveResult.status !== 0) { - console.error(" Failed to query running sandboxes from OpenShell."); - console.error(" Ensure OpenShell is running: openshell status"); - process.exit(liveResult.status || 1); - } - const liveNames = parseLiveSandboxNames(liveResult.output || ""); - - // Classify sandboxes as stale, unknown, or current - const stale = []; - const unknown = []; - for (const sb of sandboxes) { - const versionCheck = sandboxVersion.checkAgentVersion(sb.name); - if (versionCheck.isStale) { - stale.push({ - name: sb.name, - current: versionCheck.sandboxVersion, - expected: versionCheck.expectedVersion, - running: liveNames.has(sb.name), - }); - } else if (versionCheck.detectionMethod === "unavailable") { - unknown.push({ - name: sb.name, - expected: versionCheck.expectedVersion, - running: liveNames.has(sb.name), - }); - } - } - - if (stale.length === 0 && unknown.length === 0) { - console.log(" All sandboxes are up to date."); - return; - } - - if (stale.length > 0) { - console.log(`\n ${B}Stale sandboxes:${R}`); - for (const s of stale) { - const status = s.running ? `${G}running${R}` : `${D}stopped${R}`; - console.log(` ${s.name} v${s.current || "?"} → v${s.expected} (${status})`); - } - } - if (unknown.length > 0) { - console.log(`\n ${YW}Unknown version:${R}`); - for (const s of unknown) { - const status = s.running ? `${G}running${R}` : `${D}stopped${R}`; - console.log(` ${s.name} v? → v${s.expected} (${status})`); - } - } - console.log(""); - - if (checkOnly) { - if (stale.length > 0) console.log(` ${stale.length} sandbox(es) need upgrading.`); - if (unknown.length > 0) { - console.log( - ` ${unknown.length} sandbox(es) could not be version-checked; start them and rerun, or rebuild manually.`, - ); - } - console.log(` Run \`${CLI_NAME} upgrade-sandboxes\` to rebuild them.`); - return; - } - - const rebuildable = stale.filter((s: { running: boolean }) => s.running); - const stopped = stale.filter((s: { running: boolean }) => !s.running); - if (stopped.length > 0) { - console.log(` ${D}Skipping ${stopped.length} stopped sandbox(es) — start them first.${R}`); - } - if (rebuildable.length === 0) { - console.log(" No running stale sandboxes to rebuild."); - return; - } - - let rebuilt = 0; - let failed = 0; - for (const s of rebuildable) { - if (!skipConfirm) { - const answer = await askPrompt(` Rebuild '${s.name}'? [y/N]: `); - if (answer.trim().toLowerCase() !== "y" && answer.trim().toLowerCase() !== "yes") { - console.log(` Skipped '${s.name}'.`); - continue; - } - } - try { - await rebuildSandbox(s.name, ["--yes"], { throwOnError: true }); - rebuilt++; - } catch (err) { - const errorMessage = err instanceof Error ? err.message : String(err); - console.error(` ${YW}\u26a0${R} Failed to rebuild '${s.name}': ${errorMessage}`); - failed++; - } - } - - console.log(""); - if (rebuilt > 0) console.log(` ${G}\u2713${R} ${rebuilt} sandbox(es) rebuilt.`); - if (failed > 0) console.log(` ${YW}\u26a0${R} ${failed} sandbox(es) failed — see errors above.`); - if (failed > 0) process.exit(1); -} - // ── Pre-upgrade backup ─────────────────────────────────────────── // ── Snapshot ───────────────────────────────────────────────────── From 38eb84d7389ef86f492acda1cbd2c024a8966ca6 Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Sat, 2 May 2026 20:36:36 -0700 Subject: [PATCH 09/65] refactor(cli): remove runtime bridge --- src/lib/global-cli-actions.ts | 32 +++++++++++++++++----------- src/lib/nemoclaw-runtime-bridge.ts | 23 -------------------- test/credentials-cli-command.test.ts | 17 ++++++--------- 3 files changed, 26 insertions(+), 46 deletions(-) delete mode 100644 src/lib/nemoclaw-runtime-bridge.ts diff --git a/src/lib/global-cli-actions.ts b/src/lib/global-cli-actions.ts index 8b09589634..e7f0c9e949 100644 --- a/src/lib/global-cli-actions.ts +++ b/src/lib/global-cli-actions.ts @@ -14,10 +14,24 @@ import { runSetupSparkAction as executeSetupSparkAction, } from "./onboard-action"; import { recoverNamedGatewayRuntime as recoverNamedGatewayRuntimeAction } from "./gateway-runtime-action"; -import { getNemoClawRuntimeBridge } from "./nemoclaw-runtime-bridge"; import { runOpenshell } from "./openshell-runtime"; import { help, version } from "./root-help-action"; +type GatewayRecovery = { recovered: boolean }; + +type GlobalCliActionRuntimeHooks = { + recoverNamedGatewayRuntime?: () => Promise; + runOpenshell?: typeof runOpenshell; +}; + +let runtimeHooks: GlobalCliActionRuntimeHooks = {}; + +export function setGlobalCliActionRuntimeHooksForTest( + hooks: GlobalCliActionRuntimeHooks, +): void { + runtimeHooks = hooks; +} + export async function runOnboardAction(args: string[] = []): Promise { await executeOnboardAction(args); } @@ -57,12 +71,9 @@ export function showVersion(): void { version(); } -export async function recoverNamedGatewayRuntime(): Promise<{ recovered: boolean }> { - const runtime = getNemoClawRuntimeBridge() as { - recoverNamedGatewayRuntime?: () => Promise<{ recovered: boolean }>; - }; - if (typeof runtime.recoverNamedGatewayRuntime === "function") { - return runtime.recoverNamedGatewayRuntime(); +export async function recoverNamedGatewayRuntime(): Promise { + if (typeof runtimeHooks.recoverNamedGatewayRuntime === "function") { + return runtimeHooks.recoverNamedGatewayRuntime(); } return recoverNamedGatewayRuntimeAction(); } @@ -76,11 +87,8 @@ export function runOpenshellProviderCommand( timeout?: number; }, ) { - const runtime = getNemoClawRuntimeBridge() as { - runOpenshell?: typeof runOpenshell; - }; - if (typeof runtime.runOpenshell === "function") { - return runtime.runOpenshell(args, opts); + if (typeof runtimeHooks.runOpenshell === "function") { + return runtimeHooks.runOpenshell(args, opts); } return runOpenshell(args, opts); } diff --git a/src/lib/nemoclaw-runtime-bridge.ts b/src/lib/nemoclaw-runtime-bridge.ts deleted file mode 100644 index a762ea197c..0000000000 --- a/src/lib/nemoclaw-runtime-bridge.ts +++ /dev/null @@ -1,23 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -/* v8 ignore start -- transitional bridge until command actions are extracted from src/nemoclaw.ts. */ - -export interface NemoClawRuntimeBridge {} - -let runtimeFactory = (): NemoClawRuntimeBridge => { - const runtimeModule = require("../nemoclaw") as { - runtimeBridge?: NemoClawRuntimeBridge; - } & NemoClawRuntimeBridge; - return runtimeModule.runtimeBridge ?? runtimeModule; -}; - -export function setNemoClawRuntimeBridgeFactoryForTest( - factory: () => NemoClawRuntimeBridge, -): void { - runtimeFactory = factory; -} - -export function getNemoClawRuntimeBridge(): NemoClawRuntimeBridge { - return runtimeFactory(); -} diff --git a/test/credentials-cli-command.test.ts b/test/credentials-cli-command.test.ts index eea96ad73b..05d7069b1e 100644 --- a/test/credentials-cli-command.test.ts +++ b/test/credentials-cli-command.test.ts @@ -7,10 +7,8 @@ import { afterEach, describe, expect, it } from "vitest"; const require = createRequire(import.meta.url); const REPO_ROOT = path.join(import.meta.dirname, ".."); -const RUNTIME_PATH = require.resolve(path.join(REPO_ROOT, "dist", "nemoclaw.js")); const COMMANDS_PATH = path.join(REPO_ROOT, "dist", "lib", "credentials-cli-command.js"); - -type RequireCacheEntry = NonNullable<(typeof require.cache)[string]>; +const GLOBAL_ACTIONS_PATH = path.join(REPO_ROOT, "dist", "lib", "global-cli-actions.js"); type CredentialsCommandModule = typeof import("../dist/lib/credentials-cli-command.js"); type SpawnLikeResult = { status: number | null; stdout?: string; stderr?: string }; type RuntimeRecovery = { @@ -53,13 +51,10 @@ function installRuntimeBridge(bridge: Partial = {}): OpenshellCal }, ...bridge, }; - const cacheEntry = { - id: RUNTIME_PATH, - filename: RUNTIME_PATH, - loaded: true, - exports: runtime, - } as RequireCacheEntry; - require.cache[RUNTIME_PATH] = cacheEntry; + const globalActions = require(GLOBAL_ACTIONS_PATH) as { + setGlobalCliActionRuntimeHooksForTest: (hooks: RuntimeBridge) => void; + }; + globalActions.setGlobalCliActionRuntimeHooksForTest(runtime); return calls; } @@ -129,7 +124,7 @@ async function expectProcessExit( afterEach(() => { delete require.cache[COMMANDS_PATH]; - delete require.cache[RUNTIME_PATH]; + delete require.cache[GLOBAL_ACTIONS_PATH]; }); describe("credentials oclif commands", () => { From b2ad5dad14fc505a96978f6b37faed1ea7516db2 Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Sat, 2 May 2026 20:47:40 -0700 Subject: [PATCH 10/65] refactor(cli): remove legacy dispatch fallbacks --- src/lib/legacy-oclif-dispatch.test.ts | 29 ++++++++++++++++++++++++ src/lib/legacy-oclif-dispatch.ts | 16 +++++--------- src/lib/oclif-commands.ts | 12 ++++++++-- src/lib/policy-mutate-cli-commands.ts | 11 +++++++++ src/lib/skill-install-cli-command.ts | 13 +++++++++++ src/lib/snapshot-cli-commands.ts | 13 +++++++++++ src/nemoclaw.ts | 32 --------------------------- 7 files changed, 81 insertions(+), 45 deletions(-) diff --git a/src/lib/legacy-oclif-dispatch.test.ts b/src/lib/legacy-oclif-dispatch.test.ts index 49321743ff..84a9040657 100644 --- a/src/lib/legacy-oclif-dispatch.test.ts +++ b/src/lib/legacy-oclif-dispatch.test.ts @@ -28,4 +28,33 @@ describe("resolveSandboxOclifDispatch", () => { args: ["alpha", "--json"], }); }); + + it("routes policy-add missing-value errors through a raw oclif adapter", () => { + expect(resolveSandboxOclifDispatch("alpha", "policy-add", ["--from-file"])).toEqual({ + kind: "oclif", + commandId: "sandbox:policy-add:raw", + args: ["alpha", "--from-file"], + }); + }); + + it("routes skill help and unknown subcommands through oclif", () => { + expect(resolveSandboxOclifDispatch("alpha", "skill", ["--help"])).toEqual({ + kind: "oclif", + commandId: "sandbox:skill", + args: ["alpha", "--help"], + }); + expect(resolveSandboxOclifDispatch("alpha", "skill", ["bogus"])).toEqual({ + kind: "oclif", + commandId: "sandbox:skill", + args: ["alpha", "bogus"], + }); + }); + + it("routes snapshot unknown subcommands through oclif", () => { + expect(resolveSandboxOclifDispatch("alpha", "snapshot", ["bogus"])).toEqual({ + kind: "oclif", + commandId: "sandbox:snapshot", + args: ["alpha", "bogus"], + }); + }); }); diff --git a/src/lib/legacy-oclif-dispatch.ts b/src/lib/legacy-oclif-dispatch.ts index bfd3c60d77..d25d5decc2 100644 --- a/src/lib/legacy-oclif-dispatch.ts +++ b/src/lib/legacy-oclif-dispatch.ts @@ -19,11 +19,6 @@ export type UsageErrorDispatch = { lines: string[]; }; -export type LegacyDispatch = { - kind: "legacy"; - target: "policy-add" | "skill" | "snapshot"; -}; - export type UnknownSubcommandDispatch = { kind: "unknownSubcommand"; command: "credentials" | "channels"; @@ -39,7 +34,6 @@ export type DispatchResult = | OclifDispatch | HelpDispatch | UsageErrorDispatch - | LegacyDispatch | UnknownSubcommandDispatch | UnknownActionDispatch; @@ -121,7 +115,7 @@ export function resolveSandboxOclifDispatch( }; } if (hasMissingFlagValue(actionArgs, "--from-file") || hasMissingFlagValue(actionArgs, "--from-dir")) { - return { kind: "legacy", target: "policy-add" }; + return { kind: "oclif", commandId: "sandbox:policy-add:raw", args: [sandboxName, ...actionArgs] }; } return { kind: "oclif", commandId: "sandbox:policy-add", args: [sandboxName, ...actionArgs] }; case "policy-remove": @@ -140,13 +134,13 @@ export function resolveSandboxOclifDispatch( const skillSub = actionArgs[0]; const skillArgs = actionArgs.slice(1); if (!skillSub || skillSub === "help" || skillSub === "--help" || skillSub === "-h") { - return { kind: "legacy", target: "skill" }; + return { kind: "oclif", commandId: "sandbox:skill", args: [sandboxName, ...actionArgs] }; } if (skillSub === "install") { - if (hasHelpFlag(skillArgs)) return { kind: "legacy", target: "skill" }; + if (hasHelpFlag(skillArgs)) return { kind: "oclif", commandId: "sandbox:skill", args: [sandboxName, ...actionArgs] }; return { kind: "oclif", commandId: "sandbox:skill:install", args: [sandboxName, ...skillArgs] }; } - return { kind: "legacy", target: "skill" }; + return { kind: "oclif", commandId: "sandbox:skill", args: [sandboxName, ...actionArgs] }; } case "rebuild": if (hasHelpFlag(actionArgs)) return { kind: "help", usage: "rebuild [--yes|--force] [--verbose|-v]" }; @@ -168,7 +162,7 @@ export function resolveSandboxOclifDispatch( if (hasHelpFlag(snapshotArgs)) return { kind: "help", usage: "snapshot restore [selector] [--to ]" }; return { kind: "oclif", commandId: "sandbox:snapshot:restore", args: [sandboxName, ...snapshotArgs] }; } - return { kind: "legacy", target: "snapshot" }; + return { kind: "oclif", commandId: "sandbox:snapshot", args: [sandboxName, ...actionArgs] }; } case "shields": { const shieldsSub = actionArgs[0]; diff --git a/src/lib/oclif-commands.ts b/src/lib/oclif-commands.ts index 37c60ab4a7..b56342b5bc 100644 --- a/src/lib/oclif-commands.ts +++ b/src/lib/oclif-commands.ts @@ -29,7 +29,11 @@ import { GarbageCollectImagesCommand, UpgradeSandboxesCommand, } from "./maintenance-cli-commands"; -import { PolicyAddCommand, PolicyRemoveCommand } from "./policy-mutate-cli-commands"; +import { + PolicyAddCommand, + PolicyAddRawCommand, + PolicyRemoveCommand, +} from "./policy-mutate-cli-commands"; import RebuildCliCommand from "./rebuild-cli-command"; import SandboxDoctorCliCommand from "./sandbox-doctor-cli-command"; import { @@ -45,8 +49,9 @@ import { ShieldsUpCommand, } from "./shields-cli-commands"; import ShareCommand from "./share-command"; -import SkillInstallCliCommand from "./skill-install-cli-command"; +import SkillInstallCliCommand, { SkillCliCommand } from "./skill-install-cli-command"; import { + SnapshotCommand, SnapshotCreateCommand, SnapshotListCommand, SnapshotRestoreCommand, @@ -82,13 +87,16 @@ export default { "sandbox:doctor": SandboxDoctorCliCommand, "sandbox:logs": SandboxLogsCommand, "sandbox:policy-add": PolicyAddCommand, + "sandbox:policy-add:raw": PolicyAddRawCommand, "sandbox:policy-list": SandboxPolicyListCommand, "sandbox:policy-remove": PolicyRemoveCommand, "sandbox:rebuild": RebuildCliCommand, "sandbox:shields:down": ShieldsDownCommand, "sandbox:shields:status": ShieldsStatusCommand, "sandbox:shields:up": ShieldsUpCommand, + "sandbox:skill": SkillCliCommand, "sandbox:skill:install": SkillInstallCliCommand, + "sandbox:snapshot": SnapshotCommand, "sandbox:snapshot:create": SnapshotCreateCommand, "sandbox:snapshot:list": SnapshotListCommand, "sandbox:snapshot:restore": SnapshotRestoreCommand, diff --git a/src/lib/policy-mutate-cli-commands.ts b/src/lib/policy-mutate-cli-commands.ts index 838987ec00..155e831e89 100644 --- a/src/lib/policy-mutate-cli-commands.ts +++ b/src/lib/policy-mutate-cli-commands.ts @@ -76,6 +76,17 @@ export class PolicyAddCommand extends Command { } } +export class PolicyAddRawCommand extends Command { + static id = "sandbox:policy-add:raw"; + static strict = false; + static hidden = true; + + public async run(): Promise { + const [sandboxName, ...actionArgs] = this.argv; + await getRuntimeBridge().sandboxPolicyAdd(sandboxName, actionArgs); + } +} + export class PolicyRemoveCommand extends Command { static id = "sandbox:policy-remove"; static strict = true; diff --git a/src/lib/skill-install-cli-command.ts b/src/lib/skill-install-cli-command.ts index 9b9f813d9f..5debfdf254 100644 --- a/src/lib/skill-install-cli-command.ts +++ b/src/lib/skill-install-cli-command.ts @@ -19,6 +19,19 @@ function getRuntimeBridge() { return runtimeBridgeFactory(); } +export class SkillCliCommand extends Command { + static id = "sandbox:skill"; + static strict = false; + static summary = "Show skill command usage"; + static description = "Show skill install usage or report unknown skill subcommands."; + static usage = [" skill install "]; + + public async run(): Promise { + const [sandboxName, ...actionArgs] = this.argv; + await getRuntimeBridge().sandboxSkillInstall(sandboxName, actionArgs); + } +} + export default class SkillInstallCliCommand extends Command { static id = "sandbox:skill:install"; static strict = true; diff --git a/src/lib/snapshot-cli-commands.ts b/src/lib/snapshot-cli-commands.ts index ea0d32f25c..635439fd1b 100644 --- a/src/lib/snapshot-cli-commands.ts +++ b/src/lib/snapshot-cli-commands.ts @@ -25,6 +25,19 @@ const sandboxNameArg = Args.string({ required: true, }); +export class SnapshotCommand extends Command { + static id = "sandbox:snapshot"; + static strict = false; + static summary = "Show snapshot usage"; + static description = "Show snapshot usage or report unknown snapshot subcommands."; + static usage = [" snapshot "]; + + public async run(): Promise { + const [sandboxName, ...actionArgs] = this.argv; + await getRuntimeBridge().sandboxSnapshot(sandboxName, actionArgs); + } +} + export class SnapshotListCommand extends Command { static id = "sandbox:snapshot:list"; static strict = true; diff --git a/src/nemoclaw.ts b/src/nemoclaw.ts index 9e0e252764..5b7b811d9b 100644 --- a/src/nemoclaw.ts +++ b/src/nemoclaw.ts @@ -170,38 +170,6 @@ async function runDispatchResult( console.error(` Unknown action: ${result.action}`); console.error(` Valid actions: ${VALID_SANDBOX_ACTIONS}`); process.exit(1); - case "legacy": { - const sandboxName = opts.sandboxName; - const actionArgs = opts.actionArgs ?? []; - if (!sandboxName) { - throw new Error(`Missing sandbox name for legacy dispatch target ${result.target}`); - } - switch (result.target) { - case "policy-add": { - const { addSandboxPolicy } = require("./lib/policy-channel-actions") as { - addSandboxPolicy: (sandboxName: string, args?: string[]) => Promise; - }; - await addSandboxPolicy(sandboxName, actionArgs); - return; - } - case "skill": { - const { installSandboxSkill } = require("./lib/sandbox-skill-install-action") as { - installSandboxSkill: (sandboxName: string, args?: string[]) => Promise; - }; - await installSandboxSkill(sandboxName, actionArgs); - return; - } - case "snapshot": { - const { runSandboxSnapshot } = require("./lib/snapshot-action") as { - runSandboxSnapshot: (sandboxName: string, args: string[]) => Promise; - }; - await runSandboxSnapshot(sandboxName, actionArgs); - return; - } - default: - throw new Error(`Unhandled legacy dispatch target ${result.target}`); - } - } } } From edd2650e57fccd07099de4bb3b9f3a5703848998 Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Sat, 2 May 2026 21:02:02 -0700 Subject: [PATCH 11/65] refactor(cli): expose explicit main entrypoint --- src/nemoclaw.ts | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/nemoclaw.ts b/src/nemoclaw.ts index 5b7b811d9b..60353ba49e 100644 --- a/src/nemoclaw.ts +++ b/src/nemoclaw.ts @@ -175,10 +175,10 @@ async function runDispatchResult( // ── Dispatch ───────────────────────────────────────────────────── -const [cmd, ...args] = process.argv.slice(2); - // eslint-disable-next-line complexity -const mainPromise = (async () => { +async function main(argv: string[] = process.argv.slice(2)): Promise { + const [cmd, ...args] = argv; + // No command → help if (!cmd || cmd === "help" || cmd === "--help" || cmd === "-h") { await runOclif("root:help", []); @@ -285,6 +285,9 @@ const mainPromise = (async () => { console.error(` Run '${CLI_NAME} help' for usage.`); process.exit(1); -})(); +} -exports.mainPromise = mainPromise; +exports.main = main; +// Compatibility for tests that require the CLI module and await completion. +// Prefer calling main(argv) directly in new in-process harnesses. +exports.mainPromise = main(); From a15da95d53d56331213d77c0c5de42998e69f0bb Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Sat, 2 May 2026 21:30:13 -0700 Subject: [PATCH 12/65] refactor(cli): add oclif examples for utility commands --- src/lib/credentials-cli-command.ts | 9 +++++++++ src/lib/gateway-token-cli-command.ts | 4 ++++ src/lib/list-command.ts | 1 + src/lib/maintenance-cli-commands.ts | 6 ++++++ src/lib/status-command.ts | 1 + src/lib/tunnel-commands.ts | 4 ++++ src/lib/uninstall-cli-command.ts | 6 +++++- 7 files changed, 30 insertions(+), 1 deletion(-) diff --git a/src/lib/credentials-cli-command.ts b/src/lib/credentials-cli-command.ts index 887ec5f5de..a9349d463f 100644 --- a/src/lib/credentials-cli-command.ts +++ b/src/lib/credentials-cli-command.ts @@ -56,6 +56,10 @@ export class CredentialsCommand extends Command { static description = "List or reset provider credentials registered with the OpenShell gateway."; static usage = ["credentials "]; + static examples = [ + "<%= config.bin %> credentials list", + "<%= config.bin %> credentials reset nvidia-prod --yes", + ]; static flags = { help: Flags.help({ char: "h" }), }; @@ -72,6 +76,7 @@ export class CredentialsListCommand extends Command { static summary = "List stored credential providers"; static description = "List provider credentials registered with the OpenShell gateway."; static usage = ["credentials list"]; + static examples = ["<%= config.bin %> credentials list"]; static flags = { help: Flags.help({ char: "h" }), }; @@ -120,6 +125,10 @@ export class CredentialsResetCommand extends Command { static summary = "Remove a provider credential"; static description = "Remove a provider credential so onboard re-prompts for it."; static usage = ["credentials reset [--yes]"]; + static examples = [ + "<%= config.bin %> credentials reset nvidia-prod", + "<%= config.bin %> credentials reset nvidia-prod --yes", + ]; static args = { provider: Args.string({ name: "PROVIDER", diff --git a/src/lib/gateway-token-cli-command.ts b/src/lib/gateway-token-cli-command.ts index e20115edeb..36d3ae960c 100644 --- a/src/lib/gateway-token-cli-command.ts +++ b/src/lib/gateway-token-cli-command.ts @@ -17,6 +17,10 @@ export default class GatewayTokenCliCommand extends Command { static summary = "Print the OpenClaw gateway auth token to stdout"; static description = "Print the OpenClaw gateway auth token for a running sandbox to stdout."; static usage = [" gateway-token [--quiet|-q]"]; + static examples = [ + "<%= config.bin %> alpha gateway-token", + "<%= config.bin %> alpha gateway-token --quiet", + ]; static args = { sandboxName: Args.string({ name: "sandbox", diff --git a/src/lib/list-command.ts b/src/lib/list-command.ts index 0e0bc85669..c7be5f7fbe 100644 --- a/src/lib/list-command.ts +++ b/src/lib/list-command.ts @@ -16,6 +16,7 @@ export default class ListCommand extends Command { static description = "List all registered sandboxes with their model, provider, and policy presets."; static usage = ["list [--json]"]; + static examples = ["<%= config.bin %> list", "<%= config.bin %> list --json"]; static flags = { help: Flags.help({ char: "h" }), }; diff --git a/src/lib/maintenance-cli-commands.ts b/src/lib/maintenance-cli-commands.ts index eb03289e09..c40e95e4ee 100644 --- a/src/lib/maintenance-cli-commands.ts +++ b/src/lib/maintenance-cli-commands.ts @@ -17,6 +17,7 @@ export class BackupAllCommand extends Command { static summary = "Back up all sandbox state before upgrade"; static description = "Back up registered, running sandbox state before upgrading."; static usage = ["backup-all"]; + static examples = ["<%= config.bin %> backup-all"]; static flags = { help: Flags.help({ char: "h" }), }; @@ -33,6 +34,10 @@ export class UpgradeSandboxesCommand extends Command { static summary = "Detect and rebuild stale sandboxes"; static description = "Detect stale sandboxes and optionally rebuild them."; static usage = ["upgrade-sandboxes [--check] [--auto] [--yes]"]; + static examples = [ + "<%= config.bin %> upgrade-sandboxes --check", + "<%= config.bin %> upgrade-sandboxes --auto --yes", + ]; static flags = { help: Flags.help({ char: "h" }), check: Flags.boolean({ description: "Only check whether sandboxes need upgrading" }), @@ -56,6 +61,7 @@ export class GarbageCollectImagesCommand extends Command { static summary = "Remove orphaned sandbox Docker images"; static description = "Remove sandbox Docker images that are not referenced by registered sandboxes."; static usage = ["gc [--dry-run] [--yes|--force]"]; + static examples = ["<%= config.bin %> gc --dry-run", "<%= config.bin %> gc --yes"]; static flags = { help: Flags.help({ char: "h" }), "dry-run": Flags.boolean({ description: "Show images that would be removed without deleting" }), diff --git a/src/lib/status-command.ts b/src/lib/status-command.ts index 49a612e693..956c4647a8 100644 --- a/src/lib/status-command.ts +++ b/src/lib/status-command.ts @@ -15,6 +15,7 @@ export default class StatusCommand extends Command { static summary = "Show sandbox list and service status"; static description = "Show registered sandboxes, live inference, services, and messaging health."; static usage = ["status [--json]"]; + static examples = ["<%= config.bin %> status", "<%= config.bin %> status --json"]; static flags = { help: Flags.help({ char: "h" }), }; diff --git a/src/lib/tunnel-commands.ts b/src/lib/tunnel-commands.ts index 84062ef266..69aca9bff9 100644 --- a/src/lib/tunnel-commands.ts +++ b/src/lib/tunnel-commands.ts @@ -22,6 +22,7 @@ export class TunnelStartCommand extends Command { static summary = "Start the cloudflared public-URL tunnel"; static description = "Start the cloudflared public-URL tunnel for the default sandbox dashboard."; static usage = ["tunnel start"]; + static examples = ["<%= config.bin %> tunnel start"]; static flags = { help: Flags.help({ char: "h" }), }; @@ -38,6 +39,7 @@ export class TunnelStopCommand extends Command { static summary = "Stop the cloudflared public-URL tunnel"; static description = "Stop the cloudflared public-URL tunnel for the default sandbox dashboard."; static usage = ["tunnel stop"]; + static examples = ["<%= config.bin %> tunnel stop"]; static flags = { help: Flags.help({ char: "h" }), }; @@ -54,6 +56,7 @@ export class DeprecatedStartCommand extends Command { static summary = "Deprecated alias for 'tunnel start'"; static description = "Deprecated alias for tunnel start."; static usage = ["start"]; + static examples = ["<%= config.bin %> start"]; static flags = { help: Flags.help({ char: "h" }), }; @@ -73,6 +76,7 @@ export class DeprecatedStopCommand extends Command { static summary = "Deprecated alias for 'tunnel stop'"; static description = "Deprecated alias for tunnel stop."; static usage = ["stop"]; + static examples = ["<%= config.bin %> stop"]; static flags = { help: Flags.help({ char: "h" }), }; diff --git a/src/lib/uninstall-cli-command.ts b/src/lib/uninstall-cli-command.ts index 1283a7b52c..a52abc37da 100644 --- a/src/lib/uninstall-cli-command.ts +++ b/src/lib/uninstall-cli-command.ts @@ -5,7 +5,7 @@ import { spawnSync } from "node:child_process"; -import { Command } from "@oclif/core"; +import { Command, Flags } from "@oclif/core"; import { getVersion } from "./version"; import { buildVersionedUninstallUrl, runUninstallCommand } from "./uninstall-command"; @@ -16,6 +16,10 @@ export default class UninstallCliCommand extends Command { static summary = "Run uninstall.sh"; static description = "Run the local uninstall.sh script; remote fallback is disabled."; static usage = ["uninstall [flags]"]; + static examples = ["<%= config.bin %> uninstall --yes"]; + static flags = { + help: Flags.help({ char: "h" }), + }; public async run(): Promise { this.parsed = true; From 4f57ebb05a1b0e9554887ef19911651a2062bc16 Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Sat, 2 May 2026 21:57:59 -0700 Subject: [PATCH 13/65] refactor(cli): validate logs flags with oclif --- src/lib/command-registry.ts | 2 +- src/lib/legacy-oclif-dispatch.test.ts | 7 ++ src/lib/legacy-oclif-dispatch.ts | 7 +- src/lib/sandbox-logs-action.ts | 63 +++++++++++----- src/lib/sandbox-logs-cli-command.test.ts | 45 ++++++++++- src/lib/sandbox-logs-cli-command.ts | 47 +++++++++++- src/lib/sandbox-logs-options.ts | 10 +++ src/lib/sandbox-runtime-actions.ts | 7 +- test/cli.test.ts | 95 +++++++++++++++++++++++- 9 files changed, 255 insertions(+), 28 deletions(-) create mode 100644 src/lib/sandbox-logs-options.ts diff --git a/src/lib/command-registry.ts b/src/lib/command-registry.ts index d60e27b843..166674c0b7 100644 --- a/src/lib/command-registry.ts +++ b/src/lib/command-registry.ts @@ -120,7 +120,7 @@ export const COMMANDS: readonly CommandDef[] = [ { usage: "nemoclaw logs", description: "Stream sandbox logs", - flags: "[--follow]", + flags: "[--follow] [--tail |-n ] [--since ]", group: "Sandbox Management", scope: "sandbox", }, diff --git a/src/lib/legacy-oclif-dispatch.test.ts b/src/lib/legacy-oclif-dispatch.test.ts index 84a9040657..d97705331a 100644 --- a/src/lib/legacy-oclif-dispatch.test.ts +++ b/src/lib/legacy-oclif-dispatch.test.ts @@ -29,6 +29,13 @@ describe("resolveSandboxOclifDispatch", () => { }); }); + it("keeps logs help public with filter flags", () => { + expect(resolveSandboxOclifDispatch("alpha", "logs", ["--help"])).toEqual({ + kind: "help", + usage: "logs [--follow] [--tail |-n ] [--since ]", + }); + }); + it("routes policy-add missing-value errors through a raw oclif adapter", () => { expect(resolveSandboxOclifDispatch("alpha", "policy-add", ["--from-file"])).toEqual({ kind: "oclif", diff --git a/src/lib/legacy-oclif-dispatch.ts b/src/lib/legacy-oclif-dispatch.ts index d25d5decc2..2235cd4c1e 100644 --- a/src/lib/legacy-oclif-dispatch.ts +++ b/src/lib/legacy-oclif-dispatch.ts @@ -103,7 +103,12 @@ export function resolveSandboxOclifDispatch( if (hasHelpFlag(actionArgs)) return { kind: "help", usage: "status" }; return { kind: "oclif", commandId: "sandbox:status", args: [sandboxName, ...actionArgs] }; case "logs": - if (hasHelpFlag(actionArgs)) return { kind: "help", usage: "logs [--follow]" }; + if (hasHelpFlag(actionArgs)) { + return { + kind: "help", + usage: "logs [--follow] [--tail |-n ] [--since ]", + }; + } return { kind: "oclif", commandId: "sandbox:logs", args: [sandboxName, ...actionArgs] }; case "doctor": return { kind: "oclif", commandId: "sandbox:doctor", args: [sandboxName, ...actionArgs] }; diff --git a/src/lib/sandbox-logs-action.ts b/src/lib/sandbox-logs-action.ts index f5cf328c7c..08f6094bc7 100644 --- a/src/lib/sandbox-logs-action.ts +++ b/src/lib/sandbox-logs-action.ts @@ -8,6 +8,8 @@ import os from "node:os"; import { ROOT } from "./runner"; import { getOpenshellBinary, runOpenshell } from "./openshell-runtime"; +import type { SandboxLogsOptions } from "./sandbox-logs-options"; +import { DEFAULT_SANDBOX_LOG_LINES } from "./sandbox-logs-options"; const DEFAULT_LOGS_PROBE_TIMEOUT_MS = 5000; const LOGS_PROBE_TIMEOUT_ENV = "NEMOCLAW_LOGS_PROBE_TIMEOUT_MS"; @@ -53,8 +55,22 @@ function describeLogProbeResult(result: SpawnLikeResult): string { return `exit ${result.status ?? "unknown"}`; } -function runOpenclawGatewayLogs(sandboxName: string, follow: boolean): SpawnLikeResult { - const args = buildSandboxOpenclawGatewayLogsArgs(sandboxName, follow); +function normalizeSandboxLogsOptions(options: SandboxLogsOptions | boolean): SandboxLogsOptions { + if (typeof options === "boolean") { + return { follow: options, lines: DEFAULT_SANDBOX_LOG_LINES, since: null }; + } + return { + follow: options.follow, + lines: options.lines || DEFAULT_SANDBOX_LOG_LINES, + since: options.since || null, + }; +} + +function runOpenclawGatewayLogs( + sandboxName: string, + options: SandboxLogsOptions, +): SpawnLikeResult { + const args = buildSandboxOpenclawGatewayLogsArgs(sandboxName, options); const result = runOpenshell(args, { stdio: "inherit", ignoreError: true, @@ -69,9 +85,11 @@ function runOpenclawGatewayLogs(sandboxName: string, follow: boolean): SpawnLike return result; } -function streamSandboxFollowLogs(sandboxName: string): void { - const openclawArgs = buildSandboxOpenclawGatewayLogsArgs(sandboxName, true); - const openshellArgs = buildSandboxLogsArgs(sandboxName, true); +function streamSandboxFollowLogs(sandboxName: string, options: SandboxLogsOptions): void { + const openclawArgs = options.since + ? null + : buildSandboxOpenclawGatewayLogsArgs(sandboxName, options); + const openshellArgs = buildSandboxLogsArgs(sandboxName, options); const spawnOptions = { cwd: ROOT, env: process.env, @@ -162,7 +180,9 @@ function streamSandboxFollowLogs(sandboxName: string): void { }); }; - addSource("OpenClaw log source", openclawArgs); + if (openclawArgs) { + addSource("OpenClaw log source", openclawArgs); + } enableSandboxAuditLogs(sandboxName); addSource("OpenShell log source", openshellArgs); setupComplete = true; @@ -201,32 +221,41 @@ function buildEnableSandboxAuditLogsArgs(sandboxName: string): string[] { return ["settings", "set", sandboxName, "--key", "ocsf_json_enabled", "--value", "true"]; } -function buildSandboxOpenclawGatewayLogsArgs(sandboxName: string, follow: boolean): string[] { - const args = ["sandbox", "exec", "-n", sandboxName, "--", "tail", "-n", "200"]; - if (follow) { +function buildSandboxOpenclawGatewayLogsArgs( + sandboxName: string, + options: SandboxLogsOptions, +): string[] { + const args = ["sandbox", "exec", "-n", sandboxName, "--", "tail", "-n", options.lines]; + if (options.follow) { args.push("-f"); } args.push("/tmp/gateway.log"); return args; } -function buildSandboxLogsArgs(sandboxName: string, follow: boolean): string[] { - const args = ["logs", sandboxName, "-n", "200", "--source", "all"]; - if (follow) { +function buildSandboxLogsArgs(sandboxName: string, options: SandboxLogsOptions): string[] { + const args = ["logs", sandboxName, "-n", options.lines, "--source", "all"]; + if (options.since) { + args.push("--since", options.since); + } + if (options.follow) { args.push("--tail"); } return args; } -export function showSandboxLogs(sandboxName: string, follow: boolean) { - if (follow) { - streamSandboxFollowLogs(sandboxName); +export function showSandboxLogs(sandboxName: string, options: SandboxLogsOptions | boolean) { + const logsOptions = normalizeSandboxLogsOptions(options); + if (logsOptions.follow) { + streamSandboxFollowLogs(sandboxName, logsOptions); return; } enableSandboxAuditLogs(sandboxName); - runOpenclawGatewayLogs(sandboxName, false); - const args = buildSandboxLogsArgs(sandboxName, false); + if (!logsOptions.since) { + runOpenclawGatewayLogs(sandboxName, logsOptions); + } + const args = buildSandboxLogsArgs(sandboxName, logsOptions); const result = runOpenshell(args, { stdio: "inherit", ignoreError: true, diff --git a/src/lib/sandbox-logs-cli-command.test.ts b/src/lib/sandbox-logs-cli-command.test.ts index d67a9e8df4..a87afcbea3 100644 --- a/src/lib/sandbox-logs-cli-command.test.ts +++ b/src/lib/sandbox-logs-cli-command.test.ts @@ -10,12 +10,55 @@ import SandboxLogsCommand, { const rootDir = process.cwd(); describe("SandboxLogsCommand", () => { + it("runs sandbox logs with default options", async () => { + const sandboxLogs = vi.fn(); + setSandboxLogsRuntimeBridgeFactoryForTest(() => ({ sandboxLogs })); + + await SandboxLogsCommand.run(["alpha"], rootDir); + + expect(sandboxLogs).toHaveBeenCalledWith("alpha", { + follow: false, + lines: "200", + since: null, + }); + }); + it("runs sandbox logs with the follow flag", async () => { const sandboxLogs = vi.fn(); setSandboxLogsRuntimeBridgeFactoryForTest(() => ({ sandboxLogs })); await SandboxLogsCommand.run(["alpha", "--follow"], rootDir); - expect(sandboxLogs).toHaveBeenCalledWith("alpha", true); + expect(sandboxLogs).toHaveBeenCalledWith("alpha", { + follow: true, + lines: "200", + since: null, + }); + }); + + it("runs sandbox logs with tail and since filters", async () => { + const sandboxLogs = vi.fn(); + setSandboxLogsRuntimeBridgeFactoryForTest(() => ({ sandboxLogs })); + + await SandboxLogsCommand.run(["alpha", "--tail", "50", "--since", "5m"], rootDir); + + expect(sandboxLogs).toHaveBeenCalledWith("alpha", { + follow: false, + lines: "50", + since: "5m", + }); + }); + + it("maps -n to the tail line count", async () => { + const sandboxLogs = vi.fn(); + setSandboxLogsRuntimeBridgeFactoryForTest(() => ({ sandboxLogs })); + + await SandboxLogsCommand.run(["alpha", "-n", "25"], rootDir); + + expect(sandboxLogs).toHaveBeenCalledWith("alpha", { + follow: false, + lines: "25", + since: null, + }); }); }); diff --git a/src/lib/sandbox-logs-cli-command.ts b/src/lib/sandbox-logs-cli-command.ts index 7a77b48f69..47e399d3b9 100644 --- a/src/lib/sandbox-logs-cli-command.ts +++ b/src/lib/sandbox-logs-cli-command.ts @@ -5,12 +5,21 @@ import { Args, Command, Flags } from "@oclif/core"; +import type { SandboxLogsOptions } from "./sandbox-logs-options"; +import { DEFAULT_SANDBOX_LOG_LINES } from "./sandbox-logs-options"; import { showSandboxLogs } from "./sandbox-runtime-actions"; -let runtimeBridgeFactory = () => ({ sandboxLogs: showSandboxLogs }); +type SandboxLogsRuntimeBridge = { + sandboxLogs: (sandboxName: string, options: SandboxLogsOptions) => void; +}; + +const LOGS_SINCE_DURATION_RE = /^[1-9]\d*(?:ms|s|m|h|d)$/i; +const DEFAULT_SANDBOX_LOG_LINE_COUNT = Number(DEFAULT_SANDBOX_LOG_LINES); + +let runtimeBridgeFactory = (): SandboxLogsRuntimeBridge => ({ sandboxLogs: showSandboxLogs }); export function setSandboxLogsRuntimeBridgeFactoryForTest( - factory: () => { sandboxLogs: (sandboxName: string, follow: boolean) => void }, + factory: () => SandboxLogsRuntimeBridge, ): void { runtimeBridgeFactory = factory; } @@ -24,7 +33,13 @@ export default class SandboxLogsCommand extends Command { static strict = true; static summary = "Stream sandbox logs"; static description = "Show OpenClaw gateway logs and OpenShell audit logs for a sandbox."; - static usage = [" logs [--follow]"]; + static usage = [" logs [--follow] [--tail |-n ] [--since ]"]; + static examples = [ + "<%= config.bin %> alpha logs", + "<%= config.bin %> alpha logs --tail 100", + "<%= config.bin %> alpha logs --since 5m", + "<%= config.bin %> alpha logs --follow", + ]; static args = { sandboxName: Args.string({ name: "sandbox", @@ -35,10 +50,34 @@ export default class SandboxLogsCommand extends Command { static flags = { help: Flags.help({ char: "h" }), follow: Flags.boolean({ description: "Follow logs until interrupted" }), + tail: Flags.integer({ + char: "n", + default: DEFAULT_SANDBOX_LOG_LINE_COUNT, + description: "Number of log lines to return", + min: 1, + }), + since: Flags.string({ + description: "Only show logs from this duration ago, such as 5m, 1h, or 30s", + }), }; + private normalizeSinceDuration(since: string | undefined): string | null { + if (since === undefined) { + return null; + } + const trimmed = since.trim(); + if (!LOGS_SINCE_DURATION_RE.test(trimmed)) { + this.error("--since requires a positive duration like 5m, 1h, or 30s", { exit: 2 }); + } + return trimmed; + } + public async run(): Promise { const { args, flags } = await this.parse(SandboxLogsCommand); - getRuntimeBridge().sandboxLogs(args.sandboxName, flags.follow === true); + getRuntimeBridge().sandboxLogs(args.sandboxName, { + follow: flags.follow === true, + lines: String(flags.tail), + since: this.normalizeSinceDuration(flags.since), + }); } } diff --git a/src/lib/sandbox-logs-options.ts b/src/lib/sandbox-logs-options.ts new file mode 100644 index 0000000000..e99369d671 --- /dev/null +++ b/src/lib/sandbox-logs-options.ts @@ -0,0 +1,10 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +export const DEFAULT_SANDBOX_LOG_LINES = "200"; + +export type SandboxLogsOptions = { + follow: boolean; + lines: string; + since: string | null; +}; diff --git a/src/lib/sandbox-runtime-actions.ts b/src/lib/sandbox-runtime-actions.ts index 7097f33ac4..81171c171e 100644 --- a/src/lib/sandbox-runtime-actions.ts +++ b/src/lib/sandbox-runtime-actions.ts @@ -4,6 +4,7 @@ /* v8 ignore start -- transitional action facade until implementations leave src/nemoclaw.ts. */ import type { SandboxConnectOptions } from "./sandbox-connect-action"; +import type { SandboxLogsOptions } from "./sandbox-logs-options"; export async function connectSandbox( sandboxName: string, @@ -22,11 +23,11 @@ export async function showSandboxStatus(sandboxName: string): Promise { await showExtractedSandboxStatus(sandboxName); } -export function showSandboxLogs(sandboxName: string, follow: boolean): void { +export function showSandboxLogs(sandboxName: string, options: SandboxLogsOptions): void { const { showSandboxLogs: showSandboxLogsAction } = require("./sandbox-logs-action") as { - showSandboxLogs: (sandboxName: string, follow: boolean) => void; + showSandboxLogs: (sandboxName: string, options: SandboxLogsOptions) => void; }; - showSandboxLogsAction(sandboxName, follow); + showSandboxLogsAction(sandboxName, options); } export async function destroySandbox(sandboxName: string, args: string[] = []): Promise { diff --git a/test/cli.test.ts b/test/cli.test.ts index 2ce8ef0bf4..e50bea9300 100644 --- a/test/cli.test.ts +++ b/test/cli.test.ts @@ -1204,7 +1204,10 @@ describe("CLI dispatch", () => { const logs = runWithEnv("alpha logs --help", { HOME: home }); expect(logs.code).toBe(0); - expect(logs.out).toContain(" logs [--follow]"); + expect(logs.out).toContain(" logs"); + expect(logs.out).toContain("--follow"); + expect(logs.out).toContain("--tail"); + expect(logs.out).toContain("--since"); expect(logs.out).not.toContain("sandbox:logs"); const destroy = runWithEnv("alpha destroy --help", { HOME: home }); @@ -1345,6 +1348,96 @@ describe("CLI dispatch", () => { expect(r.out).toContain(FAKE_OPENSHELL_LOG_LINE); }); + it("shows logs help without calling OpenShell", () => { + const setup = createLogsTestSetup("nemoclaw-cli-logs-help-"); + const r = setup.runLogs("alpha logs --help"); + + expect(r.code).toBe(0); + expect(r.out).toContain(" logs"); + expect(r.out).toContain("--follow"); + expect(r.out).toContain("--tail"); + expect(r.out).toContain("--since"); + expect(setup.readCalls()).toEqual([]); + }); + + it("passes --tail line count to both log sources", () => { + const setup = createLogsTestSetup("nemoclaw-cli-logs-tail-"); + const r = setup.runLogs("alpha logs --tail 50"); + + const calls = setup.readCalls(); + expect(r.code).toBe(0); + expect(calls).toEqual([ + "settings set alpha --key ocsf_json_enabled --value true", + "sandbox exec -n alpha -- tail -n 50 /tmp/gateway.log", + "logs alpha -n 50 --source all", + ]); + }); + + it("passes -n line count to both log sources", () => { + const setup = createLogsTestSetup("nemoclaw-cli-logs-n-"); + const r = setup.runLogs("alpha logs -n 25"); + + const calls = setup.readCalls(); + expect(r.code).toBe(0); + expect(calls).toEqual([ + "settings set alpha --key ocsf_json_enabled --value true", + "sandbox exec -n alpha -- tail -n 25 /tmp/gateway.log", + "logs alpha -n 25 --source all", + ]); + }); + + it("passes --since to OpenShell logs without an unfiltered gateway tail", () => { + const setup = createLogsTestSetup("nemoclaw-cli-logs-since-"); + const r = setup.runLogs("alpha logs --since 5m"); + + const calls = setup.readCalls(); + expect(r.code).toBe(0); + expect(calls).toEqual([ + "settings set alpha --key ocsf_json_enabled --value true", + "logs alpha -n 200 --source all --since 5m", + ]); + expect(calls.some((call) => call.startsWith("sandbox exec -n alpha"))).toBe(false); + }); + + it("passes --follow --since to OpenShell logs without an unfiltered gateway tail", () => { + const setup = createLogsTestSetup("nemoclaw-cli-logs-since-follow-"); + const r = setup.runLogs("alpha logs --follow --since 5m"); + + const calls = setup.readCalls(); + expect(r.code).toBe(0); + expect(calls).toContain("settings set alpha --key ocsf_json_enabled --value true"); + expect(calls).toContain("logs alpha -n 200 --source all --since 5m --tail"); + expect(calls.some((call) => call.startsWith("sandbox exec -n alpha"))).toBe(false); + }); + + it("rejects malformed logs flags before calling OpenShell", () => { + const setup = createLogsTestSetup("nemoclaw-cli-logs-malformed-"); + const missingTail = setup.runLogs("alpha logs --tail 2>&1"); + const zeroTail = setup.runLogs("alpha logs --tail 0 2>&1"); + const nonNumericTail = setup.runLogs("alpha logs -n foo 2>&1"); + const missingSince = setup.runLogs("alpha logs --since 2>&1"); + const malformedSince = setup.runLogs("alpha logs --since someday 2>&1"); + + for (const result of [missingTail, zeroTail, nonNumericTail, missingSince, malformedSince]) { + expect(result.code).not.toBe(0); + } + expect(missingTail.out).toContain("--tail"); + expect(zeroTail.out).toContain("--tail"); + expect(nonNumericTail.out).toContain("Expected an integer"); + expect(missingSince.out).toContain("--since"); + expect(malformedSince.out).toContain("--since requires a positive duration"); + expect(setup.readCalls()).toEqual([]); + }); + + it("rejects unknown logs flags before calling OpenShell", () => { + const setup = createLogsTestSetup("nemoclaw-cli-logs-unknown-"); + const r = setup.runLogs("alpha logs --bogus 2>&1"); + + expect(r.code).not.toBe(0); + expect(r.out).toContain("Nonexistent flag: --bogus"); + expect(setup.readCalls()).toEqual([]); + }); + it("enables OpenShell audit events before reading logs", () => { const setup = createLogsTestSetup("nemoclaw-cli-logs-audit-"); const r = setup.runLogs(); From 8b2d0775c13e91863861e4eb20189c1d9047bced Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Sat, 2 May 2026 22:28:38 -0700 Subject: [PATCH 14/65] refactor(cli): improve sandbox diagnostic command metadata --- src/lib/command-registry.ts | 1 + src/lib/connect-cli-command.ts | 4 ++++ src/lib/legacy-oclif-dispatch.test.ts | 7 +++++++ src/lib/legacy-oclif-dispatch.ts | 1 + src/lib/sandbox-doctor-cli-command.ts | 20 ++++++++++++++++---- src/lib/sandbox-inspection-cli-command.ts | 16 +++++++++++----- test/cli.test.ts | 21 +++++++++++++++++++++ 7 files changed, 61 insertions(+), 9 deletions(-) diff --git a/src/lib/command-registry.ts b/src/lib/command-registry.ts index 166674c0b7..d8fbfafc55 100644 --- a/src/lib/command-registry.ts +++ b/src/lib/command-registry.ts @@ -277,6 +277,7 @@ export const COMMANDS: readonly CommandDef[] = [ { usage: "nemoclaw config get", description: "Get sandbox configuration", + flags: "[--key ] [--format json|yaml]", group: "Sandbox Management", scope: "sandbox", hidden: true, diff --git a/src/lib/connect-cli-command.ts b/src/lib/connect-cli-command.ts index cd490ceed1..a932374a75 100644 --- a/src/lib/connect-cli-command.ts +++ b/src/lib/connect-cli-command.ts @@ -14,6 +14,10 @@ export default class ConnectCliCommand extends Command { static summary = "Shell into a running sandbox"; static description = "Connect to a running sandbox."; static usage = [" connect [--probe-only]"]; + static examples = [ + "<%= config.bin %> alpha connect", + "<%= config.bin %> alpha connect --probe-only", + ]; static args = { sandboxName: Args.string({ name: "sandbox", description: "Sandbox name", required: true }), }; diff --git a/src/lib/legacy-oclif-dispatch.test.ts b/src/lib/legacy-oclif-dispatch.test.ts index d97705331a..26a5030de4 100644 --- a/src/lib/legacy-oclif-dispatch.test.ts +++ b/src/lib/legacy-oclif-dispatch.test.ts @@ -29,6 +29,13 @@ describe("resolveSandboxOclifDispatch", () => { }); }); + it("keeps sandbox doctor help public", () => { + expect(resolveSandboxOclifDispatch("alpha", "doctor", ["--help"])).toEqual({ + kind: "help", + usage: "doctor [--json]", + }); + }); + it("keeps logs help public with filter flags", () => { expect(resolveSandboxOclifDispatch("alpha", "logs", ["--help"])).toEqual({ kind: "help", diff --git a/src/lib/legacy-oclif-dispatch.ts b/src/lib/legacy-oclif-dispatch.ts index 2235cd4c1e..80e1971e5f 100644 --- a/src/lib/legacy-oclif-dispatch.ts +++ b/src/lib/legacy-oclif-dispatch.ts @@ -111,6 +111,7 @@ export function resolveSandboxOclifDispatch( } return { kind: "oclif", commandId: "sandbox:logs", args: [sandboxName, ...actionArgs] }; case "doctor": + if (hasHelpFlag(actionArgs)) return { kind: "help", usage: "doctor [--json]" }; return { kind: "oclif", commandId: "sandbox:doctor", args: [sandboxName, ...actionArgs] }; case "policy-add": if (hasHelpFlag(actionArgs)) { diff --git a/src/lib/sandbox-doctor-cli-command.ts b/src/lib/sandbox-doctor-cli-command.ts index 9b03c3defb..b837cee6df 100644 --- a/src/lib/sandbox-doctor-cli-command.ts +++ b/src/lib/sandbox-doctor-cli-command.ts @@ -3,19 +3,31 @@ /* v8 ignore start -- thin oclif adapter covered through CLI integration tests. */ -import { Command } from "@oclif/core"; +import { Args, Command, Flags } from "@oclif/core"; import { runSandboxDoctor } from "./sandbox-doctor-action"; export default class SandboxDoctorCliCommand extends Command { static id = "sandbox:doctor"; - static strict = false; + static strict = true; static summary = "Diagnose sandbox and gateway health"; static description = "Run host, gateway, sandbox, inference, messaging, and local service diagnostics."; static usage = [" doctor [--json]"]; + static examples = ["<%= config.bin %> alpha doctor", "<%= config.bin %> alpha doctor --json"]; + static args = { + sandboxName: Args.string({ + name: "sandbox", + description: "Sandbox name", + required: true, + }), + }; + static flags = { + help: Flags.help({ char: "h" }), + json: Flags.boolean({ description: "Emit machine-readable JSON diagnostics" }), + }; public async run(): Promise { - const [sandboxName, ...actionArgs] = this.argv; - await runSandboxDoctor(sandboxName, actionArgs); + const { args, flags } = await this.parse(SandboxDoctorCliCommand); + await runSandboxDoctor(args.sandboxName, flags.json ? ["--json"] : []); } } diff --git a/src/lib/sandbox-inspection-cli-command.ts b/src/lib/sandbox-inspection-cli-command.ts index db4df2675a..a342894580 100644 --- a/src/lib/sandbox-inspection-cli-command.ts +++ b/src/lib/sandbox-inspection-cli-command.ts @@ -22,6 +22,7 @@ export class SandboxStatusCommand extends Command { static summary = "Sandbox health and NIM status"; static description = "Show sandbox health, OpenShell gateway state, and local NIM status."; static usage = [" status"]; + static examples = ["<%= config.bin %> alpha status"]; static args = { sandboxName: sandboxNameArg, }; @@ -41,6 +42,7 @@ export class SandboxPolicyListCommand extends Command { static summary = "List policy presets"; static description = "List built-in and custom policy presets and show which are applied."; static usage = [" policy-list"]; + static examples = ["<%= config.bin %> alpha policy-list"]; static args = { sandboxName: sandboxNameArg, }; @@ -60,6 +62,7 @@ export class SandboxChannelsListCommand extends Command { static summary = "List supported messaging channels"; static description = "List supported messaging channels for a sandbox."; static usage = [" channels list"]; + static examples = ["<%= config.bin %> alpha channels list"]; static args = { sandboxName: sandboxNameArg, }; @@ -79,21 +82,24 @@ export class SandboxConfigGetCommand extends Command { static summary = "Get sandbox configuration"; static description = "Read sanitized sandbox agent configuration."; static usage = [" config get [--key dotpath] [--format json|yaml]"]; + static examples = [ + "<%= config.bin %> alpha config get", + "<%= config.bin %> alpha config get --key model --format yaml", + ]; static args = { sandboxName: sandboxNameArg, }; static flags = { help: Flags.help({ char: "h" }), key: Flags.string({ description: "Dotpath to read from the sanitized config" }), - format: Flags.string({ description: "Output format (json or yaml)" }), + format: Flags.string({ + description: "Output format", + options: ["json", "yaml"], + }), }; public async run(): Promise { const { args, flags } = await this.parse(SandboxConfigGetCommand); - if (flags.format && flags.format !== "json" && flags.format !== "yaml") { - console.error(` Unknown format: ${flags.format}. Use json or yaml.`); - process.exit(1); - } sandboxConfig.configGet(args.sandboxName, { key: flags.key ?? null, format: flags.format ?? "json", diff --git a/test/cli.test.ts b/test/cli.test.ts index e50bea9300..83c525f26a 100644 --- a/test/cli.test.ts +++ b/test/cli.test.ts @@ -1202,6 +1202,11 @@ describe("CLI dispatch", () => { expect(status.out).toContain(" status"); expect(status.out).not.toContain("sandbox:status"); + const doctor = runWithEnv("alpha doctor --help", { HOME: home }); + expect(doctor.code).toBe(0); + expect(doctor.out).toContain(" doctor [--json]"); + expect(doctor.out).not.toContain("sandbox:doctor"); + const logs = runWithEnv("alpha logs --help", { HOME: home }); expect(logs.code).toBe(0); expect(logs.out).toContain(" logs"); @@ -1242,6 +1247,7 @@ describe("CLI dispatch", () => { const config = runWithEnv("alpha config get --help", { HOME: home }); expect(config.code).toBe(0); expect(config.out).toContain(" config get"); + expect(config.out).toContain("--format json|yaml"); expect(config.out).not.toContain("sandbox:config:get"); }); @@ -1284,6 +1290,21 @@ describe("CLI dispatch", () => { expect(start.out).toContain("Channel 'telegram' is already enabled for 'alpha'. Nothing to do."); }); + it("diagnostic commands reject invalid parser-owned flags before dispatch", () => { + const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-diagnostics-invalid-flags-")); + writeSandboxRegistry(home); + + const badConfigFormat = runWithEnv("alpha config get --format xml 2>&1", { HOME: home }); + expect(badConfigFormat.code).not.toBe(0); + expect(badConfigFormat.out).toContain("--format"); + expect(badConfigFormat.out).toContain("json"); + expect(badConfigFormat.out).toContain("yaml"); + + const badDoctorFlag = runWithEnv("alpha doctor --bogus 2>&1", { HOME: home }); + expect(badDoctorFlag.code).not.toBe(0); + expect(badDoctorFlag.out).toContain("Nonexistent flag: --bogus"); + }); + it("shields help keeps public sandbox-scoped usage", () => { const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-shields-help-")); writeSandboxRegistry(home); From a09cc51366e16060cc83a4a274ded30ab00098c9 Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Sat, 2 May 2026 22:43:03 -0700 Subject: [PATCH 15/65] refactor(cli): tighten policy and channel parser validation --- src/lib/channels-mutate-cli-commands.test.ts | 14 ++++++++++++++ src/lib/channels-mutate-cli-commands.ts | 6 +++++- src/lib/command-registry.ts | 4 ++++ src/lib/legacy-oclif-dispatch.test.ts | 4 ++-- src/lib/legacy-oclif-dispatch.ts | 8 -------- src/lib/oclif-commands.ts | 7 +------ src/lib/policy-mutate-cli-commands.test.ts | 14 ++++++++++++++ src/lib/policy-mutate-cli-commands.ts | 20 +++++++++----------- test/cli.test.ts | 13 +++++++++++++ test/policies.test.ts | 3 ++- 10 files changed, 64 insertions(+), 29 deletions(-) diff --git a/src/lib/channels-mutate-cli-commands.test.ts b/src/lib/channels-mutate-cli-commands.test.ts index 8edcc3d7b1..bb7ec65a7f 100644 --- a/src/lib/channels-mutate-cli-commands.test.ts +++ b/src/lib/channels-mutate-cli-commands.test.ts @@ -48,4 +48,18 @@ describe("channels mutation oclif commands", () => { ]); expect(runtime.sandboxChannelsStop).toHaveBeenCalledWith("alpha", ["slack"]); }); + + it("requires a channel before dispatch", async () => { + const runtime = { + sandboxChannelsAdd: vi.fn().mockResolvedValue(undefined), + sandboxChannelsRemove: vi.fn().mockResolvedValue(undefined), + sandboxChannelsStart: vi.fn().mockResolvedValue(undefined), + sandboxChannelsStop: vi.fn().mockResolvedValue(undefined), + }; + setChannelsRuntimeBridgeFactoryForTest(() => runtime); + + await expect(ChannelsAddCommand.run(["alpha"], rootDir)).rejects.toThrow(/channel/i); + + expect(runtime.sandboxChannelsAdd).not.toHaveBeenCalled(); + }); }); diff --git a/src/lib/channels-mutate-cli-commands.ts b/src/lib/channels-mutate-cli-commands.ts index cbada30c2b..bf2cbf9751 100644 --- a/src/lib/channels-mutate-cli-commands.ts +++ b/src/lib/channels-mutate-cli-commands.ts @@ -38,7 +38,7 @@ function getRuntimeBridge(): ChannelsRuntimeBridge { } const sandboxNameArg = Args.string({ name: "sandbox", description: "Sandbox name", required: true }); -const channelArg = Args.string({ name: "channel", description: "Messaging channel", required: false }); +const channelArg = Args.string({ name: "channel", description: "Messaging channel", required: true }); function buildArgs(channel: string | undefined, flags: { "dry-run"?: boolean }): string[] { const args: string[] = []; @@ -62,6 +62,7 @@ export class ChannelsAddCommand extends Command { static summary = "Save messaging channel credentials and rebuild"; static description = "Store credentials for a messaging channel and queue a sandbox rebuild."; static usage = [" channels add [--dry-run]"]; + static examples = ["<%= config.bin %> alpha channels add telegram"]; static args = channelMutationArgs; static flags = channelMutationFlags; @@ -77,6 +78,7 @@ export class ChannelsRemoveCommand extends Command { static summary = "Clear messaging channel credentials and rebuild"; static description = "Remove credentials for a messaging channel and queue a sandbox rebuild."; static usage = [" channels remove [--dry-run]"]; + static examples = ["<%= config.bin %> alpha channels remove slack --dry-run"]; static args = channelMutationArgs; static flags = channelMutationFlags; @@ -92,6 +94,7 @@ export class ChannelsStopCommand extends Command { static summary = "Disable channel without wiping credentials"; static description = "Disable a messaging channel while keeping credentials in the gateway."; static usage = [" channels stop [--dry-run]"]; + static examples = ["<%= config.bin %> alpha channels stop discord"]; static args = channelMutationArgs; static flags = channelMutationFlags; @@ -107,6 +110,7 @@ export class ChannelsStartCommand extends Command { static summary = "Re-enable a stopped messaging channel"; static description = "Re-enable a previously stopped messaging channel."; static usage = [" channels start [--dry-run]"]; + static examples = ["<%= config.bin %> alpha channels start discord"]; static args = channelMutationArgs; static flags = channelMutationFlags; diff --git a/src/lib/command-registry.ts b/src/lib/command-registry.ts index d8fbfafc55..704e233fcf 100644 --- a/src/lib/command-registry.ts +++ b/src/lib/command-registry.ts @@ -228,24 +228,28 @@ export const COMMANDS: readonly CommandDef[] = [ { usage: "nemoclaw channels add", description: "Save credentials and rebuild", + flags: " [--dry-run]", group: "Messaging Channels", scope: "sandbox", }, { usage: "nemoclaw channels remove", description: "Clear credentials and rebuild", + flags: " [--dry-run]", group: "Messaging Channels", scope: "sandbox", }, { usage: "nemoclaw channels stop", description: "Disable channel (keeps credentials)", + flags: " [--dry-run]", group: "Messaging Channels", scope: "sandbox", }, { usage: "nemoclaw channels start", description: "Re-enable a previously stopped channel", + flags: " [--dry-run]", group: "Messaging Channels", scope: "sandbox", }, diff --git a/src/lib/legacy-oclif-dispatch.test.ts b/src/lib/legacy-oclif-dispatch.test.ts index 26a5030de4..d41f5754c2 100644 --- a/src/lib/legacy-oclif-dispatch.test.ts +++ b/src/lib/legacy-oclif-dispatch.test.ts @@ -43,10 +43,10 @@ describe("resolveSandboxOclifDispatch", () => { }); }); - it("routes policy-add missing-value errors through a raw oclif adapter", () => { + it("routes policy-add missing-value errors through the strict oclif adapter", () => { expect(resolveSandboxOclifDispatch("alpha", "policy-add", ["--from-file"])).toEqual({ kind: "oclif", - commandId: "sandbox:policy-add:raw", + commandId: "sandbox:policy-add", args: ["alpha", "--from-file"], }); }); diff --git a/src/lib/legacy-oclif-dispatch.ts b/src/lib/legacy-oclif-dispatch.ts index 80e1971e5f..735b81e4cb 100644 --- a/src/lib/legacy-oclif-dispatch.ts +++ b/src/lib/legacy-oclif-dispatch.ts @@ -41,11 +41,6 @@ function hasHelpFlag(args: readonly string[]): boolean { return args.includes("--help") || args.includes("-h"); } -function hasMissingFlagValue(args: readonly string[], flagName: string): boolean { - const index = args.indexOf(flagName); - return index !== -1 && (!args[index + 1] || args[index + 1].startsWith("--")); -} - export function resolveGlobalOclifDispatch(cmd: string, args: string[]): DispatchResult { switch (cmd) { case "onboard": @@ -120,9 +115,6 @@ export function resolveSandboxOclifDispatch( usage: "policy-add [preset] [--yes|-y] [--dry-run] [--from-file ] [--from-dir ]", }; } - if (hasMissingFlagValue(actionArgs, "--from-file") || hasMissingFlagValue(actionArgs, "--from-dir")) { - return { kind: "oclif", commandId: "sandbox:policy-add:raw", args: [sandboxName, ...actionArgs] }; - } return { kind: "oclif", commandId: "sandbox:policy-add", args: [sandboxName, ...actionArgs] }; case "policy-remove": if (hasHelpFlag(actionArgs)) return { kind: "help", usage: "policy-remove [preset] [--yes|-y] [--dry-run]" }; diff --git a/src/lib/oclif-commands.ts b/src/lib/oclif-commands.ts index b56342b5bc..fcc27bfe04 100644 --- a/src/lib/oclif-commands.ts +++ b/src/lib/oclif-commands.ts @@ -29,11 +29,7 @@ import { GarbageCollectImagesCommand, UpgradeSandboxesCommand, } from "./maintenance-cli-commands"; -import { - PolicyAddCommand, - PolicyAddRawCommand, - PolicyRemoveCommand, -} from "./policy-mutate-cli-commands"; +import { PolicyAddCommand, PolicyRemoveCommand } from "./policy-mutate-cli-commands"; import RebuildCliCommand from "./rebuild-cli-command"; import SandboxDoctorCliCommand from "./sandbox-doctor-cli-command"; import { @@ -87,7 +83,6 @@ export default { "sandbox:doctor": SandboxDoctorCliCommand, "sandbox:logs": SandboxLogsCommand, "sandbox:policy-add": PolicyAddCommand, - "sandbox:policy-add:raw": PolicyAddRawCommand, "sandbox:policy-list": SandboxPolicyListCommand, "sandbox:policy-remove": PolicyRemoveCommand, "sandbox:rebuild": RebuildCliCommand, diff --git a/src/lib/policy-mutate-cli-commands.test.ts b/src/lib/policy-mutate-cli-commands.test.ts index 90c2b67a69..6ac03345b9 100644 --- a/src/lib/policy-mutate-cli-commands.test.ts +++ b/src/lib/policy-mutate-cli-commands.test.ts @@ -48,4 +48,18 @@ describe("policy mutation oclif commands", () => { "--dry-run", ]); }); + + it("rejects missing custom policy paths before dispatch", async () => { + const runtime = { + sandboxPolicyAdd: vi.fn().mockResolvedValue(undefined), + sandboxPolicyRemove: vi.fn().mockResolvedValue(undefined), + }; + setPolicyRuntimeBridgeFactoryForTest(() => runtime); + + await expect(PolicyAddCommand.run(["alpha", "--from-file"], rootDir)).rejects.toThrow( + /from-file/, + ); + + expect(runtime.sandboxPolicyAdd).not.toHaveBeenCalled(); + }); }); diff --git a/src/lib/policy-mutate-cli-commands.ts b/src/lib/policy-mutate-cli-commands.ts index 155e831e89..db28cc3a6b 100644 --- a/src/lib/policy-mutate-cli-commands.ts +++ b/src/lib/policy-mutate-cli-commands.ts @@ -55,6 +55,11 @@ export class PolicyAddCommand extends Command { static summary = "Add a network or filesystem policy preset"; static description = "Add a built-in or custom policy preset to a sandbox."; static usage = [" policy-add [preset] [--yes|-y] [--dry-run] [--from-file ] [--from-dir ]"]; + static examples = [ + "<%= config.bin %> alpha policy-add slack --yes", + "<%= config.bin %> alpha policy-add --from-file ./policy.yaml --dry-run", + "<%= config.bin %> alpha policy-add --from-dir ./policies --yes", + ]; static args = { sandboxName: sandboxNameArg, preset: presetArg }; static flags = { help: Flags.help({ char: "h" }), @@ -76,23 +81,16 @@ export class PolicyAddCommand extends Command { } } -export class PolicyAddRawCommand extends Command { - static id = "sandbox:policy-add:raw"; - static strict = false; - static hidden = true; - - public async run(): Promise { - const [sandboxName, ...actionArgs] = this.argv; - await getRuntimeBridge().sandboxPolicyAdd(sandboxName, actionArgs); - } -} - export class PolicyRemoveCommand extends Command { static id = "sandbox:policy-remove"; static strict = true; static summary = "Remove an applied policy preset"; static description = "Remove a built-in or custom policy preset from a sandbox."; static usage = [" policy-remove [preset] [--yes|-y] [--dry-run]"]; + static examples = [ + "<%= config.bin %> alpha policy-remove slack --yes", + "<%= config.bin %> alpha policy-remove slack --dry-run", + ]; static args = { sandboxName: sandboxNameArg, preset: presetArg }; static flags = { help: Flags.help({ char: "h" }), diff --git a/test/cli.test.ts b/test/cli.test.ts index 83c525f26a..edf5380793 100644 --- a/test/cli.test.ts +++ b/test/cli.test.ts @@ -1290,6 +1290,19 @@ describe("CLI dispatch", () => { expect(start.out).toContain("Channel 'telegram' is already enabled for 'alpha'. Nothing to do."); }); + it("policy and channel mutations reject missing parser-owned values before dispatch", () => { + const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-mutation-missing-values-")); + writeSandboxRegistry(home); + + const missingPolicyFile = runWithEnv("alpha policy-add --from-file 2>&1", { HOME: home }); + expect(missingPolicyFile.code).not.toBe(0); + expect(missingPolicyFile.out).toContain("--from-file"); + + const missingChannel = runWithEnv("alpha channels add 2>&1", { HOME: home }); + expect(missingChannel.code).not.toBe(0); + expect(missingChannel.out).toContain("channel"); + }); + it("diagnostic commands reject invalid parser-owned flags before dispatch", () => { const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-diagnostics-invalid-flags-")); writeSandboxRegistry(home); diff --git a/test/policies.test.ts b/test/policies.test.ts index b6af1a9084..b92765bd2e 100644 --- a/test/policies.test.ts +++ b/test/policies.test.ts @@ -1450,7 +1450,8 @@ Promise.resolve(require(${CLI_PATH}).mainPromise).finally(() => { it("errors when --from-file is missing its path argument", () => { const result = runPolicyAddExternal(["--from-file"]); expect(result.status).not.toBe(0); - expect(result.stderr).toMatch(/--from-file requires a path argument/); + expect(result.stderr).toMatch(/--from-file/); + expect(result.stderr).toMatch(/value|argument|path/); }); it("applies every preset in --from-dir in sorted order and aborts on the first failure", () => { From 75857dcaff8a27ee230712d8c1dc87611dbf9a53 Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Sat, 2 May 2026 23:02:33 -0700 Subject: [PATCH 16/65] refactor(cli): improve snapshot command metadata --- src/lib/legacy-oclif-dispatch.ts | 3 +++ src/lib/snapshot-cli-commands.test.ts | 19 +++++++++++++++++++ src/lib/snapshot-cli-commands.ts | 26 ++++++++++++++++++++++---- test/cli.test.ts | 15 +++++++++++++++ 4 files changed, 59 insertions(+), 4 deletions(-) diff --git a/src/lib/legacy-oclif-dispatch.ts b/src/lib/legacy-oclif-dispatch.ts index 735b81e4cb..70b26ac0e8 100644 --- a/src/lib/legacy-oclif-dispatch.ts +++ b/src/lib/legacy-oclif-dispatch.ts @@ -148,6 +148,9 @@ export function resolveSandboxOclifDispatch( case "snapshot": { const snapshotSub = actionArgs[0]; const snapshotArgs = actionArgs.slice(1); + if (!snapshotSub || snapshotSub === "--help" || snapshotSub === "-h") { + return { kind: "oclif", commandId: "sandbox:snapshot", args: [sandboxName] }; + } if (snapshotSub === "list") { if (hasHelpFlag(snapshotArgs)) return { kind: "help", usage: "snapshot list" }; return { kind: "oclif", commandId: "sandbox:snapshot:list", args: [sandboxName, ...snapshotArgs] }; diff --git a/src/lib/snapshot-cli-commands.test.ts b/src/lib/snapshot-cli-commands.test.ts index 7ff51a5ab0..ceb1485fe4 100644 --- a/src/lib/snapshot-cli-commands.test.ts +++ b/src/lib/snapshot-cli-commands.test.ts @@ -5,6 +5,7 @@ import { describe, expect, it, vi } from "vitest"; import { setSnapshotRuntimeBridgeFactoryForTest, + SnapshotCommand, SnapshotCreateCommand, SnapshotListCommand, SnapshotRestoreCommand, @@ -13,6 +14,24 @@ import { const rootDir = process.cwd(); describe("snapshot oclif commands", () => { + it("shows parent snapshot usage through the action", async () => { + const sandboxSnapshot = vi.fn().mockResolvedValue(undefined); + setSnapshotRuntimeBridgeFactoryForTest(() => ({ sandboxSnapshot })); + + await SnapshotCommand.run(["alpha"], rootDir); + + expect(sandboxSnapshot).toHaveBeenCalledWith("alpha", []); + }); + + it("rejects unknown parent snapshot args before dispatch", async () => { + const sandboxSnapshot = vi.fn().mockResolvedValue(undefined); + setSnapshotRuntimeBridgeFactoryForTest(() => ({ sandboxSnapshot })); + + await expect(SnapshotCommand.run(["alpha", "bogus"], rootDir)).rejects.toThrow(/bogus/); + + expect(sandboxSnapshot).not.toHaveBeenCalled(); + }); + it("runs snapshot list through the legacy snapshot implementation", async () => { const sandboxSnapshot = vi.fn().mockResolvedValue(undefined); setSnapshotRuntimeBridgeFactoryForTest(() => ({ sandboxSnapshot })); diff --git a/src/lib/snapshot-cli-commands.ts b/src/lib/snapshot-cli-commands.ts index 635439fd1b..a2251b2f3c 100644 --- a/src/lib/snapshot-cli-commands.ts +++ b/src/lib/snapshot-cli-commands.ts @@ -27,14 +27,22 @@ const sandboxNameArg = Args.string({ export class SnapshotCommand extends Command { static id = "sandbox:snapshot"; - static strict = false; + static strict = true; static summary = "Show snapshot usage"; - static description = "Show snapshot usage or report unknown snapshot subcommands."; + static description = "Show snapshot usage for create, list, and restore subcommands."; static usage = [" snapshot "]; + static examples = [ + "<%= config.bin %> alpha snapshot create", + "<%= config.bin %> alpha snapshot list", + "<%= config.bin %> alpha snapshot restore", + ]; + static args = { + sandboxName: sandboxNameArg, + }; public async run(): Promise { - const [sandboxName, ...actionArgs] = this.argv; - await getRuntimeBridge().sandboxSnapshot(sandboxName, actionArgs); + const { args } = await this.parse(SnapshotCommand); + await getRuntimeBridge().sandboxSnapshot(args.sandboxName, []); } } @@ -44,6 +52,7 @@ export class SnapshotListCommand extends Command { static summary = "List available snapshots"; static description = "List available snapshots for a sandbox."; static usage = [" snapshot list"]; + static examples = ["<%= config.bin %> alpha snapshot list"]; static args = { sandboxName: sandboxNameArg, }; @@ -63,6 +72,11 @@ export class SnapshotRestoreCommand extends Command { static summary = "Restore state from a snapshot"; static description = "Restore sandbox workspace state from a snapshot."; static usage = [" snapshot restore [selector] [--to ]"]; + static examples = [ + "<%= config.bin %> alpha snapshot restore", + "<%= config.bin %> alpha snapshot restore v2", + "<%= config.bin %> alpha snapshot restore before-upgrade --to beta", + ]; static args = { sandboxName: sandboxNameArg, selector: Args.string({ @@ -91,6 +105,10 @@ export class SnapshotCreateCommand extends Command { static summary = "Create a snapshot of sandbox state"; static description = "Create an auto-versioned snapshot of sandbox workspace state."; static usage = [" snapshot create [--name