coasys · HexaField · Mar 11, 2026 · Mar 11, 2026 · Mar 11, 2026 · Mar 12, 2026
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -118,19 +118,23 @@ jobs:
     resource_class: coasys/marvin
     steps:
       - setup_integration_test_environment
-      - run:
-          name: Kill any orphaned executors from previous runs
-          command: |
-            # Self-hosted runners reuse workdirs; previous job may have left
-            # an executor alive (exec() shell-wrap means kill() only kills
-            # the shell, not the executor grandchild). Clear ports before test.
-            for port in 15700 15701 15702; do
-              lsof -ti:$port | xargs -r kill -9 2>/dev/null || true
-            done
       - run:
           name: Run integration tests
           command: cd ./tests/js && pnpm run test-main
           no_output_timeout: 30m
+      - run:
+          name: Collect logs on failure
+          when: on_fail
+          command: |
+            mkdir -p /tmp/test-artifacts
+            cp -r tests/js/tst-tmp/agents/*/ad4m/logs /tmp/test-artifacts/ 2>/dev/null || true
+            cp -r tests/js/tst-tmp/agents/*/ad4m/holochain /tmp/test-artifacts/ 2>/dev/null || true
+            # Capture port state for debugging
+            lsof -i -P -n 2>/dev/null | grep LISTEN > /tmp/test-artifacts/listening-ports.txt || true
+      - store_artifacts:
+          when: on_fail
+          path: /tmp/test-artifacts
+          destination: test-logs
 
   integration-tests-multi-user-simple:
     machine: true
@@ -157,14 +161,6 @@ jobs:
     resource_class: coasys/marvin
     steps:
       - setup_integration_test_environment
-      - run:
-          name: Kill any orphaned executors from previous runs
-          command: |
-            # MCP tests use ports 16000-16002 (mcp-http) and 16010-16012 (mcp-auth)
-            # plus port 3001 (MCP HTTP server). Clear all before starting.
-            for port in 16000 16001 16002 16010 16011 16012 3001; do
-              lsof -ti:$port | xargs -r kill -9 2>/dev/null || true
-            done
       - run:
           name: Run MCP integration tests
           command: cd ./tests/js && pnpm run test-mcp
@@ -175,12 +171,6 @@ jobs:
     resource_class: coasys/marvin
     steps:
       - setup_integration_test_environment
-      - run:
-          name: Kill any orphaned executors from previous runs
-          command: |
-            for port in 15700 15701 15702; do
-              lsof -ti:$port | xargs -r kill -9 2>/dev/null || true
-            done
       - run:
           name: Run Ad4mModel unit + integration tests
           command: cd ./tests/js && pnpm run test-model

diff --git a/cli/src/ad4m_executor.rs b/cli/src/ad4m_executor.rs
@@ -136,6 +136,10 @@ enum Domain {
         enable_mcp: Option<bool>,
         #[arg(long, action)]
         mcp_port: Option<u16>,
+        /// Write the executor PID to this file on startup (removed on clean shutdown).
+        /// Useful for test harnesses that need targeted process cleanup.
+        #[arg(long)]
+        pid_file: Option<String>,
     },
     RunLocalHcServices {},
 }
@@ -189,8 +193,17 @@ async fn main() -> Result<()> {
         enable_multi_user,
         enable_mcp,
         mcp_port,
+        pid_file,
     } = args.domain
     {
+        // Set PID file path as env var so the executor can write/clean it up
+        if let Some(ref pf) = pid_file {
+            // SAFETY: set_var is safe here because we're in single-threaded init before spawning.
+            #[allow(deprecated)]
+            unsafe {
+                std::env::set_var("AD4M_PID_FILE", pf);
+            }
+        }
         let tls = if tls_cert_file.is_some() && tls_key_file.is_some() {
             Some(TlsConfig {
                 cert_file_path: tls_cert_file.unwrap(),

diff --git a/cli/src/main.rs b/cli/src/main.rs
@@ -163,6 +163,9 @@ enum Domain {
         enable_mcp: Option<bool>,
         #[arg(long, action)]
         mcp_port: Option<u16>,
+        /// Write the executor PID to this file on startup (removed on clean shutdown).
+        #[arg(long)]
+        pid_file: Option<String>,
     },
     RunLocalHcServices {},
     Eve {
@@ -243,8 +246,14 @@ async fn main() -> Result<()> {
         enable_multi_user,
         enable_mcp,
         mcp_port,
+        pid_file,
     } = args.domain
     {
+        // Set PID file path as env var so the executor can write/clean it up
+        if let Some(ref pf) = pid_file {
+            #[allow(deprecated)]
+            unsafe { std::env::set_var("AD4M_PID_FILE", pf); }
+        }
         let _ = tokio::spawn(async move {
             rust_executor::run(Ad4mConfig {
                 app_data_path,
@@ -269,6 +278,8 @@ async fn main() -> Result<()> {
                 auto_permit_cap_requests: None,
                 tls: None,
                 log_holochain_metrics: None,
+                hc_relay_url: None,
+                smtp_config: None,
             }).await
         }).await;
 

diff --git a/rust-executor/src/globals.rs b/rust-executor/src/globals.rs
@@ -1,10 +1,17 @@
 use lazy_static::lazy_static;
+use std::sync::Mutex;
+use tokio::sync::oneshot;
 
 lazy_static! {
     /// The current version of AD4M
     pub static ref AD4M_VERSION: String = String::from("0.12.0-rc1-dev.2");
 }
 
+/// Global shutdown signal sender. Used by `runtime_quit` GQL mutation and signal handlers
+/// to trigger a graceful shutdown of the executor.
+/// Wrapped in Mutex<Option<...>> so we can take() the sender from a shared static reference.
+pub static SHUTDOWN_TX: Mutex<Option<oneshot::Sender<()>>> = Mutex::new(None);
+
 /// Struct representing oldest supported version and indicator if state should be cleared if update is required
 pub struct OldestVersion {
     pub version: String,

diff --git a/rust-executor/src/graphql/graphql_types.rs b/rust-executor/src/graphql/graphql_types.rs
@@ -633,6 +633,22 @@ pub struct RuntimeInfo {
     pub is_unlocked: bool,
 }
 
+/// Readiness status returned by the `runtimeReadiness` query.
+/// Each field indicates whether a subsystem has completed initialization.
+/// Test harnesses should poll this instead of using `sleep()`.
+#[derive(GraphQLObject, Default, Debug, Deserialize, Serialize, Clone)]
+#[serde(rename_all = "camelCase")]
+pub struct ReadinessStatus {
+    /// GraphQL server is accepting requests (always true if you can call this query)
+    pub gql_ready: bool,
+    /// Holochain conductor is running and connected
+    pub holochain_ready: bool,
+    /// Agent has been generated/unlocked
+    pub agent_initialized: bool,
+    /// Languages have been loaded into the language controller
+    pub languages_loaded: bool,
+}
+
 #[derive(GraphQLObject, Default, Debug, Deserialize, Serialize, Clone)]
 #[serde(rename_all = "camelCase")]
 pub struct SentMessage {

diff --git a/rust-executor/src/graphql/mutation_resolvers.rs b/rust-executor/src/graphql/mutation_resolvers.rs
@@ -2413,7 +2413,17 @@ impl Mutation {
 
     async fn runtime_quit(&self, context: &RequestContext) -> FieldResult<bool> {
         check_capability(&context.capabilities, &RUNTIME_QUIT_CAPABILITY)?;
-        std::process::exit(0);
+        // Trigger graceful shutdown via the global shutdown channel.
+        // The main loop will shut down Holochain conductor, flush state, and exit cleanly.
+        // Falls back to process::exit(0) if the channel was already consumed or not set.
+        if let Some(tx) = crate::globals::SHUTDOWN_TX.lock().unwrap().take() {
+            log::info!("runtime_quit: sending graceful shutdown signal");
+            let _ = tx.send(());
+            Ok(true)
+        } else {
+            log::warn!("runtime_quit: shutdown channel unavailable, falling back to process::exit");
+            std::process::exit(0);
+        }
     }
 
     async fn runtime_remove_friends(

diff --git a/rust-executor/src/graphql/query_resolvers.rs b/rust-executor/src/graphql/query_resolvers.rs
@@ -806,6 +806,27 @@ impl Query {
         })
     }
 
+    /// Returns the readiness status of executor subsystems.
+    /// Test harnesses should poll this query instead of using `sleep()`.
+    /// No capability check — readiness is safe to expose publicly.
+    async fn runtime_readiness(&self, _context: &RequestContext) -> FieldResult<ReadinessStatus> {
+        let holochain_ready = crate::holochain_service::maybe_get_holochain_service()
+            .await
+            .is_some();
+
+        let (agent_initialized, languages_loaded) =
+            AgentService::with_global_instance(|agent_service| {
+                (agent_service.is_initialized(), agent_service.is_unlocked())
+            });
+
+        Ok(ReadinessStatus {
+            gql_ready: true, // If this query returns, GQL is ready
+            holochain_ready,
+            agent_initialized,
+            languages_loaded, // Currently maps to agent unlocked (languages load during unlock)
+        })
+    }
+
     async fn runtime_known_link_language_templates(
         &self,
         context: &RequestContext,

diff --git a/rust-executor/src/lib.rs b/rust-executor/src/lib.rs
@@ -34,6 +34,7 @@ pub mod types;
 use std::thread::JoinHandle;
 
 use log::{error, info, warn};
+use tokio::sync::oneshot;
 
 use crate::{
     agent::AgentService, ai_service::AIService, dapp_server::serve_dapp, db::Ad4mDb,
@@ -190,11 +191,79 @@ pub async fn run(mut config: Ad4mConfig) -> JoinHandle<()> {
         }
     }
 
+    // Set up graceful shutdown channel.
+    // The sender is stored globally so runtime_quit and signal handlers can trigger shutdown.
+    let (shutdown_tx, shutdown_rx) = oneshot::channel::<()>();
+    {
+        let mut guard = crate::globals::SHUTDOWN_TX.lock().unwrap();
+        *guard = Some(shutdown_tx);
+    }
+
+    // Spawn a task that listens for OS signals (SIGTERM/SIGINT) and triggers shutdown.
+    // This replaces the old ctrlc handler in the CLI binaries with an in-executor handler
+    // that allows graceful cleanup of Holochain conductor and databases.
+    #[cfg(unix)]
+    {
+        tokio::spawn(async {
+            use tokio::signal;
+            let ctrl_c = signal::ctrl_c();
+            let mut sigterm = signal::unix::signal(signal::unix::SignalKind::terminate())
+                .expect("failed to install SIGTERM handler");
+
+            tokio::select! {
+                _ = ctrl_c => info!("Received SIGINT, initiating graceful shutdown..."),
+                _ = sigterm.recv() => info!("Received SIGTERM, initiating graceful shutdown..."),
+            }
+
+            // Trigger shutdown via the global channel
+            if let Some(tx) = crate::globals::SHUTDOWN_TX.lock().unwrap().take() {
+                let _ = tx.send(());
+            }
+        });
+    }
+
+    // Spawn the shutdown handler that waits for the signal and cleans up
+    tokio::spawn(async move {
+        if shutdown_rx.await.is_ok() {
+            info!("Shutdown signal received, cleaning up...");
+
+            // 1. Shut down Holochain conductor gracefully
+            if let Some(holochain_service) = holochain_service::maybe_get_holochain_service().await
+            {
+                info!("Shutting down Holochain conductor...");
+                match holochain_service.shutdown().await {
+                    Ok(()) => info!("Holochain conductor shut down cleanly"),
+                    Err(e) => warn!("Error shutting down Holochain conductor: {}", e),
+                }
+            }
+
+            // 2. Write PID file removal if it exists
+            if let Ok(pid_file) = std::env::var("AD4M_PID_FILE") {
+                let _ = std::fs::remove_file(&pid_file);
+                info!("Removed PID file: {}", pid_file);
+            }
+
+            info!("Graceful shutdown complete, exiting.");
+            std::process::exit(0);
+        }
+    });
+
     // Initialize logging for CLI (stdout)
     // Respects RUST_LOG environment variable if set
     crate::logging::init_cli_logging(None);
     config.prepare();
 
+    // Write PID file if requested via environment variable or config.
+    // Test harnesses can set AD4M_PID_FILE to get a reliable PID for targeted cleanup.
+    if let Ok(pid_file) = std::env::var("AD4M_PID_FILE") {
+        let pid = std::process::id();
+        if let Err(e) = std::fs::write(&pid_file, pid.to_string()) {
+            warn!("Failed to write PID file {}: {}", pid_file, e);
+        } else {
+            info!("Wrote PID {} to {}", pid, pid_file);
+        }
+    }
+
     // Store config globally so services (e.g. agent mutation resolvers) can access it
     crate::config::set_global_config(config.clone());
 

diff --git a/tests/integration.bats b/tests/integration.bats
@@ -7,6 +7,8 @@ setup_file() {
     echo "done." >&3
     echo "Starting agent 1..." >&3
     ./target/release/ad4m run --app-data-path ${current_dir}/tests/ad4m1 --gql-port 4000 &
+    AD4M_PID=$!
+    export AD4M_PID
     sleep 5
     echo "done." >&3
 
@@ -31,7 +33,19 @@ setup_file() {
 }
 
 teardown_file() {
-    killall ad4m
+    # Graceful shutdown: SIGTERM first, then escalate to SIGKILL if needed.
+    # Never use `killall ad4m` — it kills ALL ad4m processes on the machine,
+    # including other CI jobs and dev instances.
+    if [ -n "$AD4M_PID" ]; then
+        kill -TERM "$AD4M_PID" 2>/dev/null || true
+        for i in $(seq 1 10); do
+            kill -0 "$AD4M_PID" 2>/dev/null || break
+            sleep 1
+        done
+        kill -9 "$AD4M_PID" 2>/dev/null || true
+    fi
+    # Port-based fallback in case PID tracking missed something
+    lsof -ti:4000 | xargs -r kill -9 2>/dev/null || true
 }
 
 setup() {

diff --git a/tests/js/tests/multi-user-simple.test.ts b/tests/js/tests/multi-user-simple.test.ts
@@ -4,7 +4,7 @@ import fs from "fs-extra";
 import { fileURLToPath } from 'url';
 import * as chai from "chai";
 import chaiAsPromised from "chai-as-promised";
-import { apolloClient, sleep, startExecutor, runHcLocalServices } from "../utils/utils";
+import { apolloClient, sleep, startExecutor, runHcLocalServices, gracefulShutdown } from "../utils/utils";
 import { ChildProcess } from 'node:child_process';
 import fetch from 'node-fetch'
 import { LinkQuery } from "@coasys/ad4m";
@@ -60,20 +60,8 @@ describe("Multi-User Simple integration tests", () => {
     })
 
     after(async () => {
-        if (executorProcess) {
-            while (!executorProcess?.killed) {
-                let status = executorProcess?.kill();
-                console.log("killed executor with", status);
-                await sleep(500);
-            }
-        }
-        if (localServicesProcess) {
-            while (!localServicesProcess?.killed) {
-                let status = localServicesProcess?.kill();
-                console.log("killed local services with", status);
-                await sleep(500);
-            }
-        }
+        await gracefulShutdown(executorProcess, "executor");
+        await gracefulShutdown(localServicesProcess, "local services");
     })
 
     describe("Multi-User Configuration", () => {
@@ -1778,13 +1766,7 @@ describe("Multi-User Simple integration tests", () => {
 
         after(async function() {
             this.timeout(20000);
-            if (node2ExecutorProcess) {
-                while (!node2ExecutorProcess?.killed) {
-                    let status = node2ExecutorProcess?.kill();
-                    console.log("killed node 2 executor with", status);
-                    await sleep(500);
-                }
-            }
+            await gracefulShutdown(node2ExecutorProcess, "node 2 executor");
         });
 
         it("should return all DIDs in 'others()' for each user", async function() {
@@ -2861,13 +2843,7 @@ describe("Multi-User Simple integration tests", () => {
 
         after(async function() {
             this.timeout(20000);
-            if (node3ExecutorProcess) {
-                while (!node3ExecutorProcess?.killed) {
-                    let status = node3ExecutorProcess?.kill();
-                    console.log("killed node 3 executor with", status);
-                    await sleep(500);
-                }
-            }
+            await gracefulShutdown(node3ExecutorProcess, "node 3 executor");
         });
 
         it("should route signals between remote main agent and local managed user", async function() {