Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 13 additions & 23 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -118,19 +118,23 @@ jobs:
resource_class: coasys/marvin
steps:
- setup_integration_test_environment
- run:
name: Kill any orphaned executors from previous runs
command: |
# Self-hosted runners reuse workdirs; previous job may have left
# an executor alive (exec() shell-wrap means kill() only kills
# the shell, not the executor grandchild). Clear ports before test.
for port in 15700 15701 15702; do
lsof -ti:$port | xargs -r kill -9 2>/dev/null || true
done
- run:
name: Run integration tests
command: cd ./tests/js && pnpm run test-main
no_output_timeout: 30m
- run:
name: Collect logs on failure
when: on_fail
command: |
mkdir -p /tmp/test-artifacts
cp -r tests/js/tst-tmp/agents/*/ad4m/logs /tmp/test-artifacts/ 2>/dev/null || true
cp -r tests/js/tst-tmp/agents/*/ad4m/holochain /tmp/test-artifacts/ 2>/dev/null || true
# Capture port state for debugging
lsof -i -P -n 2>/dev/null | grep LISTEN > /tmp/test-artifacts/listening-ports.txt || true
- store_artifacts:
when: on_fail
path: /tmp/test-artifacts
destination: test-logs

integration-tests-multi-user-simple:
machine: true
Expand All @@ -157,14 +161,6 @@ jobs:
resource_class: coasys/marvin
steps:
- setup_integration_test_environment
- run:
name: Kill any orphaned executors from previous runs
command: |
# MCP tests use ports 16000-16002 (mcp-http) and 16010-16012 (mcp-auth)
# plus port 3001 (MCP HTTP server). Clear all before starting.
for port in 16000 16001 16002 16010 16011 16012 3001; do
lsof -ti:$port | xargs -r kill -9 2>/dev/null || true
done
- run:
name: Run MCP integration tests
command: cd ./tests/js && pnpm run test-mcp
Expand All @@ -175,12 +171,6 @@ jobs:
resource_class: coasys/marvin
steps:
- setup_integration_test_environment
- run:
name: Kill any orphaned executors from previous runs
command: |
for port in 15700 15701 15702; do
lsof -ti:$port | xargs -r kill -9 2>/dev/null || true
done
- run:
name: Run Ad4mModel unit + integration tests
command: cd ./tests/js && pnpm run test-model
Expand Down
13 changes: 13 additions & 0 deletions cli/src/ad4m_executor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,10 @@ enum Domain {
enable_mcp: Option<bool>,
#[arg(long, action)]
mcp_port: Option<u16>,
/// Write the executor PID to this file on startup (removed on clean shutdown).
/// Useful for test harnesses that need targeted process cleanup.
#[arg(long)]
pid_file: Option<String>,
},
RunLocalHcServices {},
}
Expand Down Expand Up @@ -189,8 +193,17 @@ async fn main() -> Result<()> {
enable_multi_user,
enable_mcp,
mcp_port,
pid_file,
} = args.domain
{
// Set PID file path as env var so the executor can write/clean it up
if let Some(ref pf) = pid_file {
// SAFETY: set_var is safe here because we're in single-threaded init before spawning.
#[allow(deprecated)]
unsafe {
std::env::set_var("AD4M_PID_FILE", pf);
}
Comment on lines +199 to +205
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

🧩 Analysis chain

🌐 Web query:

In current Rust, why is std::env::set_varunsafe on Unix, and is it sound to call it from inside a#[tokio::main(flavor = "multi_thread")] function after the runtime has started?

💡 Result:

On Unix-like platforms (everything except Windows), std::env::set_var is unsafe because the process environment is a shared global that the OS/C library does not provide a reliably thread-safe way to access: if any other thread (Rust or native/FFI) reads the environment via getenv, environ, or any other libc/internal routine while you mutate it, you can trigger a data race / use-after-free in the C environment machinery (i.e., UB from Rust’s perspective). The standard library docs therefore say that in multi-threaded programs on non-Windows, “the only safe option is to not use set_var or remove_var at all”, because you generally can’t guarantee no other thread is reading the environment (even Rust std may do so, e.g. during DNS lookup). [1] This is also why Rust 2024 marked set_var/remove_var as unsafe. [2] (Background on why POSIX/C makes this hard: [3])

For #[tokio::main(flavor = "multi_thread")]: No, it is not sound (on Unix) to call unsafe { std::env::set_var(...) } from inside that function once the runtime has started. Tokio’s multi-thread runtime creates a pool of worker threads on startup, and after that point you are in a multi-threaded process. [4] That violates set_var’s safety precondition unless you can somehow ensure that no other thread will read or write the environment except through std::env—which the docs explicitly say is not practically guaranteeable. [1]

If you must set an env var for your own process, do it before starting the Tokio multi-thread runtime (e.g., in a synchronous main that sets vars, then builds the runtime and block_ons). Otherwise, prefer passing configuration explicitly, or use Command::env when the goal is only to affect a child process. [1]

Sources
[1] Rust std docs: std::env::set_var safety contract (Rust 1.94.0) — https://doc.rust-lang.org/std/env/fn.set_var.html
[2] Rust 2024 Edition Guide: newly-unsafe functions (set_var, remove_var) — https://doc.rust-lang.org/beta/edition-guide/rust-2024/newly-unsafe-functions.html
[3] Evan Jones: “Setenv is not Thread Safe and C Doesn't Want to Fix It” — https://www.evanjones.ca/setenv-is-not-thread-safe.html
[4] Tokio runtime docs: multi-thread scheduler starts worker threads on startup — https://docs.rs/tokio/latest/tokio/runtime/


Don't call std::env::set_var() after the Tokio runtime has started.

The unsafe block is marked as "single-threaded init", but main() is already running under #[tokio::main(flavor = "multi_thread")]. On Unix, process-wide environment mutation from std::env::set_var is unsafe in multi-threaded contexts—other threads (or Rust std itself during DNS lookups) may read the environment concurrently, causing data races. Tokio's multi-thread runtime spawns worker threads on startup, so this code runs in a multi-threaded process where set_var violates its safety preconditions.

Thread the PID-file path through Ad4mConfig or another explicit startup parameter instead.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@cli/src/ad4m_executor.rs` around lines 199 - 205, The code sets AD4M_PID_FILE
via std::env::set_var in ad4m_executor.rs while running under
#[tokio::main(flavor = "multi_thread")], which is unsafe; remove the unsafe
std::env::set_var call and instead thread the PID file path through your startup
config and APIs (e.g., add an Option<PathBuf> pid_file field to Ad4mConfig or an
explicit pid_file parameter on the executor initialization function used in
ad4m_executor.rs), update the call sites that construct/start the executor to
pass the PID path, and have the executor write/clean the PID file from that
explicit value rather than via process-wide env mutation.

}
let tls = if tls_cert_file.is_some() && tls_key_file.is_some() {
Some(TlsConfig {
cert_file_path: tls_cert_file.unwrap(),
Expand Down
11 changes: 11 additions & 0 deletions cli/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,9 @@ enum Domain {
enable_mcp: Option<bool>,
#[arg(long, action)]
mcp_port: Option<u16>,
/// Write the executor PID to this file on startup (removed on clean shutdown).
#[arg(long)]
pid_file: Option<String>,
Comment on lines +166 to +168
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
set -euo pipefail
echo "Run variant definition:"
sed -n '129,169p' cli/src/main.rs
echo
echo "Trailing match arm:"
sed -n '335,352p' cli/src/main.rs

Repository: coasys/ad4m

Length of output: 2064


Update the trailing Domain::Run pattern or this enum change won't compile.

After adding pid_file (along with enable_mcp and mcp_port), the match args.domain arm at the bottom still destructures Domain::Run without these three fields. Use Domain::Run { .. } there, or list the missing fields explicitly.

Minimal fix
-        Domain::Run {
-            app_data_path: _,
-            network_bootstrap_seed: _,
-            language_language_only: _,
-            run_dapp_server: _,
-            gql_port: _,
-            hc_admin_port: _,
-            hc_app_port: _,
-            hc_use_bootstrap: _,
-            hc_use_local_proxy: _,
-            hc_use_mdns: _,
-            hc_use_proxy: _,
-            hc_proxy_url: _,
-            hc_bootstrap_url: _,
-            connect_holochain: _,
-            admin_credential: _,
-            enable_multi_user: _
-        } => unreachable!(),
+        Domain::Run { .. } => unreachable!(),
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@cli/src/main.rs` around lines 166 - 168, The Domain::Run enum variant and the
args.domain match need updating because you added pid_file, enable_mcp, and
mcp_port to the Run variant; update the match arm that matches args.domain (the
Domain::Run arm) to avoid destructuring missing fields by either changing it to
Domain::Run { .. } or listing the new fields explicitly (pid_file, enable_mcp,
mcp_port) so the match compiles and continues to access other fields as before.

},
RunLocalHcServices {},
Eve {
Expand Down Expand Up @@ -243,8 +246,14 @@ async fn main() -> Result<()> {
enable_multi_user,
enable_mcp,
mcp_port,
pid_file,
} = args.domain
{
// Set PID file path as env var so the executor can write/clean it up
if let Some(ref pf) = pid_file {
#[allow(deprecated)]
unsafe { std::env::set_var("AD4M_PID_FILE", pf); }
Comment on lines +252 to +255
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

🧩 Analysis chain

🌐 Web query:

In current Rust, why is std::env::set_varunsafe on Unix, and is it sound to call it from inside a#[tokio::main(flavor = "multi_thread")] function after the runtime has started?

💡 Result:

Why std::env::set_var is unsafe on Unix (in current Rust)

On many Unix platforms (notably Linux with glibc), the process environment is a piece of global mutable state managed by C/POSIX APIs (setenv/unsetenv/getenv) that are not reliably thread-safe. If one thread mutates the environment while another thread (or a library call) reads it, this can cause crashes and other undefined behavior at the process level. Rust cannot enforce (or even reliably detect) that no other thread (including threads created by libraries) might touch the environment, and even Rust/stdlib code may read the environment “behind your back” (e.g. during networking/DNS operations). Therefore, in Rust 2024 edition, std::env::set_var / remove_var are marked unsafe so the caller must uphold the required global precondition. [1], [2]

Is it sound to call it inside #[tokio::main(flavor = "multi_thread")] after the runtime has started?

Generally, no (on typical Unix targets like Linux). A Tokio multi-thread runtime creates worker threads on startup; your async fn main() runs after the executor is running and threads exist. [3], [4] At that point you cannot uphold set_var’s safety requirement (“no other thread might be running and reading the environment”, including via libc / other libraries / std internals), so calling unsafe { std::env::set_var(...) } is not sound in general. [1]

If you need per-task/per-component configuration, prefer explicit configuration objects, globals you control (mutex/once), or pass env vars only to child processes via Command::env(...) instead of mutating the process environment.

Sources: [1] std::env::set_var docs, safety section (doc.rust-lang.org); [2] Rust 2024 edition guide (“Newly unsafe functions”) (doc.rust-lang.org); [3] Tokio runtime docs (multi-thread runtime starts worker threads on startup) (docs.rs); [4] Tokio #[tokio::main] runs async fn main after runtime started (stackoverflow.com).

Citations:


🏁 Script executed:

# Check the context around lines 252-255 in cli/src/main.rs
cat -n cli/src/main.rs | sed -n '240,270p'

Repository: coasys/ad4m

Length of output: 1224


🏁 Script executed:

# Find the main function and check for #[tokio::main] attribute
head -n 300 cli/src/main.rs | grep -A 10 -B 5 "^#\[tokio::main"

Repository: coasys/ad4m

Length of output: 409


Remove the unsafe set_var() call and pass the PID-file path through config instead.

The unsafe { std::env::set_var(...) } call at lines 254–255 is unsound. std::env::set_var is unsafe on Unix because the process environment is global mutable state that is not reliably thread-safe. Since this code runs inside #[tokio::main(flavor = "multi_thread")] after the runtime (and its worker threads) have started, other threads may be reading the environment via libc calls, DNS operations, or stdlib internals at any moment. Mutating it here violates the safety precondition and can cause data races and undefined behavior.

Pass the PID-file path to the executor via the config object (Ad4mConfig) instead of mutating process-wide environment state.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@cli/src/main.rs` around lines 252 - 255, Remove the unsafe environment
mutation: stop calling unsafe std::env::set_var("AD4M_PID_FILE", ...) in the
pid_file handling block; instead add a PID path field to the Ad4mConfig (or
existing config struct) and populate it from pid_file, then ensure the executor
code that previously read AD4M_PID_FILE now reads the PID path from Ad4mConfig
(update the executor initialization to accept the config field). Update
references to AD4M_PID_FILE to use the new config property and remove the unsafe
block entirely.

}
let _ = tokio::spawn(async move {
rust_executor::run(Ad4mConfig {
app_data_path,
Expand All @@ -269,6 +278,8 @@ async fn main() -> Result<()> {
auto_permit_cap_requests: None,
tls: None,
log_holochain_metrics: None,
hc_relay_url: None,
smtp_config: None,
}).await
}).await;

Expand Down
7 changes: 7 additions & 0 deletions rust-executor/src/globals.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,17 @@
use lazy_static::lazy_static;
use std::sync::Mutex;
use tokio::sync::oneshot;

lazy_static! {
/// The current version of AD4M
pub static ref AD4M_VERSION: String = String::from("0.12.0-rc1-dev.2");
}

/// Global shutdown signal sender. Used by `runtime_quit` GQL mutation and signal handlers
/// to trigger a graceful shutdown of the executor.
/// Wrapped in Mutex<Option<...>> so we can take() the sender from a shared static reference.
pub static SHUTDOWN_TX: Mutex<Option<oneshot::Sender<()>>> = Mutex::new(None);

/// Struct representing oldest supported version and indicator if state should be cleared if update is required
pub struct OldestVersion {
pub version: String,
Expand Down
16 changes: 16 additions & 0 deletions rust-executor/src/graphql/graphql_types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -633,6 +633,22 @@ pub struct RuntimeInfo {
pub is_unlocked: bool,
}

/// Readiness status returned by the `runtimeReadiness` query.
/// Each field indicates whether a subsystem has completed initialization.
/// Test harnesses should poll this instead of using `sleep()`.
#[derive(GraphQLObject, Default, Debug, Deserialize, Serialize, Clone)]
#[serde(rename_all = "camelCase")]
pub struct ReadinessStatus {
/// GraphQL server is accepting requests (always true if you can call this query)
pub gql_ready: bool,
/// Holochain conductor is running and connected
pub holochain_ready: bool,
/// Agent has been generated/unlocked
pub agent_initialized: bool,
/// Languages have been loaded into the language controller
pub languages_loaded: bool,
}

#[derive(GraphQLObject, Default, Debug, Deserialize, Serialize, Clone)]
#[serde(rename_all = "camelCase")]
pub struct SentMessage {
Expand Down
12 changes: 11 additions & 1 deletion rust-executor/src/graphql/mutation_resolvers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2413,7 +2413,17 @@ impl Mutation {

async fn runtime_quit(&self, context: &RequestContext) -> FieldResult<bool> {
check_capability(&context.capabilities, &RUNTIME_QUIT_CAPABILITY)?;
std::process::exit(0);
// Trigger graceful shutdown via the global shutdown channel.
// The main loop will shut down Holochain conductor, flush state, and exit cleanly.
// Falls back to process::exit(0) if the channel was already consumed or not set.
if let Some(tx) = crate::globals::SHUTDOWN_TX.lock().unwrap().take() {
log::info!("runtime_quit: sending graceful shutdown signal");
let _ = tx.send(());
Ok(true)
} else {
log::warn!("runtime_quit: shutdown channel unavailable, falling back to process::exit");
std::process::exit(0);
}
}

async fn runtime_remove_friends(
Expand Down
21 changes: 21 additions & 0 deletions rust-executor/src/graphql/query_resolvers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -806,6 +806,27 @@ impl Query {
})
}

/// Returns the readiness status of executor subsystems.
/// Test harnesses should poll this query instead of using `sleep()`.
/// No capability check — readiness is safe to expose publicly.
async fn runtime_readiness(&self, _context: &RequestContext) -> FieldResult<ReadinessStatus> {
let holochain_ready = crate::holochain_service::maybe_get_holochain_service()
.await
.is_some();
Comment on lines +813 to +815
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

holochain_ready currently signals presence, not actual readiness.

Line 813 only checks whether a service handle exists. That can become true before the conductor is fully operational, so pollers may proceed prematurely.

Proposed fix
-        let holochain_ready = crate::holochain_service::maybe_get_holochain_service()
-            .await
-            .is_some();
+        let holochain_ready = if let Some(interface) =
+            crate::holochain_service::maybe_get_holochain_service().await
+        {
+            // Use a lightweight RPC as readiness probe, not just handle existence.
+            interface.get_network_metrics().await.is_ok()
+        } else {
+            false
+        };
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@rust-executor/src/graphql/query_resolvers.rs` around lines 813 - 815, The
current assignment to holochain_ready only checks handle presence via
maybe_get_holochain_service().await.is_some(), which can be true before the
conductor is operational; change this to obtain the service handle (call
maybe_get_holochain_service().await) and then perform an explicit readiness
check on the returned service (e.g. await a provided readiness/status method
like is_ready().await or status().await and compare to Ready), optionally with a
short poll/timeout loop if the API is asynchronous, and set holochain_ready
based on that readiness result rather than mere presence.


let (agent_initialized, languages_loaded) =
AgentService::with_global_instance(|agent_service| {
(agent_service.is_initialized(), agent_service.is_unlocked())
});
Comment on lines +817 to +820
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

languages_loaded is mapped to wallet unlock state, not language-controller state.

Line 819 assigns languages_loaded from agent_service.is_unlocked(), which does not verify whether languages are loaded. This can return incorrect readiness.

Proposed fix
-        let (agent_initialized, languages_loaded) =
-            AgentService::with_global_instance(|agent_service| {
-                (agent_service.is_initialized(), agent_service.is_unlocked())
-            });
+        let agent_initialized =
+            AgentService::with_global_instance(|agent_service| agent_service.is_initialized());
+
+        let languages_loaded = {
+            let controller = LanguageController::global_instance();
+            // Replace with a stricter internal readiness signal if available.
+            !controller.get_installed_languages(None).await.is_empty()
+        };

Also applies to: 826-826

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@rust-executor/src/graphql/query_resolvers.rs` around lines 817 - 820, The
variable languages_loaded is incorrectly derived from
AgentService::with_global_instance using agent_service.is_unlocked(); replace
that check with the language-controller's loaded state: inside
AgentService::with_global_instance, call the appropriate language controller
accessor (e.g., agent_service.language_controller() or similar) and use its
is_loaded()/are_languages_loaded() method to set languages_loaded instead of
is_unlocked(); update both occurrences (the one assigning languages_loaded at
the shown block and the similar occurrence around line 826) so languages_loaded
reflects the language-controller state rather than wallet/unlock state.


Ok(ReadinessStatus {
gql_ready: true, // If this query returns, GQL is ready
holochain_ready,
agent_initialized,
languages_loaded, // Currently maps to agent unlocked (languages load during unlock)
})
}

async fn runtime_known_link_language_templates(
&self,
context: &RequestContext,
Expand Down
69 changes: 69 additions & 0 deletions rust-executor/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ pub mod types;
use std::thread::JoinHandle;

use log::{error, info, warn};
use tokio::sync::oneshot;

use crate::{
agent::AgentService, ai_service::AIService, dapp_server::serve_dapp, db::Ad4mDb,
Expand Down Expand Up @@ -190,11 +191,79 @@ pub async fn run(mut config: Ad4mConfig) -> JoinHandle<()> {
}
}

// Set up graceful shutdown channel.
// The sender is stored globally so runtime_quit and signal handlers can trigger shutdown.
let (shutdown_tx, shutdown_rx) = oneshot::channel::<()>();
{
let mut guard = crate::globals::SHUTDOWN_TX.lock().unwrap();
*guard = Some(shutdown_tx);
}

// Spawn a task that listens for OS signals (SIGTERM/SIGINT) and triggers shutdown.
// This replaces the old ctrlc handler in the CLI binaries with an in-executor handler
// that allows graceful cleanup of Holochain conductor and databases.
#[cfg(unix)]
{
tokio::spawn(async {
use tokio::signal;
let ctrl_c = signal::ctrl_c();
let mut sigterm = signal::unix::signal(signal::unix::SignalKind::terminate())
.expect("failed to install SIGTERM handler");

tokio::select! {
_ = ctrl_c => info!("Received SIGINT, initiating graceful shutdown..."),
_ = sigterm.recv() => info!("Received SIGTERM, initiating graceful shutdown..."),
}

// Trigger shutdown via the global channel
if let Some(tx) = crate::globals::SHUTDOWN_TX.lock().unwrap().take() {
let _ = tx.send(());
}
});
}

// Spawn the shutdown handler that waits for the signal and cleans up
tokio::spawn(async move {
if shutdown_rx.await.is_ok() {
info!("Shutdown signal received, cleaning up...");

// 1. Shut down Holochain conductor gracefully
if let Some(holochain_service) = holochain_service::maybe_get_holochain_service().await
{
info!("Shutting down Holochain conductor...");
match holochain_service.shutdown().await {
Ok(()) => info!("Holochain conductor shut down cleanly"),
Err(e) => warn!("Error shutting down Holochain conductor: {}", e),
}
}

// 2. Write PID file removal if it exists
if let Ok(pid_file) = std::env::var("AD4M_PID_FILE") {
let _ = std::fs::remove_file(&pid_file);
info!("Removed PID file: {}", pid_file);
}

info!("Graceful shutdown complete, exiting.");
std::process::exit(0);
}
});

// Initialize logging for CLI (stdout)
// Respects RUST_LOG environment variable if set
crate::logging::init_cli_logging(None);
config.prepare();

// Write PID file if requested via environment variable or config.
// Test harnesses can set AD4M_PID_FILE to get a reliable PID for targeted cleanup.
if let Ok(pid_file) = std::env::var("AD4M_PID_FILE") {
let pid = std::process::id();
if let Err(e) = std::fs::write(&pid_file, pid.to_string()) {
warn!("Failed to write PID file {}: {}", pid_file, e);
} else {
info!("Wrote PID {} to {}", pid, pid_file);
}
}

// Store config globally so services (e.g. agent mutation resolvers) can access it
crate::config::set_global_config(config.clone());

Expand Down
16 changes: 15 additions & 1 deletion tests/integration.bats
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ setup_file() {
echo "done." >&3
echo "Starting agent 1..." >&3
./target/release/ad4m run --app-data-path ${current_dir}/tests/ad4m1 --gql-port 4000 &
AD4M_PID=$!
export AD4M_PID
sleep 5
echo "done." >&3

Expand All @@ -31,7 +33,19 @@ setup_file() {
}

teardown_file() {
killall ad4m
# Graceful shutdown: SIGTERM first, then escalate to SIGKILL if needed.
# Never use `killall ad4m` — it kills ALL ad4m processes on the machine,
# including other CI jobs and dev instances.
if [ -n "$AD4M_PID" ]; then
kill -TERM "$AD4M_PID" 2>/dev/null || true
for i in $(seq 1 10); do
kill -0 "$AD4M_PID" 2>/dev/null || break
sleep 1
done
kill -9 "$AD4M_PID" 2>/dev/null || true
fi
# Port-based fallback in case PID tracking missed something
lsof -ti:4000 | xargs -r kill -9 2>/dev/null || true
Comment on lines +47 to +48
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

The port fallback can kill an unrelated process.

If PID tracking misses the executor, lsof -ti:4000 | kill -9 will SIGKILL whatever owns port 4000 at teardown, including another local service or CI job. Keep the fallback scoped to the recorded executor PID / pid-file, or verify the command name before killing.

Based on learnings "Kill any lingering ad4m-executor processes before running integration tests to avoid port conflicts".

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@tests/integration.bats` around lines 47 - 48, The current port fallback
unconditionally kills whatever owns port 4000; change it to target only the
recorded executor PID (from the pid-file used by the test harness) and verify
the process is the executor before killing: read the pid from the existing
pid-file (or the variable that stores the executor PID), check that
/proc/<pid>/cmdline (or ps -p <pid> -o comm=) contains "ad4m-executor" and that
the pid actually listens on :4000, then kill that PID; if no pid-file exists
fall back to using lsof but filter results to PIDs whose command matches
"ad4m-executor" before invoking kill. Ensure you update the teardown code that
currently runs `lsof -ti:4000 | xargs -r kill -9` to this safer,
PID-and-command-verified sequence.

}

setup() {
Expand Down
34 changes: 5 additions & 29 deletions tests/js/tests/multi-user-simple.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import fs from "fs-extra";
import { fileURLToPath } from 'url';
import * as chai from "chai";
import chaiAsPromised from "chai-as-promised";
import { apolloClient, sleep, startExecutor, runHcLocalServices } from "../utils/utils";
import { apolloClient, sleep, startExecutor, runHcLocalServices, gracefulShutdown } from "../utils/utils";
import { ChildProcess } from 'node:child_process';
import fetch from 'node-fetch'
import { LinkQuery } from "@coasys/ad4m";
Expand Down Expand Up @@ -60,20 +60,8 @@ describe("Multi-User Simple integration tests", () => {
})

after(async () => {
if (executorProcess) {
while (!executorProcess?.killed) {
let status = executorProcess?.kill();
console.log("killed executor with", status);
await sleep(500);
}
}
if (localServicesProcess) {
while (!localServicesProcess?.killed) {
let status = localServicesProcess?.kill();
console.log("killed local services with", status);
await sleep(500);
}
}
await gracefulShutdown(executorProcess, "executor");
await gracefulShutdown(localServicesProcess, "local services");
})

describe("Multi-User Configuration", () => {
Expand Down Expand Up @@ -1778,13 +1766,7 @@ describe("Multi-User Simple integration tests", () => {

after(async function() {
this.timeout(20000);
if (node2ExecutorProcess) {
while (!node2ExecutorProcess?.killed) {
let status = node2ExecutorProcess?.kill();
console.log("killed node 2 executor with", status);
await sleep(500);
}
}
await gracefulShutdown(node2ExecutorProcess, "node 2 executor");
});

it("should return all DIDs in 'others()' for each user", async function() {
Expand Down Expand Up @@ -2861,13 +2843,7 @@ describe("Multi-User Simple integration tests", () => {

after(async function() {
this.timeout(20000);
if (node3ExecutorProcess) {
while (!node3ExecutorProcess?.killed) {
let status = node3ExecutorProcess?.kill();
console.log("killed node 3 executor with", status);
await sleep(500);
}
}
await gracefulShutdown(node3ExecutorProcess, "node 3 executor");
});

it("should route signals between remote main agent and local managed user", async function() {
Expand Down
Loading