Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 32 additions & 4 deletions scripts/config.template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,19 @@ aggregator:

# Proof service configuration
proof_service_addr: "http://localhost:50052" # CHANGE THIS - Proof service address for aggregator to connect to

# NUMA configuration for aggregator (optional, uses global numa if not specified)
numa:
cpuset_cpus: "62-123" # CPU cores to use for aggregator
cpuset_mems: "1" # NUMA memory node for aggregator

# =============================================================================
# WORKER CONFIGURATION
# =============================================================================
workers:
# List of worker machines
# Each worker needs: host, user, port, worker_id, index, remote_dir
# Optionally specify numa.cpuset_cpus and numa.cpuset_mems per worker
# CHANGE ALL THE HOST IPs, PORTS, AND WORKER IDs BELOW TO MATCH YOUR SETUP

- host: "192.168.1.11" # CHANGE THIS to your worker 1 IP
Expand All @@ -40,48 +46,69 @@ workers:
worker_id: "worker1" # CHANGE THIS to your worker name
index: 0
remote_dir: "/home/ubuntu/brevis"
numa: # Optional per-worker NUMA configuration
cpuset_cpus: "62-123" # CPU cores to use (uses global numa if not specified)
cpuset_mems: "1" # NUMA memory node (uses global numa if not specified)

- host: "192.168.1.12" # CHANGE THIS to your worker 2 IP
user: "ubuntu"
port: 22 # SSH port (default: 22)
worker_id: "worker2" # CHANGE THIS to your worker name
index: 1
remote_dir: "/home/ubuntu/brevis"
numa:
cpuset_cpus: "62-123"
cpuset_mems: "1"

- host: "192.168.1.13" # CHANGE THIS to your worker 3 IP
user: "ubuntu"
port: 22 # SSH port (default: 22)
worker_id: "worker3" # CHANGE THIS to your worker name
index: 2
remote_dir: "/home/ubuntu/brevis"
numa:
cpuset_cpus: "62-123"
cpuset_mems: "1"

- host: "192.168.1.14" # CHANGE THIS to your worker 4 IP
user: "ubuntu"
port: 22 # SSH port (default: 22)
worker_id: "worker4" # CHANGE THIS to your worker name
index: 3
remote_dir: "/home/ubuntu/brevis"
numa:
cpuset_cpus: "62-123"
cpuset_mems: "1"

- host: "192.168.1.15" # CHANGE THIS to your worker 5 IP
user: "ubuntu"
port: 22 # SSH port (default: 22)
worker_id: "worker5" # CHANGE THIS to your worker name
index: 4
remote_dir: "/home/ubuntu/brevis"
numa:
cpuset_cpus: "62-123"
cpuset_mems: "1"

- host: "192.168.1.16" # CHANGE THIS to your worker 6 IP
user: "ubuntu"
port: 22 # SSH port (default: 22)
worker_id: "worker6" # CHANGE THIS to your worker name
index: 5
remote_dir: "/home/ubuntu/brevis"
numa:
cpuset_cpus: "62-123"
cpuset_mems: "1"

- host: "192.168.1.17" # CHANGE THIS to your worker 7 IP
user: "ubuntu"
port: 22 # SSH port (default: 22)
worker_id: "worker7" # CHANGE THIS to your worker name
index: 6
remote_dir: "/home/ubuntu/brevis"
numa:
cpuset_cpus: "62-123"
cpuset_mems: "1"

# =============================================================================
# PATH CONFIGURATION
Expand Down Expand Up @@ -125,13 +152,14 @@ docker:
worker: ".env.subblock"

# =============================================================================
# NUMA CONFIGURATION
# NUMA CONFIGURATION (Global Defaults)
# =============================================================================
numa:
# NUMA node settings for CPU and memory binding
# Default NUMA node settings for CPU and memory binding
# These are used when per-machine NUMA settings are not specified
# Adjust based on your hardware configuration
cpuset_cpus: "62-123" # CPU cores to use (e.g., "0-31" or "62-123")
cpuset_mems: "1" # NUMA memory node (e.g., "0" or "1")
cpuset_cpus: "62-123" # Default CPU cores to use (e.g., "0-31" or "62-123")
cpuset_mems: "1" # Default NUMA memory node (e.g., "0" or "1")

# =============================================================================
# SSH CONFIGURATION
Expand Down
65 changes: 48 additions & 17 deletions scripts/docker-common-config.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ load_yaml_config() {
"export CONTAINER_CACHE_MOUNT=\"" + .paths.container_cache_mount + "\"",
"export LOGS_DIR=\"" + .paths.logs_dir + "\"",
"export DOCKER_PREFIX=\"" + .docker.prefix + "\"",
"export CPUSET_CPUS=\"" + .numa.cpuset_cpus + "\"",
"export CPUSET_MEMS=\"" + .numa.cpuset_mems + "\"",
"export GLOBAL_CPUSET_CPUS=\"" + .numa.cpuset_cpus + "\"",
"export GLOBAL_CPUSET_MEMS=\"" + .numa.cpuset_mems + "\"",
"export SSH_CONNECT_TIMEOUT=\"" + (.ssh.connect_timeout | tostring) + "\"",
"export SSH_CONTROL_PERSIST=\"" + .ssh.control_persist + "\"",
"export SSH_MAX_RETRIES=\"" + (.ssh.max_retries | tostring) + "\"",
Expand All @@ -42,12 +42,34 @@ load_yaml_config() {
"export LOG_DATE_FORMAT=\"" + .performance.log_date_format + "\""
' "$config_file" 2>/dev/null)"

# Load workers array from YAML
# Load aggregator NUMA settings (fallback to global if not specified)
AGG_CPUSET_CPUS=$(yq eval '.aggregator.numa.cpuset_cpus // .numa.cpuset_cpus' "$config_file" 2>/dev/null)
AGG_CPUSET_MEMS=$(yq eval '.aggregator.numa.cpuset_mems // .numa.cpuset_mems' "$config_file" 2>/dev/null)
export AGG_CPUSET_CPUS
export AGG_CPUSET_MEMS

# Load workers array from YAML with per-worker NUMA settings
if command -v yq &> /dev/null 2>&1; then
# Create workers array from YAML
# Create workers array from YAML with NUMA settings
local workers_data
workers_data=$(yq eval '.workers[] | .host + " " + .user + " " + (.port | tostring) + " " + .worker_id + " " + (.index | tostring) + " " + .remote_dir' "$config_file" 2>/dev/null)


# Export variables for yq (compatible with older yq versions)
export YQ_GLOBAL_CPUS="$GLOBAL_CPUSET_CPUS"
export YQ_GLOBAL_MEMS="$GLOBAL_CPUSET_MEMS"

workers_data=$(yq eval '.workers[] |
.host + " " +
.user + " " +
(.port | tostring) + " " +
.worker_id + " " +
(.index | tostring) + " " +
.remote_dir + " " +
(.numa.cpuset_cpus // env(YQ_GLOBAL_CPUS)) + " " +
(.numa.cpuset_mems // env(YQ_GLOBAL_MEMS))' \
"$config_file" 2>/dev/null)

unset YQ_GLOBAL_CPUS YQ_GLOBAL_MEMS

if [[ -n "$workers_data" ]]; then
# Convert to array
WORKERS=()
Expand Down Expand Up @@ -80,16 +102,29 @@ init_config() {
AGG_PORT="${AGG_PORT:-22}"
AGG_REMOTE_DIR="${AGG_REMOTE_DIR:-/home/ubuntu/brevis}"

# --- NUMA Configuration (Global Defaults) ---
GLOBAL_CPUSET_CPUS="${GLOBAL_CPUSET_CPUS:-62-123}"
GLOBAL_CPUSET_MEMS="${GLOBAL_CPUSET_MEMS:-1}"

# Aggregator NUMA (fallback to global)
AGG_CPUSET_CPUS="${AGG_CPUSET_CPUS:-$GLOBAL_CPUSET_CPUS}"
AGG_CPUSET_MEMS="${AGG_CPUSET_MEMS:-$GLOBAL_CPUSET_MEMS}"

# --- Worker Configuration ---
# Ensure WORKERS array is defined to avoid issues with set -u
if [[ -z "${WORKERS+x}" ]]; then
WORKERS=()
fi

if [[ ${#WORKERS[@]} -eq 0 ]]; then
WORKERS=(
"192.168.1.11 ubuntu 22 worker1 0 /home/ubuntu/brevis"
"192.168.1.12 ubuntu 22 worker2 1 /home/ubuntu/brevis"
"192.168.1.13 ubuntu 22 worker3 2 /home/ubuntu/brevis"
"192.168.1.14 ubuntu 22 worker4 3 /home/ubuntu/brevis"
"192.168.1.15 ubuntu 22 worker5 4 /home/ubuntu/brevis"
"192.168.1.16 ubuntu 22 worker6 5 /home/ubuntu/brevis"
"192.168.1.17 ubuntu 22 worker7 6 /home/ubuntu/brevis"
"192.168.1.11 ubuntu 22 worker1 0 /home/ubuntu/brevis $GLOBAL_CPUSET_CPUS $GLOBAL_CPUSET_MEMS"
"192.168.1.12 ubuntu 22 worker2 1 /home/ubuntu/brevis $GLOBAL_CPUSET_CPUS $GLOBAL_CPUSET_MEMS"
"192.168.1.13 ubuntu 22 worker3 2 /home/ubuntu/brevis $GLOBAL_CPUSET_CPUS $GLOBAL_CPUSET_MEMS"
"192.168.1.14 ubuntu 22 worker4 3 /home/ubuntu/brevis $GLOBAL_CPUSET_CPUS $GLOBAL_CPUSET_MEMS"
"192.168.1.15 ubuntu 22 worker5 4 /home/ubuntu/brevis $GLOBAL_CPUSET_CPUS $GLOBAL_CPUSET_MEMS"
"192.168.1.16 ubuntu 22 worker6 5 /home/ubuntu/brevis $GLOBAL_CPUSET_CPUS $GLOBAL_CPUSET_MEMS"
"192.168.1.17 ubuntu 22 worker7 6 /home/ubuntu/brevis $GLOBAL_CPUSET_CPUS $GLOBAL_CPUSET_MEMS"
)
fi

Expand All @@ -103,10 +138,6 @@ init_config() {
# --- Docker Configuration ---
DOCKER_PREFIX="${DOCKER_PREFIX:-sudo docker}"

# --- NUMA Configuration ---
CPUSET_CPUS="${CPUSET_CPUS:-62-123}"
CPUSET_MEMS="${CPUSET_MEMS:-1}"

# --- SSH Configuration ---
SSH_CONNECT_TIMEOUT="${SSH_CONNECT_TIMEOUT:-30}"
SSH_CONTROL_DIR="${SSH_CONTROL_DIR:-${HOME}/.ssh/control}"
Expand Down
40 changes: 22 additions & 18 deletions scripts/docker-common.sh
Original file line number Diff line number Diff line change
Expand Up @@ -218,11 +218,11 @@ get_worker() {
return 1
}

# Parse worker spec: "HOST USER PORT WORKER_ID INDEX REMOTE_DIR"
# Parse worker spec: "HOST USER PORT WORKER_ID INDEX REMOTE_DIR CPUSET_CPUS CPUSET_MEMS"
parse_worker_spec() {
local spec="$1"
read -r host user port wid idx remote_dir <<< "$spec"
echo "$host" "$user" "$port" "$wid" "$idx" "$remote_dir"
read -r host user port wid idx remote_dir cpuset_cpus cpuset_mems <<< "$spec"
echo "$host" "$user" "$port" "$wid" "$idx" "$remote_dir" "$cpuset_cpus" "$cpuset_mems"
}

# Build expected workers and indices CSV lists for aggregator configuration
Expand All @@ -231,7 +231,7 @@ build_worker_lists() {
local indices=()

for worker_spec in "${WORKERS[@]}"; do
read -r host user port wid idx remote_dir <<< "$worker_spec"
read -r host user port wid idx remote_dir cpuset_cpus cpuset_mems <<< "$worker_spec"
worker_ids+=("$wid")
indices+=("$idx")
done
Expand Down Expand Up @@ -483,6 +483,8 @@ force_kill_all_containers() {
build_docker_run_opts() {
local container_name="$1"
local env_file="$2"
local cpuset_cpus="$3"
local cpuset_mems="$4"

echo "-d \
--name $container_name \
Expand All @@ -494,8 +496,8 @@ build_docker_run_opts() {
--env-file $env_file \
-v ${PERF_DATA_DIR}:${CONTAINER_DATA_MOUNT}:ro \
-v ${PROGRAM_CACHE_FILE}:${CONTAINER_CACHE_MOUNT}:rw \
--cpuset-cpus='${CPUSET_CPUS}' \
--cpuset-mems='${CPUSET_MEMS}'"
--cpuset-cpus='${cpuset_cpus}' \
--cpuset-mems='${cpuset_mems}'"
}

# =============================================================================
Expand All @@ -521,7 +523,7 @@ start_aggregator() {

log "Starting aggregator on ${AGG_USER}@${AGG_HOST}..."

local docker_opts=$(build_docker_run_opts "$CONTAINER_NAME_AGGREGATOR" "$env_file")
local docker_opts=$(build_docker_run_opts "$CONTAINER_NAME_AGGREGATOR" "$env_file" "$AGG_CPUSET_CPUS" "$AGG_CPUSET_MEMS")

ssh_exec "$AGG_USER" "$AGG_HOST" "$AGG_PORT" "cd '$AGG_REMOTE_DIR' && \
$DOCKER_PREFIX run $docker_opts $IMAGE_NAME_AGGREGATOR"
Expand Down Expand Up @@ -640,11 +642,13 @@ start_worker() {
local port="$3"
local wid="$4"
local remote_dir="$5"
local env_file="${6:-$ENV_FILE_WORKER}"
local cpuset_cpus="$6"
local cpuset_mems="$7"
local env_file="${8:-$ENV_FILE_WORKER}"

log "Starting worker $wid on ${user}@${host}..."

local docker_opts=$(build_docker_run_opts "$CONTAINER_NAME_WORKER" "$env_file")
local docker_opts=$(build_docker_run_opts "$CONTAINER_NAME_WORKER" "$env_file" "$cpuset_cpus" "$cpuset_mems")

ssh_exec "$user" "$host" "$port" "cd '$remote_dir' && \
$DOCKER_PREFIX run $docker_opts $IMAGE_NAME_WORKER"
Expand All @@ -670,7 +674,7 @@ stop_all_workers() {
log "Stopping all ${#WORKERS[@]} workers..."

for worker_spec in "${WORKERS[@]}"; do
read -r host user port wid idx remote_dir <<< "$worker_spec"
read -r host user port wid idx remote_dir cpuset_cpus cpuset_mems <<< "$worker_spec"
stop_worker "$host" "$user" "$port" "$wid" "$save_logs" "$remote_dir"
apply_worker_delay
done
Expand All @@ -683,8 +687,8 @@ start_all_workers() {
log "Starting all ${#WORKERS[@]} workers..."

for worker_spec in "${WORKERS[@]}"; do
read -r host user port wid idx remote_dir <<< "$worker_spec"
start_worker "$host" "$user" "$port" "$wid" "$remote_dir"
read -r host user port wid idx remote_dir cpuset_cpus cpuset_mems <<< "$worker_spec"
start_worker "$host" "$user" "$port" "$wid" "$remote_dir" "$cpuset_cpus" "$cpuset_mems"
apply_worker_delay
done

Expand Down Expand Up @@ -713,7 +717,7 @@ start_all_workers() {
# Get status of all worker containers
get_all_worker_status() {
for worker_spec in "${WORKERS[@]}"; do
read -r host user port wid idx remote_dir <<< "$worker_spec"
read -r host user port wid idx remote_dir cpuset_cpus cpuset_mems <<< "$worker_spec"
log "Worker $wid status on ${user}@${host}:"

# Step 1: Get running containers
Expand Down Expand Up @@ -760,7 +764,7 @@ cleanup_all_workers() {
log "Cleaning up all ${#WORKERS[@]} workers..."

for worker_spec in "${WORKERS[@]}"; do
read -r host user port wid idx remote_dir <<< "$worker_spec"
read -r host user port wid idx remote_dir cpuset_cpus cpuset_mems <<< "$worker_spec"
cleanup_worker "$host" "$user" "$port" "$wid"
apply_worker_delay
done
Expand All @@ -786,7 +790,7 @@ force_kill_all_workers() {

local failures=0
for worker_spec in "${WORKERS[@]}"; do
read -r host user port wid idx remote_dir <<< "$worker_spec"
read -r host user port wid idx remote_dir cpuset_cpus cpuset_mems <<< "$worker_spec"
if ! force_kill_worker "$host" "$user" "$port" "$wid"; then
((failures++))
fi
Expand Down Expand Up @@ -868,7 +872,7 @@ verify_all_containers_gone() {

# Check workers
for worker_spec in "${WORKERS[@]}"; do
read -r host user port wid idx remote_dir <<< "$worker_spec"
read -r host user port wid idx remote_dir cpuset_cpus cpuset_mems <<< "$worker_spec"
if ! is_container_gone "$host" "$user" "$CONTAINER_NAME_WORKER" "$port"; then
error "Worker $wid container still exists on ${user}@${host}"
((failures++))
Expand Down Expand Up @@ -920,7 +924,7 @@ init_all_ssh_connections() {

# Establish connections to all workers (in background)
for worker_spec in "${WORKERS[@]}"; do
read -r host user port wid idx remote_dir <<< "$worker_spec"
read -r host user port wid idx remote_dir cpuset_cpus cpuset_mems <<< "$worker_spec"
establish_ssh_connection "$user" "$host" "$port" &
done

Expand All @@ -935,7 +939,7 @@ close_all_ssh_connections() {

# Close worker connections
for worker_spec in "${WORKERS[@]}"; do
read -r host user port wid idx remote_dir <<< "$worker_spec"
read -r host user port wid idx remote_dir cpuset_cpus cpuset_mems <<< "$worker_spec"
close_ssh_connection "$user" "$host" "$port"
done
}
Expand Down
4 changes: 2 additions & 2 deletions scripts/docker-multi-control.sh
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ EOF
case "$mode" in
all|workers)
for worker_spec in "${WORKERS[@]}"; do
read -r host user port wid idx remote_dir <<< "$worker_spec"
read -r host user port wid idx remote_dir cpuset_cpus cpuset_mems <<< "$worker_spec"
log "Removing worker image on ${user}@${host} (worker $wid)..."
remove_image_with_dependencies "$host" "$user" "$port" "$CONTAINER_NAME_WORKER" "$IMAGE_NAME_WORKER" || true
apply_worker_delay
Expand Down Expand Up @@ -269,7 +269,7 @@ cmd_save_logs() {

# Save worker logs
for worker_spec in "${WORKERS[@]}"; do
read -r host user port wid idx remote_dir <<< "$worker_spec"
read -r host user port wid idx remote_dir cpuset_cpus cpuset_mems <<< "$worker_spec"
local worker_log="${remote_dir}/${LOGS_DIR}/subblock-${wid}-manual-${timestamp}.log"
save_container_logs "$host" "$user" "$CONTAINER_NAME_WORKER" "$worker_log" "$port" || true
done
Expand Down
4 changes: 2 additions & 2 deletions scripts/docker-multi-deploy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ deploy_workers() {

# Deploy to each worker
for worker_spec in "${WORKERS[@]}"; do
read -r host user port wid idx remote_dir <<< "$worker_spec"
read -r host user port wid idx remote_dir cpuset_cpus cpuset_mems <<< "$worker_spec"

log ""
log "Deploying to worker $wid..."
Expand Down Expand Up @@ -277,7 +277,7 @@ verify_deployment() {
# Verify workers
if [[ "$mode" == "all" ]] || [[ "$mode" == "workers" ]]; then
for worker_spec in "${WORKERS[@]}"; do
read -r host user port wid idx remote_dir <<< "$worker_spec"
read -r host user port wid idx remote_dir cpuset_cpus cpuset_mems <<< "$worker_spec"
log "Verifying worker $wid image..."
if ssh_exec "$user" "$host" "$port" "$DOCKER_PREFIX images | grep -q '${IMAGE_NAME_WORKER%:*}'"; then
log "✓ Worker $wid image verified: $IMAGE_NAME_WORKER"
Expand Down
2 changes: 1 addition & 1 deletion scripts/docker-multi-reset-chunk-size.sh
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ reset_all_env_files() {

# Update worker envs
for worker_spec in "${WORKERS[@]}"; do
read -r host user port wid idx remote_dir <<< "$worker_spec"
read -r host user port wid idx remote_dir cpuset_cpus cpuset_mems <<< "$worker_spec"
local worker_env="${remote_dir}/${ENV_FILE_WORKER}"
reset_env_chunk_size "$host" "$user" "$worker_env" "$chunk_size" "$port"
done
Expand Down
Loading