Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,16 @@ Every Ansible deployment automatically deploys an observability stack alongside
- Every subnet contains the same set of client types
- `N=1` renames nodes to `{client}_0` with no port changes (useful for canonical naming)
- Example: `NETWORK_DIR=ansible-devnet ./spin-node.sh --node all --subnets 3 --sshKey ~/.ssh/id_ed25519 --useRoot`
17. `--replace-with` comma-separated list of replacement node names, positionally matched 1:1 with `--restart-client`. Swaps client implementations while keeping the same validator slot, keys, and server. Updates `validator-config.yaml`, `validators.yaml`, `annotated_validators.yaml`, and renames `.key` files. Leanpoint is re-synced when replacements occur.
- Example: `--restart-client zeam_0 --replace-with ream_0` → replaces zeam_0 with ream_0
- Example: `--restart-client zeam_0,ream_0 --replace-with ream_1` → ream_1 replaces zeam_0, ream_0 just restarts
- Empty entries skip that position: `--restart-client zeam_0,ream_0 --replace-with ,ream_1` → zeam_0 restarts, ream_0 replaced with ream_1
18. `--logs` enables run logging. When specified:
- Appends UTC-timestamped START/END entries with duration and log file path to `tmp/devnet.log`
- Duplicates console output to a timestamped log file in `tmp/`:
- `tmp/local-run-DD-MM-YYYY-HH-MM.log` for local deployments
- `tmp/ansible-run-DD-MM-YYYY-HH-MM.log` for Ansible deployments
- Example: `NETWORK_DIR=local-devnet ./spin-node.sh --node all --logs`

### Preparing remote servers

Expand Down Expand Up @@ -334,6 +344,16 @@ NETWORK_DIR=local-devnet ./spin-node.sh --restart-client zeam_0 \
2. Data directories are cleared
3. Clients are started with `--checkpoint-sync-url` so they sync from the remote checkpoint instead of genesis

**Replacing a client during restart:**

```sh
# Replace zeam_0 with ream_0 (same validator slot, keys, and server)
NETWORK_DIR=ansible-devnet ./spin-node.sh --restart-client zeam_0 --replace-with ream_0 --useRoot

# Replace first node, just restart second
NETWORK_DIR=local-devnet ./spin-node.sh --restart-client zeam_0,ream_0 --replace-with qlean_0
```

**Deployment modes:**
- **Local** (`NETWORK_DIR=local-devnet`): Uses Docker directly
- **Ansible** (`NETWORK_DIR=ansible-devnet`): Uses Ansible to deploy to remote hosts
Expand Down
2 changes: 1 addition & 1 deletion ansible/playbooks/copy-genesis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@
_assignments: "{{ _av[inventory_hostname] | default([]) }}"
_base: "{{ _assignments | map(attribute='privkey_file') | map('regex_replace', '_sk\\.ssz$', '') | list }}"
set_fact:
node_hash_sig_files: "{{ _base | product(['_sk.ssz', '_pk.ssz', '_sk.json', '_pk.json']) | map('join') | list }}"
node_hash_sig_files: "{{ (_base | product(['_sk.ssz', '_pk.ssz', '_sk.json', '_pk.json']) | map('join') | list) + ['validator-keys-manifest.yaml'] }}"
when: hash_sig_keys_stat.stat.exists

- name: Create hash-sig-keys directory on remote
Expand Down
38 changes: 37 additions & 1 deletion ansible/playbooks/deploy-nodes.yml
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@
loop:
- config.yaml
- validators.yaml
- annotated_validators.yaml
- nodes.yaml
- genesis.ssz
- genesis.json
Expand All @@ -174,7 +175,7 @@
_assignments: "{{ _av[node_name] | default([]) }}"
_base: "{{ _assignments | map(attribute='privkey_file') | map('regex_replace', '_sk\\.ssz$', '') | list }}"
set_fact:
node_hash_sig_files: "{{ _base | product(['_sk.ssz', '_pk.ssz', '_sk.json', '_pk.json']) | map('join') | list }}"
node_hash_sig_files: "{{ (_base | product(['_sk.ssz', '_pk.ssz', '_sk.json', '_pk.json']) | map('join') | list) + ['validator-keys-manifest.yaml'] }}"
when: hash_sig_keys_local.stat.exists
tags:
- deploy
Expand Down Expand Up @@ -242,3 +243,38 @@
include_tasks: helpers/deploy-single-node.yml
tags:
- deploy

# When --replace-with is used, sync updated config yamls to all remote hosts
- name: Sync updated config files to all hosts after replacement
hosts: all:!local
gather_facts: no
vars:
genesis_dir: "{{ remote_genesis_dir | default('/opt/lean-quickstart/genesis') }}"
local_genesis_dir: "{{ hostvars['localhost']['local_genesis_dir_path'] }}"
tasks:
- name: Sync config yamls to all hosts
copy:
src: "{{ local_genesis_dir }}/{{ item }}"
dest: "{{ genesis_dir }}/{{ item }}"
mode: '0644'
force: yes
loop:
- validators.yaml
- annotated_validators.yaml
- nodes.yaml
- validator-config.yaml
when: sync_all_hosts | default(false) | bool
tags:
- deploy
- sync

- name: Sync validator-keys-manifest to all hosts
copy:
src: "{{ local_genesis_dir }}/hash-sig-keys/validator-keys-manifest.yaml"
dest: "{{ genesis_dir }}/hash-sig-keys/validator-keys-manifest.yaml"
mode: '0644'
force: yes
when: sync_all_hosts | default(false) | bool
tags:
- deploy
- sync
4 changes: 4 additions & 0 deletions ansible/playbooks/stop-nodes.yml
Original file line number Diff line number Diff line change
Expand Up @@ -110,3 +110,7 @@
msg: "Container {{ node_name }} not found (may already be stopped)"
when: container_check.stdout != node_name

- name: Clean node data directory
raw: rm -rf {{ remote_data_dir | default('/opt/lean-quickstart/data') }}/{{ node_name }}
when: clean_data | default(false) | bool

1 change: 1 addition & 0 deletions ansible/roles/observability/defaults/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ cadvisor_port: 9098
node_exporter_port: 9100
remote_write_url: "http://46.225.10.32:9090/api/v1/write"
loki_push_url: "http://46.225.10.32:3100/loki/api/v1/push"
pushgateway_url: "http://46.225.10.32:9091"
2 changes: 1 addition & 1 deletion client-cmds/nlean-cmd.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
# NLEAN_REPO should point to this repository when lean-quickstart is outside this workspace.
# Default assumes sibling checkouts: <workspace>/nlean and <workspace>/lean-quickstart.
nlean_repo="${NLEAN_REPO:-$scriptDir/../nlean}"
nlean_docker_image="${NLEAN_DOCKER_IMAGE:-ghcr.io/nleaneth/nlean:latest}"
nlean_docker_image="${NLEAN_DOCKER_IMAGE:-ghcr.io/nleaneth/nlean:devnet3}"
nlean_network_name="${NLEAN_NETWORK_NAME:-devnet0}"
log_level="${NLEAN_LOG_LEVEL:-}"
enable_metrics="${enableMetrics:-false}"
Expand Down
17 changes: 17 additions & 0 deletions parse-env.sh
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,15 @@ while [[ $# -gt 0 ]]; do
dryRun=true
shift
;;
--replace-with)
replaceWith="$2"
shift
shift
;;
--logs)
enableLogs=true
shift
;;
*) # unknown option
shift # past argument
;;
Expand All @@ -134,6 +143,12 @@ then
exit
fi;

# Validate --replace-with requires --restart-client
if [[ -n "$replaceWith" ]] && [[ ! -n "$restartClient" ]]; then
echo "Warning: --replace-with requires --restart-client. Ignoring --replace-with."
replaceWith=""
fi

# When using --restart-client with checkpoint sync, set default checkpoint URL if not provided
if [[ -n "$restartClient" ]] && [[ ! -n "$checkpointSyncUrl" ]]; then
checkpointSyncUrl="https://leanpoint.leanroadmap.org/lean/v0/states/finalized"
Expand Down Expand Up @@ -163,3 +178,5 @@ echo "restartClient = ${restartClient:-<not set>}"
echo "skipLeanpoint = ${skipLeanpoint:-false}"
echo "skipNemo = ${skipNemo:-false}"
echo "dryRun = ${dryRun:-false}"
echo "replaceWith = ${replaceWith:-<not set>}"
echo "enableLogs = ${enableLogs:-false}"
5 changes: 5 additions & 0 deletions run-ansible.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ coreDumps="$9" # Core dump configuration: "all", node names, or client types
skipGenesis="${10}" # Set to "true" to skip genesis generation (e.g. when restarting with checkpoint sync)
checkpointSyncUrl="${11}" # URL for checkpoint sync (when restarting with --restart-client)
dryRun="${12}" # Set to "true" to run Ansible with --check --diff (no changes applied)
syncAllHosts="${13}" # Set to "true" to sync config yamls to all hosts (used after --replace-with)

# Determine SSH user: use root if --useRoot flag is set, otherwise use current user
if [ "$useRoot" == "true" ]; then
Expand Down Expand Up @@ -131,6 +132,10 @@ if [ -n "$checkpointSyncUrl" ]; then
EXTRA_VARS="$EXTRA_VARS checkpoint_sync_url=$checkpointSyncUrl"
fi

if [ "$syncAllHosts" == "true" ]; then
EXTRA_VARS="$EXTRA_VARS sync_all_hosts=true"
fi

# Determine deployment mode (docker/binary) - read default from group_vars/all.yml
# Default to 'docker' if not specified in group_vars
GROUP_VARS_FILE="$ANSIBLE_DIR/inventory/group_vars/all.yml"
Expand Down
157 changes: 154 additions & 3 deletions spin-node.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ if [ "$scriptDir" == "." ]; then
scriptDir="$currentDir"
fi

# Save original args before parse-env.sh shifts them
_original_args="$*"

# 0. parse env and args
source "$(dirname $0)/parse-env.sh"

Expand Down Expand Up @@ -68,6 +71,23 @@ if [ "$deployment_mode" == "ansible" ] && ([ "$validatorConfig" == "genesis_boot
echo "Using Ansible deployment: configDir=$configDir, validator config=$validator_config_file"
fi

# Set up logging if --logs flag is enabled
if [ "$enableLogs" == "true" ]; then
_log_dir="$scriptDir/tmp"
mkdir -p "$_log_dir"
_log_start=$(date -u +%s)
if [ "$deployment_mode" == "ansible" ]; then
_log_prefix="ansible-run"
else
_log_prefix="local-run"
fi
_log_file="$_log_dir/${_log_prefix}-$(date -u '+%d-%m-%Y-%H-%M').log"
echo "$(date -u '+%Y-%m-%d %H:%M:%S') START spin-node.sh $_original_args" >> "$_log_dir/devnet.log"
trap 'echo "$(date -u '\''+%Y-%m-%d %H:%M:%S'\'') END spin-node.sh ($(( $(date -u +%s) - _log_start ))s) -> '"$_log_file"'" >> "'"$_log_dir"'/devnet.log"' EXIT
exec > >(tee -a "$_log_file") 2>&1
echo "Logging to $_log_file"
fi

# If --subnets N is specified, expand the validator config template into a new
# file with N nodes per client (same IP, unique incremented ports and keys).
# This must run after configDir/validator_config_file are resolved so the
Expand Down Expand Up @@ -373,6 +393,115 @@ if [[ -n "$restartClient" ]]; then
echo "Restarting with checkpoint sync: ${spin_nodes[*]} from $checkpointSyncUrl"
cleanData=true # Clear data when restarting with checkpoint sync
node_present=true

# --- Handle --replace-with: swap client implementations ---
# Uses parallel arrays (bash 3.x compatible, no associative arrays)
if [[ -n "$replaceWith" ]]; then
IFS=',' read -r -a replace_nodes <<< "$replaceWith"

# Build replacement pairs as parallel arrays
replace_old_names=()
replace_new_names=()
has_replacements=false
i=0
for old_name in "${requested_nodes[@]}"; do
new_name=""
if [ $i -lt ${#replace_nodes[@]} ]; then
new_name=$(echo "${replace_nodes[$i]}" | xargs) # trim whitespace
fi
if [ -n "$new_name" ] && [ "$new_name" != "$old_name" ]; then
replace_old_names+=("$old_name")
replace_new_names+=("$new_name")
has_replacements=true
echo "Will replace: $old_name → $new_name"
fi
i=$((i + 1))
done

# Warn about extra --replace-with entries beyond --restart-client count
if [ ${#replace_nodes[@]} -gt ${#requested_nodes[@]} ]; then
echo "Warning: --replace-with has more entries (${#replace_nodes[@]}) than --restart-client (${#requested_nodes[@]}). Extra entries ignored."
fi

if [ "$has_replacements" = true ]; then
# 1. Stop old containers and clean data BEFORE config changes (inventory still resolves to old names)
echo "Stopping old containers and cleaning data before replacement..."
for idx in "${!replace_old_names[@]}"; do
old_name="${replace_old_names[$idx]}"
if [ "$deployment_mode" == "ansible" ]; then
echo "Stopping $old_name and cleaning remote data via Ansible..."
"$scriptDir/run-ansible.sh" "$configDir" "$old_name" "true" "$validatorConfig" "$validator_config_file" "$sshKeyFile" "$useRoot" "stop" "" "true" "" || {
echo "Warning: Failed to stop $old_name via Ansible, continuing..."
}
else
echo "Stopping local container $old_name..."
if [ -n "$dockerWithSudo" ]; then
sudo docker rm -f "$old_name" 2>/dev/null || true
else
docker rm -f "$old_name" 2>/dev/null || true
fi
# Remove old local data directory (different clients have different data structures)
old_data_dir="$dataDir/$old_name"
if [ -d "$old_data_dir" ]; then
rm -rf "$old_data_dir"
echo " Removed data dir $old_name"
fi
fi
done

# 2. Update config files (rename old → new)
for idx in "${!replace_old_names[@]}"; do
old_name="${replace_old_names[$idx]}"
new_name="${replace_new_names[$idx]}"
echo "Updating config files: $old_name → $new_name"

# validator-config.yaml: rename .validators[].name
yq eval -i "(.validators[] | select(.name == \"$old_name\") | .name) = \"$new_name\"" "$validator_config_file"

# validators.yaml: rename top-level key (two steps: copy then delete, single expression doesn't work)
validators_file="$configDir/validators.yaml"
if [ -f "$validators_file" ]; then
yq eval -i ".$new_name = .$old_name" "$validators_file"
yq eval -i "del(.$old_name)" "$validators_file"
fi

# annotated_validators.yaml: rename top-level key
annotated_file="$configDir/annotated_validators.yaml"
if [ -f "$annotated_file" ]; then
yq eval -i ".$new_name = .$old_name" "$annotated_file"
yq eval -i "del(.$old_name)" "$annotated_file"
fi

# Rename key file (overwrite if destination exists from a previous run)
if [ -f "$configDir/$old_name.key" ]; then
mv -f "$configDir/$old_name.key" "$configDir/$new_name.key"
echo " Renamed $old_name.key → $new_name.key"
fi
done

# 3. Update spin_nodes array with new names
for i in "${!spin_nodes[@]}"; do
old="${spin_nodes[$i]}"
for idx in "${!replace_old_names[@]}"; do
if [ "$old" = "${replace_old_names[$idx]}" ]; then
spin_nodes[$i]="${replace_new_names[$idx]}"
break
fi
done
done

# Re-read nodes from updated config (needed for aggregator and downstream logic)
nodes=($(yq eval '.validators[].name' "$validator_config_file"))

# Ensure inventory is regenerated on next run-ansible.sh call
# (the stop call may have regenerated it with old names)
touch "$validator_config_file"

echo "Updated spin_nodes: ${spin_nodes[*]}"
echo "Config files updated successfully."
fi
fi

# Parse comma-separated or space-separated node names or handle single node/all
elif [[ "$node" == "all" ]]; then
# Spin all nodes
Expand Down Expand Up @@ -442,6 +571,7 @@ if [ "$deployment_mode" == "ansible" ]; then
fi

# Determine skip_genesis for Ansible (true when restarting with checkpoint sync)
# deploy-nodes.yml syncs config files to the target host, so copy-genesis to all hosts is not needed
ansible_skip_genesis="false"
[[ "$restart_with_checkpoint_sync" == "true" ]] && ansible_skip_genesis="true"

Expand All @@ -459,17 +589,25 @@ if [ "$deployment_mode" == "ansible" ]; then
exit 0
fi

# When --replace-with already cleaned data in the stop step, don't pass clean_data to deploy
# (the old node name no longer exists in config, so clean-node-data.yml would fail resolving it)
ansible_clean_data="$cleanData"
[[ "${has_replacements:-false}" = "true" ]] && ansible_clean_data=""

# Call separate Ansible execution script
# If Ansible deployment fails, exit immediately (don't fall through to local deployment)
if [ "$dryRun" == "true" ]; then
echo "[DRY RUN] Would deploy via Ansible — running playbook with --check --diff"
fi
if ! "$scriptDir/run-ansible.sh" "$configDir" "$ansible_node_arg" "$cleanData" "$validatorConfig" "$validator_config_file" "$sshKeyFile" "$useRoot" "" "$coreDumps" "$ansible_skip_genesis" "$ansible_checkpoint_url" "$dryRun"; then
ansible_sync_all_hosts=""
[[ "${has_replacements:-false}" = "true" ]] && ansible_sync_all_hosts="true"

if ! "$scriptDir/run-ansible.sh" "$configDir" "$ansible_node_arg" "$ansible_clean_data" "$validatorConfig" "$validator_config_file" "$sshKeyFile" "$useRoot" "" "$coreDumps" "$ansible_skip_genesis" "$ansible_checkpoint_url" "$dryRun" "$ansible_sync_all_hosts"; then
echo "❌ Ansible deployment failed. Exiting."
exit 1
fi

if [ -z "$skipLeanpoint" ]; then
if [ -z "$skipLeanpoint" ] && { [ "$restart_with_checkpoint_sync" != "true" ] || [ "${has_replacements:-false}" = "true" ]; }; then
# Sync leanpoint upstreams to tooling server and restart remote container (no 5th arg = remote)
if ! "$scriptDir/sync-leanpoint-upstreams.sh" "$validator_config_file" "$scriptDir" "$sshKeyFile" "$useRoot"; then
echo "Warning: leanpoint sync failed. If the tooling server requires a specific SSH key, run with: --sshKey <path-to-key>"
Expand All @@ -484,6 +622,18 @@ if [ "$deployment_mode" == "ansible" ]; then
fi
fi

# Push genesis time metric to Pushgateway if available
_pushgateway_url="${PUSHGATEWAY_URL:-http://46.225.10.32:9091}"
_genesis_config="$configDir/config.yaml"
if [ -f "$_genesis_config" ]; then
_genesis_time=$(grep "GENESIS_TIME:" "$_genesis_config" | awk '{print $2}')
if [ -n "$_genesis_time" ]; then
echo "lean_genesis_time $_genesis_time" | curl -s --data-binary @- \
"$_pushgateway_url/metrics/job/lean-quickstart" || \
echo "Warning: Failed to push lean_genesis_time to Pushgateway."
fi
fi

# Ansible deployment succeeded, exit normally
exit 0
fi
Expand Down Expand Up @@ -707,8 +857,9 @@ if [ -n "$enableMetrics" ] && [ "$enableMetrics" == "true" ]; then
fi

# Deploy leanpoint: locally (local devnet) or sync to tooling server (Ansible), unless --skip-leanpoint
# Skip leanpoint during checkpoint sync restart (node list hasn't changed)
local_leanpoint_deployed=0
if [ -z "$skipLeanpoint" ]; then
if [ -z "$skipLeanpoint" ] && { [ "$restart_with_checkpoint_sync" != "true" ] || [ "${has_replacements:-false}" = "true" ]; }; then
if "$scriptDir/sync-leanpoint-upstreams.sh" "$validator_config_file" "$scriptDir" "$sshKeyFile" "$useRoot" "$dataDir"; then
local_leanpoint_deployed=1
else
Expand Down
Loading