diff --git a/README.md b/README.md index 08f6fc2..953bdec 100644 --- a/README.md +++ b/README.md @@ -239,6 +239,16 @@ Every Ansible deployment automatically deploys an observability stack alongside - Every subnet contains the same set of client types - `N=1` renames nodes to `{client}_0` with no port changes (useful for canonical naming) - Example: `NETWORK_DIR=ansible-devnet ./spin-node.sh --node all --subnets 3 --sshKey ~/.ssh/id_ed25519 --useRoot` +17. `--replace-with` comma-separated list of replacement node names, positionally matched 1:1 with `--restart-client`. Swaps client implementations while keeping the same validator slot, keys, and server. Updates `validator-config.yaml`, `validators.yaml`, `annotated_validators.yaml`, and renames `.key` files. Leanpoint is re-synced when replacements occur. + - Example: `--restart-client zeam_0 --replace-with ream_0` → replaces zeam_0 with ream_0 + - Example: `--restart-client zeam_0,ream_0 --replace-with ream_1` → ream_1 replaces zeam_0, ream_0 just restarts + - Empty entries skip that position: `--restart-client zeam_0,ream_0 --replace-with ,ream_1` → zeam_0 restarts, ream_0 replaced with ream_1 +18. `--logs` enables run logging. When specified: + - Appends UTC-timestamped START/END entries with duration and log file path to `tmp/devnet.log` + - Duplicates console output to a timestamped log file in `tmp/`: + - `tmp/local-run-DD-MM-YYYY-HH-MM.log` for local deployments + - `tmp/ansible-run-DD-MM-YYYY-HH-MM.log` for Ansible deployments + - Example: `NETWORK_DIR=local-devnet ./spin-node.sh --node all --logs` ### Preparing remote servers @@ -334,6 +344,16 @@ NETWORK_DIR=local-devnet ./spin-node.sh --restart-client zeam_0 \ 2. Data directories are cleared 3. Clients are started with `--checkpoint-sync-url` so they sync from the remote checkpoint instead of genesis +**Replacing a client during restart:** + +```sh +# Replace zeam_0 with ream_0 (same validator slot, keys, and server) +NETWORK_DIR=ansible-devnet ./spin-node.sh --restart-client zeam_0 --replace-with ream_0 --useRoot + +# Replace first node, just restart second +NETWORK_DIR=local-devnet ./spin-node.sh --restart-client zeam_0,ream_0 --replace-with qlean_0 +``` + **Deployment modes:** - **Local** (`NETWORK_DIR=local-devnet`): Uses Docker directly - **Ansible** (`NETWORK_DIR=ansible-devnet`): Uses Ansible to deploy to remote hosts diff --git a/ansible/playbooks/copy-genesis.yml b/ansible/playbooks/copy-genesis.yml index 758d152..02ed759 100644 --- a/ansible/playbooks/copy-genesis.yml +++ b/ansible/playbooks/copy-genesis.yml @@ -159,7 +159,7 @@ _assignments: "{{ _av[inventory_hostname] | default([]) }}" _base: "{{ _assignments | map(attribute='privkey_file') | map('regex_replace', '_sk\\.ssz$', '') | list }}" set_fact: - node_hash_sig_files: "{{ _base | product(['_sk.ssz', '_pk.ssz', '_sk.json', '_pk.json']) | map('join') | list }}" + node_hash_sig_files: "{{ (_base | product(['_sk.ssz', '_pk.ssz', '_sk.json', '_pk.json']) | map('join') | list) + ['validator-keys-manifest.yaml'] }}" when: hash_sig_keys_stat.stat.exists - name: Create hash-sig-keys directory on remote diff --git a/ansible/playbooks/deploy-nodes.yml b/ansible/playbooks/deploy-nodes.yml index 34fc60f..1c1ae9a 100644 --- a/ansible/playbooks/deploy-nodes.yml +++ b/ansible/playbooks/deploy-nodes.yml @@ -152,6 +152,7 @@ loop: - config.yaml - validators.yaml + - annotated_validators.yaml - nodes.yaml - genesis.ssz - genesis.json @@ -174,7 +175,7 @@ _assignments: "{{ _av[node_name] | default([]) }}" _base: "{{ _assignments | map(attribute='privkey_file') | map('regex_replace', '_sk\\.ssz$', '') | list }}" set_fact: - node_hash_sig_files: "{{ _base | product(['_sk.ssz', '_pk.ssz', '_sk.json', '_pk.json']) | map('join') | list }}" + node_hash_sig_files: "{{ (_base | product(['_sk.ssz', '_pk.ssz', '_sk.json', '_pk.json']) | map('join') | list) + ['validator-keys-manifest.yaml'] }}" when: hash_sig_keys_local.stat.exists tags: - deploy @@ -242,3 +243,38 @@ include_tasks: helpers/deploy-single-node.yml tags: - deploy + +# When --replace-with is used, sync updated config yamls to all remote hosts +- name: Sync updated config files to all hosts after replacement + hosts: all:!local + gather_facts: no + vars: + genesis_dir: "{{ remote_genesis_dir | default('/opt/lean-quickstart/genesis') }}" + local_genesis_dir: "{{ hostvars['localhost']['local_genesis_dir_path'] }}" + tasks: + - name: Sync config yamls to all hosts + copy: + src: "{{ local_genesis_dir }}/{{ item }}" + dest: "{{ genesis_dir }}/{{ item }}" + mode: '0644' + force: yes + loop: + - validators.yaml + - annotated_validators.yaml + - nodes.yaml + - validator-config.yaml + when: sync_all_hosts | default(false) | bool + tags: + - deploy + - sync + + - name: Sync validator-keys-manifest to all hosts + copy: + src: "{{ local_genesis_dir }}/hash-sig-keys/validator-keys-manifest.yaml" + dest: "{{ genesis_dir }}/hash-sig-keys/validator-keys-manifest.yaml" + mode: '0644' + force: yes + when: sync_all_hosts | default(false) | bool + tags: + - deploy + - sync diff --git a/ansible/playbooks/stop-nodes.yml b/ansible/playbooks/stop-nodes.yml index af706a3..6d96bc3 100644 --- a/ansible/playbooks/stop-nodes.yml +++ b/ansible/playbooks/stop-nodes.yml @@ -110,3 +110,7 @@ msg: "Container {{ node_name }} not found (may already be stopped)" when: container_check.stdout != node_name + - name: Clean node data directory + raw: rm -rf {{ remote_data_dir | default('/opt/lean-quickstart/data') }}/{{ node_name }} + when: clean_data | default(false) | bool + diff --git a/ansible/roles/observability/defaults/main.yml b/ansible/roles/observability/defaults/main.yml index 7dc86fa..64e5f89 100644 --- a/ansible/roles/observability/defaults/main.yml +++ b/ansible/roles/observability/defaults/main.yml @@ -10,3 +10,4 @@ cadvisor_port: 9098 node_exporter_port: 9100 remote_write_url: "http://46.225.10.32:9090/api/v1/write" loki_push_url: "http://46.225.10.32:3100/loki/api/v1/push" +pushgateway_url: "http://46.225.10.32:9091" diff --git a/client-cmds/nlean-cmd.sh b/client-cmds/nlean-cmd.sh index 4f96865..1cf0bde 100755 --- a/client-cmds/nlean-cmd.sh +++ b/client-cmds/nlean-cmd.sh @@ -6,7 +6,7 @@ # NLEAN_REPO should point to this repository when lean-quickstart is outside this workspace. # Default assumes sibling checkouts: /nlean and /lean-quickstart. nlean_repo="${NLEAN_REPO:-$scriptDir/../nlean}" -nlean_docker_image="${NLEAN_DOCKER_IMAGE:-ghcr.io/nleaneth/nlean:latest}" +nlean_docker_image="${NLEAN_DOCKER_IMAGE:-ghcr.io/nleaneth/nlean:devnet3}" nlean_network_name="${NLEAN_NETWORK_NAME:-devnet0}" log_level="${NLEAN_LOG_LEVEL:-}" enable_metrics="${enableMetrics:-false}" diff --git a/parse-env.sh b/parse-env.sh index 2689be7..6dfc0d0 100755 --- a/parse-env.sh +++ b/parse-env.sh @@ -121,6 +121,15 @@ while [[ $# -gt 0 ]]; do dryRun=true shift ;; + --replace-with) + replaceWith="$2" + shift + shift + ;; + --logs) + enableLogs=true + shift + ;; *) # unknown option shift # past argument ;; @@ -134,6 +143,12 @@ then exit fi; +# Validate --replace-with requires --restart-client +if [[ -n "$replaceWith" ]] && [[ ! -n "$restartClient" ]]; then + echo "Warning: --replace-with requires --restart-client. Ignoring --replace-with." + replaceWith="" +fi + # When using --restart-client with checkpoint sync, set default checkpoint URL if not provided if [[ -n "$restartClient" ]] && [[ ! -n "$checkpointSyncUrl" ]]; then checkpointSyncUrl="https://leanpoint.leanroadmap.org/lean/v0/states/finalized" @@ -163,3 +178,5 @@ echo "restartClient = ${restartClient:-}" echo "skipLeanpoint = ${skipLeanpoint:-false}" echo "skipNemo = ${skipNemo:-false}" echo "dryRun = ${dryRun:-false}" +echo "replaceWith = ${replaceWith:-}" +echo "enableLogs = ${enableLogs:-false}" diff --git a/run-ansible.sh b/run-ansible.sh index 502b2b1..75d8267 100755 --- a/run-ansible.sh +++ b/run-ansible.sh @@ -30,6 +30,7 @@ coreDumps="$9" # Core dump configuration: "all", node names, or client types skipGenesis="${10}" # Set to "true" to skip genesis generation (e.g. when restarting with checkpoint sync) checkpointSyncUrl="${11}" # URL for checkpoint sync (when restarting with --restart-client) dryRun="${12}" # Set to "true" to run Ansible with --check --diff (no changes applied) +syncAllHosts="${13}" # Set to "true" to sync config yamls to all hosts (used after --replace-with) # Determine SSH user: use root if --useRoot flag is set, otherwise use current user if [ "$useRoot" == "true" ]; then @@ -131,6 +132,10 @@ if [ -n "$checkpointSyncUrl" ]; then EXTRA_VARS="$EXTRA_VARS checkpoint_sync_url=$checkpointSyncUrl" fi +if [ "$syncAllHosts" == "true" ]; then + EXTRA_VARS="$EXTRA_VARS sync_all_hosts=true" +fi + # Determine deployment mode (docker/binary) - read default from group_vars/all.yml # Default to 'docker' if not specified in group_vars GROUP_VARS_FILE="$ANSIBLE_DIR/inventory/group_vars/all.yml" diff --git a/spin-node.sh b/spin-node.sh index dcd21ee..db4987a 100755 --- a/spin-node.sh +++ b/spin-node.sh @@ -7,6 +7,9 @@ if [ "$scriptDir" == "." ]; then scriptDir="$currentDir" fi +# Save original args before parse-env.sh shifts them +_original_args="$*" + # 0. parse env and args source "$(dirname $0)/parse-env.sh" @@ -68,6 +71,23 @@ if [ "$deployment_mode" == "ansible" ] && ([ "$validatorConfig" == "genesis_boot echo "Using Ansible deployment: configDir=$configDir, validator config=$validator_config_file" fi +# Set up logging if --logs flag is enabled +if [ "$enableLogs" == "true" ]; then + _log_dir="$scriptDir/tmp" + mkdir -p "$_log_dir" + _log_start=$(date -u +%s) + if [ "$deployment_mode" == "ansible" ]; then + _log_prefix="ansible-run" + else + _log_prefix="local-run" + fi + _log_file="$_log_dir/${_log_prefix}-$(date -u '+%d-%m-%Y-%H-%M').log" + echo "$(date -u '+%Y-%m-%d %H:%M:%S') START spin-node.sh $_original_args" >> "$_log_dir/devnet.log" + trap 'echo "$(date -u '\''+%Y-%m-%d %H:%M:%S'\'') END spin-node.sh ($(( $(date -u +%s) - _log_start ))s) -> '"$_log_file"'" >> "'"$_log_dir"'/devnet.log"' EXIT + exec > >(tee -a "$_log_file") 2>&1 + echo "Logging to $_log_file" +fi + # If --subnets N is specified, expand the validator config template into a new # file with N nodes per client (same IP, unique incremented ports and keys). # This must run after configDir/validator_config_file are resolved so the @@ -373,6 +393,115 @@ if [[ -n "$restartClient" ]]; then echo "Restarting with checkpoint sync: ${spin_nodes[*]} from $checkpointSyncUrl" cleanData=true # Clear data when restarting with checkpoint sync node_present=true + + # --- Handle --replace-with: swap client implementations --- + # Uses parallel arrays (bash 3.x compatible, no associative arrays) + if [[ -n "$replaceWith" ]]; then + IFS=',' read -r -a replace_nodes <<< "$replaceWith" + + # Build replacement pairs as parallel arrays + replace_old_names=() + replace_new_names=() + has_replacements=false + i=0 + for old_name in "${requested_nodes[@]}"; do + new_name="" + if [ $i -lt ${#replace_nodes[@]} ]; then + new_name=$(echo "${replace_nodes[$i]}" | xargs) # trim whitespace + fi + if [ -n "$new_name" ] && [ "$new_name" != "$old_name" ]; then + replace_old_names+=("$old_name") + replace_new_names+=("$new_name") + has_replacements=true + echo "Will replace: $old_name → $new_name" + fi + i=$((i + 1)) + done + + # Warn about extra --replace-with entries beyond --restart-client count + if [ ${#replace_nodes[@]} -gt ${#requested_nodes[@]} ]; then + echo "Warning: --replace-with has more entries (${#replace_nodes[@]}) than --restart-client (${#requested_nodes[@]}). Extra entries ignored." + fi + + if [ "$has_replacements" = true ]; then + # 1. Stop old containers and clean data BEFORE config changes (inventory still resolves to old names) + echo "Stopping old containers and cleaning data before replacement..." + for idx in "${!replace_old_names[@]}"; do + old_name="${replace_old_names[$idx]}" + if [ "$deployment_mode" == "ansible" ]; then + echo "Stopping $old_name and cleaning remote data via Ansible..." + "$scriptDir/run-ansible.sh" "$configDir" "$old_name" "true" "$validatorConfig" "$validator_config_file" "$sshKeyFile" "$useRoot" "stop" "" "true" "" || { + echo "Warning: Failed to stop $old_name via Ansible, continuing..." + } + else + echo "Stopping local container $old_name..." + if [ -n "$dockerWithSudo" ]; then + sudo docker rm -f "$old_name" 2>/dev/null || true + else + docker rm -f "$old_name" 2>/dev/null || true + fi + # Remove old local data directory (different clients have different data structures) + old_data_dir="$dataDir/$old_name" + if [ -d "$old_data_dir" ]; then + rm -rf "$old_data_dir" + echo " Removed data dir $old_name" + fi + fi + done + + # 2. Update config files (rename old → new) + for idx in "${!replace_old_names[@]}"; do + old_name="${replace_old_names[$idx]}" + new_name="${replace_new_names[$idx]}" + echo "Updating config files: $old_name → $new_name" + + # validator-config.yaml: rename .validators[].name + yq eval -i "(.validators[] | select(.name == \"$old_name\") | .name) = \"$new_name\"" "$validator_config_file" + + # validators.yaml: rename top-level key (two steps: copy then delete, single expression doesn't work) + validators_file="$configDir/validators.yaml" + if [ -f "$validators_file" ]; then + yq eval -i ".$new_name = .$old_name" "$validators_file" + yq eval -i "del(.$old_name)" "$validators_file" + fi + + # annotated_validators.yaml: rename top-level key + annotated_file="$configDir/annotated_validators.yaml" + if [ -f "$annotated_file" ]; then + yq eval -i ".$new_name = .$old_name" "$annotated_file" + yq eval -i "del(.$old_name)" "$annotated_file" + fi + + # Rename key file (overwrite if destination exists from a previous run) + if [ -f "$configDir/$old_name.key" ]; then + mv -f "$configDir/$old_name.key" "$configDir/$new_name.key" + echo " Renamed $old_name.key → $new_name.key" + fi + done + + # 3. Update spin_nodes array with new names + for i in "${!spin_nodes[@]}"; do + old="${spin_nodes[$i]}" + for idx in "${!replace_old_names[@]}"; do + if [ "$old" = "${replace_old_names[$idx]}" ]; then + spin_nodes[$i]="${replace_new_names[$idx]}" + break + fi + done + done + + # Re-read nodes from updated config (needed for aggregator and downstream logic) + nodes=($(yq eval '.validators[].name' "$validator_config_file")) + + # Ensure inventory is regenerated on next run-ansible.sh call + # (the stop call may have regenerated it with old names) + touch "$validator_config_file" + + echo "Updated spin_nodes: ${spin_nodes[*]}" + echo "Config files updated successfully." + fi + fi + # Parse comma-separated or space-separated node names or handle single node/all elif [[ "$node" == "all" ]]; then # Spin all nodes @@ -442,6 +571,7 @@ if [ "$deployment_mode" == "ansible" ]; then fi # Determine skip_genesis for Ansible (true when restarting with checkpoint sync) + # deploy-nodes.yml syncs config files to the target host, so copy-genesis to all hosts is not needed ansible_skip_genesis="false" [[ "$restart_with_checkpoint_sync" == "true" ]] && ansible_skip_genesis="true" @@ -459,17 +589,25 @@ if [ "$deployment_mode" == "ansible" ]; then exit 0 fi + # When --replace-with already cleaned data in the stop step, don't pass clean_data to deploy + # (the old node name no longer exists in config, so clean-node-data.yml would fail resolving it) + ansible_clean_data="$cleanData" + [[ "${has_replacements:-false}" = "true" ]] && ansible_clean_data="" + # Call separate Ansible execution script # If Ansible deployment fails, exit immediately (don't fall through to local deployment) if [ "$dryRun" == "true" ]; then echo "[DRY RUN] Would deploy via Ansible — running playbook with --check --diff" fi - if ! "$scriptDir/run-ansible.sh" "$configDir" "$ansible_node_arg" "$cleanData" "$validatorConfig" "$validator_config_file" "$sshKeyFile" "$useRoot" "" "$coreDumps" "$ansible_skip_genesis" "$ansible_checkpoint_url" "$dryRun"; then + ansible_sync_all_hosts="" + [[ "${has_replacements:-false}" = "true" ]] && ansible_sync_all_hosts="true" + + if ! "$scriptDir/run-ansible.sh" "$configDir" "$ansible_node_arg" "$ansible_clean_data" "$validatorConfig" "$validator_config_file" "$sshKeyFile" "$useRoot" "" "$coreDumps" "$ansible_skip_genesis" "$ansible_checkpoint_url" "$dryRun" "$ansible_sync_all_hosts"; then echo "❌ Ansible deployment failed. Exiting." exit 1 fi - if [ -z "$skipLeanpoint" ]; then + if [ -z "$skipLeanpoint" ] && { [ "$restart_with_checkpoint_sync" != "true" ] || [ "${has_replacements:-false}" = "true" ]; }; then # Sync leanpoint upstreams to tooling server and restart remote container (no 5th arg = remote) if ! "$scriptDir/sync-leanpoint-upstreams.sh" "$validator_config_file" "$scriptDir" "$sshKeyFile" "$useRoot"; then echo "Warning: leanpoint sync failed. If the tooling server requires a specific SSH key, run with: --sshKey " @@ -484,6 +622,18 @@ if [ "$deployment_mode" == "ansible" ]; then fi fi + # Push genesis time metric to Pushgateway if available + _pushgateway_url="${PUSHGATEWAY_URL:-http://46.225.10.32:9091}" + _genesis_config="$configDir/config.yaml" + if [ -f "$_genesis_config" ]; then + _genesis_time=$(grep "GENESIS_TIME:" "$_genesis_config" | awk '{print $2}') + if [ -n "$_genesis_time" ]; then + echo "lean_genesis_time $_genesis_time" | curl -s --data-binary @- \ + "$_pushgateway_url/metrics/job/lean-quickstart" || \ + echo "Warning: Failed to push lean_genesis_time to Pushgateway." + fi + fi + # Ansible deployment succeeded, exit normally exit 0 fi @@ -707,8 +857,9 @@ if [ -n "$enableMetrics" ] && [ "$enableMetrics" == "true" ]; then fi # Deploy leanpoint: locally (local devnet) or sync to tooling server (Ansible), unless --skip-leanpoint +# Skip leanpoint during checkpoint sync restart (node list hasn't changed) local_leanpoint_deployed=0 -if [ -z "$skipLeanpoint" ]; then +if [ -z "$skipLeanpoint" ] && { [ "$restart_with_checkpoint_sync" != "true" ] || [ "${has_replacements:-false}" = "true" ]; }; then if "$scriptDir/sync-leanpoint-upstreams.sh" "$validator_config_file" "$scriptDir" "$sshKeyFile" "$useRoot" "$dataDir"; then local_leanpoint_deployed=1 else