Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
230 changes: 230 additions & 0 deletions rerun-failed-smoke.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,230 @@
#!/usr/bin/env bash
set -euo pipefail

# Rerun failed and dead jobs from a daily smoke run, once only.
# Usage: rerun-failed-smoke.sh [RUN_NAME]
# rerun-failed-smoke.sh --check RUN_NAME # only print branch, fail+dead count, sha1
# If RUN_NAME is omitted, uses run names from logs/smoke-runs-YYYY-MM-DD
# (written by run-daily-smoke.sh when it schedules runs).

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
LOG_DIR="${SCRIPT_DIR}/logs"
OVERRIDE_YAML="${OVERRIDE_YAML:-/home/ubuntu/override.yaml}"
LOG_FILE="${LOG_DIR}/rerun-smoke-$(date +%Y%m%d-%H%M%S).log"

mkdir -p "$LOG_DIR"
cd "$SCRIPT_DIR"

log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" | tee -a "$LOG_FILE"
}

# Get branch from run name (e.g. ...-smoke-main-distro-... or ...-smoke-tentacle-distro-...)
get_branch_from_run_name() {
local run_name="$1"
if [[ "$run_name" =~ -smoke-([^-]+)-distro- ]]; then
echo "${BASH_REMATCH[1]}"
else
echo "main"
fi
}

# Count fail+dead jobs; output to stdout: first line = count, second line = sha1 from run (if found)
count_fail_dead_and_get_sha() {
local run_name="$1"
RUN_NAME="$run_name" python3 <<'PYEOF' 2>/dev/null
import os
import sys
run_name = os.environ.get('RUN_NAME', '')
if not run_name:
print('0')
sys.exit(0)
try:
from teuthology.config import config
from teuthology.report import ResultsReporter
reporter = ResultsReporter()
jobs = reporter.get_jobs(run_name, fields=['status'])
if not jobs:
print('0')
sys.exit(0)
n = sum(1 for j in jobs if j.get('status') in ('fail', 'dead'))
print(n)

archive_base = getattr(config, 'archive_base', '') or ''
run_archive = os.path.join(archive_base, run_name)
sha1 = None
if os.path.isdir(run_archive):
for name in sorted(os.listdir(run_archive)):
if name.isdigit():
job_dir = os.path.join(run_archive, name)
for yaml in ('config.yaml', 'info.yaml', 'orig.config.yaml'):
path = os.path.join(job_dir, yaml)
if os.path.isfile(path):
try:
import yaml
with open(path) as f:
data = yaml.safe_load(f)
if data and isinstance(data, dict):
sha1 = data.get('sha1') or data.get('suite_sha1')
if sha1:
break
except Exception:
pass
if sha1:
break
if sha1:
print(sha1)
except Exception:
print('-1')
sys.exit(1)
PYEOF
}

# --check RUN_NAME: print branch, fail+dead count, and sha1 then exit (no rerun)
if [[ "${1:-}" = "--check" ]]; then
run_name="${2:-}"
if [[ -z "$run_name" ]]; then
echo "Usage: $0 --check RUN_NAME"
exit 1
fi
branch=$(get_branch_from_run_name "$run_name")
echo "Run: $run_name"
echo "Branch: $branch"
out=$(count_fail_dead_and_get_sha "$run_name")
fail_dead=$(echo "$out" | head -1)
sha1=$(echo "$out" | sed -n '2p')
echo "Fail+dead count: $fail_dead"
if [[ -n "$sha1" ]]; then
echo "SHA1 (from run): $sha1"
else
echo "SHA1 (from run): (not found in archive)"
fi
exit 0
fi

# Rerun a single run if it has any fail or dead jobs (once only)
do_rerun_for_run() {
local run_name="$1"
local branch
branch=$(get_branch_from_run_name "$run_name")
log "Run: $run_name (branch=$branch)"

local out
out=$(count_fail_dead_and_get_sha "$run_name")
local fail_dead
fail_dead=$(echo "$out" | head -1)
local shaman_id
shaman_id=$(echo "$out" | sed -n '2p')

if [[ "$fail_dead" = "-1" ]]; then
log "WARNING: Could not query jobs for run $run_name (server unreachable or run missing), skipping"
return 1
fi
if [[ "$fail_dead" -eq 0 ]]; then
log "No fail/dead jobs for $run_name, skipping rerun"
return 0
fi

log "Found $fail_dead fail/dead job(s); scheduling one-time rerun..."

# Use the original run's Ceph/suite SHA (from archive); only fall back to latest build if missing
if [[ -z "$shaman_id" ]]; then
local tmp_err
tmp_err=$(mktemp)
if ! shaman_id=$(python3 getUpstreamBuildDetails.py \
--branch "$branch" \
--platform ubuntu-jammy-default,centos-9-default \
--arch x86_64 2>"$tmp_err"); then
log "ERROR: No sha1 in run archive and failed to get upstream build for branch $branch:"
cat "$tmp_err" | tee -a "$LOG_FILE"
rm -f "$tmp_err"
return 1
fi
rm -f "$tmp_err"
log "Using latest ceph/suite sha for branch $branch (run archive had no sha1): $shaman_id"
else
log "Using original run ceph/suite sha: $shaman_id"
fi

# Rerun only fail and dead jobs; use same SHA for build and suite.
# Suite and rerun-statuses (fail,dead) come from run metadata / defaults.
local cmd="teuthology-suite \
--rerun \"$run_name\" \
-c \"$branch\" \
-m openstack \
--ceph-repo https://github.com/ceph/ceph \
--priority 50 \
--force-priority \
--sha1 $shaman_id \
--suite-sha1 $shaman_id \
$OVERRIDE_YAML"

log "Running: $cmd"
local suite_output
suite_output=$(mktemp)
if ! eval "$cmd" > "$suite_output" 2>&1; then
log "ERROR: Rerun scheduling failed for $run_name"
cat "$suite_output" >> "$LOG_FILE"
rm -f "$suite_output"
return 1
fi
cat "$suite_output" >> "$LOG_FILE"

local rerun_name
rerun_name=$(grep -oP "Job scheduled with name \K[^\s]+" "$suite_output" | head -1)
rm -f "$suite_output"

if [ -z "$rerun_name" ]; then
log "WARNING: Could not extract rerun name from output"
return 0
fi
log "Rerun scheduled: $rerun_name (one-time; no automatic retry if this fails)"

log "Waiting for rerun to complete..."
if ! teuthology-wait --run "$rerun_name" >> "$LOG_FILE" 2>&1; then
log "WARNING: Rerun $rerun_name finished with failures (no further automatic retries)"
return 1
fi
log "Rerun $rerun_name completed"
return 0
}

# Resolve list of run names: from args or from today's smoke-runs file
run_names=()
if [[ $# -ge 1 ]]; then
run_names=("$@")
log "Using run name(s) from arguments: ${run_names[*]}"
else
runs_file="${LOG_DIR}/smoke-runs-$(date '+%Y-%m-%d')"
if [[ ! -f "$runs_file" ]]; then
log "No run name given and no file $runs_file (run run-daily-smoke.sh first or pass RUN_NAME)"
exit 1
fi
while IFS= read -r line; do
line=$(echo "$line" | tr -d '\r\n')
if [[ -n "$line" ]]; then
# Avoid duplicate run names (e.g. same day manual + cron)
if [[ " ${run_names[*]} " != *" $line "* ]]; then
run_names+=("$line")
fi
fi
done < "$runs_file"
log "Using run name(s) from $runs_file: ${run_names[*]}"
fi

log "=========================================="
log "Rerun failed/dead smoke jobs (one-time)"
log "=========================================="

exit_code=0
for run_name in "${run_names[@]}"; do
if ! do_rerun_for_run "$run_name"; then
exit_code=1
fi
log ""
done

log "=========================================="
log "Rerun smoke script finished (exit_code=$exit_code)"
log "=========================================="
exit "$exit_code"
37 changes: 32 additions & 5 deletions run-daily-smoke.sh
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,13 @@ run_smoke_for_branch() {
return 1
fi

log "Using shaman build id for branch $branch: $shaman_id"
log "Using shaman build id (ceph sha) for branch $branch: $shaman_id"
rm -f "$tmp_err"


# Use same SHA for suite so QA tests match the installed Ceph build (avoids
# e.g. ImportError when suite expects symbols not in the build's Python bindings)
local suite_sha1="$shaman_id"

# Upload shaman_id to remote server
sshpass -p "admin" ssh -o StrictHostKeyChecking=no cloud-user@10.0.196.233 \
"sudo mkdir -p /data/scheduler/cron && echo '${shaman_id}' | sudo tee /data/scheduler/cron/${branch}-$(date "+%Y-%m-%d") > /dev/null" 2>&1 | tee -a "$LOG_FILE"
Expand All @@ -75,12 +79,13 @@ run_smoke_for_branch() {
local suite="smoke"
local seed=8446

log "Starting smoke suite for branch $branch with seed=$seed"
log "Starting smoke suite for branch $branch with seed=$seed (ceph sha=$shaman_id, suite sha=$suite_sha1)"

# Capture timestamp
local timestamp=$(date "+%Y-%m-%d_%H:%M:%S")

# Build and run command
# Build and run command: use same SHA for both --sha1 (Ceph build) and --suite-sha1 (QA)
# to avoid build/suite mismatch (e.g. RBD_LOCK_MODE_EXCLUSIVE_TRANSIENT).
local cmd="teuthology-suite \
--suite \"$suite\" \
--machine-type openstack \
Expand All @@ -90,6 +95,7 @@ run_smoke_for_branch() {
--force-priority \
--seed $seed \
--sha1 $shaman_id \
--suite-sha1 $suite_sha1 \
$OVERRIDE_YAML"

log "Running command: $cmd"
Expand Down Expand Up @@ -118,7 +124,12 @@ run_smoke_for_branch() {
fi

log "Using run name: $run_name"


# Record run name for rerun-failed-smoke.sh (rerun fail/dead jobs once)
local runs_file="$LOG_DIR/smoke-runs-$(date '+%Y-%m-%d')"
echo "$run_name" >> "$runs_file"
log "Recorded run name to $runs_file"

# Wait for run to be registered
log "Waiting 10 seconds for run to be registered on server..."
sleep 10
Expand Down Expand Up @@ -221,6 +232,22 @@ else
fi
log ""

# Check for fail/dead jobs and rerun them once (uses logs/smoke-runs-YYYY-MM-DD written above)
log "=========================================="
log "Checking for fail/dead jobs and rerunning once if needed..."
log "=========================================="
runs_file="$LOG_DIR/smoke-runs-$(date '+%Y-%m-%d')"
if [[ -f "$runs_file" ]]; then
if "$SCRIPT_DIR/rerun-failed-smoke.sh" >> "$LOG_FILE" 2>&1; then
log "✓ Rerun check completed"
else
log "✗ Rerun check had errors (see log)"
fi
else
log "No smoke-runs file found ($runs_file), skipping rerun check"
fi
log ""

log "=========================================="
log "Daily smoke suite execution completed"
log "=========================================="
Loading