diff --git a/.github/workflows/deployment-test.yaml b/.github/workflows/deployment-test.yaml new file mode 100644 index 000000000..b796e138e --- /dev/null +++ b/.github/workflows/deployment-test.yaml @@ -0,0 +1,1391 @@ +name: Deployment Test (Azure) + +# Cloud deployment-test gate. Runs `deployments/scripts/run-deployment-test.sh` +# end-to-end against an ephemeral cloud cluster (Azure today; other providers +# follow). Three modes, each cheaper to set up than the next: +# +# 1. `init-only` (~30s, no cloud setup): terraform init + validate + fmt +# against the Azure example module. Provider-download + HCL syntax check; +# ZERO Azure API calls. Use this to shake out the workflow shape before +# any cloud-side setup. +# 2. `auth-check` (~2 min, requires OIDC + Azure App Reg): adds terraform +# plan. First step that actually touches Azure — confirms the federated- +# identity → service-principal → RBAC chain. +# 3. `full-deployment` (~45 min, requires #2 plus POSTGRES_PASSWORD): runs +# `deployments/scripts/run-deployment-test.sh --provider azure` end-to-end. +# +# Triggers: +# - `workflow_dispatch` — once this file lands on the default branch, the +# "Run workflow" button in Actions becomes available for all three modes. +# - `pull_request` — auto-runs `init-only` on every push that touches the +# workflow, the wrapper script, or the Azure terraform module. The two +# heavier modes are gated behind PR labels (see below) so they don't burn +# Azure quota on every push. +# +# PR-label trigger (works pre-merge when the dispatcher isn't registered yet): +# - `ci:azure-deployment` → full-deployment fires on the next PR push +# auth-check is workflow_dispatch only — it's a developer-driven smoke for +# the OIDC chain, not something we want to run automatically per PR. +# +# Scheduled trigger (PRIMARY mode of operation on main): +# - Daily at 00:00 UTC = 5pm PDT (16:00 PST during winter — GitHub cron +# is UTC, doesn't adjust for DST). github.event_name='schedule' runs +# build-images + full-deployment end-to-end on main, the same path +# the PR-label gate exercises. Schedule events fire only from the +# repo's default branch (main) — they don't run for forks or +# feature branches. +# +# Slack notification (failure-only, schedule-only): +# - notify-slack-on-azure-deployment-test-failure posts to the channel +# named by `vars.CI_SLACK_CHANNEL` (fallback `osmo-slack-test`) using +# `OSMO_SLACK_BOT_TOKEN` (xoxb- bot token with chat:write scope) via +# Slack `chat.postMessage`. Override at repo/org level when redirecting +# the noise (e.g. to #osmo-oncall once this gate goes prod-ready). +# - Fires only on scheduled-run failures. PR-label and workflow_dispatch +# runs surface their own status interactively. +# - If the secret is unset or the API returns non-ok, the step logs a +# warning and exits 0 — the gate's overall status is unaffected. + +on: + workflow_dispatch: + inputs: + mode: + description: 'What to run' + type: choice + required: true + default: init-only + options: + - init-only + - auth-check + - full-deployment + pull_request: + branches: [main] + types: [opened, synchronize, reopened, labeled] + paths: + - '.github/workflows/deployment-test.yaml' + - 'deployments/scripts/run-deployment-test.sh' + - 'deployments/terraform/azure/**' + schedule: + # Daily at 00:00 UTC = 5pm PDT (16:00 PST during winter — GitHub cron + # is UTC, doesn't track DST). Schedule fires only on main, not on + # feature branches. + - cron: '0 0 * * *' + +# OIDC federation to Azure — no static secrets in this workflow. +# `id-token: write` lets the runner mint a JWT that Azure trusts via the +# Federated Identity Credential on the App Registration. The federated +# credential is bound to the `internal-ci` GitHub environment (subject = +# `repo:NVIDIA/OSMO:environment:internal-ci`), so the auth-check and +# full-deployment jobs must declare `environment: internal-ci` for the +# subject claim to match. Environment-scoped Variables (vars.AZURE_*) +# also resolve only inside jobs with that environment. +permissions: + id-token: write + contents: read + +jobs: + # Cheapest mode — no Azure setup needed. terraform init downloads the + # azurerm provider plugin from the Terraform Registry (HTTPS, no Azure + # API call). terraform validate + fmt are purely local. + init-only: + if: > + ${{ github.event_name == 'pull_request' + || github.event.inputs.mode == 'init-only' }} + runs-on: ubuntu-latest + timeout-minutes: 5 + defaults: + run: + working-directory: deployments/terraform/azure/example + steps: + - uses: actions/checkout@v4 + + - uses: hashicorp/setup-terraform@v3 + with: + terraform_version: 1.9.8 + + - name: terraform init (no Azure auth required) + run: terraform init -input=false + + - name: terraform validate + run: terraform validate -no-color + + # fmt is informational only — formatting drift in the existing Azure + # example is out of scope for this PR and the run-deployment-test + # wrapper doesn't care about cosmetic formatting. + - name: terraform fmt -check (informational) + run: terraform fmt -check -recursive -no-color || true + + # First step that actually talks to Azure — terraform plan reads the + # resource group via the azurerm_resource_group data source. Requires + # the full OIDC + App Reg + RBAC setup. Provisions nothing. + auth-check: + if: ${{ github.event.inputs.mode == 'auth-check' }} + runs-on: ubuntu-latest + timeout-minutes: 10 + environment: internal-ci + env: + ARM_USE_OIDC: true + ARM_CLIENT_ID: ${{ vars.AZURE_CLIENT_ID }} + ARM_TENANT_ID: ${{ vars.AZURE_TENANT_ID }} + ARM_SUBSCRIPTION_ID: ${{ vars.AZURE_SUBSCRIPTION_ID }} + defaults: + run: + working-directory: deployments/terraform/azure/example + steps: + - uses: actions/checkout@v4 + + - uses: hashicorp/setup-terraform@v3 + with: + terraform_version: 1.9.8 + + - name: terraform init + run: terraform init -input=false + + - name: terraform plan (against osmo-deployment-ci-rg, plan-only) + run: | + # postgres_password is a TF input without a default — pass a + # placeholder so plan can complete. The value would only matter + # at `terraform apply` time (which auth-check never runs). + terraform plan \ + -input=false \ + -var "subscription_id=${ARM_SUBSCRIPTION_ID}" \ + -var "resource_group_name=${RESOURCE_GROUP}" \ + -var "azure_region=${AZURE_REGION}" \ + -var "postgres_password=auth-check-placeholder-not-applied" \ + -no-color + env: + RESOURCE_GROUP: ${{ vars.AZURE_RESOURCE_GROUP }} + AZURE_REGION: ${{ vars.AZURE_REGION || 'eastus2' }} + + # Build OSMO service + backend images from THIS PR's source and push them + # to ghcr.io so the deployment-test below verifies the actual diff, not + # whatever's currently published at nvcr.io/nvidia/osmo:latest. Without + # this job the gate is meaningless for service-code PRs (it always tests + # the published `latest`, never the proposed change). Sequenced before + # full-deployment via `needs:`. + build-images: + if: > + ${{ github.event_name == 'schedule' + || github.event.inputs.mode == 'full-deployment' + || (github.event_name == 'pull_request' + && contains(github.event.pull_request.labels.*.name, 'ci:azure-deployment')) }} + runs-on: ubuntu-latest + timeout-minutes: 90 + permissions: + contents: read + packages: write + outputs: + image_registry: ${{ steps.tag.outputs.registry }} + image_tag: ${{ steps.tag.outputs.tag }} + steps: + # rules_oci + ~10 service images on a stock GHA runner needs ~25 GB + # of free disk; default ubuntu-latest is ~14 GB free. Same recipe + # as pr-checks.yaml's ci-public. + - name: Free disk space + run: | + sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc /usr/local/.ghcup /opt/hostedtoolcache/CodeQL || true + sudo docker image prune --all --force || true + df -h + + - uses: actions/checkout@v4 + with: + lfs: true + + # Same setup-bazel pin + external-cache manifest as pr-checks.yaml. + # disk-cache is keyed per-workflow so we don't share cache state with + # ci-public/ci-internal (different bazel targets, different shape). + - name: Setup Bazel + uses: bazel-contrib/setup-bazel@4fd964a13a440a8aeb0be47350db2fc640f19ca8 + with: + bazelisk-cache: true + bazelisk-version: 1.27.0 + disk-cache: ${{ github.workflow }}-images + repository-cache: true + external-cache: | + manifest: + osmo_python_deps: src/locked_requirements.txt + osmo_tests_python_deps: src/tests/locked_requirements.txt + osmo_mypy_deps: bzl/mypy/locked_requirements.txt + pylint_python_deps: bzl/linting/locked_requirements.txt + io_bazel_rules_go: src/runtime/go.mod + bazel_gazelle: src/runtime/go.sum + + # GHCR auth for rules_oci's `oci_push` (reads ~/.docker/config.json). + # GITHUB_TOKEN gets packages:write for this repo automatically. + - name: Log in to GHCR + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + # Tag layout: ghcr.io//osmo-ci/:pr---amd64 + # The `-amd64` suffix is appended by rules_oci's per-arch oci_push; + # we expose the FULL tag (with suffix) so downstream uses match the + # actual remote tag. + - id: tag + run: | + PR_NUM="${{ github.event.pull_request.number || github.run_id }}" + ATTEMPT="${{ github.run_attempt }}" + OWNER_LC=$(echo "${{ github.repository_owner }}" | tr '[:upper:]' '[:lower:]') + TAG_BASE="pr-${PR_NUM}-${ATTEMPT}" + echo "registry=ghcr.io/${OWNER_LC}/osmo-ci" >> "$GITHUB_OUTPUT" + echo "tag_base=${TAG_BASE}" >> "$GITHUB_OUTPUT" + echo "tag=${TAG_BASE}-amd64" >> "$GITHUB_OUTPUT" + + # Minimal --no-gpu image set: 8 service images + client + init-container. + # The public repo has no //ci:push_images orchestrator (that's GitLab-CI + # only — it lives in the internal repo's `ci/` dir). Iterate the + # per-target oci_push rules directly. Each accepts --repository and + # --tag at `bazel run` time, so we don't need to mutate the constants + # repo to redirect from nvcr.io to ghcr.io. + - name: Build and push OSMO images + env: + REMOTE_CACHE: ${{ secrets.BAZEL_REMOTE_CACHE_URL }} + REG: ${{ steps.tag.outputs.registry }} + TAG: ${{ steps.tag.outputs.tag }} + run: | + set -euo pipefail + CACHE_FLAG=() + if [[ -n "${REMOTE_CACHE:-}" ]]; then + CACHE_FLAG=(--remote_cache="$REMOTE_CACHE") + echo "::notice::Using bazel remote cache" + else + echo "::warning::BAZEL_REMOTE_CACHE_URL not set — cold build will be slow (~60 min)" + fi + + push_one() { + local target="$1" image="$2" + echo "::group::$image → $REG/$image:$TAG" + echo "▶ $(date -u +%H:%M:%S) bazel run $target" + bazel run --config=ci "${CACHE_FLAG[@]}" "$target" -- \ + --repository "$REG/$image" \ + --tag "$TAG" + echo "::endgroup::" + } + + # SERVICE_IMAGES (per chart's deployment templates) + push_one //src/service/core:service_push_x86_64 service + push_one //src/service/logger:logger_push_x86_64 logger + push_one //src/service/agent:agent_service_push_x86_64 agent + push_one //src/service/authz_sidecar:authz_sidecar_push_x86_64 authz-sidecar + push_one //src/service/router:router_push_x86_64 router + push_one //src/service/worker:worker_push_x86_64 worker + push_one //src/service/delayed_job_monitor:delayed_job_monitor_push_x86_64 delayed-job-monitor + # web-ui uses sh_binary + docker buildx (not oci_push); same flag shape + push_one //src/ui:build_push_web_ui_x86_64 web-ui + # BACKEND_IMAGES the chart's backend_images.{init,client} reference + push_one //src/cli:cli_push_x86_64 client + push_one //src/runtime:init_push_x86_64 init-container + # backend-operator chart deploys these two: without them, the + # operator install hits ImagePullBackOff and helm `--wait` times + # out with `context deadline exceeded`. backend-test-runner is + # only spawned at test-run time (not at install) and stays at + # nvcr.io defaults unless --backend-test-runner-* overrides flow + # in — skip for now to keep the build minimal. + push_one //src/operator:backend_listener_push_x86_64 backend-listener + push_one //src/operator:backend_worker_push_x86_64 backend-worker + + # GitHub Container Registry creates packages as PRIVATE on first push. + # Subsequent pushes inherit visibility. AKS would hit ImagePullBackOff + # without auth, which is why the full-deployment job pre-creates an + # imagePullSecret using GITHUB_TOKEN. (Setting packages to public is + # an admin-only API call requiring admin:packages PAT scope — out of + # this workflow's permissions surface.) + - name: Step summary + run: | + { + echo "### OSMO images built from source" + echo "" + echo "- Registry: \`${{ steps.tag.outputs.registry }}\`" + echo "- Tag: \`${{ steps.tag.outputs.tag }}\`" + echo "- Source SHA: \`${{ github.sha }}\`" + echo "" + echo "Packages pushed:" + for img in service logger agent authz-sidecar router worker delayed-job-monitor web-ui init-container client; do + echo " - \`${{ steps.tag.outputs.registry }}/$img:${{ steps.tag.outputs.tag }}\`" + done + } >> "$GITHUB_STEP_SUMMARY" + + # ── Stage 1: terraform apply ───────────────────────────────────────────── + # Provisions AKS + Postgres flex + Managed Redis in `vars.AZURE_REGION`. + # Uploads the resulting tfstate + tfvars as artifacts so the `tf-destroy` + # job at the end can clean up regardless of what fails in between. + # POSTGRES_PASSWORD is generated here and written into the tfvars file + # that's uploaded as part of the `tf-state-` artifact. The + # deploy/oetf jobs download that artifact and grep the password out — + # cross-job job-outputs don't work for masked values (GitHub filters + # them out, so the receiving job sees an empty string). + tf-apply: + needs: build-images + if: ${{ needs.build-images.result == 'success' }} + runs-on: ubuntu-latest + timeout-minutes: 30 + environment: internal-ci + env: + ARM_USE_OIDC: true + ARM_CLIENT_ID: ${{ vars.AZURE_CLIENT_ID }} + ARM_TENANT_ID: ${{ vars.AZURE_TENANT_ID }} + ARM_SUBSCRIPTION_ID: ${{ vars.AZURE_SUBSCRIPTION_ID }} + permissions: + id-token: write + contents: read + steps: + - uses: actions/checkout@v4 + + - name: azure login (OIDC) + uses: azure/login@v2 + with: + client-id: ${{ vars.AZURE_CLIENT_ID }} + tenant-id: ${{ vars.AZURE_TENANT_ID }} + subscription-id: ${{ vars.AZURE_SUBSCRIPTION_ID }} + + - uses: hashicorp/setup-terraform@v3 + with: + terraform_version: 1.9.8 + + - name: install kubectl + run: | + set -euo pipefail + KUBECTL_VERSION=v1.31.0 + curl -fsSLo /tmp/kubectl \ + "https://dl.k8s.io/release/${KUBECTL_VERSION}/bin/linux/amd64/kubectl" + curl -fsSL "https://dl.k8s.io/release/${KUBECTL_VERSION}/bin/linux/amd64/kubectl.sha256" \ + | awk '{print $1" /tmp/kubectl"}' | sha256sum -c - + sudo install -m 0755 /tmp/kubectl /usr/local/bin/kubectl + + - name: environment snapshot + env: + AZURE_SUBSCRIPTION_ID: ${{ vars.AZURE_SUBSCRIPTION_ID }} + AZURE_RESOURCE_GROUP: ${{ vars.AZURE_RESOURCE_GROUP }} + AZURE_REGION: ${{ vars.AZURE_REGION || 'eastus2' }} + AZURE_CLUSTER_NAME: ${{ vars.AZURE_CLUSTER_NAME || 'osmo-deployment-test' }} + run: | + echo "::group::az identity"; az account show -o table || true; echo "::endgroup::" + echo "::group::tool versions"; terraform version; az version 2>&1 | head -5; echo "::endgroup::" + echo "::group::target RG"; az group show --name "$AZURE_RESOURCE_GROUP" -o table || \ + echo "(RG not found)"; echo "::endgroup::" + echo "::group::env (non-secret)" + echo "AZURE_SUBSCRIPTION_ID=$AZURE_SUBSCRIPTION_ID" + echo "AZURE_RESOURCE_GROUP=$AZURE_RESOURCE_GROUP" + echo "AZURE_REGION=$AZURE_REGION" + echo "AZURE_CLUSTER_NAME=$AZURE_CLUSTER_NAME" + echo "::endgroup::" + + - name: generate per-run postgres password + id: gen_pg + run: | + PG_PASS="$(openssl rand -base64 32 | tr -d '/=+' | head -c 32)Aa1!" + echo "::add-mask::$PG_PASS" + echo "value=$PG_PASS" >> "$GITHUB_OUTPUT" + + # Single source of truth for the TF inputs the apply + destroy steps + # use. Stored in $RUNNER_TEMP (per-job; this job uploads as artifact + # for the destroy job to download). Non-default values: + # - aks_private_cluster_enabled=false GHA runners are public-net. + # - node_instance_type=Standard_D8s_v3 D4s_v3 left K8_CPU=0 after + # Azure daemons + OSMO sidecars. + # - node_group_min_size=3 headroom for scenario tests. + - name: build TF var file + env: + AZURE_SUBSCRIPTION_ID: ${{ vars.AZURE_SUBSCRIPTION_ID }} + AZURE_RESOURCE_GROUP: ${{ vars.AZURE_RESOURCE_GROUP }} + AZURE_REGION: ${{ vars.AZURE_REGION || 'eastus2' }} + AZURE_CLUSTER_NAME: ${{ vars.AZURE_CLUSTER_NAME || 'osmo-deployment-test' }} + PG_PASS: ${{ steps.gen_pg.outputs.value }} + run: | + cat > "$RUNNER_TEMP/azure.tfvars" </dev/null || true) + if [ -z "$existing" ]; then + echo "::error::resource group '$AZURE_RESOURCE_GROUP' not found (or SP lacks read access)." + exit 1 + elif [ "$existing" != "$AZURE_REGION" ]; then + echo "::error::RG '$AZURE_RESOURCE_GROUP' lives in '$existing' but workflow expects '$AZURE_REGION'." + exit 1 + fi + echo "::notice::RG $AZURE_RESOURCE_GROUP confirmed in $AZURE_REGION" + + # If a prior run was killed mid-destroy, resources may exist in the + # RG without matching TF state — `terraform apply` would then fail + # with "Resource already exists, import into state". Wipe leftovers. + - name: TEMP — pre-apply cleanup (delete leftover resources in RG) + env: + AZURE_RESOURCE_GROUP: ${{ vars.AZURE_RESOURCE_GROUP }} + run: | + set -euo pipefail + echo "▶ $(date -u +%H:%M:%S) checking for leftover resources in $AZURE_RESOURCE_GROUP" + IDS=$(az resource list --resource-group "$AZURE_RESOURCE_GROUP" --query '[].id' -o tsv || true) + if [ -z "$IDS" ]; then + echo "::notice::resource group is clean — nothing to delete" + exit 0 + fi + echo "::warning::found $(echo "$IDS" | wc -l) leftover resource(s) from a prior partial run" + echo "$IDS" + + fire_deletes() { + local ids="$1" budget="$2" + while IFS= read -r id; do + [ -z "$id" ] && continue + az resource delete --ids "$id" --no-wait 2>&1 | head -"$budget" & + done <<< "$ids" + wait + } + + echo "▶ $(date -u +%H:%M:%S) firing async deletes (--no-wait)" + fire_deletes "$IDS" 2 + + echo "▶ $(date -u +%H:%M:%S) polling until RG is empty (max 30 min)" + deadline=$(( $(date +%s) + 1800 )) + last_refire=$(date +%s) + while [ "$(date +%s)" -lt "$deadline" ]; do + ids_now=$(az resource list --resource-group "$AZURE_RESOURCE_GROUP" --query '[].id' -o tsv || true) + count=$(echo -n "$ids_now" | grep -c . || true) + echo " $(date -u +%H:%M:%S) remaining: $count" + [ "$count" = "0" ] && break + + now=$(date +%s) + if [ $(( now - last_refire )) -ge 300 ]; then + echo " $(date -u +%H:%M:%S) ↻ re-firing deletes on $count remaining resource(s)" + fire_deletes "$ids_now" 1 + last_refire=$now + fi + sleep 30 + done + + if [ "$count" != "0" ]; then + echo "::error::cleanup timed out — $count resource(s) still present" + az resource list --resource-group "$AZURE_RESOURCE_GROUP" -o table + exit 1 + fi + echo "::notice::cleanup complete" + + - name: TEMP — terraform apply (provision AKS + Postgres + Redis) + working-directory: deployments/terraform/azure/example + env: + AZURE_RESOURCE_GROUP: ${{ vars.AZURE_RESOURCE_GROUP }} + AZURE_REGION: ${{ vars.AZURE_REGION || 'eastus2' }} + AZURE_CLUSTER_NAME: ${{ vars.AZURE_CLUSTER_NAME || 'osmo-deployment-test' }} + run: | + set -euo pipefail + echo "::notice::terraform apply starting — expected ~10–15 min (AKS dominates)" + echo "::group::terraform init" + terraform init -input=false -no-color + echo "::endgroup::" + echo "::group::terraform apply (streaming)" + terraform apply -input=false -auto-approve -no-color -var-file="$RUNNER_TEMP/azure.tfvars" + echo "::endgroup::" + echo "::group::resources provisioned (terraform state list)" + terraform state list || true + echo "::endgroup::" + # Stash state file inside the workspace so upload-artifact can find it. + mkdir -p "$GITHUB_WORKSPACE/tf-state" + cp terraform.tfstate "$GITHUB_WORKSPACE/tf-state/" 2>/dev/null || true + cp .terraform.lock.hcl "$GITHUB_WORKSPACE/tf-state/" 2>/dev/null || true + cp "$RUNNER_TEMP/azure.tfvars" "$GITHUB_WORKSPACE/tf-state/" 2>/dev/null || true + { + echo "### TEMP terraform apply ✅" + echo "" + echo "- AKS: \`${AZURE_CLUSTER_NAME}\` in \`${AZURE_RESOURCE_GROUP}\` (${AZURE_REGION})" + echo "- Postgres flex: \`${AZURE_CLUSTER_NAME}-postgres\`" + echo "- Redis: \`${AZURE_CLUSTER_NAME}-redis\`" + echo "- finished at: $(date -u +%H:%M:%SZ)" + } >> "$GITHUB_STEP_SUMMARY" + + # Upload terraform state (and the tfvars file) so the tf-destroy job + # can download and replay the same plan. `if: always()` so a partial + # apply still uploads whatever state exists. + - name: upload terraform state + tfvars (for tf-destroy) + if: always() + uses: actions/upload-artifact@v4 + with: + name: tf-state-${{ github.run_id }} + path: tf-state/ + retention-days: 7 + if-no-files-found: warn + # upload-artifact@v4 excludes dotfiles by default — that'd drop + # `.terraform.lock.hcl`, which deploy-osmo + tf-destroy need to + # `terraform init` against the same provider versions tf-apply + # used. + include-hidden-files: true + + # ── Stage 2: deploy OSMO chart + verify-hello ──────────────────────────── + # Refreshes kubectl creds against the freshly-applied AKS, pre-creates a + # GHCR pull secret, then invokes the wrapper with SKIP_OETF=1 so only + # bootstrap + deploy stages run. + deploy-osmo: + needs: [build-images, tf-apply] + if: ${{ needs.tf-apply.result == 'success' }} + runs-on: ubuntu-latest + timeout-minutes: 30 + environment: internal-ci + env: + ARM_USE_OIDC: true + ARM_CLIENT_ID: ${{ vars.AZURE_CLIENT_ID }} + ARM_TENANT_ID: ${{ vars.AZURE_TENANT_ID }} + ARM_SUBSCRIPTION_ID: ${{ vars.AZURE_SUBSCRIPTION_ID }} + RUN_DIR: ${{ github.workspace }}/runs/deployment-test-azure + OSMO_IMAGE_REGISTRY: ${{ needs.build-images.outputs.image_registry }} + OSMO_IMAGE_TAG: ${{ needs.build-images.outputs.image_tag }} + NGC_SECRET_NAME: ghcr-pull + permissions: + id-token: write + contents: read + packages: read + steps: + - uses: actions/checkout@v4 + + - name: azure login (OIDC) + uses: azure/login@v2 + with: + client-id: ${{ vars.AZURE_CLIENT_ID }} + tenant-id: ${{ vars.AZURE_TENANT_ID }} + subscription-id: ${{ vars.AZURE_SUBSCRIPTION_ID }} + + # deploy-osmo-minimal.sh (called by the wrapper's stage_deploy) does + # an unconditional `command -v terraform` preflight check, even + # though --skip-terraform tells it not to actually run terraform. + # Install it to satisfy that check. + - uses: hashicorp/setup-terraform@v3 + with: + terraform_version: 1.9.8 + + - name: install kubectl + helm + run: | + set -euo pipefail + KUBECTL_VERSION=v1.31.0 + curl -fsSLo /tmp/kubectl \ + "https://dl.k8s.io/release/${KUBECTL_VERSION}/bin/linux/amd64/kubectl" + curl -fsSL "https://dl.k8s.io/release/${KUBECTL_VERSION}/bin/linux/amd64/kubectl.sha256" \ + | awk '{print $1" /tmp/kubectl"}' | sha256sum -c - + sudo install -m 0755 /tmp/kubectl /usr/local/bin/kubectl + + HELM_VERSION=v3.16.2 + HELM_SHA256=9318379b847e333460d33d291d4c088156299a26cd93d570a7f5d0c36e50b5bb + curl -fsSLo /tmp/helm.tgz "https://get.helm.sh/helm-${HELM_VERSION}-linux-amd64.tar.gz" + echo "${HELM_SHA256} /tmp/helm.tgz" | sha256sum -c - + tar -xzf /tmp/helm.tgz -C /tmp linux-amd64/helm + sudo install -m 0755 /tmp/linux-amd64/helm /usr/local/bin/helm + + # GitHub Actions filters secret/masked values out of cross-job + # outputs, so we can't propagate POSTGRES_PASSWORD via + # `needs.tf-apply.outputs.*` — the receiving job sees an empty + # string. Workaround: download the tfvars file from the tf-state + # artifact tf-apply uploaded and grep the password out. + - name: download tf-state artifact (for POSTGRES_PASSWORD) + uses: actions/download-artifact@v4 + with: + name: tf-state-${{ github.run_id }} + path: tf-state-download/ + + - name: extract POSTGRES_PASSWORD from tfvars + id: pg + run: | + set -euo pipefail + PG_PASS=$(grep '^postgres_password' tf-state-download/azure.tfvars | sed 's/^[^"]*"\(.*\)".*/\1/') + if [ -z "$PG_PASS" ]; then + echo "::error::POSTGRES_PASSWORD not found in tf-state-download/azure.tfvars" + exit 1 + fi + echo "::add-mask::$PG_PASS" + echo "value=$PG_PASS" >> "$GITHUB_OUTPUT" + + # deploy-osmo-minimal.sh shells out to `terraform output` to read + # connection strings (postgres FQDN, redis endpoint, etc.) for the + # chart's helm values, even with --skip-terraform. Without these + # three things the call fails with "Module not installed": + # 1. terraform.tfstate present in the working dir (state) + # 2. .terraform.lock.hcl present (pinned provider versions) + # 3. `terraform init` to download providers + modules locally + - name: stage tfstate + terraform init + working-directory: deployments/terraform/azure/example + run: | + set -euo pipefail + echo "::group::tf-state-download contents" + ls -la "$GITHUB_WORKSPACE/tf-state-download/" + echo "::endgroup::" + for f in terraform.tfstate .terraform.lock.hcl; do + if [ ! -f "$GITHUB_WORKSPACE/tf-state-download/$f" ]; then + echo "::error::$f missing from tf-state artifact — tf-apply upload step lost it" + exit 1 + fi + cp "$GITHUB_WORKSPACE/tf-state-download/$f" . + done + terraform init -input=false -no-color + + # Wire kubectl to the freshly-applied AKS, then pre-create a GHCR + # docker-registry secret in every OSMO namespace. The chart's deploy + # script (deploy-k8s.sh) skips its own kubectl-create-secret path + # when the named secret exists, avoiding the need to leak NGC_API_KEY. + - name: wire kubectl + pre-create GHCR pull secret + env: + AZURE_RESOURCE_GROUP: ${{ vars.AZURE_RESOURCE_GROUP }} + AZURE_CLUSTER_NAME: ${{ vars.AZURE_CLUSTER_NAME || 'osmo-deployment-test' }} + GHCR_USERNAME: ${{ github.actor }} + GHCR_PASSWORD: ${{ secrets.GITHUB_TOKEN }} + run: | + set -euo pipefail + echo "▶ az aks get-credentials" + az aks get-credentials \ + --resource-group "$AZURE_RESOURCE_GROUP" \ + --name "$AZURE_CLUSTER_NAME" \ + --overwrite-existing --admin + kubectl cluster-info | head -3 + + echo "▶ ensuring OSMO namespaces exist" + for ns in osmo-minimal osmo-operator osmo-workflows; do + kubectl create namespace "$ns" --dry-run=client -o yaml | kubectl apply -f - + done + + # Chart-generated workflow task pods set `runtimeClassName: nvidia`. + # On CPU-only deploys (--no-gpu), without this stub k8s rejects them. + echo "▶ applying nvidia RuntimeClass stub (CPU-mode shim)" + printf '%s\n' \ + 'apiVersion: node.k8s.io/v1' \ + 'kind: RuntimeClass' \ + 'metadata:' \ + ' name: nvidia' \ + 'handler: runc' \ + | kubectl apply -f - + + echo "▶ creating GHCR pull secret '$NGC_SECRET_NAME' in each namespace" + for ns in osmo-minimal osmo-operator osmo-workflows; do + kubectl create secret docker-registry "$NGC_SECRET_NAME" \ + --docker-server=ghcr.io \ + --docker-username="$GHCR_USERNAME" \ + --docker-password="$GHCR_PASSWORD" \ + --namespace "$ns" \ + --dry-run=client -o yaml \ + | kubectl apply -f - + done + + - name: deploy OSMO (chart install + verify-hello) + id: deploy_osmo + env: + AZURE_SUBSCRIPTION_ID: ${{ vars.AZURE_SUBSCRIPTION_ID }} + AZURE_RESOURCE_GROUP: ${{ vars.AZURE_RESOURCE_GROUP }} + AZURE_REGION: ${{ vars.AZURE_REGION || 'eastus2' }} + AZURE_CLUSTER_NAME: ${{ vars.AZURE_CLUSTER_NAME || 'osmo-deployment-test' }} + POSTGRES_PASSWORD: ${{ steps.pg.outputs.value }} + SKIP_OETF: "1" + SKIP_TEARDOWN: "1" + run: | + set -o pipefail + echo "::notice::deploy stage starting — chart install + verify-hello, expected ~5–15 min" + mkdir -p "$RUN_DIR" + bash deployments/scripts/run-deployment-test.sh --provider azure + echo "▶ $(date -u +%H:%M:%S) deploy stage done" + + - name: deploy result summary + if: always() && steps.deploy_osmo.conclusion != 'skipped' + env: + AZURE_RESOURCE_GROUP: ${{ vars.AZURE_RESOURCE_GROUP }} + AZURE_CLUSTER_NAME: ${{ vars.AZURE_CLUSTER_NAME || 'osmo-deployment-test' }} + run: | + set +e + chart_version="$(helm list -n osmo --output json 2>/dev/null \ + | python3 -c 'import json,sys; rs=json.load(sys.stdin); print(rs[0].get("chart","-") if rs else "-")' 2>/dev/null || echo "-")" + pod_summary="$(kubectl get pods -n osmo --no-headers 2>/dev/null \ + | awk '{print $3}' | sort | uniq -c | awk '{printf "%s×%s ", $1, $2}' || echo "-")" + icon='✅'; verify_text='passed' + if [ "${{ steps.deploy_osmo.outcome }}" != "success" ]; then icon='❌'; verify_text='failed (see step logs)'; fi + { + echo "### Deploy stage ${icon}" + echo "" + echo "- chart: \`${chart_version}\`" + echo "- image: \`${OSMO_IMAGE_REGISTRY:-?}/*:${OSMO_IMAGE_TAG:-?}\`" + echo "- pods: ${pod_summary:-?}" + echo "- verify-hello: ${verify_text}" + if [ -f "$RUN_DIR/deployment-test-result.json" ]; then + echo "" + echo "
wrapper result JSON" + echo "" + echo '```json' + cat "$RUN_DIR/deployment-test-result.json" + echo '```' + echo "
" + fi + } >> "$GITHUB_STEP_SUMMARY" + + - name: upload deploy logs + if: always() + uses: actions/upload-artifact@v4 + with: + name: deploy-osmo-${{ github.run_id }} + path: runs/deployment-test-azure/** + retention-days: 14 + if-no-files-found: warn + + # ── Stage 3: OETF smoke tests ──────────────────────────────────────────── + # Refreshes kubectl creds against the AKS cluster the deploy job left + # running, then invokes the wrapper with SKIP_DEPLOY=1 so only bootstrap + # + oetf-smoke stages run. The wrapper sets up its own kubectl + # port-forward to osmo-gateway and runs `bazel run //test/oetf:run`. + oetf: + needs: [build-images, tf-apply, deploy-osmo] + if: ${{ needs.deploy-osmo.result == 'success' }} + runs-on: ubuntu-latest + timeout-minutes: 30 + environment: internal-ci + env: + ARM_USE_OIDC: true + ARM_CLIENT_ID: ${{ vars.AZURE_CLIENT_ID }} + ARM_TENANT_ID: ${{ vars.AZURE_TENANT_ID }} + ARM_SUBSCRIPTION_ID: ${{ vars.AZURE_SUBSCRIPTION_ID }} + RUN_DIR: ${{ github.workspace }}/runs/deployment-test-azure + OSMO_IMAGE_REGISTRY: ${{ needs.build-images.outputs.image_registry }} + OSMO_IMAGE_TAG: ${{ needs.build-images.outputs.image_tag }} + # OETF lives at /test/oetf in the public repo; the wrapper's + # REPO_ROOT computation assumes external/ submodule wrapping and + # overshoots on a standalone checkout, so override explicitly. + OETF_REPO_ROOT: ${{ github.workspace }} + # OETF tag set. Only remaining hole vs the broad `kind` tag is + # router-connectivity (Azure CoreDNS, not OETF). task-runtime-environment + # was unblocked by #1128. + # 8 tests: smoke api + smoke ws + 2 positive scenarios + 4 negative. + OETF_TAGS: api,websocket,logger,task-env,negative + permissions: + id-token: write + contents: read + steps: + - uses: actions/checkout@v4 + + - name: azure login (OIDC) + uses: azure/login@v2 + with: + client-id: ${{ vars.AZURE_CLIENT_ID }} + tenant-id: ${{ vars.AZURE_TENANT_ID }} + subscription-id: ${{ vars.AZURE_SUBSCRIPTION_ID }} + + # deploy-osmo-minimal.sh has an unconditional `command -v terraform` + # preflight check that the wrapper's stage_oetf path also trips + # (via stage_bootstrap → reachability check that exits if any + # required tool is missing). Install it. + - uses: hashicorp/setup-terraform@v3 + with: + terraform_version: 1.9.8 + + - name: install kubectl + run: | + set -euo pipefail + KUBECTL_VERSION=v1.31.0 + curl -fsSLo /tmp/kubectl \ + "https://dl.k8s.io/release/${KUBECTL_VERSION}/bin/linux/amd64/kubectl" + curl -fsSL "https://dl.k8s.io/release/${KUBECTL_VERSION}/bin/linux/amd64/kubectl.sha256" \ + | awk '{print $1" /tmp/kubectl"}' | sha256sum -c - + sudo install -m 0755 /tmp/kubectl /usr/local/bin/kubectl + + # bazel is needed for `bazel run //test/oetf:run` inside the wrapper's + # oetf-smoke stage. disk-cache key shared with the build-images job so + # OETF target builds can hit the cache. + - name: Setup Bazel + uses: bazel-contrib/setup-bazel@4fd964a13a440a8aeb0be47350db2fc640f19ca8 + with: + bazelisk-cache: true + bazelisk-version: 1.27.0 + disk-cache: ${{ github.workflow }}-images + repository-cache: true + external-cache: | + manifest: + osmo_python_deps: src/locked_requirements.txt + osmo_tests_python_deps: src/tests/locked_requirements.txt + osmo_mypy_deps: bzl/mypy/locked_requirements.txt + pylint_python_deps: bzl/linting/locked_requirements.txt + io_bazel_rules_go: src/runtime/go.mod + bazel_gazelle: src/runtime/go.sum + + - name: refresh kubectl creds for AKS + env: + AZURE_RESOURCE_GROUP: ${{ vars.AZURE_RESOURCE_GROUP }} + AZURE_CLUSTER_NAME: ${{ vars.AZURE_CLUSTER_NAME || 'osmo-deployment-test' }} + run: | + set -euo pipefail + az aks get-credentials \ + --resource-group "$AZURE_RESOURCE_GROUP" \ + --name "$AZURE_CLUSTER_NAME" \ + --overwrite-existing --admin + kubectl cluster-info | head -3 + kubectl get pods -n osmo-minimal -o wide | head -20 + + # See deploy-osmo for why we re-derive POSTGRES_PASSWORD from the + # tf-state artifact instead of consuming a job output. + - name: download tf-state artifact (for POSTGRES_PASSWORD) + uses: actions/download-artifact@v4 + with: + name: tf-state-${{ github.run_id }} + path: tf-state-download/ + + - name: extract POSTGRES_PASSWORD from tfvars + id: pg + run: | + set -euo pipefail + PG_PASS=$(grep '^postgres_password' tf-state-download/azure.tfvars | sed 's/^[^"]*"\(.*\)".*/\1/') + if [ -z "$PG_PASS" ]; then + echo "::error::POSTGRES_PASSWORD not found in tf-state-download/azure.tfvars" + exit 1 + fi + echo "::add-mask::$PG_PASS" + echo "value=$PG_PASS" >> "$GITHUB_OUTPUT" + + # The wrapper's stage_oetf_smoke applies a profile-pool=default + # workaround for #1114's `pool=` vs `pools=` query-param mismatch, + # but it only runs that workaround when `command -v osmo` finds + # the CLI. In the old monolithic job the deploy stage installed + # osmo into ~/.local/bin earlier in the same runner; in the split, + # this is a fresh runner — osmo isn't there. Without the + # workaround, smoke:api-checks fails with "No pool selected!". + # Install osmo CLI here (idempotent; common.sh's installer downloads + # the latest GA release from github.com/NVIDIA/OSMO/releases). + - name: install osmo CLI (for profile-pool workaround) + run: | + set -euo pipefail + source deployments/scripts/common.sh + install_osmo_cli_if_missing + echo "$HOME/.local/bin" >> "$GITHUB_PATH" + + - name: run OETF smoke tests + id: run_oetf + env: + AZURE_SUBSCRIPTION_ID: ${{ vars.AZURE_SUBSCRIPTION_ID }} + AZURE_RESOURCE_GROUP: ${{ vars.AZURE_RESOURCE_GROUP }} + AZURE_REGION: ${{ vars.AZURE_REGION || 'eastus2' }} + AZURE_CLUSTER_NAME: ${{ vars.AZURE_CLUSTER_NAME || 'osmo-deployment-test' }} + POSTGRES_PASSWORD: ${{ steps.pg.outputs.value }} + SKIP_DEPLOY: "1" + SKIP_TEARDOWN: "1" + run: | + set -o pipefail + echo "::notice::OETF stage starting — bazel run //test/oetf:run with tags=$OETF_TAGS" + mkdir -p "$RUN_DIR" + bash deployments/scripts/run-deployment-test.sh --provider azure + echo "▶ $(date -u +%H:%M:%S) OETF stage done" + + - name: OETF result summary + if: always() && steps.run_oetf.conclusion != 'skipped' + env: + RUN_DIR: ${{ github.workspace }}/runs/deployment-test-azure + run: | + set +e + oetf_json="$RUN_DIR/oetf-result.json" + if [ ! -f "$oetf_json" ]; then + { echo "### OETF stage ⚠️"; echo ""; echo "_no result JSON found at \`$oetf_json\` — wrapper likely died before OETF ran_"; } >> "$GITHUB_STEP_SUMMARY" + exit 0 + fi + python3 - <<'PY' >> "$GITHUB_STEP_SUMMARY" + import json, os, pathlib + data = json.loads(pathlib.Path(os.environ["RUN_DIR"], "oetf-result.json").read_text()) + total = data.get("total", 0) + passed = data.get("passed", 0) + failed = data.get("failed", 0) + errored = data.get("errored", 0) + skipped = data.get("skipped", 0) + status_icon = "✅" if (failed == 0 and errored == 0) else "❌" + row_icon = {"pass": "✅", "fail": "❌", "error": "⚠️", "skip": "⏭️"} + print(f"### OETF stage {status_icon}") + print() + print(f"- tags: `{data.get('tags','-')}`") + print(f"- url: `{data.get('url','-')}`") + print(f"- totals: ✅ {passed} passed · ❌ {failed} failed · ⚠️ {errored} errored · ⏭️ {skipped} skipped (of {total})") + print() + print("| | Target | Time | Message |") + print("|---|---|---:|---|") + for r in data.get("results", []): + msg = (r.get("message") or "").strip().replace("\n", " ") + if len(msg) > 200: + msg = msg[:200] + "…" + msg = msg.replace("|", "\\|") + print(f"| {row_icon.get(r.get('status'),'?')} | `{r.get('target','?')}` | {r.get('time',0):.1f}s | {msg} |") + PY + + - name: upload OETF logs + if: always() + uses: actions/upload-artifact@v4 + with: + name: oetf-${{ github.run_id }} + path: runs/deployment-test-azure/** + retention-days: 14 + if-no-files-found: warn + + # ── Stage 4: terraform destroy + cluster diagnostics ───────────────────── + # Always runs as long as tf-apply succeeded — we don't want to leak AKS + # + Postgres + Redis after a verification run. Downloads the tfstate + # artifact tf-apply uploaded, captures a final cluster snapshot before + # destroy, then tears everything down. + tf-destroy: + needs: [build-images, tf-apply, deploy-osmo, oetf] + if: ${{ always() && needs.tf-apply.result == 'success' }} + runs-on: ubuntu-latest + timeout-minutes: 30 + environment: internal-ci + env: + ARM_USE_OIDC: true + ARM_CLIENT_ID: ${{ vars.AZURE_CLIENT_ID }} + ARM_TENANT_ID: ${{ vars.AZURE_TENANT_ID }} + ARM_SUBSCRIPTION_ID: ${{ vars.AZURE_SUBSCRIPTION_ID }} + RUN_DIR: ${{ github.workspace }}/runs/deployment-test-azure + permissions: + id-token: write + contents: read + steps: + - uses: actions/checkout@v4 + + - name: azure login (OIDC) + uses: azure/login@v2 + with: + client-id: ${{ vars.AZURE_CLIENT_ID }} + tenant-id: ${{ vars.AZURE_TENANT_ID }} + subscription-id: ${{ vars.AZURE_SUBSCRIPTION_ID }} + + - uses: hashicorp/setup-terraform@v3 + with: + terraform_version: 1.9.8 + + - name: install kubectl + helm + run: | + set -euo pipefail + KUBECTL_VERSION=v1.31.0 + curl -fsSLo /tmp/kubectl \ + "https://dl.k8s.io/release/${KUBECTL_VERSION}/bin/linux/amd64/kubectl" + curl -fsSL "https://dl.k8s.io/release/${KUBECTL_VERSION}/bin/linux/amd64/kubectl.sha256" \ + | awk '{print $1" /tmp/kubectl"}' | sha256sum -c - + sudo install -m 0755 /tmp/kubectl /usr/local/bin/kubectl + + HELM_VERSION=v3.16.2 + HELM_SHA256=9318379b847e333460d33d291d4c088156299a26cd93d570a7f5d0c36e50b5bb + curl -fsSLo /tmp/helm.tgz "https://get.helm.sh/helm-${HELM_VERSION}-linux-amd64.tar.gz" + echo "${HELM_SHA256} /tmp/helm.tgz" | sha256sum -c - + tar -xzf /tmp/helm.tgz -C /tmp linux-amd64/helm + sudo install -m 0755 /tmp/linux-amd64/helm /usr/local/bin/helm + + - name: download tf-state artifact + uses: actions/download-artifact@v4 + with: + name: tf-state-${{ github.run_id }} + path: tf-state-download/ + + - name: stage tfstate + tfvars for destroy + run: | + set -euo pipefail + cp tf-state-download/terraform.tfstate deployments/terraform/azure/example/ 2>/dev/null || true + cp tf-state-download/.terraform.lock.hcl deployments/terraform/azure/example/ 2>/dev/null || true + cp tf-state-download/azure.tfvars "$RUNNER_TEMP/azure.tfvars" 2>/dev/null || true + ls -la deployments/terraform/azure/example/terraform.tfstate "$RUNNER_TEMP/azure.tfvars" || true + + # Capture a snapshot of cluster + OSMO state BEFORE terraform destroys + # everything. Self-contained: re-mints kubectl context up front in + # case anything along the way mangled the kubeconfig. + - name: dump cluster + OSMO diagnostics (always) + if: always() + timeout-minutes: 5 + env: + AZURE_RESOURCE_GROUP: ${{ vars.AZURE_RESOURCE_GROUP }} + AZURE_CLUSTER_NAME: ${{ vars.AZURE_CLUSTER_NAME || 'osmo-deployment-test' }} + run: | + set +e + DIAG="$RUN_DIR/diagnostics" + mkdir -p "$DIAG" + + echo "▶ refreshing kubectl context" + az aks get-credentials \ + --resource-group "$AZURE_RESOURCE_GROUP" \ + --name "$AZURE_CLUSTER_NAME" \ + --overwrite-existing --admin > "$DIAG/az_creds.log" 2>&1 || true + kubectl cluster-info > "$DIAG/cluster-info.txt" 2>&1 || \ + { echo "::warning::kubectl can't reach the cluster — skipping in-cluster diagnostics"; exit 0; } + + echo "::group::pods (all namespaces)" + kubectl get pods -A -o wide | tee "$DIAG/pods.txt" + echo "::endgroup::" + + echo "::group::events (last 200, sorted by lastTimestamp)" + kubectl get events -A --sort-by='.lastTimestamp' 2>/dev/null | tail -200 | tee "$DIAG/events.txt" + echo "::endgroup::" + + echo "::group::non-Running pods + describe" + kubectl get pods -A --field-selector=status.phase!=Running -o wide | tee "$DIAG/non-running.txt" + kubectl get pods -A --field-selector=status.phase!=Running \ + -o jsonpath='{range .items[*]}{.metadata.namespace}{" "}{.metadata.name}{"\n"}{end}' \ + | while read -r ns pod; do + [[ -z "$ns" || -z "$pod" ]] && continue + kubectl describe pod "$pod" -n "$ns" > "$DIAG/describe-${ns}-${pod}.txt" 2>&1 + kubectl logs "$pod" -n "$ns" --all-containers --tail=200 --prefix \ + > "$DIAG/logs-${ns}-${pod}.log" 2>&1 + done + echo "::endgroup::" + + echo "::group::image refs on running pods" + kubectl get pods -A -o jsonpath='{range .items[*]}{.metadata.namespace}/{.metadata.name}{"\t"}{range .spec.containers[*]}{.image}{","}{end}{"\n"}{end}' \ + | sort | tee "$DIAG/image-refs.txt" + echo "::endgroup::" + + echo "::group::OSMO pod logs (tail 500)" + for ns in osmo-minimal osmo-operator osmo-workflows; do + kubectl get pods -n "$ns" --no-headers -o custom-columns=NAME:.metadata.name 2>/dev/null \ + | while read -r pod; do + [[ -z "$pod" ]] && continue + kubectl logs "$pod" -n "$ns" --tail=500 --all-containers --prefix --timestamps \ + > "$DIAG/podlog-${ns}-${pod}.log" 2>&1 + done + done + echo "::endgroup::" + + echo "::group::helm releases + values" + helm list -A -o yaml > "$DIAG/helm-releases.yaml" 2>&1 + while IFS='|' read -r r ns; do + [[ -z "$r" ]] && continue + helm status "$r" -n "$ns" > "$DIAG/helm-status-${r}.txt" 2>&1 + helm get values "$r" -n "$ns" > "$DIAG/helm-values-${r}.yaml" 2>&1 + done < <(helm list -A -o json 2>/dev/null | jq -r '.[] | "\(.name)|\(.namespace)"') + echo "::endgroup::" + + { + echo "### Cluster diagnostic snapshot" + echo "" + echo "Captured under \`$DIAG\` (uploaded as part of the \`tf-destroy-${GITHUB_RUN_ID}\` artifact)." + echo "" + echo "#### Pods not Running" + if [ -s "$DIAG/non-running.txt" ] && [ "$(wc -l < "$DIAG/non-running.txt")" -gt 1 ]; then + echo '```' + head -20 "$DIAG/non-running.txt" + echo '```' + else + echo "_(all pods Running)_" + fi + echo "" + echo "#### Image refs (first 30)" + echo '```' + head -30 "$DIAG/image-refs.txt" + echo '```' + echo "" + echo "#### Last 30 cluster events" + echo '```' + tail -30 "$DIAG/events.txt" + echo '```' + } >> "$GITHUB_STEP_SUMMARY" + + # Never fail — diagnostics are best-effort, must not block teardown. + exit 0 + + - name: TEMP — terraform destroy + if: always() + working-directory: deployments/terraform/azure/example + env: + AZURE_RESOURCE_GROUP: ${{ vars.AZURE_RESOURCE_GROUP }} + run: | + set -euo pipefail + echo "::notice::terraform destroy starting — expected ~10–15 min" + + echo "::group::terraform init (refresh provider)" + terraform init -input=false -no-color + echo "::endgroup::" + + echo "::group::terraform destroy (streaming)" + terraform destroy -input=false -auto-approve -no-color \ + -var-file="$RUNNER_TEMP/azure.tfvars" \ + || echo "::warning::terraform destroy failed — orphan resources in $AZURE_RESOURCE_GROUP may remain" + echo "::endgroup::" + + REMAINING=$(az resource list --resource-group "$AZURE_RESOURCE_GROUP" --query 'length(@)' -o tsv || echo "?") + echo " $REMAINING resource(s) still in $AZURE_RESOURCE_GROUP" + + icon='✅' + [ "$REMAINING" != "0" ] && icon='⚠️' + { + echo "### Destroy stage ${icon}" + echo "" + echo "- resources remaining in \`${AZURE_RESOURCE_GROUP}\`: ${REMAINING}" + echo "- finished at: $(date -u +%H:%M:%SZ)" + if [ "$REMAINING" != "0" ]; then + echo "" + echo "Next run's pre-apply cleanup step will wipe these." + fi + } >> "$GITHUB_STEP_SUMMARY" + + - name: upload destroy logs + diagnostics + if: always() + uses: actions/upload-artifact@v4 + with: + name: tf-destroy-${{ github.run_id }} + path: runs/deployment-test-azure/** + retention-days: 14 + if-no-files-found: warn + + + # ── Slack failure-notification (schedule-only) ─────────────────────────── + # + # Channel comes from `vars.CI_SLACK_CHANNEL` (fallback `osmo-slack-test`) + # and the auth comes from `OSMO_SLACK_BOT_TOKEN` — same `chat.postMessage` + # plumbing testbot.yaml + update-distroless-images.yaml use. + # + # ───────────────────────────────────────────────────────────────────────── + + notify-slack-on-azure-deployment-test-failure: + needs: [build-images, tf-apply, deploy-osmo, oetf, tf-destroy] + # always() so this evaluates even when an upstream `needs:` failed. + # Fires only on scheduled-run failures — PR-label and workflow_dispatch + # runs surface their own status interactively. + if: > + ${{ always() + && github.event_name == 'schedule' + && (needs.build-images.result == 'failure' + || needs.tf-apply.result == 'failure' + || needs.deploy-osmo.result == 'failure' + || needs.oetf.result == 'failure' + || needs.tf-destroy.result == 'failure') }} + runs-on: ubuntu-latest + timeout-minutes: 5 + steps: + - name: Gather context (commit metadata + commits since previous green run) + id: ctx + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + REPO: ${{ github.repository }} + SHA: ${{ github.sha }} + WORKFLOW_ID: ${{ github.workflow_ref }} + SERVER_URL: ${{ github.server_url }} + RUN_ID: ${{ github.run_id }} + run: | + set -uo pipefail + + # 1) HEAD commit metadata — author display name + first-line subject. + # Daily cron runs land on whatever's on main at fire time. Embed + # both so the on-call doesn't have to click through to identify + # whose change is suspect. + commit_resp=$(curl -sS -H "Authorization: Bearer $GH_TOKEN" \ + -H 'Accept: application/vnd.github+json' \ + "https://api.github.com/repos/${REPO}/commits/${SHA}") + author=$(jq -r '.commit.author.name // "unknown"' <<<"$commit_resp") + subject=$(jq -r '.commit.message // ""' <<<"$commit_resp" | head -1) + # Trim subject to ≤ 120 chars so the Slack block doesn't sprawl. + if [[ ${#subject} -gt 120 ]]; then subject="${subject:0:117}..."; fi + + # 2) Find the most recent successful scheduled run BEFORE this + # one, then build a compare link spanning every commit that + # landed since. Daily cron on a busy repo can easily span 10+ + # commits — a single "current SHA" link is misleading. + # Fall back to a plain "recent commits on main" view when this + # is the first scheduled run (no prior green to compare against). + wf_name='Deployment Test' + wf_runs=$(curl -sS -H "Authorization: Bearer $GH_TOKEN" \ + -H 'Accept: application/vnd.github+json' \ + "https://api.github.com/repos/${REPO}/actions/workflows/deployment-test.yaml/runs?event=schedule&status=success&per_page=2") + prev_sha=$(jq -r --arg this "$RUN_ID" \ + '[.workflow_runs[] | select((.id | tostring) != $this)] | .[0].head_sha // empty' \ + <<<"$wf_runs") + if [[ -n "$prev_sha" && "$prev_sha" != "$SHA" ]]; then + compare_url="${SERVER_URL}/${REPO}/compare/${prev_sha}...${SHA}" + # Count commits in the range (best-effort). + compare_resp=$(curl -sS -H "Authorization: Bearer $GH_TOKEN" \ + -H 'Accept: application/vnd.github+json' \ + "https://api.github.com/repos/${REPO}/compare/${prev_sha}...${SHA}") + commit_count=$(jq -r '.total_commits // 0' <<<"$compare_resp") + compare_label="${commit_count} commits since last green run" + else + compare_url="${SERVER_URL}/${REPO}/commits/${GITHUB_REF_NAME:-main}" + compare_label="Recent commits on main" + fi + + # 3) Resolve the artifact ID for THIS run so the Slack button + # deep-links directly to the artifact's download page. GitHub + # has no `#artifacts` anchor on the run page — links with that + # fragment land at the top of the page with no scroll. The + # working URL shape is: + # https://github.com///actions/runs//artifacts/ + # which renders the artifact's download flow directly. We pick + # the first non-expired artifact (full-deployment uploads a + # single one named `deployment-test-run-`); fall back + # to the run page when none is found (e.g. job aborted before + # the always() upload step ran). + artifacts_resp=$(curl -sS -H "Authorization: Bearer $GH_TOKEN" \ + -H 'Accept: application/vnd.github+json' \ + "https://api.github.com/repos/${REPO}/actions/runs/${RUN_ID}/artifacts?per_page=10") + artifact_id=$(jq -r '[.artifacts[] | select(.expired==false)] | .[0].id // empty' <<<"$artifacts_resp") + artifact_name=$(jq -r '[.artifacts[] | select(.expired==false)] | .[0].name // empty' <<<"$artifacts_resp") + if [[ -n "$artifact_id" ]]; then + artifact_url="${SERVER_URL}/${REPO}/actions/runs/${RUN_ID}/artifacts/${artifact_id}" + artifact_label="Download ${artifact_name}" + else + artifact_url="${SERVER_URL}/${REPO}/actions/runs/${RUN_ID}" + artifact_label="(no artifact yet — open run page)" + fi + + # 4) Persist outputs (escape multi-line values). + { + echo "author<<__GHA_EOF__"; echo "$author"; echo "__GHA_EOF__" + echo "subject<<__GHA_EOF__"; echo "$subject"; echo "__GHA_EOF__" + echo "short_sha=${SHA:0:7}" + echo "compare_url=$compare_url" + echo "compare_label=$compare_label" + echo "artifact_url=$artifact_url" + echo "artifact_label=$artifact_label" + } >> "$GITHUB_OUTPUT" + + - name: Post failure notification to Slack + env: + OSMO_SLACK_BOT_TOKEN: ${{ secrets.OSMO_SLACK_BOT_TOKEN }} + # `vars.CI_SLACK_CHANNEL` lets the channel be overridden at the + # repo/org level without editing this file. Default `osmo-slack-test` + # while the gate proves itself; flip to e.g. #osmo-oncall once it's + # trusted. Note: the org-level `vars.TESTBOT_SLACK_CHANNEL` is NOT + # what we want here — it points at #osmo-code-reviews (testbot's + # PR-review channel), which is the wrong audience for deploy-gate + # failures. + SLACK_CHANNEL: ${{ vars.CI_SLACK_CHANNEL || 'osmo-slack-test' }} + BI_RESULT: ${{ needs.build-images.result }} + APPLY_RESULT: ${{ needs.tf-apply.result }} + DEPLOY_RESULT: ${{ needs.deploy-osmo.result }} + OETF_RESULT: ${{ needs.oetf.result }} + DESTROY_RESULT: ${{ needs.tf-destroy.result }} + REPO: ${{ github.repository }} + RUN_ID: ${{ github.run_id }} + RUN_ATTEMPT: ${{ github.run_attempt }} + AUTHOR: ${{ steps.ctx.outputs.author }} + SUBJECT: ${{ steps.ctx.outputs.subject }} + SHORT_SHA: ${{ steps.ctx.outputs.short_sha }} + FULL_SHA: ${{ github.sha }} + SERVER_URL: ${{ github.server_url }} + REF_NAME: ${{ github.ref_name }} + WORKFLOW: ${{ github.workflow }} + COMPARE_URL: ${{ steps.ctx.outputs.compare_url }} + COMPARE_LABEL: ${{ steps.ctx.outputs.compare_label }} + ARTIFACT_URL: ${{ steps.ctx.outputs.artifact_url }} + ARTIFACT_LABEL: ${{ steps.ctx.outputs.artifact_label }} + EVENT: ${{ github.event_name }} + run: | + set -uo pipefail + if [[ -z "${OSMO_SLACK_BOT_TOKEN:-}" ]]; then + echo "::warning::OSMO_SLACK_BOT_TOKEN secret not set — skipping Slack notification." + exit 0 + fi + + run_url="${SERVER_URL}/${REPO}/actions/runs/${RUN_ID}" + if [[ -n "${RUN_ATTEMPT:-}" && "${RUN_ATTEMPT}" != "1" ]]; then + run_url="${run_url}/attempts/${RUN_ATTEMPT}" + fi + commit_url="${SERVER_URL}/${REPO}/commit/${FULL_SHA}" + workflow_url="${SERVER_URL}/${REPO}/blob/${REF_NAME}/.github/workflows/deployment-test.yaml" + # artifact_url comes from the "Gather context" step which already + # resolved the per-run artifact ID via the GH API. Falls back to + # the run page when no artifact exists (job died before upload). + artifact_url="${ARTIFACT_URL}" + artifact_label="${ARTIFACT_LABEL}" + header_text=":x: OSMO Azure deployment-test FAILED" + trigger_label="Daily schedule (00:00 UTC = 5pm PDT)" + + payload=$(jq -n \ + --arg channel "$SLACK_CHANNEL" \ + --arg header_text "$header_text" \ + --arg trigger_label "$trigger_label" \ + --arg branch "$REF_NAME" \ + --arg short_sha "$SHORT_SHA" \ + --arg author "$AUTHOR" \ + --arg subject "$SUBJECT" \ + --arg bi "$BI_RESULT" \ + --arg apply "$APPLY_RESULT" \ + --arg deploy "$DEPLOY_RESULT" \ + --arg oetf "$OETF_RESULT" \ + --arg destroy "$DESTROY_RESULT" \ + --arg workflow "$WORKFLOW" \ + --arg run_url "$run_url" \ + --arg commit_url "$commit_url" \ + --arg workflow_url "$workflow_url" \ + --arg artifact_url "$artifact_url" \ + --arg artifact_label "$artifact_label" \ + --arg compare_url "$COMPARE_URL" \ + --arg compare_label "$COMPARE_LABEL" \ + --arg run_id "$RUN_ID" \ + '{ + channel: $channel, + text: "\($header_text) — \($workflow) run #\($run_id) (branch \($branch))", + blocks: [ + { type: "header", + text: { type: "plain_text", text: $header_text } }, + { type: "section", + fields: [ + { type: "mrkdwn", text: "*build-images*\n`\($bi)`" }, + { type: "mrkdwn", text: "*tf-apply*\n`\($apply)`" }, + { type: "mrkdwn", text: "*deploy-osmo*\n`\($deploy)`" }, + { type: "mrkdwn", text: "*oetf*\n`\($oetf)`" }, + { type: "mrkdwn", text: "*tf-destroy*\n`\($destroy)`" }, + { type: "mrkdwn", text: "*Trigger*\n\($trigger_label)" } + ] }, + { type: "section", + text: { type: "mrkdwn", + text: "*Branch:* `\($branch)` • *Tested commit:* <\($commit_url)|`\($short_sha)`> by *\($author)*\n>\($subject)" } }, + { type: "context", + elements: [ + { type: "mrkdwn", + text: "Daily cron can span many commits since the last green run. Use the *\($compare_label)* button to see everything that landed in between — narrowing blame from a single SHA to the actual contributing change is usually faster from the compare view." } + ] }, + { type: "actions", + elements: [ + { type: "button", + text: { type: "plain_text", text: "View run + logs" }, + url: $run_url, + style: "danger" }, + { type: "button", + text: { type: "plain_text", text: $artifact_label }, + url: $artifact_url }, + { type: "button", + text: { type: "plain_text", text: $compare_label }, + url: $compare_url }, + { type: "button", + text: { type: "plain_text", text: "Workflow file" }, + url: $workflow_url } + ] }, + { type: "context", + elements: [ + { type: "mrkdwn", + text: ":bulb: First-look investigation: open *Download artifacts* → unzip → check `deployment-test-result.json` (which wrapper stage failed) and `diagnostics/` (cluster state at teardown)." } + ] } + ] + }') + + echo "::group::Slack payload (preview)" + echo "$payload" | jq -C . | head -80 + echo "::endgroup::" + + # Same `chat.postMessage` call pattern that + # update-distroless-images.yaml uses (lines 210–224). Stay resilient: + # we never want a Slack outage to turn a passed deploy into a + # failed run, so log + continue rather than fail. + if ! response=$( + curl -fsSL \ + -H "Authorization: Bearer $OSMO_SLACK_BOT_TOKEN" \ + -H 'Content-Type: application/json; charset=utf-8' \ + -d "$payload" \ + https://slack.com/api/chat.postMessage + ); then + echo "::warning::Slack POST failed (network/transport) — message not delivered." + exit 0 + fi + ok=$(jq -r '.ok' <<<"$response") + if [[ "$ok" != "true" ]]; then + echo "::warning::Slack chat.postMessage returned ok=$ok — message not delivered." + echo " Full response: $response" + exit 0 + fi + ts=$(jq -r '.ts // ""' <<<"$response") + ch=$(jq -r '.channel // ""' <<<"$response") + echo "::notice::Slack notification posted to channel $ch (ts=$ts)." diff --git a/ci/deployment-test/azure-overrides.yaml b/ci/deployment-test/azure-overrides.yaml new file mode 100644 index 000000000..d12d586c4 --- /dev/null +++ b/ci/deployment-test/azure-overrides.yaml @@ -0,0 +1,40 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Helm values overlay layered on top of charts/service/values.yaml by the +# deployment-test wrapper's Azure path (run-deployment-test.sh: azure args). +# Layered via deploy-osmo-minimal.sh --helm-values. +# +# Why this exists: the chart's default `osmo-ctrl` sidecar requests 1 vCPU +# at scheduling time. OSMO's resource validator subtracts that from each +# node's allocatable to compute the K8_CPU placeholder used in +# `USER_CPU LE K8_CPU` strict-LE rules. On a 3-node Std_D4s_v3 cluster +# (allocatable ~3 vCPU/node) after Azure system daemons + OSMO services, +# K8_CPU drops below 1.0 and every cpu=1 task is rejected. +# +# We can't do this with --helm-set because helm REPLACES list elements +# wholesale rather than merging; `--set …containers[0].resources.requests +# .cpu=100m` would wipe the container's `name` and the rest of `resources`. +# Layering a full values file keeps the merge clean. + +services: + configs: + podTemplates: + default_ctrl: + spec: + containers: + - name: osmo-ctrl + resources: + limits: + cpu: "{{USER_CPU}}" + memory: "{{USER_MEMORY}}" + ephemeral-storage: "{{USER_STORAGE}}" + requests: + # Reduced from chart default of "1" to 100m. The chart's + # limit still tracks USER_CPU so the task gets its full + # CPU budget at runtime; only the scheduler-side reservation + # shrinks. See run-deployment-test.sh stage_deploy() azure + # branch for the full rationale. + cpu: "100m" + memory: "1Gi" + ephemeral-storage: "1Gi" diff --git a/deployments/scripts/run-deployment-test.sh b/deployments/scripts/run-deployment-test.sh new file mode 100755 index 000000000..858d4597f --- /dev/null +++ b/deployments/scripts/run-deployment-test.sh @@ -0,0 +1,700 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +############################################################################### +# OSMO Deployment-Script Test Gate (D4) +# +# End-to-end test wrapper that exercises deploy-osmo-minimal.sh, verify.sh, +# and the per-provider helper scripts on a real ephemeral cluster. Designed +# to run from a GitLab CI nightly schedule, a release-cut manual trigger, or +# a future Kargo verification stage --- the interface (flags + env vars + +# categorized exit code) is the stable contract. +# +# Invariants (see plan §D4.1): +# 1. Stateless CLI: only --provider / --chart-version / --image-tag. +# Note: --chart-version and --image-tag are accepted by THIS wrapper but +# passed through to deploy-osmo-minimal.sh as OSMO_CHART_VERSION / +# OSMO_IMAGE_TAG env vars (deploy-k8s.sh:59-60), not as CLI flags. +# 2. Self-contained: ephemeral cluster + DB + Redis, torn down on EXIT. +# 3. Identity-agnostic: no cloud creds, Vault, or Kargo tokens needed. +# 4. Reproducible: no $RANDOM, no wall-clock dependencies in test logic. +# 5. Bounded: 45-min hard timeout; every kubectl wait has --timeout. +# 6. Structured output: JSON result + per-stage logs in $RUN_DIR. +# 7. Idempotent teardown: --destroy + kind delete + docker prune. +# 8. Categorized exit codes: +# 0 = pass +# 1 = cluster-bootstrap failure +# 2 = deploy-script OR verify failure (verify.sh runs inside +# deploy-osmo-minimal.sh; we let the deploy script own its +# port-forward-watchdog → verify.sh sequencing rather than +# splitting them across stages) +# 4 = OETF smoke failure +# 5 = teardown failure +# +# Usage: +# run-deployment-test.sh [--provider byo-kind|microk8s] +# [--chart-version VERSION] +# [--image-tag TAG] +# +# Env vars (read but never required): +# PROVIDER, OSMO_CHART_VERSION, OSMO_IMAGE_TAG, RUN_DIR +# +# OSMO_DEPLOY_DEMO is FORBIDDEN in CI: this script will abort if set. +############################################################################### + +set -euo pipefail + +# ── CI guardrail: demo mode must never be active in the test gate ──────────── +# Demo mode (D1) tolerates verify-script failures. Letting that opt-out leak +# into the nightly gate would silently hide exactly the regressions D4 exists +# to catch. Fail fast. +if [[ -n "${OSMO_DEPLOY_DEMO:-}" ]]; then + echo "FATAL: OSMO_DEPLOY_DEMO is set; forbidden in the deployment-test gate." >&2 + exit 2 +fi + +# ── Defaults / CLI parsing ─────────────────────────────────────────────────── +PROVIDER="${PROVIDER:-byo-kind}" +CHART_VERSION="${OSMO_CHART_VERSION:-}" +IMAGE_TAG="${OSMO_IMAGE_TAG:-}" + +# Azure provider params (read from env or set via CLI; required when --provider azure). +AZURE_SUBSCRIPTION_ID="${AZURE_SUBSCRIPTION_ID:-}" +AZURE_RESOURCE_GROUP="${AZURE_RESOURCE_GROUP:-}" +AZURE_REGION="${AZURE_REGION:-eastus2}" +AZURE_CLUSTER_NAME="${AZURE_CLUSTER_NAME:-}" +ENVIRONMENT="${ENVIRONMENT:-dev}" +POSTGRES_PASSWORD="${POSTGRES_PASSWORD:-}" +STORAGE_BACKEND="${STORAGE_BACKEND:-}" + +# Where //test_infra/oetf lives. In the OUTER osmo repo it is a sibling of +# external/ (NOT inside it). When this script is invoked from an external/ +# worktree (e.g. /tmp/osmo-d4-azure), $REPO_ROOT resolves to /tmp/ and OETF +# is unreachable. Setting OETF_REPO_ROOT lets the caller point at the outer +# checkout (e.g. /home/jiaenr/osmo) without changing the run-from-external +# convention. +OETF_REPO_ROOT="${OETF_REPO_ROOT:-}" + +# Operational knobs (env-only, never required): +# SKIP_DEPLOY=1 → skip stage_deploy (chart install + verify-hello). +# Bootstrap still runs (kubectl creds, reachability). +# Used by the CI gate to split deploy and OETF into +# separate, individually-summarised GHA steps. +# SKIP_OETF=1 → skip stage_oetf_smoke entirely (returns 0) +# SKIP_TEARDOWN=1 → skip the deploy --destroy + KIND delete in cleanup() +# (use when --provider azure / aws and you want to keep +# the cloud infra alive for inspection) +SKIP_DEPLOY="${SKIP_DEPLOY:-0}" +SKIP_OETF="${SKIP_OETF:-0}" +SKIP_TEARDOWN="${SKIP_TEARDOWN:-0}" + +while [[ $# -gt 0 ]]; do + case "$1" in + --provider) PROVIDER="$2"; shift 2 ;; + --chart-version) CHART_VERSION="$2"; shift 2 ;; + --image-tag) IMAGE_TAG="$2"; shift 2 ;; + # Azure pass-through + --subscription-id) AZURE_SUBSCRIPTION_ID="$2"; shift 2 ;; + --resource-group) AZURE_RESOURCE_GROUP="$2"; shift 2 ;; + --region) AZURE_REGION="$2"; shift 2 ;; + --cluster-name) AZURE_CLUSTER_NAME="$2"; shift 2 ;; + --environment) ENVIRONMENT="$2"; shift 2 ;; + --postgres-password) POSTGRES_PASSWORD="$2"; shift 2 ;; + --storage-backend) STORAGE_BACKEND="$2"; shift 2 ;; + --oetf-repo-root) OETF_REPO_ROOT="$2"; shift 2 ;; + --skip-deploy) SKIP_DEPLOY=1; shift ;; + --skip-oetf) SKIP_OETF=1; shift ;; + --skip-teardown) SKIP_TEARDOWN=1; shift ;; + -h|--help) + grep '^#' "$0" | sed 's/^# \{0,1\}//' + exit 0 ;; + *) + echo "FATAL: unknown argument: $1" >&2 + exit 2 ;; + esac +done + +# ── Path setup ─────────────────────────────────────────────────────────────── +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# external/deployments/scripts/ → external/deployments/ → external/ → repo root +REPO_ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)" +DEPLOY_SCRIPT="$SCRIPT_DIR/deploy-osmo-minimal.sh" +KIND_CONFIG="$REPO_ROOT/ci/deployment-test/kind-config.yaml" + +RUN_DIR="${RUN_DIR:-$REPO_ROOT/runs/deployment-test-${PROVIDER}}" +mkdir -p "$RUN_DIR" + +DEPLOY_LOG="$RUN_DIR/deploy.log" +OETF_LOG="$RUN_DIR/oetf.log" +TEARDOWN_LOG="$RUN_DIR/teardown.log" +RESULT_JSON="$RUN_DIR/deployment-test-result.json" +JUNIT_XML="$RUN_DIR/junit.xml" + +KIND_CLUSTER_NAME="osmo-deployment-test" +OSMO_NAMESPACE="osmo-minimal" +HARD_TIMEOUT_SECONDS=2700 # 45 minutes + +# Per-stage state for the final JSON. +declare -a STAGE_NAMES=() +declare -a STAGE_EXIT_CODES=() +declare -a STAGE_DURATIONS=() +OVERALL_EXIT_CODE=0 +FAILED_STAGE="" + +log_info() { printf '[%s] [INFO] %s\n' "$(date -u +%H:%M:%S)" "$*"; } +log_error() { printf '[%s] [ERROR] %s\n' "$(date -u +%H:%M:%S)" "$*" >&2; } + +# ── Result + teardown helpers ──────────────────────────────────────────────── +record_stage() { + # record_stage + STAGE_NAMES+=("$1") + STAGE_EXIT_CODES+=("$2") + STAGE_DURATIONS+=("$3") +} + +# Map an exit code to its semantic stage name (plan §D4.1 invariant 8). +exit_code_category() { + case "$1" in + 0) echo "pass" ;; + 1) echo "cluster-bootstrap" ;; + 2) echo "deploy-script-or-verify" ;; + 4) echo "oetf-smoke" ;; + 5) echo "teardown" ;; + *) echo "unknown" ;; + esac +} + +emit_result_json() { + local overall="pass" + [[ "$OVERALL_EXIT_CODE" -ne 0 ]] && overall="fail" + + { + printf '{\n' + printf ' "provider": "%s",\n' "$PROVIDER" + printf ' "chart_version": "%s",\n' "$CHART_VERSION" + printf ' "image_tag": "%s",\n' "$IMAGE_TAG" + printf ' "stages": [\n' + local i + for i in "${!STAGE_NAMES[@]}"; do + local sep="," + [[ "$i" -eq $(( ${#STAGE_NAMES[@]} - 1 )) ]] && sep="" + printf ' {"name": "%s", "exit_code": %s, "duration_seconds": %s}%s\n' \ + "${STAGE_NAMES[$i]}" "${STAGE_EXIT_CODES[$i]}" "${STAGE_DURATIONS[$i]}" "$sep" + done + printf ' ],\n' + printf ' "overall": "%s",\n' "$overall" + printf ' "exit_code": %s,\n' "$OVERALL_EXIT_CODE" + printf ' "failed_stage": "%s"\n' "$FAILED_STAGE" + printf '}\n' + } > "$RESULT_JSON" +} + +emit_junit_xml() { + # Minimal JUnit XML so GitLab CI's reports.junit: surfaces stages as cases. + local total="${#STAGE_NAMES[@]}" + local failures=0 + local i + for i in "${!STAGE_NAMES[@]}"; do + [[ "${STAGE_EXIT_CODES[$i]}" -ne 0 ]] && failures=$((failures + 1)) + done + + { + printf '\n' + printf '\n' "$total" "$failures" + for i in "${!STAGE_NAMES[@]}"; do + local name="${STAGE_NAMES[$i]}" + local code="${STAGE_EXIT_CODES[$i]}" + local duration="${STAGE_DURATIONS[$i]}" + printf ' ' \ + "$PROVIDER" "$name" "$duration" + if [[ "$code" -ne 0 ]]; then + printf '' \ + "$name" "$code" "$(exit_code_category "$code")" + fi + printf '\n' + done + printf '\n' + } > "$JUNIT_XML" +} + +cleanup() { + local rc=$? + # If we're here because a stage already set OVERALL_EXIT_CODE, preserve it; + # otherwise infer from $rc (e.g. ERR-on-set -e from an unguarded command). + if [[ "$OVERALL_EXIT_CODE" -eq 0 && "$rc" -ne 0 ]]; then + OVERALL_EXIT_CODE="$rc" + FAILED_STAGE="${FAILED_STAGE:-unknown}" + fi + + # Best-effort: silence the watchdog before its sleep elapses. Safe to call + # even if WATCHDOG_PID is unset/already-dead (stop_watchdog tolerates both). + if declare -F stop_watchdog >/dev/null 2>&1; then + stop_watchdog + fi + + local td_start td_end td_rc=0 + td_start=$SECONDS + log_info "Teardown: starting (preserving exit code $OVERALL_EXIT_CODE)" + + if [[ "$SKIP_TEARDOWN" == "1" ]]; then + log_info "SKIP_TEARDOWN=1 — skipping deploy --destroy and infra cleanup" + else + # Best-effort destroy via the same orchestrator the test exercises. + # --destroy is idempotent (plan §D4.1 invariant 7), so it is safe to + # run even when stage 1 only got halfway through cluster creation. + # + # NOTE: deploy-osmo-minimal.sh's accepted providers are azure|aws|microk8s|byo + # (deploy-osmo-minimal.sh:450-457). Our wrapper's `byo-kind` taxonomy must + # translate to `byo` at this boundary. + local deploy_provider="$PROVIDER" + [[ "$PROVIDER" == "byo-kind" ]] && deploy_provider="byo" + local destroy_args=(--provider "$deploy_provider" --destroy --non-interactive) + # For cloud providers, preserve the externally-managed terraform infra. + # Without --skip-terraform, deploy-osmo-minimal.sh --destroy would run + # `terraform destroy` and delete the cluster + postgres + redis that + # the operator provisioned out-of-band. + if [[ "$PROVIDER" == "azure" || "$PROVIDER" == "aws" ]]; then + destroy_args+=(--skip-terraform) + fi + if [[ -x "$DEPLOY_SCRIPT" ]]; then + bash "$DEPLOY_SCRIPT" "${destroy_args[@]}" \ + >>"$TEARDOWN_LOG" 2>&1 || td_rc=$? + fi + + if [[ "$PROVIDER" == "byo-kind" ]]; then + # Even if the deploy script never ran or partial-failed, ensure the + # KIND cluster, sidecar containers, and unused images are removed + # so the runner returns to a clean state. + kind delete cluster --name "$KIND_CLUSTER_NAME" >>"$TEARDOWN_LOG" 2>&1 || true + docker rm -f osmo-test-postgres osmo-test-redis >>"$TEARDOWN_LOG" 2>&1 || true + docker system prune -af --filter "until=2h" >>"$TEARDOWN_LOG" 2>&1 || true + fi + fi + + td_end=$SECONDS + record_stage "teardown" "$td_rc" "$((td_end - td_start))" + + # A teardown failure is only the controlling exit code when no earlier + # stage already failed --- keep the original signal so triage points at + # the real regression. + if [[ "$OVERALL_EXIT_CODE" -eq 0 && "$td_rc" -ne 0 ]]; then + OVERALL_EXIT_CODE=5 + FAILED_STAGE="teardown" + fi + + emit_result_json + emit_junit_xml + + log_info "Teardown: complete; overall exit code = $OVERALL_EXIT_CODE (failed_stage=${FAILED_STAGE:-none})" + exit "$OVERALL_EXIT_CODE" +} +trap cleanup EXIT + +# ── Hard 45-minute timeout ─────────────────────────────────────────────────── +# Background watchdog process signals the main script if a stage hangs past +# the bounded duration invariant. We send SIGTERM to the main shell ($$) only +# --- not to the whole process group (`kill -- -$$`) --- because this script +# is not guaranteed to be a session leader (CI runners frequently exec it +# inside an existing group). SIGTERM gives the EXIT trap a chance to run +# teardown. +MAIN_PID=$$ +( + sleep "$HARD_TIMEOUT_SECONDS" + log_error "Hard timeout (${HARD_TIMEOUT_SECONDS}s) reached; aborting" + kill -TERM "$MAIN_PID" 2>/dev/null || true +) & +WATCHDOG_PID=$! +disown "$WATCHDOG_PID" 2>/dev/null || true + +stop_watchdog() { + kill "$WATCHDOG_PID" 2>/dev/null || true + wait "$WATCHDOG_PID" 2>/dev/null || true +} + +# ── Stage runner ───────────────────────────────────────────────────────────── +# run_stage +run_stage() { + local name="$1" + local fail_code="$2" + shift 2 + + log_info "Stage start: $name" + local start=$SECONDS + local rc=0 + + if ! "$@"; then + rc=$? + log_error "Stage failed: $name (raw rc=$rc → categorized $fail_code)" + record_stage "$name" "$fail_code" "$((SECONDS - start))" + OVERALL_EXIT_CODE="$fail_code" + FAILED_STAGE="$name" + stop_watchdog + exit "$fail_code" + fi + + record_stage "$name" 0 "$((SECONDS - start))" + log_info "Stage pass: $name ($((SECONDS - start))s)" +} + +# ── Stage implementations ──────────────────────────────────────────────────── + +stage_bootstrap_byo_kind() { + log_info "Creating KIND cluster '$KIND_CLUSTER_NAME' (config=$KIND_CONFIG)" + kind create cluster \ + --name "$KIND_CLUSTER_NAME" \ + --config "$KIND_CONFIG" \ + --wait 5m + + log_info "Starting ephemeral postgres + redis sidecars on the 'kind' docker network" + # postgres:15 reads POSTGRES_USER/POSTGRES_PASSWORD/POSTGRES_DB at container + # startup to create the role+db. POSTGRES_USER here is the container's env + # contract --- distinct from POSTGRES_USERNAME (the libpq credential name + # the deploy script reads at deploy-osmo-minimal.sh:585). + docker run -d --name osmo-test-postgres --network kind \ + -e POSTGRES_PASSWORD=test \ + -e POSTGRES_USER=postgres \ + -e POSTGRES_DB=osmo \ + postgres:15 + # deploy-osmo-minimal.sh's BYO preflight (line 587) rejects empty + # REDIS_PASSWORD with `[[ -z ... ]]`, so the sidecar must require a + # password. This differs from the microk8s in-cluster redis path which + # tolerates empty passwords explicitly. + docker run -d --name osmo-test-redis --network kind \ + redis:7 redis-server --requirepass test-redis-password + + # Export creds for deploy-osmo-minimal.sh's --non-interactive path. + # Variable names match deploy-osmo-minimal.sh:584-595 exactly: + # POSTGRES_HOST, POSTGRES_USERNAME (NOT POSTGRES_USER), POSTGRES_PASSWORD, + # POSTGRES_DB_NAME, REDIS_HOST, REDIS_PORT, REDIS_PASSWORD (non-empty). + export POSTGRES_HOST=osmo-test-postgres + export POSTGRES_USERNAME=postgres + export POSTGRES_PASSWORD=test + export POSTGRES_DB_NAME=osmo + export REDIS_HOST=osmo-test-redis + export REDIS_PORT=6379 + export REDIS_PASSWORD=test-redis-password + + log_info "Waiting for control-plane Ready" + kubectl wait --for=condition=Ready node \ + --selector='node-role.kubernetes.io/control-plane' \ + --timeout=5m +} + +stage_bootstrap_microk8s() { + # TODO(plan §D4.2): microk8s requires `privileged: true` on the runner + # (snap install). Ship D4 v1 with byo-kind only; wire microk8s in once a + # privileged runner class is justified by a real regression. + log_error "--provider microk8s is not yet supported in run-deployment-test.sh" + log_error "See plan §D4.2 'Why --provider byo-kind first'" + return 1 +} + +stage_bootstrap_azure() { + # Azure infra (AKS + flexible postgres + redis cache + storage) is + # provisioned out-of-band via terraform — the same flow operators use + # for real deployments. This wrapper only confirms reachability; + # provisioning belongs to the human/automation that ran terraform. + if [[ -z "$AZURE_SUBSCRIPTION_ID" ]]; then + if command -v az >/dev/null 2>&1; then + AZURE_SUBSCRIPTION_ID="$(az account show --query id -o tsv 2>/dev/null || true)" + fi + if [[ -z "$AZURE_SUBSCRIPTION_ID" ]]; then + log_error "AZURE_SUBSCRIPTION_ID is required (env or --subscription-id)" + return 1 + fi + fi + for var in AZURE_RESOURCE_GROUP AZURE_CLUSTER_NAME POSTGRES_PASSWORD; do + if [[ -z "${!var}" ]]; then + log_error "Required for --provider azure: $var (env or matching CLI flag)" + return 1 + fi + done + + log_info "Refreshing kubectl credentials for AKS cluster" + log_info " subscription=$AZURE_SUBSCRIPTION_ID resource-group=$AZURE_RESOURCE_GROUP cluster=$AZURE_CLUSTER_NAME" + az aks get-credentials \ + --subscription "$AZURE_SUBSCRIPTION_ID" \ + --resource-group "$AZURE_RESOURCE_GROUP" \ + --name "$AZURE_CLUSTER_NAME" \ + --admin --overwrite-existing >/dev/null + + log_info "Confirming cluster reachability" + kubectl get nodes -o wide + kubectl version --output=yaml | head -10 || true +} + +stage_bootstrap() { + case "$PROVIDER" in + byo-kind) stage_bootstrap_byo_kind ;; + microk8s) stage_bootstrap_microk8s ;; + azure) stage_bootstrap_azure ;; + *) + log_error "Unknown provider: $PROVIDER" + return 1 ;; + esac +} + +stage_deploy() { + if [[ "$SKIP_DEPLOY" == "1" ]]; then + log_info "SKIP_DEPLOY=1 — skipping stage_deploy (returns pass)" + return 0 + fi + + # Translate the wrapper's `byo-kind` taxonomy to deploy-osmo-minimal.sh's + # accepted provider set (azure|aws|microk8s|byo; see deploy-osmo-minimal.sh:450-457). + local deploy_provider="$PROVIDER" + [[ "$PROVIDER" == "byo-kind" ]] && deploy_provider="byo" + + # OSMO_CHART_VERSION / OSMO_IMAGE_TAG are read as env vars by deploy-k8s.sh + # (lines 59-60, 661, 730-731, 741, 762-763). They are NOT CLI flags --- the + # deploy script silently drops unknown flags via `*) shift ;;` at lines + # 386-388, so passing --chart-version/--image-tag would do nothing. + [[ -n "$CHART_VERSION" ]] && export OSMO_CHART_VERSION="$CHART_VERSION" + [[ -n "$IMAGE_TAG" ]] && export OSMO_IMAGE_TAG="$IMAGE_TAG" + + local args=() + case "$PROVIDER" in + byo-kind) + # KIND has no cloud LoadBalancer controller — pin gateway to + # NodePort 30080 (matching ci/deployment-test/kind-config.yaml). + # STORAGE_BACKEND=none short-circuits configure_storage_phase + # (deploy-osmo-minimal.sh:733-737) since terraform outputs aren't + # available on a BYO KIND box. + args=( + --provider "$deploy_provider" + --non-interactive + --no-gpu + --storage-backend none + --helm-set gateway.envoy.service.type=NodePort + --helm-set gateway.envoy.service.nodePort=30080 + --helm-set gateway.envoy.service.httpsPort=null + ) + ;; + azure) + # Azure expects --skip-terraform (terraform applied externally). + # STORAGE_BACKEND default for Azure path is minio (per user flow); + # caller may override via --storage-backend. Real Azure LB is + # provisioned by the chart's default service.type=LoadBalancer, + # so do NOT pin to NodePort here. + # + # Chart defaults reserve 1 full CPU each for logger / service / + # worker / agent with minReplicas=3 on logger, AND 1 full CPU + # for the osmo-ctrl sidecar of every workflow pod (chart + # path: services.configs.workflow.podTemplates.default_ctrl. + # spec.containers[0].resources.requests.cpu = "1"). On a + # 3-node Standard_D4s_v3 system pool (4 vCPU each, ~3 + # schedulable after Azure daemons) the K8_CPU placeholder + # (= node.allocatable.cpu − default_ctrl.requests.cpu − + # non_workflow_usage; see postgres.py + # construct_updated_allocatables) drops below 1.0, so the + # strict-LE rule `USER_CPU LE K8_CPU` rejects every + # cpu=1 task ("Value 1.0 too high for CPU"). + # + # Two reductions: + # - OSMO-service requests → 100m (was 1 each → 5 × 1 = 5 CPU) + # - osmo-ctrl sidecar request → 100m (was 1 per workflow task) + # The chart's CPU LIMIT on ctrl/user still tracks USER_CPU, + # so the user's task still gets its full requested CPU budget + # at runtime; only the SCHEDULING request shrinks. + args=( + --provider azure + --non-interactive + --no-gpu + --skip-terraform + --storage-backend "${STORAGE_BACKEND:-minio}" + --subscription-id "$AZURE_SUBSCRIPTION_ID" + --resource-group "$AZURE_RESOURCE_GROUP" + --region "$AZURE_REGION" + --cluster-name "$AZURE_CLUSTER_NAME" + --environment "$ENVIRONMENT" + --postgres-password "$POSTGRES_PASSWORD" + --helm-set services.logger.scaling.minReplicas=1 + --helm-set services.logger.resources.requests.cpu=100m + --helm-set services.service.resources.requests.cpu=100m + --helm-set services.worker.resources.requests.cpu=100m + --helm-set services.agent.resources.requests.cpu=100m + --helm-set services.router.resources.requests.cpu=100m + # default_ctrl pod template override (osmo-ctrl sidecar + # requests.cpu → 100m). Has to come via --helm-values not + # --helm-set because helm replaces list elements wholesale — + # `--set …containers[0]...cpu=100m` wipes the container's + # `name` and limits, breaking the configmap loader's schema. + --helm-values "${SCRIPT_DIR}/../../ci/deployment-test/azure-overrides.yaml" + ) + ;; + *) + log_error "stage_deploy: provider $PROVIDER not wired" + return 1 + ;; + esac + + log_info "Invoking $DEPLOY_SCRIPT (provider=$deploy_provider, ${#args[@]} args)" + log_info " (env: OSMO_CHART_VERSION='${OSMO_CHART_VERSION:-}' OSMO_IMAGE_TAG='${OSMO_IMAGE_TAG:-}')" + bash "$DEPLOY_SCRIPT" "${args[@]}" 2>&1 | tee "$DEPLOY_LOG" + # PIPESTATUS[0] = exit code of bash invocation; tee never fails. + local rc="${PIPESTATUS[0]}" + return "$rc" +} + +stage_oetf_smoke() { + if [[ "$SKIP_OETF" == "1" ]]; then + log_info "SKIP_OETF=1 — skipping stage_oetf_smoke (returns pass)" + return 0 + fi + + # Locate the deployed OSMO URL. + # byo-kind: KIND config maps host :80 → NodePort 30080 → gateway-envoy Service. + # azure: chart default service.type=LoadBalancer → external IP. Wait briefly. + local osmo_url + case "$PROVIDER" in + byo-kind) + osmo_url="http://localhost" + ;; + azure) + # Tried hitting the Azure LB external IP directly first + # (osmo-gateway Service is LoadBalancer type). The IP shows + # up in kubectl get svc within ~30s, but actual reachability + # from the GitHub runner takes longer to settle: every OETF + # bazel test got `ConnectTimeoutError(timeout=60)` to the + # LB on port 80. The cluster's verify-hello check (verify.sh) + # had no such issue because it goes via kubectl port-forward. + # Mirror that: start a localhost port-forward to osmo-gateway + # and point OETF at localhost. Robust to any LB-propagation + # delay or NSG quirk. + local pf_port="${OSMO_OETF_PF_PORT:-9100}" + log_info "Starting kubectl port-forward for OETF: localhost:${pf_port} → osmo-gateway:80" + local pf_svc="" + for candidate in osmo-gateway osmo-gateway-envoy; do + if kubectl get svc -n "$OSMO_NAMESPACE" "$candidate" >/dev/null 2>&1; then + pf_svc="$candidate"; break + fi + done + if [[ -z "$pf_svc" ]]; then + log_error "Neither osmo-gateway nor osmo-gateway-envoy found in $OSMO_NAMESPACE" + return 1 + fi + # nohup + & so the PF outlives this function's subshells. + # Also drop output to a per-run log so we can debug PF crashes. + nohup kubectl port-forward -n "$OSMO_NAMESPACE" \ + "svc/${pf_svc}" "${pf_port}:80" \ + > "$RUN_DIR/oetf-pf.log" 2>&1 & + local pf_pid=$! + # Smoke the PF before we hand off to OETF; OETF will retry on + # its own but a hard-fail here surfaces PF problems immediately. + local pf_ready="" + for _ in 1 2 3 4 5 6 7 8 9 10; do + if curl -sS -o /dev/null -m 2 "http://localhost:${pf_port}/api/version" 2>/dev/null; then + pf_ready=1; break + fi + sleep 1 + done + if [[ -z "$pf_ready" ]]; then + log_error "port-forward to ${pf_svc}:80 didn't become reachable on localhost:${pf_port}; check $RUN_DIR/oetf-pf.log" + kill "$pf_pid" 2>/dev/null || true + return 1 + fi + log_info "Port-forward healthy (PID=$pf_pid). OETF will use http://localhost:${pf_port}" + # Ensure PF dies on function return (success OR failure). + # Bash RETURN trap is per-function — re-arm here. + trap "kill $pf_pid 2>/dev/null || true" RETURN + osmo_url="http://localhost:${pf_port}" + + # Set admin's profile-level default pool. Required because: + # - api-checks/test_list_workflows passes `pool=default` as + # query param, but `/api/workflow` reads `pools` (PLURAL) + # from fastapi.Query — singular is silently ignored + # (workflow_service.py:587). #1114's "fix" used the wrong + # param name; the server-side handler falls through to + # UserProfile.pool lookup, which is empty by default for + # dev-auth admin and raises "No pool selected!" + # (workflow_service.py:609-612). + # - Storing the profile-level default via `osmo profile set + # pool default` fills that fallback so the test passes + # without needing to fix the test query param. + if command -v osmo >/dev/null 2>&1; then + log_info "Setting admin profile default pool=default (workaround for #1114's wrong-param api-checks fix)" + osmo login "$osmo_url" --method dev --username admin >/dev/null 2>&1 \ + || log_warning "osmo login failed — api-checks may still fail" + osmo profile set pool default >/dev/null 2>&1 \ + || log_warning "osmo profile set pool failed — api-checks may still fail" + fi + ;; + *) + osmo_url="http://localhost" + ;; + esac + log_info "Running OETF smoke against $osmo_url" + + # OETF lives in the OUTER osmo repo at test/oetf (sibling of external/). + # When this script runs from an external/ worktree, $REPO_ROOT points at + # the worktree's parent (e.g. /tmp/) which does not contain test/. The + # caller supplies OETF_REPO_ROOT to point at the actual outer checkout. + # (Path was test_infra/oetf prior to the 2026-06 rename — keep a fallback + # so older checkouts still work without re-editing.) + local oetf_repo="${OETF_REPO_ROOT:-$REPO_ROOT}" + local oetf_pkg="" + if [[ -d "$oetf_repo/test/oetf" ]]; then + oetf_pkg="//test/oetf:run" + elif [[ -d "$oetf_repo/test_infra/oetf" ]]; then + oetf_pkg="//test_infra/oetf:run" + else + log_error "OETF source not found under $oetf_repo (looked for test/oetf and test_infra/oetf; set OETF_REPO_ROOT)" + return 1 + fi + if ! command -v bazel >/dev/null 2>&1; then + log_error "OETF KIND entrypoint not wired --- bazel not on PATH. See runbook-3." + return 1 + fi + log_info "OETF target: $oetf_pkg (repo=$oetf_repo)" + + # OETF tag selection. `smoke` is the canonical post-deploy gate, but + # during the test_infra → test/oetf migration the public staging/smoke/ + # set is empty after `auth` is auto-excluded (--auth-method dev). The + # caller can override via $OETF_TAGS; default falls back from smoke to + # `cli` (a real scenario test that exercises OSMO workflow submission). + local oetf_tags="${OETF_TAGS:-smoke}" + # --pool: without it, OETF's `osmo` CLI invocations error with + # `No pool selected!` because the dev-auth admin user has no + # default pool stored. The chart's default pool name is `default`. + local oetf_pool="${OETF_POOL:-default}" + ( + cd "$oetf_repo" + bazel run "$oetf_pkg" -- \ + --env kind \ + --url "$osmo_url" \ + --auth-method dev \ + --auth-username admin \ + --pool "$oetf_pool" \ + --tags "$oetf_tags" \ + --output-json "$RUN_DIR/oetf-result.json" + ) 2>&1 | tee "$OETF_LOG" + local rc="${PIPESTATUS[0]}" + return "$rc" +} + +# ── Main ───────────────────────────────────────────────────────────────────── + +log_info "run-deployment-test.sh: provider=$PROVIDER chart_version='$CHART_VERSION' image_tag='$IMAGE_TAG'" +log_info "RUN_DIR=$RUN_DIR" + +run_stage "bootstrap" 1 stage_bootstrap +run_stage "deploy" 2 stage_deploy +run_stage "oetf-smoke" 4 stage_oetf_smoke + +stop_watchdog +log_info "PASS: deployment-test for provider=$PROVIDER" +# trap cleanup EXIT runs teardown, emits JSON/JUnit, and exits 0. diff --git a/deployments/terraform/azure/example/example.tf b/deployments/terraform/azure/example/example.tf index bfd5ceaf1..4ce8a5d90 100644 --- a/deployments/terraform/azure/example/example.tf +++ b/deployments/terraform/azure/example/example.tf @@ -73,8 +73,13 @@ data "azurerm_resource_group" "main" { ################################################################################ module "vnet" { - source = "Azure/avm-res-network-virtualnetwork/azurerm" - version = "~> 0.10" + source = "Azure/avm-res-network-virtualnetwork/azurerm" + # Pin to 0.17.x. 0.18.0 (2026-06-15) added IPAM validation rules that rely + # on `||` short-circuit in `validation { condition = ... }` — Terraform + # 1.9.x evaluates both sides, so `length(null)` throws even when the + # `ipam_pools == null` branch is true. Re-evaluate once we bump Terraform + # to >= 1.10 or once the AVM module guards the validation with `try()`. + version = "~> 0.17.0" name = "${local.name}-vnet" parent_id = data.azurerm_resource_group.main.id