Skip to content
Open
250 changes: 243 additions & 7 deletions deployments/scripts/azure/terraform.sh
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,9 @@ TF_PROJECT_NAME="${TF_PROJECT_NAME:-osmo}"
# create new standard-tier clusters.
TF_K8S_VERSION="${TF_K8S_VERSION:-1.33.11}"

# GPU node-pool inputs (used by azure_generate_tfvars to render gpu_min/gpu_max
# + gpu_vm_size). Empty TF_GPU_COUNT + TF_GPU_NODE_POOL_ENABLED=false means
# GPU node-pool inputs (used by azure_generate_tfvars to render
# gpu_node_pool_{min,max}_size + gpu_vm_size). Empty TF_GPU_COUNT +
# TF_GPU_NODE_POOL_ENABLED=false means
# CPU-only cluster. Populated by azure_configure_interactively when the user
# opts in via the GPU prompt.
TF_GPU_NODE_POOL_ENABLED="${TF_GPU_NODE_POOL_ENABLED:-false}"
Expand Down Expand Up @@ -251,6 +252,44 @@ azure_describe_vm_sku() {
| head -1
}

# Zones available for a SKU in a region as JSON array (e.g. `["1","3"]`).
# Args: region sku
_azure_sku_zones_json() {
local region="$1" sku="$2"
az vm list-skus -l "$region" ${sub_args[@]+"${sub_args[@]}"} \
--query "[?name=='$sku'].locationInfo[0].zones | [0]" -o json 2>/dev/null
}

# TF list literal of zones common to all active pools (both pools share
# var.availability_zones in TF). Falls back to `["1", "2"]` on az failure;
# returns `[]` only when both pools' zones are known but disjoint.
# Args: region node_sku gpu_enabled gpu_sku
_azure_resolve_pool_zones() {
local region="$1" node_sku="$2" gpu_enabled="$3" gpu_sku="$4"
local node_zones gpu_zones zones
node_zones=$(_azure_sku_zones_json "$region" "$node_sku")
if [[ -z "$node_zones" || "$node_zones" == "null" || "$node_zones" == "[]" ]]; then
echo '["1", "2"]'
return 0
fi
if [[ "$gpu_enabled" == "true" ]]; then
gpu_zones=$(_azure_sku_zones_json "$region" "$gpu_sku")
if [[ -n "$gpu_zones" && "$gpu_zones" != "null" && "$gpu_zones" != "[]" ]]; then
zones=$(jq -nc --argjson a "$node_zones" --argjson b "$gpu_zones" \
'[$a[] | select(IN($b[]))] | sort' 2>/dev/null)
else
zones=$(echo "$node_zones" | jq -c 'sort' 2>/dev/null)
fi
else
zones=$(echo "$node_zones" | jq -c 'sort' 2>/dev/null)
fi
if [[ -z "$zones" || "$zones" == "[]" ]]; then
echo '[]'
return 0
fi
echo "$zones" | jq -r 'map("\"" + . + "\"") | "[" + join(", ") + "]"'
}

# Iterate through TF_REGION_CANDIDATES looking for the first region whose
# remaining quota for the GPU SKU's family >= (count × vCPUs-per-node).
# Echoes the chosen region (empty if none qualifies).
Expand Down Expand Up @@ -431,6 +470,21 @@ azure_generate_tfvars() {
local tfvars_file="$1"
log_info "Generating terraform.tfvars..."

# TF_AVAILABILITY_ZONES (comma-separated) overrides auto-detection.
local resolved_zones
if [[ -n "${TF_AVAILABILITY_ZONES:-}" ]]; then
resolved_zones=$(echo "$TF_AVAILABILITY_ZONES" | tr ',' '\n' | \
jq -Rcn '[inputs | select(length > 0) | gsub("^ +| +$"; "")] | map("\"" + . + "\"") | "[" + join(", ") + "]"' -r)
else
local sub_args=()
[[ -n "${TF_SUBSCRIPTION_ID:-}" ]] && sub_args+=(--subscription "$TF_SUBSCRIPTION_ID")
resolved_zones=$(_azure_resolve_pool_zones \
"${TF_REGION:-eastus2}" \
"${TF_NODE_INSTANCE_TYPE:-Standard_D2s_v3}" \
"${TF_GPU_NODE_POOL_ENABLED:-false}" \
"${TF_GPU_VM_SIZE:-Standard_NC40ads_H100_v5}")
fi

cat > "$tfvars_file" <<EOF
# Auto-generated by deploy-osmo-minimal.sh
# Generated on: $(date)
Expand All @@ -450,7 +504,7 @@ private_subnets = ["10.0.1.0/24", "10.0.2.0/24"]
database_subnets = ["10.0.201.0/24", "10.0.202.0/24"]

# Availability Zones
availability_zones = ["1", "2"]
availability_zones = $resolved_zones

# AKS Configuration
kubernetes_version = "$TF_K8S_VERSION"
Expand Down Expand Up @@ -491,10 +545,10 @@ log_analytics_retention_days = 30
# Optional GPU node pool
# Triggered by --gpu-node-pool on deploy-osmo-minimal.sh, or by answering "yes"
# to the GPU prompt in azure_configure_interactively.
gpu_node_pool_enabled = ${TF_GPU_NODE_POOL_ENABLED:-false}
gpu_vm_size = "${TF_GPU_VM_SIZE:-Standard_NC40ads_H100_v5}"
gpu_min = ${TF_GPU_COUNT:-0}
gpu_max = ${TF_GPU_COUNT:-0}
gpu_node_pool_enabled = ${TF_GPU_NODE_POOL_ENABLED:-false}
gpu_vm_size = "${TF_GPU_VM_SIZE:-Standard_NC40ads_H100_v5}"
gpu_node_pool_min_size = ${TF_GPU_COUNT:-0}
gpu_node_pool_max_size = ${TF_GPU_COUNT:-0}

# Optional Azure Blob Storage Account for workflow data
# Triggered by --storage-backend azure-blob on deploy-osmo-minimal.sh
Expand Down Expand Up @@ -548,6 +602,184 @@ azure_preflight_checks() {
log_success "Azure pre-flight checks passed"
}

# Compare requested vCPU count against (limit - used) for a Microsoft.Compute
# family. Returns 0 when OK or when quota data is unavailable; 1 when the
# request exceeds available. Reads `${sub_args[@]}` from caller scope.
# Args: region family need pool_label math_label
_azure_check_vcpu_quota() {
local region="$1" family="$2" need="$3" pool_label="$4" math_label="$5"
local row limit used available
row=$(az vm list-usage -l "$region" ${sub_args[@]+"${sub_args[@]}"} \
--query "[?contains(name.value, '$family')] | [0].[limit, currentValue]" -o tsv 2>/dev/null)
if [[ -z "$row" ]]; then
log_warning " vCPU quota: no usage row matching family '$family' in $region — skipping math."
return 0
fi
read -r limit used <<<"$row"
if [[ ! "$limit" =~ ^[0-9]+$ || ! "$used" =~ ^[0-9]+$ ]]; then
log_warning " vCPU quota: malformed row for '$family' (limit=$limit used=$used) — skipping math."
return 0
fi
available=$(( limit - used ))
if (( need > available )); then
log_error "Insufficient vCPU quota for $pool_label in $region."
log_error " Need: $need vCPUs ($math_label)"
log_error " Available: $available vCPUs ($used used / $limit limit, family '$family')"
log_error " Request more via: Azure Portal → Subscriptions → Usage + quotas (filter to '$family Family vCPUs')"
return 1
fi
log_info " ✓ vCPU quota OK for $pool_label: need $need, available $available ($used/$limit used)"
return 0
}

# Fail fast on SKU/region/quota mismatches that would otherwise only surface
# 15-25 min into `terraform apply`.
azure_preflight_sku_quota() {
log_info "Pre-flight: SKU availability + vCPU quota..."

local region
region=$(echo "${TF_REGION:-eastus2}" | tr '[:upper:]' '[:lower:]' | tr -d ' ')

local k8s_version="${TF_K8S_VERSION:-1.33.11}"
local postgres_sku="${TF_POSTGRES_SKU:-GP_Standard_D2s_v3}"
local redis_sku="${TF_REDIS_SKU_NAME:-ComputeOptimized_X3}"
local node_sku="${TF_NODE_INSTANCE_TYPE:-Standard_D2s_v3}"
local node_max="${TF_NODE_GROUP_MAX_SIZE:-5}"
local gpu_enabled="${TF_GPU_NODE_POOL_ENABLED:-false}"
local gpu_sku="${TF_GPU_VM_SIZE:-Standard_NC40ads_H100_v5}"
local gpu_max="${TF_GPU_COUNT:-0}"

# Defense-in-depth before interpolating into JMESPath / az args.
if [[ ! "$k8s_version" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
log_error "TF_K8S_VERSION='$k8s_version' is not in expected x.y.z format (e.g. 1.33.11)"
exit 1
fi

local sub_args=()
if [[ -n "${TF_SUBSCRIPTION_ID:-}" ]]; then
sub_args+=(--subscription "$TF_SUBSCRIPTION_ID")
fi

# `values[].version` is principal-only ("1.33") in every region; full
# patches ("1.33.11") live as keys under `patchVersions`. TF azurerm
# requires a full x.y.z, so match against the flattened key list.
if ! az aks get-versions -l "$region" ${sub_args[@]+"${sub_args[@]}"} \
--query "values[].patchVersions.keys(@) | []" -o tsv 2>/dev/null \
| grep -Fqx "$k8s_version"; then
log_error "AKS Kubernetes version $k8s_version is not in $region's supported patch list."
log_error " See: az aks get-versions -l $region --query 'values[].patchVersions.keys(@) | []' -o tsv"
exit 1
fi
log_info " ✓ AKS $k8s_version is GA in $region"

# `TF_POSTGRES_SKU` carries the azurerm tier prefix (GP_/MO_/B_) which
# maps to `supportedServerEditions[].name` upstream; the SKU under
# `supportedServerSkus[].name` does not carry the prefix.
local postgres_sku_name="${postgres_sku#GP_}"
postgres_sku_name="${postgres_sku_name#MO_}"
postgres_sku_name="${postgres_sku_name#B_}"
# `grep -F`: SKU names contain `.` which would otherwise be a regex metachar.
if ! az postgres flexible-server list-skus -l "$region" ${sub_args[@]+"${sub_args[@]}"} \
--query "[].supportedServerEditions[].supportedServerSkus[].name" -o tsv 2>/dev/null \
| grep -Fqx "$postgres_sku_name"; then
log_error "Postgres Flexible Server SKU '$postgres_sku' (resolves to '$postgres_sku_name') is not available in $region."
log_error " See: az postgres flexible-server list-skus -l $region \\"
log_error " --query '[].supportedServerEditions[].supportedServerSkus[].name' -o tsv"
exit 1
fi
log_info " ✓ Postgres SKU $postgres_sku available in $region"

if ! az vm list-skus -l "$region" ${sub_args[@]+"${sub_args[@]}"} \
--query "[?name=='$node_sku'].name | [0]" -o tsv 2>/dev/null | grep -Fqx "$node_sku"; then
log_error "AKS node-pool VM SKU '$node_sku' is not available in $region."
log_error " See: az vm list-skus -l $region --query \"[?name=='$node_sku']\" -o table"
exit 1
fi
log_info " ✓ Node-pool SKU $node_sku available in $region"

# azure_describe_vm_sku returns Azure's authoritative `family` field so
# the contains() filter in _azure_check_vcpu_quota matches name.value.
local node_family node_vcpus
read -r node_family node_vcpus <<<"$(azure_describe_vm_sku "$node_sku")"
if [[ -n "$node_family" && "$node_vcpus" =~ ^[0-9]+$ && "$node_max" =~ ^[0-9]+$ ]]; then
Comment thread
coderabbitai[bot] marked this conversation as resolved.
local node_need=$(( node_max * node_vcpus ))
if ! _azure_check_vcpu_quota "$region" "$node_family" "$node_need" "AKS system pool" "$node_max × $node_sku ($node_vcpus vCPUs each)"; then
exit 1
fi
else
log_warning " vCPU quota: couldn't read family/vCPU data for $node_sku — skipping quota math"
fi

if [[ "$gpu_enabled" == "true" ]]; then
if ! az vm list-skus -l "$region" ${sub_args[@]+"${sub_args[@]}"} \
--query "[?name=='$gpu_sku'].name | [0]" -o tsv 2>/dev/null | grep -Fqx "$gpu_sku"; then
log_error "AKS GPU-pool VM SKU '$gpu_sku' is not available in $region."
log_error " See: az vm list-skus -l $region --query \"[?name=='$gpu_sku']\" -o table"
exit 1
fi
log_info " ✓ GPU-pool SKU $gpu_sku available in $region"

local gpu_family gpu_vcpus
read -r gpu_family gpu_vcpus <<<"$(azure_describe_vm_sku "$gpu_sku")"
if [[ -n "$gpu_family" && "$gpu_vcpus" =~ ^[0-9]+$ && "$gpu_max" =~ ^[0-9]+$ && "$gpu_max" -gt 0 ]]; then
local gpu_need=$(( gpu_max * gpu_vcpus ))
if ! _azure_check_vcpu_quota "$region" "$gpu_family" "$gpu_need" "GPU pool" "$gpu_max × $gpu_sku ($gpu_vcpus vCPUs each)"; then
exit 1
fi
elif [[ "$gpu_max" -eq 0 ]]; then
log_info " GPU pool: gpu_max=0 — skipping quota math"
else
log_warning " vCPU quota: couldn't read family/vCPU data for $gpu_sku — skipping quota math"
fi
fi

# Informational usage table. Skip when no family resolved so we don't
# emit `contains(name.value, '')` which would match every row.
local family_filter=""
if [[ -n "$node_family" ]]; then
family_filter="contains(name.value, '$node_family')"
fi
if [[ "$gpu_enabled" == "true" && -n "${gpu_family:-}" ]]; then
if [[ -n "$family_filter" ]]; then
family_filter="$family_filter || contains(name.value, '$gpu_family')"
else
family_filter="contains(name.value, '$gpu_family')"
fi
fi
if [[ -n "$family_filter" ]]; then
log_info " vCPU usage in $region (informational):"
az vm list-usage -l "$region" ${sub_args[@]+"${sub_args[@]}"} -o table \
--query "[?$family_filter].{name:name.localizedValue, used:currentValue, limit:limit}" \
2>/dev/null || log_warning " (vm list-usage failed — check quota manually if apply errors)"
fi

# Empty intersection = TF apply will fail with AvailabilityZoneNotSupported.
local resolved_zones
resolved_zones=$(_azure_resolve_pool_zones "$region" "$node_sku" "$gpu_enabled" "$gpu_sku")
if [[ "$resolved_zones" == "[]" ]]; then
log_error "No availability zone supports both node SKU '$node_sku' and GPU SKU '$gpu_sku' in $region."
log_error " Node SKU zones: $(_azure_sku_zones_json "$region" "$node_sku")"
log_error " GPU SKU zones: $(_azure_sku_zones_json "$region" "$gpu_sku")"
log_error " Pick a region where the two zone sets overlap, or split into two pools manually."
exit 1
fi
log_info " ✓ Availability zones: $resolved_zones"

# No `az redis-managed list-skus` per region today; warn on the
# AllocationFailed-prone tiers (see variables.tf for the empirical note).
case "$redis_sku" in
Balanced_B0|Balanced_B1|Balanced_B3)
log_warning " redis_sku_name='$redis_sku' has hit AllocationFailed in capacity-constrained regions."
log_warning " Consider ComputeOptimized_X3 (variables.tf empirically validated default)."
;;
*)
log_info " Managed Redis SKU: $redis_sku"
;;
esac

log_success "Pre-flight SKU + quota checks passed"
}

azure_terraform_init() {
local terraform_dir="$1"
log_info "Initializing Terraform..."
Expand Down Expand Up @@ -710,6 +942,10 @@ azure_configure_kubectl() {
}

# Export functions for use by other scripts
export -f azure_preflight_sku_quota
export -f _azure_check_vcpu_quota
export -f _azure_sku_zones_json
export -f _azure_resolve_pool_zones
export -f azure_run_kubectl
export -f azure_run_kubectl_apply_stdin
export -f azure_run_helm
Expand Down
28 changes: 20 additions & 8 deletions deployments/scripts/deploy-osmo-minimal.sh
Original file line number Diff line number Diff line change
Expand Up @@ -193,12 +193,14 @@ Discovery (provider-less, exit after running):
before picking an OSMO_CHART_VERSION pin so you can
see what's published.
--find-gpu-region SKU COUNT
Print the first Azure region (from
Find the first Azure region (from
TF_REGION_CANDIDATES, default eastus2 swedencentral
westus3 southcentralus westeurope) with sufficient
quota for COUNT x SKU. Exits non-zero if none
qualify. Used by agent-driven setup flows when the
user doesn't know which region to target.
qualify. When combined with --provider, sets
TF_REGION inline and continues the deploy; when
used standalone (no --provider), prints the region
and exits (query-only).

Environment Variables:
OSMO_IMAGE_REGISTRY OSMO image registry (default: nvcr.io/nvidia/osmo)
Expand Down Expand Up @@ -408,16 +410,21 @@ if [[ "$LIST_CHART_VERSIONS" == "true" ]]; then
fi

if [[ -n "${FIND_GPU_REGION_SKU:-}" ]]; then
# Delegate to the azure provider's helper.
source "$SCRIPT_DIR/azure/terraform.sh"
region=$(azure_find_region_with_gpu_quota "$FIND_GPU_REGION_SKU" "$FIND_GPU_REGION_COUNT" "$(az account show --query id -o tsv 2>/dev/null)")
if [[ -n "$region" ]]; then
if [[ -z "$region" ]]; then
log_error "No candidate region had quota for $FIND_GPU_REGION_COUNT x $FIND_GPU_REGION_SKU"
log_error "Override TF_REGION_CANDIDATES (space-separated) to expand the search."
exit 1
fi
# Standalone (no --provider) = query-only: print region, exit.
# Combined with --provider = set TF_REGION inline and continue the deploy.
if [[ -z "$PROVIDER" ]]; then
echo "$region"
exit 0
fi
log_error "No candidate region had quota for $FIND_GPU_REGION_COUNT x $FIND_GPU_REGION_SKU"
log_error "Override TF_REGION_CANDIDATES (space-separated) to expand the search."
exit 1
log_info "Auto-picked region for GPU pool: $region"
export TF_REGION="$region"
fi

###############################################################################
Expand Down Expand Up @@ -552,6 +559,8 @@ preflight_checks() {
case "$PROVIDER" in
azure)
azure_preflight_checks
# SKU/quota preflight runs later in the TF-apply branch, after
# handle_configuration resolves the user's actual config.
;;
Comment thread
coderabbitai[bot] marked this conversation as resolved.
aws)
aws_preflight_checks
Expand Down Expand Up @@ -924,6 +933,9 @@ main() {
azure|aws)
if [[ "$SKIP_TERRAFORM" == false ]]; then
handle_configuration
if [[ "$PROVIDER" == "azure" ]]; then
azure_preflight_sku_quota
fi
run_terraform_init
run_terraform_apply
fi
Expand Down
Loading
Loading