diff --git a/Makefile b/Makefile index a5b26eb..b7b2101 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ .PHONY: help init plan apply destroy cluster-setup deploy-all deploy-infra deploy-services test clean -.PHONY: deploy-k8s deploy-k8s-prod k8s-status +.PHONY: k8s-status k8s-start k8s-stop k8s-restart .PHONY: build start stop status logs TERRAFORM_DIR := infrastructure/terraform @@ -24,22 +24,20 @@ help: @echo "Cluster Setup:" @echo " cluster-setup Install k3s on all nodes" @echo "" - @echo "K8s Deployment:" - @echo " deploy-k8s Deploy K8s with custom Relay + VultiServer" - @echo " deploy-k8s-prod Deploy K8s using api.vultisig.com endpoints" - @echo " deploy-all Deploy everything (legacy)" - @echo " deploy-infra Deploy infrastructure services only" - @echo " deploy-services Deploy application services only" - @echo " deploy-monitoring Deploy Prometheus and Grafana" + @echo "K8s Deployment (uses production Relay/VultiServer at api.vultisig.com):" + @echo " k8s-start Deploy + verify all services (RECOMMENDED)" + @echo " k8s-stop Graceful shutdown" + @echo " k8s-restart Stop then start" + @echo " deploy-secrets Deploy secrets only" @echo "" @echo "Testing:" @echo " test-smoke Run smoke tests" - @echo " test-partition Show partition test options" @echo "" @echo "Utilities:" @echo " logs-verifier Tail verifier logs" @echo " logs-worker Tail worker logs" - @echo " logs-relay Tail relay logs" + @echo " logs-dca-worker Tail DCA worker logs" + @echo " k8s-status Show cluster status" @echo " port-forward Port forward services for local access" @echo " clean Remove generated files" @@ -87,10 +85,6 @@ deploy-infra: deploy-namespaces deploy-secrets kubectl -n infra wait --for=condition=ready pod -l app=minio --timeout=120s @echo "Infrastructure ready" -deploy-relay: - kubectl apply -f k8s/base/relay/ - kubectl -n relay wait --for=condition=ready pod -l app=relay --timeout=120s - deploy-verifier: kubectl apply -f k8s/base/verifier/ kubectl -n verifier wait --for=condition=ready pod -l app=verifier --timeout=300s @@ -99,58 +93,24 @@ deploy-dca: kubectl apply -f k8s/base/dca/ kubectl -n plugin-dca wait --for=condition=ready pod -l app=dca --timeout=300s -deploy-vultiserver: - kubectl apply -f k8s/base/vultiserver/ - kubectl -n vultiserver wait --for=condition=ready pod -l app=vultiserver --timeout=120s - deploy-monitoring: kubectl apply -f k8s/base/monitoring/prometheus/ kubectl apply -f k8s/base/monitoring/grafana/ kubectl -n monitoring wait --for=condition=ready pod -l app=prometheus --timeout=120s kubectl -n monitoring wait --for=condition=ready pod -l app=grafana --timeout=120s -deploy-services: deploy-relay deploy-verifier deploy-dca deploy-vultiserver deploy-monitoring +deploy-services: deploy-verifier deploy-dca deploy-monitoring deploy-all: deploy-infra deploy-services -# Kustomize-based K8s deployment -deploy-k8s: deploy-secrets - @echo "Deploying K8s with custom Relay + VultiServer..." - kubectl apply -k k8s/overlays/local - @echo "" - @echo "Waiting for pods..." - kubectl -n infra wait --for=condition=ready pod -l app=postgres --timeout=300s - kubectl -n infra wait --for=condition=ready pod -l app=redis --timeout=120s - kubectl -n infra wait --for=condition=ready pod -l app=minio --timeout=120s - kubectl -n relay wait --for=condition=ready pod -l app=relay --timeout=120s - kubectl -n vultiserver wait --for=condition=ready pod -l app=vultiserver --timeout=120s - kubectl -n verifier wait --for=condition=ready pod -l app=verifier --timeout=300s - kubectl -n plugin-dca wait --for=condition=ready pod -l app=dca --timeout=300s - @echo "" - @echo "=========================================" - @echo " K8s Deployment Complete!" - @echo " Relay: relay.relay.svc.cluster.local" - @echo " VultiServer: vultiserver.vultiserver.svc.cluster.local" - @echo "=========================================" - kubectl get pods --all-namespaces - -deploy-k8s-prod: deploy-secrets - @echo "Deploying K8s with production endpoints (api.vultisig.com)..." - kubectl apply -k k8s/overlays/production - @echo "" - @echo "Waiting for pods..." - kubectl -n infra wait --for=condition=ready pod -l app=postgres --timeout=300s - kubectl -n infra wait --for=condition=ready pod -l app=redis --timeout=120s - kubectl -n infra wait --for=condition=ready pod -l app=minio --timeout=120s - kubectl -n verifier wait --for=condition=ready pod -l app=verifier --timeout=300s - kubectl -n plugin-dca wait --for=condition=ready pod -l app=dca --timeout=300s - @echo "" - @echo "=========================================" - @echo " K8s Production Deployment Complete!" - @echo " Relay: https://api.vultisig.com/router" - @echo " VultiServer: https://api.vultisig.com" - @echo "=========================================" - kubectl get pods --all-namespaces +# K8s deploy/start/stop scripts +k8s-start: deploy-secrets + @./infrastructure/scripts/k8s-start.sh + +k8s-stop: + @./infrastructure/scripts/k8s-stop.sh + +k8s-restart: k8s-stop k8s-start # ============== Testing ============== @@ -177,9 +137,6 @@ logs-verifier: logs-worker: kubectl -n verifier logs -l app=verifier,component=worker -f -logs-relay: - kubectl -n relay logs -l app=relay -f - logs-dca-worker: kubectl -n plugin-dca logs -l app=dca,component=worker -f diff --git a/README.md b/README.md index 62de4bd..19c9502 100644 --- a/README.md +++ b/README.md @@ -592,3 +592,17 @@ rm ~/.vultisig/lib/linux/.downloaded-master # Linux # Then run any vcli command to trigger download ./local/vcli.sh --help ``` + +--- + +## Kubernetes Deployment + +For production Kubernetes deployment on Hetzner Cloud, see **[infrastructure/DEPLOYMENT.md](infrastructure/DEPLOYMENT.md)**. + +The K8s deployment guide covers: +- Terraform-based infrastructure provisioning +- K3s cluster setup +- Service deployment with kustomize overlays +- E2E testing in Kubernetes +- Server type and region selection (AMD64 required for GHCR images) +- Troubleshooting guide with common errors and fixes diff --git a/docker/app-recurring-local.Dockerfile b/docker/app-recurring-local.Dockerfile new file mode 100644 index 0000000..7e5c788 --- /dev/null +++ b/docker/app-recurring-local.Dockerfile @@ -0,0 +1,53 @@ +# Build app-recurring services with local verifier dependency +# Must be built from parent directory containing both app-recurring and verifier: +# docker build -f vcli/docker/app-recurring-local.Dockerfile --build-arg BINARY=tx_indexer -t app-recurring-txindexer:local . + +# Stage 1: Download go-wrappers (cached layer - rarely changes) +FROM golang:1.25-bookworm AS dkls-setup + +RUN apt-get update && apt-get install -y wget && rm -rf /var/lib/apt/lists/* + +RUN wget -q https://github.com/vultisig/go-wrappers/archive/refs/heads/master.tar.gz && \ + tar -xzf master.tar.gz && \ + mkdir -p /usr/local/lib/dkls && \ + cp -r go-wrappers-master/includes /usr/local/lib/dkls/ && \ + rm -rf master.tar.gz go-wrappers-master + +# Stage 2: Build (with local verifier dependency) +FROM golang:1.25-bookworm AS builder + +RUN apt-get update && apt-get install -y clang && rm -rf /var/lib/apt/lists/* + +COPY --from=dkls-setup /usr/local/lib/dkls /usr/local/lib/dkls + +ARG BINARY=server + +WORKDIR /build + +# Copy both repositories +COPY verifier ./verifier +COPY app-recurring ./app-recurring + +WORKDIR /build/app-recurring + +ENV CGO_ENABLED=1 +ENV CC=clang +ENV LD_LIBRARY_PATH=/usr/local/lib/dkls/includes/linux/ + +RUN go build -o /app/${BINARY} ./cmd/${BINARY} + +# Stage 3: Runtime (minimal image) +FROM debian:bookworm-slim + +RUN apt-get update && apt-get install -y ca-certificates && rm -rf /var/lib/apt/lists/* + +COPY --from=builder /usr/local/lib/dkls/includes/linux/*.so /usr/local/lib/ +ARG BINARY=server +COPY --from=builder /app/${BINARY} /app/${BINARY} + +RUN ldconfig + +WORKDIR /app +EXPOSE 8080 8088 + +CMD ["/app/main"] diff --git a/docker/app-recurring.Dockerfile b/docker/app-recurring.Dockerfile new file mode 100644 index 0000000..0a7e595 --- /dev/null +++ b/docker/app-recurring.Dockerfile @@ -0,0 +1,51 @@ +# Stage 1: Download go-wrappers (cached layer - rarely changes) +FROM golang:1.25-bookworm AS dkls-setup + +RUN apt-get update && apt-get install -y wget && rm -rf /var/lib/apt/lists/* + +# This layer is cached unless go-wrappers repo changes +RUN wget -q https://github.com/vultisig/go-wrappers/archive/refs/heads/master.tar.gz && \ + tar -xzf master.tar.gz && \ + mkdir -p /usr/local/lib/dkls && \ + cp -r go-wrappers-master/includes /usr/local/lib/dkls/ && \ + rm -rf master.tar.gz go-wrappers-master + +# Stage 2: Download Go dependencies (cached layer - changes when go.mod changes) +FROM golang:1.25-bookworm AS deps + +RUN apt-get update && apt-get install -y clang && rm -rf /var/lib/apt/lists/* + +# Copy pre-downloaded go-wrappers from cache stage +COPY --from=dkls-setup /usr/local/lib/dkls /usr/local/lib/dkls + +WORKDIR /app +COPY go.mod go.sum ./ +RUN go mod download + +# Stage 3: Build (rebuilds only when source changes) +FROM deps AS builder + +ARG BINARY=server + +COPY . . + +ENV CGO_ENABLED=1 +ENV CC=clang +ENV LD_LIBRARY_PATH=/usr/local/lib/dkls/includes/linux/ + +RUN go build -o /app/main ./cmd/${BINARY} + +# Stage 4: Runtime (minimal image) +FROM debian:bookworm-slim + +RUN apt-get update && apt-get install -y ca-certificates && rm -rf /var/lib/apt/lists/* + +COPY --from=builder /usr/local/lib/dkls/includes/linux/*.so /usr/local/lib/ +COPY --from=builder /app/main /app/main + +RUN ldconfig + +WORKDIR /app +EXPOSE 8080 8088 + +CMD ["/app/main"] diff --git a/docker/feeplugin.Dockerfile b/docker/feeplugin.Dockerfile new file mode 100644 index 0000000..0a7e595 --- /dev/null +++ b/docker/feeplugin.Dockerfile @@ -0,0 +1,51 @@ +# Stage 1: Download go-wrappers (cached layer - rarely changes) +FROM golang:1.25-bookworm AS dkls-setup + +RUN apt-get update && apt-get install -y wget && rm -rf /var/lib/apt/lists/* + +# This layer is cached unless go-wrappers repo changes +RUN wget -q https://github.com/vultisig/go-wrappers/archive/refs/heads/master.tar.gz && \ + tar -xzf master.tar.gz && \ + mkdir -p /usr/local/lib/dkls && \ + cp -r go-wrappers-master/includes /usr/local/lib/dkls/ && \ + rm -rf master.tar.gz go-wrappers-master + +# Stage 2: Download Go dependencies (cached layer - changes when go.mod changes) +FROM golang:1.25-bookworm AS deps + +RUN apt-get update && apt-get install -y clang && rm -rf /var/lib/apt/lists/* + +# Copy pre-downloaded go-wrappers from cache stage +COPY --from=dkls-setup /usr/local/lib/dkls /usr/local/lib/dkls + +WORKDIR /app +COPY go.mod go.sum ./ +RUN go mod download + +# Stage 3: Build (rebuilds only when source changes) +FROM deps AS builder + +ARG BINARY=server + +COPY . . + +ENV CGO_ENABLED=1 +ENV CC=clang +ENV LD_LIBRARY_PATH=/usr/local/lib/dkls/includes/linux/ + +RUN go build -o /app/main ./cmd/${BINARY} + +# Stage 4: Runtime (minimal image) +FROM debian:bookworm-slim + +RUN apt-get update && apt-get install -y ca-certificates && rm -rf /var/lib/apt/lists/* + +COPY --from=builder /usr/local/lib/dkls/includes/linux/*.so /usr/local/lib/ +COPY --from=builder /app/main /app/main + +RUN ldconfig + +WORKDIR /app +EXPOSE 8080 8088 + +CMD ["/app/main"] diff --git a/docker/verifier.Dockerfile b/docker/verifier.Dockerfile new file mode 100644 index 0000000..2f43713 --- /dev/null +++ b/docker/verifier.Dockerfile @@ -0,0 +1,51 @@ +# Stage 1: Download go-wrappers (cached layer - rarely changes) +FROM golang:1.25-bookworm AS dkls-setup + +RUN apt-get update && apt-get install -y wget && rm -rf /var/lib/apt/lists/* + +# This layer is cached unless go-wrappers repo changes +RUN wget -q https://github.com/vultisig/go-wrappers/archive/refs/heads/master.tar.gz && \ + tar -xzf master.tar.gz && \ + mkdir -p /usr/local/lib/dkls && \ + cp -r go-wrappers-master/includes /usr/local/lib/dkls/ && \ + rm -rf master.tar.gz go-wrappers-master + +# Stage 2: Download Go dependencies (cached layer - changes when go.mod changes) +FROM golang:1.25-bookworm AS deps + +RUN apt-get update && apt-get install -y clang && rm -rf /var/lib/apt/lists/* + +# Copy pre-downloaded go-wrappers from cache stage +COPY --from=dkls-setup /usr/local/lib/dkls /usr/local/lib/dkls + +WORKDIR /app +COPY go.mod go.sum ./ +RUN go mod download + +# Stage 3: Build (rebuilds only when source changes) +FROM deps AS builder + +ARG BINARY=verifier + +COPY . . + +ENV CGO_ENABLED=1 +ENV CC=clang +ENV LD_LIBRARY_PATH=/usr/local/lib/dkls/includes/linux/ + +RUN go build -o /app/main ./cmd/${BINARY} + +# Stage 4: Runtime (minimal image) +FROM debian:bookworm-slim + +RUN apt-get update && apt-get install -y ca-certificates && rm -rf /var/lib/apt/lists/* + +COPY --from=builder /usr/local/lib/dkls/includes/linux/*.so /usr/local/lib/ +COPY --from=builder /app/main /app/main + +RUN ldconfig + +WORKDIR /app +EXPOSE 8080 8088 + +CMD ["/app/main"] diff --git a/infrastructure/DEPLOYMENT.md b/infrastructure/DEPLOYMENT.md new file mode 100644 index 0000000..13a0783 --- /dev/null +++ b/infrastructure/DEPLOYMENT.md @@ -0,0 +1,833 @@ +# Vultisig Kubernetes Deployment Guide + +Production-ready Kubernetes deployment for Vultisig services on Hetzner Cloud. + +--- + +## Prerequisites + +- `hcloud` CLI installed and configured +- SSH key registered in Hetzner Cloud +- `.env.k8s` file with secrets (see [Secrets Configuration](#secrets-configuration)) +- `kubectl` installed locally + +--- + +## Quick Start (Full Deployment) + +```bash +cd /Users/dev/dev/vultisig/vcli + +# 1. Check server type availability before deploying +./infrastructure/scripts/check-availability.sh + +# 2. Create infrastructure with Terraform +cd infrastructure/terraform +cp terraform.tfvars.example terraform.tfvars +# Edit terraform.tfvars with your Hetzner API token +terraform init && terraform apply + +# 3. Setup K3s on nodes +cd ../.. +source setup-env.sh +./infrastructure/scripts/setup-cluster.sh + +# 4. Deploy and start services +export KUBECONFIG=$(pwd)/.kube/config +./infrastructure/scripts/k8s-start.sh +``` + +### E2E Test Commands + +```bash +# Import vault +kubectl exec -n verifier vcli -- vcli vault import --file /vault/vault.vult --password "Password123" + +# Install DCA plugin (4-party TSS reshare) +kubectl exec -n verifier vcli -- vcli plugin install dca --password "Password123" + +# Generate swap policy +kubectl exec -n verifier vcli -- vcli policy generate --from usdc --to btc --amount 10 --output /tmp/policy.json + +# Add policy +kubectl exec -n verifier vcli -- vcli policy add --plugin dca --policy-file /tmp/policy.json --password "Password123" + +# Monitor +kubectl exec -n verifier vcli -- vcli policy list --plugin dca +kubectl -n plugin-dca logs -f deploy/worker +``` + +--- + +## Hetzner Cloud Reference + +### Locations + +| Code | Name | Notes | +|------|------|-------| +| `sin` | Singapore | Asia-Pacific | +| `fsn1` | Falkenstein | Germany (EU) | +| `nbg1` | Nuremberg | Germany (EU) | +| `hel1` | Helsinki | Finland (EU) | +| `ash` | Ashburn | US East | +| `hil` | Hillsboro | US West | + +**Common mistake:** The location code is `sin` not `sin1`. Some locations have datacenter suffixes in the output (e.g., `sin-dc1`) but the location code is just `sin`. + +### Server Types by Location + +**IMPORTANT: GHCR images are AMD64 only.** ARM servers (cax*) will fail with "exec format error". + +| Type Family | Architecture | Notes | +|-------------|--------------|-------| +| `cax*` | ARM64 | **NOT compatible with GHCR images** | +| `cpx*` | AMD64 (shared) | Often out of stock in EU regions | +| `ccx*` | AMD64 (dedicated) | **Recommended** - available everywhere | +| `cx*` | Intel (shared) | Check availability | + +**Current working configuration:** +``` +Master: ccx13 (2 dedicated vCPU, 8GB RAM) - ~€13/mo +Worker: ccx23 (4 dedicated vCPU, 16GB RAM) - ~€25/mo +Region: hel1 (Helsinki) +``` + +**Check availability before deployment:** +```bash +# Use the availability script +./infrastructure/scripts/check-availability.sh + +# Or manually +hcloud server-type describe ccx23 -o json | jq '.prices[] | "\(.location): \(.price_monthly.gross)"' +``` + +**If cpx* is out of stock:** +Switch to dedicated ccx* servers (slightly more expensive but always available). + +### SSH Key Requirement + +Servers MUST be created with an SSH key for the setup scripts to work. + +```bash +# List available SSH keys +hcloud ssh-key list + +# Create servers with SSH key (REQUIRED) +hcloud server create \ + --type cpx32 \ + --image ubuntu-22.04 \ + --name vultisig-master-1 \ + --location sin \ + --ssh-key dev-key # <-- REQUIRED +``` + +**If you forget the SSH key:** Delete and recreate the server. There's no way to add SSH keys after creation. + +--- + +## Phase 1: Infrastructure Setup (Terraform) + +The recommended approach uses Terraform for reproducible infrastructure. + +### Prerequisites + +1. **Hetzner API Token**: Get from Hetzner Cloud Console → Security → API Tokens +2. **Vault keyshare file**: Export from Vultisig mobile app to `local/keyshares/` +3. **hcloud CLI**: `brew install hcloud` (macOS) or download from Hetzner + +### Check Availability + +Before deploying, check server type availability in your target region: + +```bash +./infrastructure/scripts/check-availability.sh +``` + +This shows: +- Specs for master (ccx13) and worker (ccx23) server types +- Pricing and availability per region +- Which regions have capacity + +### Deploy with Terraform + +```bash +cd infrastructure/terraform + +# Create config file +cp terraform.tfvars.example terraform.tfvars + +# Edit with your Hetzner API token +echo 'hcloud_token = "your-token-here"' > terraform.tfvars + +# Optional: customize server types or region +# Edit variables.tf to change defaults, or override: +# terraform apply -var="worker_server_type=ccx33" -var="regions=[\"nbg1\"]" + +# Initialize and apply +terraform init +terraform apply +``` + +**What Terraform creates:** +- Master node (ccx13) with K3s control plane +- Worker node (ccx23) for workloads +- Persistent volumes: PostgreSQL (50GB), Redis (10GB), MinIO (50GB) +- Private network for cluster communication +- Firewall rules for SSH, K8s API, and NodePorts +- SSH keys (generated if not provided) +- `setup-env.sh` script with all connection details + +### Generated Files + +After `terraform apply`, these files are created in the vcli root: +- `setup-env.sh` - Environment variables for cluster setup +- `.ssh/id_ed25519` - SSH private key (if generated) +- `.ssh/id_ed25519.pub` - SSH public key + +### Manual Server Creation (Alternative) + +If you prefer manual control: + +```bash +export HCLOUD_TOKEN="your-token" + +# Create servers (use ccx* for AMD64 compatibility) +hcloud server create --type ccx13 --image ubuntu-24.04 --name vultisig-master --location hel1 --ssh-key your-key +hcloud server create --type ccx23 --image ubuntu-24.04 --name vultisig-worker --location hel1 --ssh-key your-key + +# Create volumes +hcloud volume create --name vultisig-postgres --size 50 --location hel1 --format ext4 +hcloud volume create --name vultisig-redis --size 10 --location hel1 --format ext4 +hcloud volume create --name vultisig-minio --size 50 --location hel1 --format ext4 + +# Attach volumes to worker +hcloud volume attach vultisig-postgres --server vultisig-worker --automount +hcloud volume attach vultisig-redis --server vultisig-worker --automount +hcloud volume attach vultisig-minio --server vultisig-worker --automount + +# Create setup-env.sh manually +cat > setup-env.sh << EOF +export MASTER_IP="$(hcloud server ip vultisig-master)" +export MASTER_PRIVATE_IP="10.1.0.10" +export K3S_TOKEN="$(openssl rand -hex 32)" +export WORKER_HEL1_IP="$(hcloud server ip vultisig-worker)" +export SSH_KEY_PATH="./.ssh/id_ed25519" +EOF +``` + +--- + +## Phase 2: K3s Cluster Setup + +```bash +# Run the cluster setup script +./infrastructure/scripts/setup-cluster.sh + +# Set kubeconfig +export KUBECONFIG=$(pwd)/.kube/config + +# Verify cluster +kubectl get nodes -o wide +``` + +**Expected output:** +``` +NAME STATUS ROLES AGE VERSION +vultisig-master-1 Ready control-plane,master 2m v1.28.x +vultisig-worker-1 Ready 1m v1.28.x +vultisig-worker-2 Ready 1m v1.28.x +vultisig-worker-3 Ready 1m v1.28.x +``` + +--- + +## Phase 3: Deploy Services + +### Deploy Secrets + +```bash +make deploy-secrets +``` + +This creates secrets for: +- PostgreSQL credentials +- Redis credentials +- MinIO credentials +- Encryption keys +- RPC endpoints +- Test vault file + +### Deploy Services (GHCR Images) + +```bash +# Deploy using production overlay (pulls from GHCR) +make deploy-k8s-prod + +# Watch pods start +kubectl get pods -A -w +``` + +--- + +## GHCR Images + +Pre-published images on GitHub Container Registry: + +| Service | Image | Version | +|---------|-------|---------| +| Verifier | `ghcr.io/vultisig/verifier/verifier` | v0.1.16 | +| Verifier Worker | `ghcr.io/vultisig/verifier/worker` | v0.1.16 | +| Verifier TX Indexer | `ghcr.io/vultisig/verifier/tx_indexer` | v0.1.16 | +| DCA Server | `ghcr.io/vultisig/app-recurring/server` | v1.0.84 | +| DCA Scheduler | `ghcr.io/vultisig/app-recurring/scheduler` | v1.0.82 | +| DCA Worker | `ghcr.io/vultisig/app-recurring/worker` | v1.0.82 | +| DCA TX Indexer | `ghcr.io/vultisig/app-recurring/tx_indexer` | v1.0.82 | +| VCLI | `ghcr.io/vultisig/vcli` | v1.0.3 | + +**Important version notes:** +- DCA Server v1.0.84 includes TaskQueueName fix (routes tasks to `dca_plugin_queue`) +- VCLI v1.0.3 includes billing fetch fix for policy generation + +Images are configured in `k8s/overlays/production/kustomization.yaml`. + +--- + +## Phase 4: E2E Testing + +### Import Vault + +```bash +kubectl exec -n verifier vcli -- vcli vault import \ + --file /vault/vault.vult \ + --password "Password123" + +# Verify import +kubectl exec -n verifier vcli -- vcli vault details +``` + +### Install DCA Plugin + +```bash +kubectl exec -n verifier vcli -- vcli plugin install dca \ + --password "Password123" +``` + +This performs a 4-party TSS reshare: +1. CLI (vcli in cluster) +2. Fast Vault Server (production) +3. Verifier Worker (local cluster) +4. DCA Plugin Worker (local cluster) + +### Create Policy (10 USDC → BTC) + +```bash +# Generate policy (fetches pricing from verifier automatically) +kubectl exec -n verifier vcli -- vcli policy generate \ + --from usdc \ + --to btc \ + --amount 10 \ + --output /tmp/policy.json + +# Add policy (signs with TSS keysign) +kubectl exec -n verifier vcli -- vcli policy add \ + --plugin dca \ + --policy-file /tmp/policy.json \ + --password "Password123" + +# List policies (should show active policy) +kubectl exec -n verifier vcli -- vcli policy list --plugin dca +``` + +### E2E Validation Checklist + +#### Plugin Install +- [ ] Output shows "4 parties joined" +- [ ] Party names are distinct (e.g., `vcli-xxx`, `Server-xxx`, `verifier-xxx`, `dca-worker-xxx`) +- [ ] `Verifier (MinIO): ✓ 458.0KB` +- [ ] `DCA Plugin (MinIO): ✓ 458.0KB` + +#### Policy Add +- [ ] Output shows "POLICY ADDED SUCCESSFULLY" +- [ ] Policy ID returned +- [ ] No billing count mismatch errors + +#### Swap Execution (Optional) +- [ ] `vcli policy status ` shows "Active: true" +- [ ] Worker logs show "swap route found" +- [ ] Worker logs show "tx signed & broadcasted" with txHash + +--- + +## Success Criteria + +- [ ] All 4 nodes in Ready state +- [ ] All pods in Running state +- [ ] Vault imported successfully +- [ ] DCA plugin installed (4-party TSS reshare completed) +- [ ] Policy created with `tx_hash` (0x...) + +--- + +## Secrets Configuration + +Create `.env.k8s` with: + +```bash +# Hetzner Cloud +export HCLOUD_TOKEN="your-hcloud-api-token" + +# PostgreSQL +export POSTGRES_DSN="postgres://user:pass@host:5432/db" + +# Redis +export REDIS_URI="redis://:password@host:6379" + +# MinIO +export MINIO_HOST="http://host:9000" +export MINIO_ACCESS_KEY="access-key" +export MINIO_SECRET_KEY="secret-key" + +# Encryption +export ENCRYPTION_SECRET="32-byte-hex-secret" + +# Relay +export RELAY_URL="https://relay.vultisig.com" + +# RPC Endpoints +export RPC_ETHEREUM_URL="https://eth-mainnet.g.alchemy.com/v2/key" +export RPC_ARBITRUM_URL="https://arb-mainnet.g.alchemy.com/v2/key" +# ... other chains + +# Vault (base64 encoded) +export TEST_VAULT_BASE64="base64-encoded-vault-file" +``` + +--- + +## Troubleshooting + +### SSH Connection Failed + +**Symptom:** `setup-cluster.sh` fails with "Permission denied" or "Connection refused" + +**Cause:** Servers created without SSH key + +**Fix:** +```bash +# Check if servers have SSH key +hcloud server describe vultisig-master-1 | grep ssh_key + +# If empty, delete and recreate with --ssh-key flag +hcloud server delete vultisig-master-1 +hcloud server create --type cpx32 --image ubuntu-22.04 --name vultisig-master-1 --location sin --ssh-key dev-key +``` + +### Server Type Not Available + +**Symptom:** `resource "cpx31" is not available in location "sin"` + +**Fix:** Use `cpx32` for Singapore: +```bash +hcloud server create --type cpx32 --image ubuntu-22.04 --name vultisig-master-1 --location sin --ssh-key dev-key +``` + +### Pods Stuck in ImagePullBackOff + +**Symptom:** Pods can't pull images from GHCR + +**Fix:** GHCR images are public, but verify the image path: +```bash +kubectl describe pod -n +# Check the image URL in the error message +``` + +### TSS Timeout + +**Symptom:** Plugin install or policy add times out + +**Cause:** TSS operations should complete within 30 seconds + +**Fix:** +1. Check all pods are running: `kubectl get pods -A` +2. Check worker logs: `kubectl logs -n verifier deploy/worker` +3. Retry the operation (do NOT extend timeout) + +### Nodes NotReady + +**Symptom:** `kubectl get nodes` shows NotReady + +**Fix:** +```bash +# Check node status +kubectl describe node + +# Check k3s on the node +ssh root@ "systemctl status k3s" +ssh root@ "journalctl -u k3s -f" +``` + +--- + +## Architecture + +**Default deployment (single worker):** +``` +┌───────────────────────────────────────────────────────────────┐ +│ Hetzner Cloud (Helsinki - hel1) │ +├───────────────────────────────────────────────────────────────┤ +│ vultisig-master (ccx13 - 2 vCPU, 8GB) │ +│ └── K3s control plane │ +│ │ +│ vultisig-worker-hel1 (ccx23 - 4 vCPU, 16GB) │ +│ ├── infra: postgres, redis, minio (with Hetzner volumes) │ +│ ├── verifier: API + worker + tx-indexer + vcli │ +│ └── plugin-dca: server + scheduler + worker + tx-indexer │ +│ │ +│ Persistent Volumes (attached to worker): │ +│ ├── vultisig-postgres (50GB) │ +│ ├── vultisig-redis (10GB) │ +│ └── vultisig-minio (50GB) │ +└───────────────────────────────────────────────────────────────┘ +``` + +**Multi-worker deployment (optional):** +``` +# To deploy multiple workers, edit variables.tf: +variable "regions" { + default = ["hel1", "fsn1", "nbg1"] # Multiple regions +} +``` + +--- + +## Namespace Layout + +| Namespace | Services | +|-----------|----------| +| `infra` | PostgreSQL, Redis, MinIO | +| `verifier` | Verifier API, Worker, TX Indexer, VCLI | +| `plugin-dca` | DCA Server, Scheduler, Worker, TX Indexer | +| `relay` | Relay Server | + +--- + +## K8s vs Local Development Differences + +| Component | Local (Docker) | K8s | +|-----------|----------------|-----| +| Services | `go run` processes via `run-services.sh` | GHCR container images | +| Config | Environment variables in `run-services.sh` | ConfigMaps + Secrets | +| Queue Name | `TASK_QUEUE_NAME` in env | Same, set in deployment manifests | +| Images | N/A (native Go binary) | `ghcr.io/vultisig/*` | +| Relay | `api.vultisig.com` | Same (patched in production overlay) | +| MinIO | Local Docker container | K8s StatefulSet in `infra` namespace | +| vcli | Native binary | `ghcr.io/vultisig/vcli:v1.0.3` | + +--- + +## Startup Scripts + +### k8s-start.sh (Recommended) + +The `k8s-start.sh` script handles full deployment with verification: + +```bash +./infrastructure/scripts/k8s-start.sh # Deploy and verify +./infrastructure/scripts/k8s-start.sh --skip-seed # Skip database seeding +``` + +**External services (production endpoints):** +- Relay: `https://api.vultisig.com/router` +- VultiServer/FastVault: `https://api.vultisig.com` + +**What it deploys:** +1. Applies kustomize overlay (creates namespaces, deploys pods) +2. Applies secrets +3. Recreates jobs (minio-init, seed-plugins) +4. Waits for infrastructure (PostgreSQL, Redis, MinIO) +5. Waits for application services (Verifier, DCA) +6. Flushes Redis for clean state +7. Runs comprehensive verification: + - MinIO buckets exist + - Database seeded with DCA plugin + - Redis responding + - All pods healthy (Running, 0 restarts) + - Service HTTP endpoints responding + - Worker queue configuration correct + +### k8s-stop.sh + +Graceful shutdown with cleanup: + +```bash +./infrastructure/scripts/k8s-stop.sh +``` + +--- + +## Makefile Commands + +```bash +make deploy-secrets # Deploy secrets to cluster +make k8s-start # Deploy + verify (recommended) +make k8s-stop # Graceful shutdown +make delete-k8s # Delete all Kubernetes resources +``` + +--- + +## Teardown + +### Stop Services (Keep Infrastructure) + +```bash +./infrastructure/scripts/k8s-stop.sh +``` + +This stops K8s services but keeps the servers and volumes for redeployment. + +### Full Teardown (Destroy Infrastructure) + +```bash +cd infrastructure/terraform +terraform destroy +``` + +This removes: +- All Hetzner servers (master + workers) +- All Hetzner volumes (postgres, redis, minio data is DELETED) +- Network resources (VPC, subnets) +- Firewall rules +- SSH keys (if generated by Terraform) + +**Manual teardown (if Terraform state is lost):** + +```bash +export HCLOUD_TOKEN="your-token" + +# Delete servers +hcloud server delete vultisig-master +hcloud server delete vultisig-worker-hel1 + +# Delete volumes (DATA WILL BE LOST) +hcloud volume delete vultisig-postgres +hcloud volume delete vultisig-redis +hcloud volume delete vultisig-minio + +# Delete network +hcloud network delete vultisig-network + +# Delete firewall +hcloud firewall delete vultisig-firewall + +# Delete SSH key (if generated) +hcloud ssh-key delete vultisig-cluster-key +``` + +--- + +## Version History + +| Date | Version | Changes | +|------|---------|---------| +| 2026-01-22 | 1.1 | Switched to Terraform, AMD64 servers (ccx13/ccx23), added availability check script | +| 2026-01-21 | 1.0 | Initial deployment to Singapore | + +--- + +## Lessons Learned + +### Infrastructure & Hetzner + +1. **Always use `--ssh-key`** when creating Hetzner servers +2. **Check server type availability** per location (`cpx32` for Singapore) +3. **Location codes don't have suffixes** (`sin` not `sin1`) + +### Kubernetes & K3s + +4. **K8s 1.34+ rejects `node-role.kubernetes.io/*` labels** - use `node.kubernetes.io/role=*` instead +5. **Use kustomize overlays** for environment-specific image tags +6. **GHCR images are public** - no authentication needed + +### Image Configuration + +7. **GHCR images use `/app/main` binary path** - local images use component-specific paths (`/app/scheduler`, `/usr/local/bin/verifier`). Production kustomization patches commands. + +### Service Configuration + +8. **RPC ConfigMaps need all chains** - DCA worker requires: zksync, cronos, cosmos, tron, dash, zcash (see `k8s/base/dca/configmaps.yaml`) +9. **VCLI DCA plugin URL** must point to `server-swap.plugin-dca.svc.cluster.local:8082` (not `dca-server`) +10. **VCLI verifier URL must be LOCAL** - vcli.yaml must use `http://verifier.verifier.svc.cluster.local:8080`, NOT production verifier. This is configured via `vcli-config` ConfigMap in verifier namespace. +11. **All workers must use the SAME relay** - verifier worker and DCA worker must both use production relay (`https://api.vultisig.com/router`) for TSS coordination. Check with: `kubectl exec -n verifier deploy/worker -- env | grep RELAY` + +### TSS Operations + +12. **TSS timeouts are 30 seconds max** - don't extend, retry instead +13. **4-party TSS reshare requires all parties on same relay** - vcli, Fast Vault Server, verifier worker, and DCA plugin worker must all communicate through the same relay +14. **Restart workers after configmap changes** - ConfigMap changes require pod restart: `kubectl rollout restart deployment/worker -n verifier` + +### Database & Plugin Configuration + +15. **Plugin pricing required for policy creation** - Each plugin needs pricing entries in the `pricings` table. Without them, policy creation fails with "billing policies count does not match plugin pricing count". See `k8s/base/verifier/seed-plugins.yaml` for seeding. +16. **billing.amount must be uint64** - The verifier expects `billing.amount` as a number, not a string. vcli `policy generate` now fetches pricing from verifier and uses correct types. + +### MinIO Bucket Configuration + +17. **Keyshares stored in correct buckets** - Verifier stores in `vultisig-verifier`, DCA plugin stores in `vultisig-dca`. If keyshare is in wrong bucket, policy verification fails with "Invalid policy signature" + +### Queue & Task Routing + +18. **DCA Queue Routing** - DCA services use `TASK_QUEUE_NAME=dca_plugin_queue`. This ensures DCA worker receives reshare tasks and saves keyshares to `vultisig-dca` bucket (not `vultisig-verifier`). K8s manifests have this correctly configured in `k8s/base/dca/server.yaml`, `worker.yaml`, and `scheduler.yaml`. + +19. **Policy Billing Fetch** - When generating policies, vcli fetches pricing from verifier. If fetch fails, manually add billing entries: + ```json + "billing": [ + {"type": "once", "amount": 0, "asset": "usdc"}, + {"type": "per-tx", "amount": 0, "asset": "usdc"} + ] + ``` + vcli v1.0.3+ handles this automatically. + +20. **4-Party Reshare Validation** - After plugin install, verify: + - 4 distinct parties: CLI + Fast Vault + Verifier Worker + DCA Worker + - Both MinIO buckets have keyshares: `Verifier (MinIO): ✓` AND `DCA Plugin (MinIO): ✓` + - Party names should be distinct (e.g., `vcli-xxx`, `Server-xxx`, `verifier-xxx`, `dca-worker-xxx`) + +### Relay Configuration (Critical) + +21. **Relay URL Must Match Across All Parties** - All TSS parties (vcli, verifier worker, DCA worker) must connect to the **same relay server**. The production overlay configures all workers to use `https://api.vultisig.com/router`. + - **Deploy command**: `./infrastructure/scripts/k8s-start.sh` (uses production overlay) + - If workers use different relays, reshare will hang at 2 parties forever + +### Database Seeding + +22. **Pricing Table Has No Unique Constraint** - The `pricings` table lacks a unique constraint on `(type, plugin_id, frequency)`. The `ON CONFLICT DO NOTHING` clause is ineffective without a constraint. Running the seed script multiple times creates duplicates. + - **Fix**: Seed SQL now uses `DELETE + INSERT` pattern to prevent duplicates + - **If duplicates exist**: Clean up with `DELETE FROM pricings WHERE plugin_id = 'vultisig-dca-0000' AND id NOT IN (SELECT id FROM pricings WHERE plugin_id = 'vultisig-dca-0000' ORDER BY created_at ASC LIMIT 2);` + +### Node Sizing & Deployment + +23. **Node Sizing** - Worker nodes should be ccx23 (4 dedicated vCPU, 16GB) minimum for all services to run without CPU throttling. + +24. **Test Vault Secret** - `k8s-start.sh` auto-creates `test-vault` secret from `local/keyshares/FastPlugin1-a06a-share2of2.vult`. Ensure this file exists before running the script. + +25. **Rolling Restart After Infra** - After infrastructure restart, application pods need rolling restart to pick up fresh service IPs (e.g., postgres moving nodes). Automated in `k8s-start.sh` as STEP 4.5. + +### Architecture Compatibility + +26. **GHCR Images are AMD64 Only** - All GHCR images (`ghcr.io/vultisig/*`) are built for AMD64. ARM64 servers (cax*) will fail with "exec format error" or "no match for platform in manifest". + - **DO NOT use:** cax11, cax21, cax31, cax41 (ARM64) + - **Use instead:** ccx13, ccx23, ccx33 (AMD64 dedicated) or cpx11, cpx21, cpx31 (AMD64 shared) + +27. **Server Availability Varies by Region** - cpx* (shared AMD64) servers are often out of stock in EU regions (fsn1, nbg1, hel1). Use `./infrastructure/scripts/check-availability.sh` to verify before deployment. + - **Fallback:** ccx* dedicated servers are always available (slightly higher cost) + +28. **Terraform Manages Infrastructure** - Use `infrastructure/terraform/` for reproducible deployments: + ```bash + terraform apply # Create infrastructure + terraform destroy # Tear down all resources + terraform apply -var="regions=[\"nbg1\"]" # Override region + ``` + +--- + +## Common Errors & Fixes + +### TSS: Only 2 parties joining (expected 4) + +**Symptom:** Plugin install shows "Waiting for more parties... parties=2" forever + +**Root Cause:** All TSS parties must use the SAME relay server to coordinate sessions. + +**Diagnosis:** +```bash +# Check relay URLs for all workers +kubectl exec -n verifier deploy/worker -- env | grep RELAY +kubectl exec -n plugin-dca deploy/worker -- env | grep RELAY + +# Both should show: RELAY_URL=https://api.vultisig.com/router +``` + +**Fix:** +```bash +# Redeploy with k8s-start.sh +./infrastructure/scripts/k8s-start.sh + +# Verify relay configmaps are patched +kubectl get cm relay-config -n verifier -o yaml | grep url +kubectl get cm relay-config -n plugin-dca -o yaml | grep url +# Both should show: url: "https://api.vultisig.com/router" + +# Restart workers to pick up new config +kubectl rollout restart deployment/worker -n verifier +kubectl rollout restart deployment/worker -n plugin-dca +``` + +### Policy creation fails with 400: "billing.amount expected uint64" + +**Symptom:** `vcli policy add` fails with type error + +**Cause:** Policy JSON has `"amount": "0"` (string) instead of `"amount": 0` (number) + +**Fix:** Use latest vcli which generates correct types, or manually edit policy JSON + +### Policy creation fails with 500: "billing count mismatch" + +**Symptom:** `vcli policy add` fails with "billing policies count (N) does not match plugin pricing count (M)" + +**Causes:** +1. Plugin has no pricing entries (M=0) +2. Plugin has duplicate pricing entries (M>2, e.g., 6 instead of 2) + +**Diagnosis:** +```bash +# Check how many pricing entries exist +kubectl exec -n infra deploy/postgres -- psql -U postgres -d vultisig-verifier -c " +SELECT type, COUNT(*) FROM pricings WHERE plugin_id = 'vultisig-dca-0000' GROUP BY type; +" +# Expected: once=1, per-tx=1 (total 2) +``` + +**Fix (if duplicates exist):** +```bash +# Clean up duplicates, keeping only oldest entries +kubectl exec -n infra deploy/postgres -- psql -U postgres -d vultisig-verifier -c " +DELETE FROM pricings +WHERE plugin_id = 'vultisig-dca-0000' + AND id NOT IN ( + SELECT id FROM pricings + WHERE plugin_id = 'vultisig-dca-0000' + ORDER BY created_at ASC + LIMIT 2 + ); +" +``` + +**Fix (if no entries exist):** +```bash +# Add pricing entries to database +kubectl exec -n infra deploy/postgres -- psql -U postgres -d vultisig-verifier -c " +INSERT INTO pricings (type, frequency, amount, asset, metric, plugin_id, created_at, updated_at) +VALUES + ('once', NULL, 0, 'usdc', 'fixed', 'vultisig-dca-0000', NOW(), NOW()), + ('per-tx', NULL, 0, 'usdc', 'fixed', 'vultisig-dca-0000', NOW(), NOW()); +" +``` + +### Policy creation fails with 403: "Invalid policy signature" + +**Symptom:** `vcli policy add` fails after TSS keysign succeeds + +**Cause:** Plugin keyshare not found in correct MinIO bucket + +**Fix:** +```bash +# Check where keyshare is stored +kubectl exec -n infra minio-0 -- mc ls local/vultisig-verifier/ +kubectl exec -n infra minio-0 -- mc ls local/vultisig-dca/ + +# If keyshare is in wrong bucket, copy it: +kubectl exec -n infra minio-0 -- mc cp \ + local/vultisig-verifier/vultisig-dca-0000-.vult \ + local/vultisig-dca/vultisig-dca-0000-.vult +``` diff --git a/infrastructure/deploy-notes.md b/infrastructure/deploy-notes.md new file mode 100644 index 0000000..df9a8b9 --- /dev/null +++ b/infrastructure/deploy-notes.md @@ -0,0 +1,65 @@ +# Deployment Notes - 2026-01-22 + +Tracking issues encountered during one-shot E2E deployment. + +## Prerequisites Provided +- [x] Hetzner API key (terraform.tfvars) +- [ ] Vault keyshare (local/keyshares/FastPlugin1-a06a-share2of2.vult) +- [ ] Vault password + +## Deployment Steps + +### Step 1: Terraform Apply +Status: COMPLETE +- Master: 49.13.58.177 (cax11, fsn1) +- Worker: 167.235.246.209 (cax31, fsn1) +- Volumes: postgres, redis, minio attached + +### Step 2: K3s Cluster Setup +Status: COMPLETE (manual worker install needed due to script issue) +- Master: vultisig-master (49.13.58.177) Ready +- Worker: vultisig-worker-fsn1 (167.235.246.209) Ready +- CSI Driver: Installed + +### Step 3: K8s Services Deployment +Status: COMPLETE +- All pods running successfully +- Infrastructure: postgres, redis, minio +- Verifier: verifier, worker, tx-indexer, vcli +- Plugin-DCA: server-swap, server-send, worker, scheduler, tx-indexer + +### Step 4: E2E Test +Status: PASSED ✅ +- vault import: Success (7.4s) +- plugin install: Success (31s) - 4-party TSS reshare +- policy generate: Success +- policy add: Success (9.1s) - Policy ID: e0db3699-e574-4e36-8395-4a523e4d307f + +--- + +## Issues Encountered + +### Issue 1: SSH key path wrong in setup-env.sh +- **Problem**: Terraform generated setup-env.sh with relative path `./../../.ssh/id_ed25519` which doesn't work from vcli root +- **Fix**: Changed main.tf to use `./.ssh/id_ed25519` (relative to vcli root where setup-env.sh lives) + +### Issue 2: setup-cluster.sh hardcodes 3 workers +- **Problem**: Script loops over fsn1/nbg1/hel1 but terraform only creates 1 worker in fsn1 +- **Fix TODO**: Script should dynamically get worker list from setup-env.sh instead of hardcoding regions + +### Issue 3: GHCR images are AMD64 only - ARM nodes won't work +- **Problem**: cax* servers are ARM64, but GHCR images only have AMD64 builds +- **Error**: "exec format error" and "no match for platform in manifest: not found" +- **Fix**: Must use cpx* (AMD64) servers instead of cax* (ARM64) +- **Cost Impact**: cpx31 (€15.59/mo) vs cax31 (€14.99/mo) - similar cost + +### Issue 4: cpx* servers unavailable in all EU regions +- **Problem**: cpx* servers are out of capacity in fsn1, nbg1, and hel1 +- **Fix**: Switched to dedicated AMD ccx* servers (ccx13 for master, ccx23 for worker) in hel1 (Helsinki) +- **Cost**: ccx13 ~€13/mo, ccx23 ~€25/mo (higher than cpx but available) + +--- + +## TODO for Later + +(to be filled as issues arise) diff --git a/infrastructure/scripts/check-availability.sh b/infrastructure/scripts/check-availability.sh new file mode 100755 index 0000000..d03947f --- /dev/null +++ b/infrastructure/scripts/check-availability.sh @@ -0,0 +1,94 @@ +#!/bin/bash +# Check Hetzner server type availability by region before deployment +set -euo pipefail + +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' + +MASTER_TYPE="${MASTER_TYPE:-ccx13}" +WORKER_TYPE="${WORKER_TYPE:-ccx23}" + +echo "Checking Hetzner server type availability..." +echo "" + +check_type() { + local type=$1 + local role=$2 + + echo "=== $role: $type ===" + + # Get JSON output for reliable parsing + local json + json=$(hcloud server-type describe "$type" -o json 2>/dev/null) || { + echo -e " ${RED}ERROR: Server type '$type' not found${NC}" + return 1 + } + + local arch cores memory + arch=$(echo "$json" | jq -r '.architecture') + cores=$(echo "$json" | jq -r '.cores') + memory=$(echo "$json" | jq -r '.memory') + + echo " Specs: ${cores} vCPU, ${memory}GB RAM, ${arch}" + echo "" + + # Check each location from prices array + echo "$json" | jq -r '.prices[] | "\(.location) \(.price_monthly.gross) \(.deprecation.unavailable_after // "available")"' | while read -r location price status; do + if [[ "$status" == "available" ]]; then + printf " %-6s ${GREEN}Available${NC} (€%s/mo)\n" "$location:" "$price" + elif [[ "$status" == "null" ]]; then + printf " %-6s ${GREEN}Available${NC} (€%s/mo)\n" "$location:" "$price" + else + # Check if deprecation date has passed + local unavail_date="${status%%T*}" + local now_date + now_date=$(date +%Y-%m-%d) + if [[ "$unavail_date" < "$now_date" ]]; then + printf " %-6s ${RED}Unavailable${NC} (deprecated)\n" "$location:" + else + printf " %-6s ${YELLOW}Deprecating${NC} (until %s)\n" "$location:" "$unavail_date" + fi + fi + done + echo "" +} + +check_type "$MASTER_TYPE" "Master" +check_type "$WORKER_TYPE" "Worker" + +# Optional: Test actual stock by attempting server creation (will fail fast if out of stock) +if [[ "${TEST_STOCK:-false}" == "true" ]]; then + echo "=== Stock Test (attempts create, cancels immediately) ===" + echo "" + echo " Note: This creates real API requests. Failures are expected for out-of-stock." + echo "" + for region in fsn1 nbg1 hel1 ash hil; do + printf " %-6s $WORKER_TYPE... " "$region:" + # Try to create - will fail immediately if out of stock + result=$(hcloud server create --name "stock-check-$$" --type "$WORKER_TYPE" --image ubuntu-24.04 --location "$region" 2>&1) || true + if echo "$result" | grep -qi "unavailable"; then + echo -e "${RED}OUT OF STOCK${NC}" + elif echo "$result" | grep -q "Server [0-9]"; then + # Server was created - delete it immediately + server_id=$(echo "$result" | grep -o "Server [0-9]*" | awk '{print $2}') + hcloud server delete "$server_id" --poll-interval 1s >/dev/null 2>&1 || true + echo -e "${GREEN}IN STOCK${NC} (test server deleted)" + else + echo -e "${YELLOW}ERROR: $result${NC}" + fi + done + echo "" +fi + +echo "=== Quick Reference ===" +echo "" +echo "AMD64 server types (for GHCR images):" +echo " cpx* - Shared AMD (deprecated in EU, available in US)" +echo " ccx* - Dedicated AMD (available everywhere)" +echo " cx* - Shared Intel (check availability)" +echo "" +echo "ARM64 server types (need custom images):" +echo " cax* - ARM (available everywhere)" +echo "" diff --git a/infrastructure/scripts/k3s-install-master.sh b/infrastructure/scripts/k3s-install-master.sh index 3312bc1..27dc321 100755 --- a/infrastructure/scripts/k3s-install-master.sh +++ b/infrastructure/scripts/k3s-install-master.sh @@ -20,7 +20,7 @@ apt-get install -y curl wget open-iscsi nfs-common curl -sfL https://get.k3s.io | sh -s - server \ --token "$K3S_TOKEN" \ --node-label "topology.kubernetes.io/region=fsn1" \ - --node-label "node-role.kubernetes.io/master=true" \ + --node-label "node.kubernetes.io/role=master" \ --flannel-backend=wireguard-native \ --disable traefik \ --disable servicelb \ diff --git a/infrastructure/scripts/k3s-install-worker.sh b/infrastructure/scripts/k3s-install-worker.sh index 668965f..55a47cb 100755 --- a/infrastructure/scripts/k3s-install-worker.sh +++ b/infrastructure/scripts/k3s-install-worker.sh @@ -26,7 +26,7 @@ apt-get install -y curl wget open-iscsi nfs-common # Install k3s agent curl -sfL https://get.k3s.io | K3S_URL="https://${MASTER_URL}:6443" K3S_TOKEN="$K3S_TOKEN" sh -s - agent \ --node-label "topology.kubernetes.io/region=$REGION" \ - --node-label "node-role.kubernetes.io/worker=true" + --node-label "node.kubernetes.io/role=worker" echo "" echo "=== Worker node joined cluster ===" diff --git a/infrastructure/scripts/k8s-start.sh b/infrastructure/scripts/k8s-start.sh new file mode 100755 index 0000000..a60f6ce --- /dev/null +++ b/infrastructure/scripts/k8s-start.sh @@ -0,0 +1,438 @@ +#!/bin/bash +# +# K8s Start Script - Deploy and start Vultisig services on K8s +# +# Deploys: +# - Infrastructure: PostgreSQL, Redis, MinIO +# - Verifier: API + Worker (from GHCR images) +# - DCA Plugin: Server + Worker + Scheduler (from GHCR images) +# - VCLI pod for testing +# +# Uses production endpoints for: +# - Relay: https://api.vultisig.com/router +# - VultiServer/FastVault: https://api.vultisig.com +# +# Usage: +# ./k8s-start.sh # Full deploy with verification +# ./k8s-start.sh --skip-seed # Skip database seeding +# +# Prerequisites: +# - kubectl configured with cluster access +# - k8s/secrets.yaml must exist with valid secrets + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +VCLI_DIR="$(dirname "$(dirname "$SCRIPT_DIR")")" + +# Colors +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +CYAN='\033[0;36m' +NC='\033[0m' + +# Parse flags +SKIP_SEED=false +for arg in "$@"; do + case $arg in + --skip-seed) + SKIP_SEED=true + ;; + esac +done + +# Find kubeconfig +if [[ -z "$KUBECONFIG" ]]; then + if [[ -f "$VCLI_DIR/.kube/config" ]]; then + export KUBECONFIG="$VCLI_DIR/.kube/config" + elif [[ -f "$HOME/.kube/config" ]]; then + export KUBECONFIG="$HOME/.kube/config" + fi +fi + +echo -e "${CYAN}==========================================${NC}" +echo -e "${CYAN} Vultisig K8s Deploy${NC}" +echo -e "${CYAN}==========================================${NC}" +echo "" + +# Verify cluster connection +if ! kubectl cluster-info &>/dev/null; then + echo -e "${RED}ERROR: Cannot connect to cluster${NC}" + echo "Set KUBECONFIG or ensure kubectl is configured" + exit 1 +fi + +echo -e "${YELLOW}Connected to cluster:${NC} $(kubectl config current-context)" +echo "" + +# Check secrets exist +if ! kubectl get -f "$VCLI_DIR/k8s/secrets.yaml" &>/dev/null 2>&1; then + if [[ ! -f "$VCLI_DIR/k8s/secrets.yaml" ]]; then + echo -e "${RED}ERROR: k8s/secrets.yaml not found${NC}" + echo "Copy secrets-template.yaml and fill in values:" + echo " cp k8s/secrets-template.yaml k8s/secrets.yaml" + exit 1 + fi +fi + +# ============================================ +# STEP 1: Apply kustomize overlay (creates namespaces) +# ============================================ + +echo -e "${CYAN}Applying production overlay...${NC}" +kubectl apply -k "$VCLI_DIR/k8s/overlays/production" 2>&1 | grep -v "^#" || true +echo -e " ${GREEN}✓${NC} Manifests applied" + +# ============================================ +# STEP 2: Apply secrets (namespaces now exist) +# ============================================ + +echo -e "${CYAN}Applying secrets...${NC}" +kubectl apply -f "$VCLI_DIR/k8s/secrets.yaml" +echo -e " ${GREEN}✓${NC} Secrets applied" + +# ============================================ +# STEP 2.5: Create test-vault secret (for E2E testing) +# ============================================ + +echo -e "${CYAN}Creating test-vault secret...${NC}" + +KEYSHARE_FILE="$VCLI_DIR/local/keyshares/FastPlugin1-a06a-share2of2.vult" +if [[ -f "$KEYSHARE_FILE" ]]; then + kubectl -n verifier delete secret test-vault --ignore-not-found 2>/dev/null || true + kubectl -n verifier create secret generic test-vault --from-file=vault.vult="$KEYSHARE_FILE" + echo -e " ${GREEN}✓${NC} test-vault secret created" +else + echo -e " ${YELLOW}⚠${NC} Keyshare not found: $KEYSHARE_FILE" + echo -e " ${YELLOW}⚠${NC} Create manually: kubectl -n verifier create secret generic test-vault --from-file=vault.vult=" +fi + +# ============================================ +# STEP 3: Delete existing jobs (they're immutable) and recreate +# ============================================ + +echo -e "${CYAN}Recreating jobs...${NC}" +kubectl -n infra delete job minio-init --ignore-not-found 2>/dev/null || true +kubectl -n verifier delete job seed-plugins --ignore-not-found 2>/dev/null || true +sleep 2 +# Reapply to recreate jobs +kubectl apply -k "$VCLI_DIR/k8s/overlays/production" 2>&1 | grep -v "^#" | grep -v "unchanged" || true +echo -e " ${GREEN}✓${NC} Jobs recreated" + +# ============================================ +# STEP 4: Wait for infrastructure +# ============================================ + +echo -e "${CYAN}Waiting for infrastructure...${NC}" + +echo -e " ${YELLOW}⏳${NC} PostgreSQL..." +kubectl -n infra wait --for=condition=ready pod -l app=postgres --timeout=300s +echo -e " ${GREEN}✓${NC} PostgreSQL ready" + +echo -e " ${YELLOW}⏳${NC} Redis..." +kubectl -n infra wait --for=condition=ready pod -l app=redis --timeout=120s +echo -e " ${GREEN}✓${NC} Redis ready" + +echo -e " ${YELLOW}⏳${NC} MinIO..." +kubectl -n infra wait --for=condition=ready pod -l app=minio --timeout=120s +echo -e " ${GREEN}✓${NC} MinIO ready" + +# ============================================ +# STEP 4.5: Rolling restart to pick up fresh infra IPs +# ============================================ + +echo -e "${CYAN}Rolling restart of application deployments...${NC}" + +kubectl -n verifier rollout restart deployment/verifier deployment/worker 2>/dev/null || true +kubectl -n plugin-dca rollout restart deployment/server-swap deployment/server-send deployment/worker deployment/scheduler 2>/dev/null || true + +echo -e " ${GREEN}✓${NC} Rolling restart initiated" + +# ============================================ +# STEP 5: Wait for application services +# ============================================ + +echo -e "${CYAN}Waiting for application services...${NC}" + +echo -e " ${YELLOW}⏳${NC} Verifier..." +kubectl -n verifier wait --for=condition=ready pod -l app=verifier,component=api --timeout=300s +echo -e " ${GREEN}✓${NC} Verifier API ready" + +kubectl -n verifier wait --for=condition=ready pod -l app=verifier,component=worker --timeout=120s +echo -e " ${GREEN}✓${NC} Verifier Worker ready" + +echo -e " ${YELLOW}⏳${NC} DCA Plugin..." +kubectl -n plugin-dca wait --for=condition=ready pod -l app=dca,component=server-swap --timeout=120s 2>/dev/null || \ + kubectl -n plugin-dca wait --for=condition=ready pod -l app=dca --timeout=120s 2>/dev/null || true +echo -e " ${GREEN}✓${NC} DCA services ready" + +# ============================================ +# STEP 6: Flush Redis (clean start) +# ============================================ + +echo -e "${CYAN}Flushing Redis for clean start...${NC}" + +REDIS_PASSWORD=$(kubectl -n infra get secret redis -o jsonpath='{.data.password}' 2>/dev/null | base64 -d) || REDIS_PASSWORD="" + +if [[ -n "$REDIS_PASSWORD" ]]; then + # Redis is a StatefulSet, not Deployment + kubectl -n infra exec redis-0 -- redis-cli -a "$REDIS_PASSWORD" FLUSHALL 2>/dev/null && \ + echo -e " ${GREEN}✓${NC} Redis flushed" || \ + echo -e " ${YELLOW}⚠${NC} Redis flush failed" +fi + +# ============================================ +# STEP 6: Run seed job +# ============================================ + +if ! $SKIP_SEED; then + echo -e "${CYAN}Seeding database...${NC}" + + # Delete old seed job if exists + kubectl -n verifier delete job seed-plugins --ignore-not-found 2>/dev/null || true + sleep 2 + + # Apply seed-plugins manifest + if [[ -f "$VCLI_DIR/k8s/base/verifier/seed-plugins.yaml" ]]; then + kubectl apply -f "$VCLI_DIR/k8s/base/verifier/seed-plugins.yaml" + + # Wait for seed job to complete + echo -e " ${YELLOW}⏳${NC} Waiting for seed job..." + for i in {1..60}; do + STATUS=$(kubectl -n verifier get job seed-plugins -o jsonpath='{.status.succeeded}' 2>/dev/null || echo "") + if [[ "$STATUS" == "1" ]]; then + echo -e " ${GREEN}✓${NC} Database seeded" + break + fi + FAILED=$(kubectl -n verifier get job seed-plugins -o jsonpath='{.status.failed}' 2>/dev/null || echo "") + if [[ "$FAILED" -ge "3" ]]; then + echo -e " ${RED}✗${NC} Seed job failed" + kubectl -n verifier logs job/seed-plugins --tail=20 + break + fi + sleep 2 + done + else + echo -e " ${YELLOW}⚠${NC} seed-plugins.yaml not found, skipping" + fi +else + echo -e "${YELLOW}Skipping database seeding (--skip-seed)${NC}" +fi + +# ============================================ +# STEP 7: Comprehensive Verification +# ============================================ + +echo -e "${CYAN}==========================================${NC}" +echo -e "${CYAN} COMPREHENSIVE VERIFICATION${NC}" +echo -e "${CYAN}==========================================${NC}" +echo "" + +VERIFICATION_FAILED=false + +# --- 7.1 MinIO Buckets --- +echo -e "${CYAN}[1/6] Verifying MinIO buckets...${NC}" + +# Wait for minio-init job to complete +for i in {1..30}; do + MINIO_INIT_STATUS=$(kubectl -n infra get job minio-init -o jsonpath='{.status.succeeded}' 2>/dev/null || echo "") + if [[ "$MINIO_INIT_STATUS" == "1" ]]; then + break + fi + sleep 2 +done + +# Check buckets exist using mc or direct API +MINIO_POD=$(kubectl -n infra get pods -l app=minio -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) +if [[ -n "$MINIO_POD" ]]; then + BUCKETS=$(kubectl -n infra exec "$MINIO_POD" -- ls /data/ 2>/dev/null || echo "") + + if echo "$BUCKETS" | grep -q "vultisig-verifier"; then + echo -e " ${GREEN}✓${NC} vultisig-verifier bucket exists" + else + echo -e " ${RED}✗${NC} vultisig-verifier bucket MISSING" + VERIFICATION_FAILED=true + fi + + if echo "$BUCKETS" | grep -q "vultisig-dca"; then + echo -e " ${GREEN}✓${NC} vultisig-dca bucket exists" + else + echo -e " ${RED}✗${NC} vultisig-dca bucket MISSING" + VERIFICATION_FAILED=true + fi +else + echo -e " ${RED}✗${NC} MinIO pod not found" + VERIFICATION_FAILED=true +fi + +# --- 7.2 Database Connectivity & Seeding --- +echo -e "${CYAN}[2/6] Verifying database...${NC}" + +# Check if DCA plugin is seeded +PLUGIN_COUNT=$(kubectl -n infra exec postgres-0 -- psql -U postgres -d vultisig_verifier -t -c "SELECT COUNT(*) FROM plugins WHERE id = 'vultisig-dca-0000';" 2>/dev/null | tr -d ' ' || echo "0") +if [[ "$PLUGIN_COUNT" == "1" ]]; then + echo -e " ${GREEN}✓${NC} DCA plugin seeded (vultisig-dca-0000)" +else + echo -e " ${RED}✗${NC} DCA plugin NOT seeded (count: $PLUGIN_COUNT)" + VERIFICATION_FAILED=true +fi + +# Check verifier tables exist +TABLES=$(kubectl -n infra exec postgres-0 -- psql -U postgres -d vultisig_verifier -t -c "SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = 'public';" 2>/dev/null | tr -d ' ' || echo "0") +if [[ "$TABLES" -gt "5" ]]; then + echo -e " ${GREEN}✓${NC} Verifier database tables exist ($TABLES tables)" +else + echo -e " ${RED}✗${NC} Verifier database tables missing ($TABLES tables)" + VERIFICATION_FAILED=true +fi + +# --- 7.3 Redis Connectivity --- +echo -e "${CYAN}[3/6] Verifying Redis...${NC}" + +REDIS_PASSWORD=$(kubectl -n infra get secret redis -o jsonpath='{.data.password}' 2>/dev/null | base64 -d) || REDIS_PASSWORD="" +REDIS_PING=$(kubectl -n infra exec redis-0 -- redis-cli -a "$REDIS_PASSWORD" PING 2>/dev/null || echo "") +if [[ "$REDIS_PING" == "PONG" ]]; then + echo -e " ${GREEN}✓${NC} Redis responding" +else + echo -e " ${RED}✗${NC} Redis not responding" + VERIFICATION_FAILED=true +fi + +# Check Redis is empty (no stale sessions) +REDIS_KEYS=$(kubectl -n infra exec redis-0 -- redis-cli -a "$REDIS_PASSWORD" DBSIZE 2>/dev/null | grep -o '[0-9]*' || echo "0") +if [[ "$REDIS_KEYS" == "0" ]]; then + echo -e " ${GREEN}✓${NC} Redis is clean (0 keys)" +else + echo -e " ${YELLOW}⚠${NC} Redis has $REDIS_KEYS keys (may have stale sessions)" +fi + +# --- 7.4 Pod Health (Running + No Restarts) --- +echo -e "${CYAN}[4/6] Verifying pod health...${NC}" + +check_pod_health() { + local namespace=$1 + local label=$2 + local name=$3 + + POD_INFO=$(kubectl -n "$namespace" get pods -l "$label" -o jsonpath='{.items[0].status.phase}:{.items[0].status.containerStatuses[0].restartCount}' 2>/dev/null || echo ":") + POD_PHASE=$(echo "$POD_INFO" | cut -d: -f1) + RESTART_COUNT=$(echo "$POD_INFO" | cut -d: -f2) + + if [[ "$POD_PHASE" == "Running" ]] && [[ "$RESTART_COUNT" == "0" ]]; then + echo -e " ${GREEN}✓${NC} $name: Running (0 restarts)" + return 0 + elif [[ "$POD_PHASE" == "Running" ]]; then + echo -e " ${YELLOW}⚠${NC} $name: Running ($RESTART_COUNT restarts)" + return 0 + else + echo -e " ${RED}✗${NC} $name: $POD_PHASE" + return 1 + fi +} + +check_pod_health "verifier" "app=verifier,component=api" "Verifier API" || VERIFICATION_FAILED=true +check_pod_health "verifier" "app=verifier,component=worker" "Verifier Worker" || VERIFICATION_FAILED=true +check_pod_health "plugin-dca" "app=dca,component=server-swap" "DCA Server (swap)" || VERIFICATION_FAILED=true +check_pod_health "plugin-dca" "app=dca,component=worker" "DCA Worker" || VERIFICATION_FAILED=true +check_pod_health "plugin-dca" "app=dca,component=scheduler" "DCA Scheduler" || VERIFICATION_FAILED=true + +# --- 7.5 Service Health Endpoints --- +echo -e "${CYAN}[5/6] Verifying service endpoints...${NC}" + +# Verifier API +if kubectl -n verifier exec deploy/verifier -- wget -qO- --timeout=5 http://localhost:8080/ &>/dev/null; then + echo -e " ${GREEN}✓${NC} Verifier API HTTP responding" +else + echo -e " ${RED}✗${NC} Verifier API HTTP not responding" + VERIFICATION_FAILED=true +fi + +# DCA Server (swap) +if kubectl -n plugin-dca exec deploy/server-swap -- wget -qO- --timeout=5 http://localhost:8082/ &>/dev/null; then + echo -e " ${GREEN}✓${NC} DCA Server HTTP responding" +else + echo -e " ${RED}✗${NC} DCA Server HTTP not responding" + VERIFICATION_FAILED=true +fi + +# --- 7.6 Worker Queue Configuration --- +echo -e "${CYAN}[6/6] Verifying worker queue configuration...${NC}" + +# Check DCA worker logs for queue connection +DCA_WORKER_LOGS=$(kubectl -n plugin-dca logs deploy/worker --tail=50 2>/dev/null || echo "") +if echo "$DCA_WORKER_LOGS" | grep -q "dca_plugin_queue\|starting worker"; then + echo -e " ${GREEN}✓${NC} DCA Worker started (should use dca_plugin_queue)" +else + echo -e " ${YELLOW}⚠${NC} DCA Worker queue config not confirmed in logs" +fi + +# Check Verifier worker logs +VERIFIER_WORKER_LOGS=$(kubectl -n verifier logs deploy/worker --tail=50 2>/dev/null || echo "") +if echo "$VERIFIER_WORKER_LOGS" | grep -q "starting worker\|asynq"; then + echo -e " ${GREEN}✓${NC} Verifier Worker started (uses default queue)" +else + echo -e " ${YELLOW}⚠${NC} Verifier Worker queue config not confirmed in logs" +fi + +# ============================================ +# VERIFICATION RESULT +# ============================================ + +echo "" +if $VERIFICATION_FAILED; then + echo -e "${RED}==========================================${NC}" + echo -e "${RED} VERIFICATION FAILED${NC}" + echo -e "${RED}==========================================${NC}" + echo "" + echo -e " ${RED}Some checks failed. Review above and fix before proceeding.${NC}" + echo "" + echo -e " ${CYAN}Debug commands:${NC}" + echo -e " kubectl -n infra logs job/minio-init" + echo -e " kubectl -n verifier logs job/seed-plugins" + echo -e " kubectl -n plugin-dca logs deploy/worker" + echo "" + exit 1 +fi + +# ============================================ +# SUMMARY +# ============================================ + +echo -e "${GREEN}==========================================${NC}" +echo -e "${GREEN} ALL VERIFICATIONS PASSED${NC}" +echo -e "${GREEN}==========================================${NC}" +echo "" +echo -e "${CYAN}==========================================${NC}" +echo -e "${GREEN} STARTUP COMPLETE${NC}" +echo -e "${CYAN}==========================================${NC}" +echo "" +echo -e " ${CYAN}Relay:${NC} https://api.vultisig.com/router (production)" +echo -e " ${CYAN}VultiServer:${NC} https://api.vultisig.com (production)" +echo "" +echo -e " ${CYAN}Services:${NC}" +echo -e " verifier/verifier (API)" +echo -e " verifier/worker (TSS worker)" +echo -e " plugin-dca/server (DCA API)" +echo -e " plugin-dca/worker (DCA TSS worker)" +echo -e " plugin-dca/scheduler (Job scheduler)" +echo "" +echo -e " ${CYAN}Infrastructure:${NC}" +echo -e " infra/postgres" +echo -e " infra/redis" +echo -e " infra/minio" +echo "" + +# Show pod status +echo -e " ${CYAN}Pod Status:${NC}" +kubectl get pods -n verifier --no-headers 2>/dev/null | awk '{print " verifier/" $1 ": " $3}' +kubectl get pods -n plugin-dca --no-headers 2>/dev/null | awk '{print " plugin-dca/" $1 ": " $3}' +echo "" + +echo -e " ${CYAN}Logs:${NC}" +echo -e " kubectl -n verifier logs -f deploy/verifier" +echo -e " kubectl -n verifier logs -f deploy/worker" +echo -e " kubectl -n plugin-dca logs -f deploy/worker" +echo "" +echo -e " ${CYAN}Stop:${NC} ./infrastructure/scripts/k8s-stop.sh" +echo "" diff --git a/infrastructure/scripts/k8s-stop.sh b/infrastructure/scripts/k8s-stop.sh new file mode 100755 index 0000000..f877df1 --- /dev/null +++ b/infrastructure/scripts/k8s-stop.sh @@ -0,0 +1,149 @@ +#!/bin/bash +# +# K8s Stop Script - Full shutdown and reset of Vultisig services +# Mirrors local `make stop` behavior for K8s environment +# +# Usage: ./k8s-stop.sh +# +# This script: +# 1. Scales down all deployments +# 2. Deletes all jobs +# 3. Flushes Redis +# 4. Deletes PVCs (full data reset) +# 5. Recreates infrastructure + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +VCLI_DIR="$(dirname "$(dirname "$SCRIPT_DIR")")" + +# Colors +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +CYAN='\033[0;36m' +NC='\033[0m' + +# Find kubeconfig +if [[ -z "$KUBECONFIG" ]]; then + if [[ -f "$VCLI_DIR/.kube/config" ]]; then + export KUBECONFIG="$VCLI_DIR/.kube/config" + elif [[ -f "$HOME/.kube/config" ]]; then + export KUBECONFIG="$HOME/.kube/config" + fi +fi + +echo -e "${CYAN}==========================================${NC}" +echo -e "${CYAN} Vultisig K8s Full Stop${NC}" +echo -e "${CYAN}==========================================${NC}" +echo "" + +# Verify cluster connection +if ! kubectl cluster-info &>/dev/null; then + echo -e "${RED}ERROR: Cannot connect to cluster${NC}" + echo "Set KUBECONFIG or ensure kubectl is configured" + exit 1 +fi + +echo -e "${YELLOW}Connected to cluster:${NC} $(kubectl config current-context)" +echo "" + +# ============================================ +# STEP 1: Scale down all deployments +# ============================================ + +echo -e "${CYAN}Scaling down deployments...${NC}" + +# Verifier namespace +for deploy in verifier worker tx-indexer; do + if kubectl -n verifier get deployment $deploy &>/dev/null; then + kubectl -n verifier scale deployment $deploy --replicas=0 + echo -e " ${GREEN}✓${NC} verifier/$deploy → 0" + fi +done + +# Plugin-DCA namespace +for deploy in server-swap server-send worker scheduler tx-indexer; do + if kubectl -n plugin-dca get deployment $deploy &>/dev/null; then + kubectl -n plugin-dca scale deployment $deploy --replicas=0 + echo -e " ${GREEN}✓${NC} plugin-dca/$deploy → 0" + fi +done + +# ============================================ +# STEP 2: Delete jobs +# ============================================ + +echo -e "${CYAN}Deleting jobs...${NC}" +kubectl -n infra delete job --all --ignore-not-found 2>/dev/null || true +kubectl -n verifier delete job --all --ignore-not-found 2>/dev/null || true +kubectl -n plugin-dca delete job --all --ignore-not-found 2>/dev/null || true +echo -e " ${GREEN}✓${NC} Jobs deleted" + +# Wait for pods to terminate +echo -e "${YELLOW}Waiting for pods to terminate...${NC}" +sleep 5 + +# ============================================ +# STEP 3: Flush Redis +# ============================================ + +echo -e "${CYAN}Flushing Redis...${NC}" +REDIS_PASSWORD=$(kubectl -n infra get secret redis -o jsonpath='{.data.password}' 2>/dev/null | base64 -d) || REDIS_PASSWORD="" + +if [[ -n "$REDIS_PASSWORD" ]]; then + kubectl -n infra exec redis-0 -- redis-cli -a "$REDIS_PASSWORD" FLUSHALL 2>/dev/null && \ + echo -e " ${GREEN}✓${NC} Redis flushed" || \ + echo -e " ${YELLOW}⚠${NC} Redis flush skipped" +fi + +# ============================================ +# STEP 4: Delete PVCs (full data reset) +# ============================================ + +echo -e "${CYAN}Deleting PVCs...${NC}" + +# Scale down statefulsets first +for ns in infra verifier plugin-dca; do + kubectl -n $ns scale statefulset --all --replicas=0 2>/dev/null || true +done +sleep 3 + +# Delete PVCs +for ns in infra verifier plugin-dca; do + kubectl -n $ns delete pvc --all --ignore-not-found 2>/dev/null || true + echo -e " ${GREEN}✓${NC} PVCs deleted in $ns" +done + +# ============================================ +# STEP 5: Recreate infrastructure +# ============================================ + +echo -e "${CYAN}Recreating infrastructure...${NC}" + +# Re-apply base infrastructure +kubectl apply -f "$VCLI_DIR/k8s/base/infra/" 2>/dev/null || true + +echo -e "${YELLOW}Waiting for infrastructure...${NC}" +kubectl -n infra wait --for=condition=ready pod -l app=postgres --timeout=180s 2>/dev/null || true +kubectl -n infra wait --for=condition=ready pod -l app=redis --timeout=60s 2>/dev/null || true +kubectl -n infra wait --for=condition=ready pod -l app=minio --timeout=60s 2>/dev/null || true +echo -e " ${GREEN}✓${NC} Infrastructure ready" + +# ============================================ +# SUMMARY +# ============================================ + +echo "" +echo -e "${CYAN}==========================================${NC}" +echo -e "${GREEN} STOP COMPLETE${NC}" +echo -e "${CYAN}==========================================${NC}" +echo "" +echo -e " ${GREEN}✓${NC} All deployments scaled to 0" +echo -e " ${GREEN}✓${NC} All jobs deleted" +echo -e " ${GREEN}✓${NC} Redis flushed" +echo -e " ${GREEN}✓${NC} PVCs deleted and recreated" +echo -e " ${GREEN}✓${NC} Databases empty (will be seeded on start)" +echo "" +echo -e " ${CYAN}To start:${NC} ./infrastructure/scripts/k8s-start.sh" +echo "" diff --git a/infrastructure/terraform/main.tf b/infrastructure/terraform/main.tf index fa444fa..d70f440 100644 --- a/infrastructure/terraform/main.tf +++ b/infrastructure/terraform/main.tf @@ -57,17 +57,17 @@ resource "hcloud_network_subnet" "helsinki" { ip_range = "10.2.0.0/16" } -# Master node in Falkenstein +# Master node in Nuremberg (fsn1 out of cpx capacity) resource "hcloud_server" "master" { name = "${var.cluster_name}-master" server_type = var.master_server_type image = "ubuntu-24.04" - location = "fsn1" + location = "hel1" ssh_keys = [hcloud_ssh_key.cluster.id] labels = { role = "master" - region = "fsn1" + region = "hel1" cluster = var.cluster_name } @@ -117,39 +117,39 @@ resource "hcloud_server" "workers" { resource "hcloud_volume" "postgres" { name = "${var.cluster_name}-postgres" size = 50 - location = "fsn1" + location = "hel1" format = "ext4" } resource "hcloud_volume" "redis" { name = "${var.cluster_name}-redis" size = 10 - location = "fsn1" + location = "hel1" format = "ext4" } resource "hcloud_volume" "minio" { name = "${var.cluster_name}-minio" size = 50 - location = "fsn1" + location = "hel1" format = "ext4" } resource "hcloud_volume_attachment" "postgres" { volume_id = hcloud_volume.postgres.id - server_id = hcloud_server.workers["fsn1"].id + server_id = hcloud_server.workers["hel1"].id automount = true } resource "hcloud_volume_attachment" "redis" { volume_id = hcloud_volume.redis.id - server_id = hcloud_server.workers["fsn1"].id + server_id = hcloud_server.workers["hel1"].id automount = true } resource "hcloud_volume_attachment" "minio" { volume_id = hcloud_volume.minio.id - server_id = hcloud_server.workers["fsn1"].id + server_id = hcloud_server.workers["hel1"].id automount = true } @@ -237,7 +237,7 @@ resource "local_file" "setup_env" { master_private = "10.1.0.10" k3s_token = local.k3s_token workers = { for k, v in hcloud_server.workers : k => v.ipv4_address } - ssh_key_path = var.ssh_public_key == "" ? "${path.module}/../../.ssh/id_ed25519" : "" + ssh_key_path = var.ssh_public_key == "" ? "./.ssh/id_ed25519" : "" }) filename = "${path.module}/../../setup-env.sh" file_permission = "0755" diff --git a/infrastructure/terraform/terraform.tfvars.example b/infrastructure/terraform/terraform.tfvars.example index ee4fe86..9ad9e44 100644 --- a/infrastructure/terraform/terraform.tfvars.example +++ b/infrastructure/terraform/terraform.tfvars.example @@ -12,9 +12,17 @@ hcloud_token = "your-hetzner-api-token" # Optional: customize cluster name # cluster_name = "vultisig" -# Optional: customize server types -# master_server_type = "cx31" # 2 vCPU, 8GB RAM -# worker_server_type = "cx41" # 4 vCPU, 16GB RAM +# Optional: customize server types (MUST be AMD64 for GHCR images) +# Default: ccx13/ccx23 (dedicated AMD64, always available) +# master_server_type = "ccx13" # 2 dedicated vCPU, 8GB RAM (~€13/mo) +# worker_server_type = "ccx23" # 4 dedicated vCPU, 16GB RAM (~€25/mo) +# +# Alternative (cheaper but often out of stock in EU): +# master_server_type = "cpx11" +# worker_server_type = "cpx31" +# +# DO NOT USE ARM servers (cax*) - GHCR images are AMD64 only -# Optional: customize regions (default: all 3) -# regions = ["fsn1", "nbg1", "hel1"] +# Optional: customize region (default: hel1) +# Check availability first: ./infrastructure/scripts/check-availability.sh +# regions = ["hel1"] diff --git a/infrastructure/terraform/variables.tf b/infrastructure/terraform/variables.tf index 5bb6310..286e231 100644 --- a/infrastructure/terraform/variables.tf +++ b/infrastructure/terraform/variables.tf @@ -26,17 +26,17 @@ variable "cluster_name" { variable "master_server_type" { description = "Server type for master node" type = string - default = "cax11" # 2 vCPU, 4GB RAM (ARM) + default = "ccx13" # 2 dedicated vCPU, 8GB RAM (AMD64) - GHCR images are AMD64 only } variable "worker_server_type" { description = "Server type for worker nodes" type = string - default = "cax21" # 4 vCPU, 8GB RAM (ARM) + default = "ccx23" # 4 dedicated vCPU, 16GB RAM (AMD64) - GHCR images are AMD64 only } variable "regions" { description = "Hetzner regions to deploy workers" type = list(string) - default = ["fsn1"] # Reduced for account with low IP quota + default = ["hel1"] # fsn1 out of cpx capacity, using nbg1 } diff --git a/k8s/base/dca/configmaps.yaml b/k8s/base/dca/configmaps.yaml index 58d463d..8b09f09 100644 --- a/k8s/base/dca/configmaps.yaml +++ b/k8s/base/dca/configmaps.yaml @@ -57,13 +57,19 @@ data: blast: "https://rpc.blast.io" optimism: "https://optimism-rpc.publicnode.com" polygon: "https://polygon-bor-rpc.publicnode.com" + zksync: "https://mainnet.era.zksync.io" + cronos: "https://evm.cronos.org" solana: "https://solana-rpc.publicnode.com" - litecoin: "https://api.blockchair.com/litecoin" - dogecoin: "https://api.blockchair.com/dogecoin" - bitcoin: "https://api.blockchair.com/bitcoin" - bitcoincash: "https://api.blockchair.com/bitcoin-cash" + litecoin: "https://litecoin-rpc.publicnode.com" + dogecoin: "https://dogecoin-rpc.publicnode.com" + bitcoin: "https://bitcoin-rpc.publicnode.com" + bitcoincash: "https://bitcoincash-rpc.publicnode.com" thorchain: "https://thornode.ninerealms.com" mayachain: "https://mayanode.mayachain.info" + cosmos: "https://cosmos-rest.publicnode.com" + tron: "https://api.trongrid.io" + dash: "https://dash-rpc.publicnode.com" + zcash: "https://zcash-rpc.publicnode.com" --- apiVersion: v1 kind: ConfigMap diff --git a/k8s/base/dca/tx-indexer.yaml b/k8s/base/dca/tx-indexer.yaml index 688bd08..d84f113 100644 --- a/k8s/base/dca/tx-indexer.yaml +++ b/k8s/base/dca/tx-indexer.yaml @@ -25,6 +25,7 @@ spec: containers: - name: tx-indexer image: docker.io/library/app-recurring-txindexer:local-amd64 + imagePullPolicy: Never command: ["/app/tx_indexer"] ports: - containerPort: 8183 diff --git a/k8s/base/relay/kustomization.yaml b/k8s/base/relay/kustomization.yaml deleted file mode 100644 index 74d1cbb..0000000 --- a/k8s/base/relay/kustomization.yaml +++ /dev/null @@ -1,7 +0,0 @@ -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization - -namespace: relay - -resources: - - relay.yaml diff --git a/k8s/base/relay/relay.yaml b/k8s/base/relay/relay.yaml deleted file mode 100644 index 3053599..0000000 --- a/k8s/base/relay/relay.yaml +++ /dev/null @@ -1,87 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: relay - namespace: relay - labels: - app: relay -spec: - replicas: 1 - selector: - matchLabels: - app: relay - template: - metadata: - labels: - app: relay - spec: - initContainers: - - name: config-generator - image: busybox:1.36 - command: ['sh', '-c'] - args: - - | - cat > /config/config.json << EOF - { - "port": 8080, - "redis_server": { - "addr": "redis.infra.svc.cluster.local:6379", - "password": "${REDIS_PASSWORD}", - "db": 1 - } - } - EOF - env: - - name: REDIS_PASSWORD - valueFrom: - secretKeyRef: - name: redis - key: password - volumeMounts: - - name: config - mountPath: /config - containers: - - name: relay - image: ghcr.io/vultisig/vultisig-relay:latest - ports: - - containerPort: 8080 - name: http - volumeMounts: - - name: config - mountPath: /app/config - resources: - requests: - memory: "64Mi" - cpu: "50m" - limits: - memory: "256Mi" - cpu: "200m" - readinessProbe: - httpGet: - path: /ping - port: 8080 - initialDelaySeconds: 5 - periodSeconds: 5 - livenessProbe: - httpGet: - path: /ping - port: 8080 - initialDelaySeconds: 10 - periodSeconds: 10 - volumes: - - name: config - emptyDir: {} ---- -apiVersion: v1 -kind: Service -metadata: - name: relay - namespace: relay -spec: - type: ClusterIP - ports: - - port: 8080 - targetPort: 8080 - name: http - selector: - app: relay diff --git a/k8s/base/vcli/vcli.yaml b/k8s/base/vcli/vcli.yaml index b7c0b18..1115b8e 100644 --- a/k8s/base/vcli/vcli.yaml +++ b/k8s/base/vcli/vcli.yaml @@ -14,14 +14,22 @@ spec: env: # Service URLs (in-cluster) - name: VCLI_VERIFIER_URL - value: "http://verifier.verifier.svc.cluster.local:8080" + valueFrom: + configMapKeyRef: + name: vcli-config + key: verifier-url - name: VCLI_DCA_PLUGIN_URL - value: "http://dca-server.plugin-dca.svc.cluster.local:8080" + value: "http://server-swap.plugin-dca.svc.cluster.local:8082" - name: VCLI_RELAY_URL valueFrom: configMapKeyRef: name: relay-config key: url + - name: VCLI_FAST_VAULT_URL + valueFrom: + configMapKeyRef: + name: vcli-config + key: fast-vault-url # Database (read-only access for queries) - name: VCLI_DATABASE_DSN valueFrom: diff --git a/k8s/base/verifier/configmaps.yaml b/k8s/base/verifier/configmaps.yaml index e167c7a..cc06496 100644 --- a/k8s/base/verifier/configmaps.yaml +++ b/k8s/base/verifier/configmaps.yaml @@ -54,8 +54,29 @@ data: blast: "https://rpc.blast.io" optimism: "https://optimism-rpc.publicnode.com" polygon: "https://polygon-bor-rpc.publicnode.com" + zksync: "https://mainnet.era.zksync.io" + cronos: "https://evm.cronos.org" solana: "https://solana-rpc.publicnode.com" litecoin: "https://api.blockchair.com/litecoin" dogecoin: "https://api.blockchair.com/dogecoin" bitcoin: "https://api.blockchair.com/bitcoin" bitcoincash: "https://api.blockchair.com/bitcoin-cash" +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: vcli-config + namespace: verifier +data: + verifier-url: "http://verifier.verifier.svc.cluster.local:8080" + fast-vault-url: "https://api.vultisig.com" +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: plugin-endpoints + namespace: verifier +data: + dca-swap-url: "http://server-swap.plugin-dca.svc.cluster.local:8082" + dca-send-url: "http://server-send.plugin-dca.svc.cluster.local:8083" + fee-url: "http://server.plugin-fee.svc.cluster.local:8085" diff --git a/k8s/base/vultiserver/kustomization.yaml b/k8s/base/vultiserver/kustomization.yaml deleted file mode 100644 index 2947310..0000000 --- a/k8s/base/vultiserver/kustomization.yaml +++ /dev/null @@ -1,7 +0,0 @@ -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization - -namespace: vultiserver - -resources: - - vultiserver.yaml diff --git a/k8s/base/vultiserver/vultiserver.yaml b/k8s/base/vultiserver/vultiserver.yaml deleted file mode 100644 index 7e2b4b1..0000000 --- a/k8s/base/vultiserver/vultiserver.yaml +++ /dev/null @@ -1,165 +0,0 @@ -apiVersion: v1 -kind: ConfigMap -metadata: - name: vultiserver-config - namespace: vultiserver -data: - config.yaml: | - server: - port: "8080" - host: "0.0.0.0" - vaultsFilePath: "vaults" - redis: - host: "redis.infra.svc.cluster.local" - port: "6379" - relay: - server: "http://relay.relay.svc.cluster.local:8080" - block_storage: - host: "http://minio.infra.svc.cluster.local:9000" - region: "us-east-1" - bucket: "vultiserver" ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: vultiserver-api - namespace: vultiserver - labels: - app: vultiserver - component: api -spec: - replicas: 1 - selector: - matchLabels: - app: vultiserver - component: api - template: - metadata: - labels: - app: vultiserver - component: api - spec: - containers: - - name: vultiserver - image: ghcr.io/vultisig/vultiserver:latest - ports: - - containerPort: 8080 - name: http - env: - - name: REDIS_PASSWORD - valueFrom: - secretKeyRef: - name: redis - key: password - - name: BLOCK_STORAGE_ACCESS_KEY - valueFrom: - secretKeyRef: - name: minio - key: access-key - - name: BLOCK_STORAGE_SECRET - valueFrom: - secretKeyRef: - name: minio - key: secret-key - volumeMounts: - - name: config - mountPath: /root/config.yaml - subPath: config.yaml - - name: vaults - mountPath: /root/vaults - resources: - requests: - memory: "128Mi" - cpu: "100m" - limits: - memory: "512Mi" - cpu: "500m" - readinessProbe: - httpGet: - path: /ping - port: 8080 - initialDelaySeconds: 5 - periodSeconds: 5 - livenessProbe: - httpGet: - path: /ping - port: 8080 - initialDelaySeconds: 10 - periodSeconds: 10 - volumes: - - name: config - configMap: - name: vultiserver-config - - name: vaults - emptyDir: {} ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: vultiserver-worker - namespace: vultiserver - labels: - app: vultiserver - component: worker -spec: - replicas: 1 - selector: - matchLabels: - app: vultiserver - component: worker - template: - metadata: - labels: - app: vultiserver - component: worker - spec: - containers: - - name: worker - image: ghcr.io/vultisig/vultiserver:latest - command: ["./main", "worker"] - env: - - name: REDIS_PASSWORD - valueFrom: - secretKeyRef: - name: redis - key: password - - name: BLOCK_STORAGE_ACCESS_KEY - valueFrom: - secretKeyRef: - name: minio - key: access-key - - name: BLOCK_STORAGE_SECRET - valueFrom: - secretKeyRef: - name: minio - key: secret-key - volumeMounts: - - name: config - mountPath: /root/config.yaml - subPath: config.yaml - resources: - requests: - memory: "256Mi" - cpu: "200m" - limits: - memory: "1Gi" - cpu: "1000m" - volumes: - - name: config - configMap: - name: vultiserver-config ---- -apiVersion: v1 -kind: Service -metadata: - name: vultiserver - namespace: vultiserver -spec: - type: ClusterIP - ports: - - port: 8080 - targetPort: 8080 - name: http - selector: - app: vultiserver - component: api diff --git a/k8s/overlays/local/ingress-local.yaml b/k8s/overlays/local/ingress-local.yaml deleted file mode 100644 index ca57aaf..0000000 --- a/k8s/overlays/local/ingress-local.yaml +++ /dev/null @@ -1,39 +0,0 @@ -apiVersion: networking.k8s.io/v1 -kind: Ingress -metadata: - name: relay-ingress - namespace: relay - annotations: - kubernetes.io/ingress.class: traefik -spec: - rules: - - host: relay.vultisig.local - http: - paths: - - path: / - pathType: Prefix - backend: - service: - name: relay - port: - number: 8080 ---- -apiVersion: networking.k8s.io/v1 -kind: Ingress -metadata: - name: vultiserver-ingress - namespace: vultiserver - annotations: - kubernetes.io/ingress.class: traefik -spec: - rules: - - host: vultiserver.vultisig.local - http: - paths: - - path: / - pathType: Prefix - backend: - service: - name: vultiserver - port: - number: 8080 diff --git a/k8s/overlays/local/kustomization.yaml b/k8s/overlays/local/kustomization.yaml deleted file mode 100644 index b5443c6..0000000 --- a/k8s/overlays/local/kustomization.yaml +++ /dev/null @@ -1,28 +0,0 @@ -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization - -# Local overlay: Deploy all services locally including Relay and VultiServer -# Usage: kubectl apply -k k8s/overlays/local -# -# Prerequisites: -# 1. Build and load images into local cluster (kind/minikube): -# docker build -t app-recurring-server:local-amd64 . -# kind load docker-image app-recurring-server:local-amd64 -# 2. Base uses imagePullPolicy: Never (requires pre-loaded images) - -resources: - - ../../base - - ../../base/relay/ - - ../../base/vultiserver/ - - ../../base/vcli/ - - ingress-local.yaml - -# Local uses cluster DNS endpoints from base: -# - relay.relay.svc.cluster.local -# - vultiserver.vultiserver.svc.cluster.local - -# Images use local tags (local-amd64) with Never pull policy -# For multi-arch local dev, override with: -# images: -# - name: docker.io/library/app-recurring-server -# newTag: local-arm64 diff --git a/local/cmd/vcli/cmd/devtoken.go b/local/cmd/vcli/cmd/devtoken.go index 510e263..d59a25e 100644 --- a/local/cmd/vcli/cmd/devtoken.go +++ b/local/cmd/vcli/cmd/devtoken.go @@ -27,7 +27,7 @@ func runDevToken() error { } vault := vaults[0] - jwtSecret := []byte("dev-jwt-secret-change-in-production") + jwtSecret := []byte("devsecret") tokenID := uuid.New().String() expirationTime := time.Now().Add(7 * 24 * time.Hour) diff --git a/local/seed-plugins.sql b/local/seed-plugins.sql index c38bccc..4cdc735 100644 --- a/local/seed-plugins.sql +++ b/local/seed-plugins.sql @@ -60,3 +60,19 @@ VALUES ('vultisig-dca-0000', 'local-dev-dca-apikey', 1), ('vultisig-recurring-sends-0000', 'local-dev-send-apikey', 1) ON CONFLICT (apikey) DO NOTHING; + +-- Seed plugin pricing (required for policy creation) +-- Each plugin needs pricing entries that match the billing types used in policies +-- Types: 'once' (one-time fee), 'per-tx' (per transaction), 'recurring' (subscription) +-- For 'once' and 'per-tx', frequency must be NULL +-- For 'recurring', frequency must be: daily, weekly, biweekly, or monthly +-- Note: Delete existing rows first to prevent duplicates (pricings table has no unique constraint on type+plugin_id) +DELETE FROM pricings WHERE plugin_id IN ('vultisig-dca-0000', 'vultisig-recurring-sends-0000', 'vultisig-fees-feee'); +INSERT INTO pricings (type, frequency, amount, asset, metric, plugin_id, created_at, updated_at) +VALUES + ('once', NULL, 0, 'usdc', 'fixed', 'vultisig-dca-0000', NOW(), NOW()), + ('per-tx', NULL, 0, 'usdc', 'fixed', 'vultisig-dca-0000', NOW(), NOW()), + ('once', NULL, 0, 'usdc', 'fixed', 'vultisig-recurring-sends-0000', NOW(), NOW()), + ('per-tx', NULL, 0, 'usdc', 'fixed', 'vultisig-recurring-sends-0000', NOW(), NOW()), + ('once', NULL, 0, 'usdc', 'fixed', 'vultisig-fees-feee', NOW(), NOW()), + ('per-tx', NULL, 0, 'usdc', 'fixed', 'vultisig-fees-feee', NOW(), NOW());