diff --git a/.github/workflows/mlops-pipeline.yml b/.github/workflows/mlops-pipeline.yml index 583ef79..7fb220c 100644 --- a/.github/workflows/mlops-pipeline.yml +++ b/.github/workflows/mlops-pipeline.yml @@ -14,7 +14,7 @@ jobs: mlops-pipeline: name: MLOps Kubernetes Pipeline runs-on: ubuntu-latest - timeout-minutes: 30 # Prevent hanging jobs + timeout-minutes: 45 permissions: contents: read @@ -26,6 +26,230 @@ jobs: with: fetch-depth: 0 + - name: Setup KinD + uses: engineerd/setup-kind@v0.6.0 + with: + version: "v0.24.0" + skipClusterCreation: true + + - name: Validate Prerequisites + run: | + echo "๐Ÿ” Validating prerequisites..." + + # Verify tools are available + kind version + kubectl version --client + docker version + + # Verify configuration files + if [[ ! -f "config/kind-cluster.yaml" ]]; then + echo "โŒ KinD configuration not found" + exit 1 + fi + + echo "โœ… All prerequisites validated" + + - name: Provision Kubernetes Cluster + id: cluster-provision + run: | + echo "๐Ÿš€ Starting cluster provisioning..." + + # Create config directory if it doesn't exist + mkdir -p config + + # Run cluster provisioning script + bash scripts/provision-cluster.sh + + # Export cluster status for later steps + echo "cluster_ready=true" >> $GITHUB_OUTPUT + echo "cluster_context=kind-${CLUSTER_NAME}" >> $GITHUB_OUTPUT + + - name: Validate Cluster Status + if: steps.cluster-provision.outputs.cluster_ready == 'true' + run: | + echo "๐Ÿฅ Performing comprehensive cluster validation..." + + CONTEXT="kind-${CLUSTER_NAME}" + + # Test cluster connectivity + kubectl cluster-info --context "$CONTEXT" + + # Verify all nodes are ready + echo "๐Ÿ“Š Node Status:" + kubectl get nodes --context "$CONTEXT" -o wide + + # Check system pods + echo "๐Ÿ” System Pods Status:" + kubectl get pods -n kube-system --context "$CONTEXT" + + # Verify we have the expected number of nodes + NODE_COUNT=$(kubectl get nodes --context "$CONTEXT" --no-headers | wc -l) + if [[ "$NODE_COUNT" -ne 3 ]]; then + echo "โŒ Expected 3 nodes, found $NODE_COUNT" + exit 1 + fi + + echo "โœ… Cluster validation completed successfully" + + - name: Deploy Ingress Controller + if: steps.cluster-provision.outputs.cluster_ready == 'true' + id: ingress-deploy + run: | + echo "๐Ÿ“ฆ Deploying ingress controller and services..." + bash scripts/deploy-ingress.sh + echo "ingress_ready=true" >> $GITHUB_OUTPUT + + - name: Test Ingress Connectivity + if: steps.ingress-deploy.outputs.ingress_ready == 'true' + id: ingress-test + run: | + echo "๐Ÿงช Testing ingress connectivity..." + bash scripts/test-ingress.sh + echo "ingress_test_passed=true" >> $GITHUB_OUTPUT + + - name: Run Load Testing + if: steps.ingress-test.outputs.ingress_test_passed == 'true' + id: load-test + run: | + echo "๐Ÿš€ Running load testing..." + bash scripts/load-test.sh + echo "load_test_completed=true" >> $GITHUB_OUTPUT + + - name: Post Cluster Status to PR + if: always() + uses: actions/github-script@v7 + with: + script: | + const clusterReady = '${{ steps.cluster-provision.outputs.cluster_ready }}' === 'true'; + const ingressReady = '${{ steps.ingress-deploy.outputs.ingress_ready }}' === 'true'; + const ingressTestPassed = '${{ steps.ingress-test.outputs.ingress_test_passed }}' === 'true'; + const loadTestCompleted = '${{ steps.load-test.outputs.load_test_completed }}' === 'true'; + + const getStatus = (condition) => condition ? 'โœ… Success' : 'โŒ Failed'; + const getEmoji = (condition) => condition ? '๐ŸŽ‰' : '๐Ÿ’ฅ'; + + let body = `## ${getEmoji(clusterReady)} MLOps Pipeline Status Report + + ### ๐Ÿ“Š Pipeline Execution Summary + + | Task | Status | Details | + |------|--------|---------| + | **Cluster Provisioning** | ${getStatus(clusterReady)} | ${clusterReady ? '3 nodes (1 control-plane, 2 workers)' : 'Provisioning failed'} | + | **Ingress Deployment** | ${getStatus(ingressReady)} | ${ingressReady ? 'NGINX ingress controller deployed' : 'Deployment failed'} | + | **Health Validation** | ${getStatus(ingressTestPassed)} | ${ingressTestPassed ? 'foo.localhost & bar.localhost responding' : 'Connectivity tests failed'} | + | **Load Testing** | ${getStatus(loadTestCompleted)} | ${loadTestCompleted ? 'k6 load test executed' : 'Load test failed'} | + + ### ๐Ÿ”ง Infrastructure Configuration + - **Cluster Name:** ${{ env.CLUSTER_NAME }} + - **Port Mappings:** 80, 443, 30080, 30081 + - **Pod Subnet:** 10.244.0.0/16 + - **Service Subnet:** 10.96.0.0/12 + + ### ๐ŸŽฏ Service Endpoints + ${ingressTestPassed ? '- โœ… http://foo.localhost (returns "foo")' : '- โŒ foo.localhost not accessible'} + ${ingressTestPassed ? '- โœ… http://bar.localhost (returns "bar")' : '- โŒ bar.localhost not accessible'} + `; + + await github.rest.issues.createComment({ + issue_number: context.issue.number, + owner: context.repo.owner, + repo: context.repo.repo, + body: body + }); + + - name: Post Load Test Results to PR + if: always() && steps.load-test.outputs.load_test_completed == 'true' + uses: actions/github-script@v7 + with: + script: | + const fs = require('fs'); + + // Read load test results + let loadTestResults = 'Load test results not available'; + try { + loadTestResults = fs.readFileSync('load-test-results/test-summary.md', 'utf8'); + } catch (error) { + console.log('Could not read load test results:', error.message); + loadTestResults = `โŒ **Load Test Results Unavailable** + + Error reading results: ${error.message} + + Please check the workflow logs for detailed information.`; + } + + const body = `## ๐Ÿš€ Load Testing Results + + ${loadTestResults} + + ### ๐Ÿ“ˆ Performance Analysis + - **Test Duration:** 30 seconds + - **Virtual Users:** 10 concurrent users + - **Target Services:** foo.localhost, bar.localhost + - **Traffic Pattern:** Randomized between services + + ### ๐Ÿ” Quality Gates + - **Response Time Threshold:** < 500ms (95th percentile) + - **Error Rate Threshold:** < 10% + - **Success Criteria:** All thresholds must be met + + ### ๐Ÿ“Š CI Pipeline Completion Status + - โœ… **Task 2:** Kubernetes Cluster Provisioning + - โœ… **Task 3:** Ingress Controller Deployment + - โœ… **Task 4:** HTTP-echo Service Deployments + - โœ… **Task 5:** Ingress Routing Configuration + - โœ… **Task 6:** Health Checks & Validation + - โœ… **Task 7:** Load Testing Execution + - โœ… **Task 8:** Automated Results Reporting + + --- + *Pipeline completed at: ${new Date().toISOString()}* + *Workflow run: ${{ github.run_id }}*`; + + await github.rest.issues.createComment({ + issue_number: context.issue.number, + owner: context.repo.owner, + repo: context.repo.repo, + body: body + }); + + - name: Upload Load Test Artifacts + if: always() + uses: actions/upload-artifact@v4 + with: + name: load-test-results-${{ github.run_id }} + path: load-test-results/ + retention-days: 30 + + - name: Upload Cluster Logs + if: failure() + uses: actions/upload-artifact@v4 + with: + name: cluster-debug-logs-${{ github.run_id }} + path: | + /tmp/kind-logs-* + ~/.kube/config + retention-days: 7 + if-no-files-found: ignore + + - name: Pipeline Summary + if: always() + run: | + echo "๐ŸŽฏ MLOps Pipeline Execution Summary" + echo "==================================" + echo "Cluster Provisioning: ${{ steps.cluster-provision.outputs.cluster_ready == 'true' && 'โœ… SUCCESS' || 'โŒ FAILED' }}" + echo "Ingress Deployment: ${{ steps.ingress-deploy.outputs.ingress_ready == 'true' && 'โœ… SUCCESS' || 'โŒ FAILED' }}" + echo "Health Validation: ${{ steps.ingress-test.outputs.ingress_test_passed == 'true' && 'โœ… SUCCESS' || 'โŒ FAILED' }}" + echo "Load Testing: ${{ steps.load-test.outputs.load_test_completed == 'true' && 'โœ… SUCCESS' || 'โŒ FAILED' }}" + echo "" + echo "๐Ÿ”— Artifacts uploaded for detailed analysis" + echo "๐Ÿ“Š Results posted to PR for review" + echo "" + if [[ "${{ steps.load-test.outputs.load_test_completed }}" == "true" ]]; then + echo "๐ŸŽ‰ All MLOps pipeline tasks completed successfully!" + else + echo "โš ๏ธ Pipeline completed with some failures - check logs above" + fi +======= - name: Validate PR Trigger run: | echo "๐Ÿ” Validating PR trigger..." @@ -93,4 +317,5 @@ jobs: echo " - Application deployments" echo " - Load testing" echo " - Results reporting" +>>>>>>> 1629dec5755779cb0630a892f8c98e87fa03813d diff --git a/config/kind-cluster.yaml b/config/kind-cluster.yaml new file mode 100644 index 0000000..c29a979 --- /dev/null +++ b/config/kind-cluster.yaml @@ -0,0 +1,36 @@ +kind: Cluster +apiVersion: kind.x-k8s.io/v1alpha4 +name: mlops-test-cluster +nodes: + - role: control-plane + kubeadmConfigPatches: + - | + kind: InitConfiguration + nodeRegistration: + kubeletExtraArgs: + node-labels: "ingress-ready=true" + extraPortMappings: + - containerPort: 80 + hostPort: 80 + protocol: TCP + - containerPort: 443 + hostPort: 443 + protocol: TCP + - containerPort: 30080 + hostPort: 30080 + protocol: TCP + - containerPort: 30081 + hostPort: 30081 + protocol: TCP + - role: worker + labels: + tier: worker + - role: worker + labels: + tier: worker +networking: + apiServerAddress: "127.0.0.1" + apiServerPort: 6443 + podSubnet: "10.244.0.0/16" + serviceSubnet: "10.96.0.0/12" + diff --git a/scripts/deploy-ingress.sh b/scripts/deploy-ingress.sh new file mode 100755 index 0000000..1e6b18e --- /dev/null +++ b/scripts/deploy-ingress.sh @@ -0,0 +1,233 @@ +#!/bin/bash +set -euo pipefail + +CLUSTER_NAME="${CLUSTER_NAME:-mlops-test-cluster}" +CONTEXT="kind-${CLUSTER_NAME}" + +echo "๐Ÿš€ Deploying ingress controller and HTTP services..." + +# Deploy NGINX Ingress Controller +deploy_ingress_controller() { + echo "๐Ÿ“ฆ Deploying NGINX Ingress Controller..." + + kubectl apply -f https://raw.githubusercontent.com/kubernetes/ingress-nginx/main/deploy/static/provider/kind/deploy.yaml --context "$CONTEXT" + + echo "โณ Waiting for ingress controller to be ready..." + kubectl wait --namespace ingress-nginx \ + --for=condition=ready pod \ + --selector=app.kubernetes.io/component=controller \ + --timeout=300s --context "$CONTEXT" + + echo "โœ… Ingress controller deployed successfully" +} + +# Deploy HTTP echo services +deploy_echo_services() { + echo "๐Ÿ“ฆ Deploying foo and bar echo services..." + + # Deploy foo service with proper YAML manifest + cat </dev/null; then + echo "โŒ Cluster $CLUSTER_NAME not found or not accessible" + echo "๐Ÿ” Run ./provision-cluster.sh first" + exit 1 + fi + + deploy_ingress_controller + deploy_echo_services + create_ingress_routes + wait_for_deployments + display_deployment_summary + + echo "๐ŸŽ‰ Ingress deployment completed successfully!" +} + +main "$@" + diff --git a/scripts/load-test-results/k6-output.log b/scripts/load-test-results/k6-output.log new file mode 100644 index 0000000..b00b4ae --- /dev/null +++ b/scripts/load-test-results/k6-output.log @@ -0,0 +1,111 @@ + + /\ Grafana /โ€พโ€พ/ + /\ / \ |\ __ / / + / \/ \ | |/ / / โ€พโ€พ\ + / \ | ( | (โ€พ) | + / __________ \ |_|\_\ \_____/ + + execution: local + script: /scripts/load-test.js + web dashboard: http://127.0.0.1:5665 + output: - + + scenarios: (100.00%) 1 scenario, 10 max VUs, 1m0s max duration (incl. graceful stop): + * default: 10 looping VUs for 30s (gracefulStop: 30s) + + +running (0m01.0s), 10/10 VUs, 7 complete and 0 interrupted iterations +default [ 3% ] 10 VUs 01.0s/30s + +running (0m02.0s), 10/10 VUs, 17 complete and 0 interrupted iterations +default [ 7% ] 10 VUs 02.0s/30s + +running (0m03.0s), 10/10 VUs, 27 complete and 0 interrupted iterations +default [ 10% ] 10 VUs 03.0s/30s + +running (0m04.0s), 10/10 VUs, 38 complete and 0 interrupted iterations +default [ 13% ] 10 VUs 04.0s/30s + +running (0m05.0s), 10/10 VUs, 47 complete and 0 interrupted iterations +default [ 17% ] 10 VUs 05.0s/30s + +running (0m06.0s), 10/10 VUs, 59 complete and 0 interrupted iterations +default [ 20% ] 10 VUs 06.0s/30s + +running (0m07.0s), 10/10 VUs, 70 complete and 0 interrupted iterations +default [ 23% ] 10 VUs 07.0s/30s + +running (0m08.0s), 10/10 VUs, 79 complete and 0 interrupted iterations +default [ 27% ] 10 VUs 08.0s/30s + +running (0m09.0s), 10/10 VUs, 89 complete and 0 interrupted iterations +default [ 30% ] 10 VUs 09.0s/30s + +running (0m10.0s), 10/10 VUs, 98 complete and 0 interrupted iterations +default [ 33% ] 10 VUs 10.0s/30s + +running (0m11.0s), 10/10 VUs, 109 complete and 0 interrupted iterations +default [ 37% ] 10 VUs 11.0s/30s + +running (0m12.0s), 10/10 VUs, 123 complete and 0 interrupted iterations +default [ 40% ] 10 VUs 12.0s/30s + +running (0m13.0s), 10/10 VUs, 132 complete and 0 interrupted iterations +default [ 43% ] 10 VUs 13.0s/30s + +running (0m14.0s), 10/10 VUs, 141 complete and 0 interrupted iterations +default [ 47% ] 10 VUs 14.0s/30s + +running (0m15.0s), 10/10 VUs, 152 complete and 0 interrupted iterations +default [ 50% ] 10 VUs 15.0s/30s + +running (0m16.0s), 10/10 VUs, 157 complete and 0 interrupted iterations +default [ 53% ] 10 VUs 16.0s/30s + +running (0m17.0s), 10/10 VUs, 170 complete and 0 interrupted iterations +default [ 57% ] 10 VUs 17.0s/30s + +running (0m18.0s), 10/10 VUs, 184 complete and 0 interrupted iterations +default [ 60% ] 10 VUs 18.0s/30s + +running (0m19.0s), 10/10 VUs, 194 complete and 0 interrupted iterations +default [ 63% ] 10 VUs 19.0s/30s + +running (0m20.0s), 10/10 VUs, 200 complete and 0 interrupted iterations +default [ 67% ] 10 VUs 20.0s/30s + +running (0m21.0s), 10/10 VUs, 209 complete and 0 interrupted iterations +default [ 70% ] 10 VUs 21.0s/30s + +running (0m22.0s), 10/10 VUs, 216 complete and 0 interrupted iterations +default [ 73% ] 10 VUs 22.0s/30s + +running (0m23.0s), 10/10 VUs, 227 complete and 0 interrupted iterations +default [ 77% ] 10 VUs 23.0s/30s + +running (0m24.0s), 10/10 VUs, 237 complete and 0 interrupted iterations +default [ 80% ] 10 VUs 24.0s/30s + +running (0m25.0s), 10/10 VUs, 246 complete and 0 interrupted iterations +default [ 83% ] 10 VUs 25.0s/30s + +running (0m26.0s), 10/10 VUs, 256 complete and 0 interrupted iterations +default [ 87% ] 10 VUs 26.0s/30s + +running (0m27.0s), 10/10 VUs, 268 complete and 0 interrupted iterations +default [ 90% ] 10 VUs 27.0s/30s + +running (0m28.0s), 10/10 VUs, 277 complete and 0 interrupted iterations +default [ 93% ] 10 VUs 28.0s/30s + +running (0m29.0s), 10/10 VUs, 283 complete and 0 interrupted iterations +default [ 97% ] 10 VUs 29.0s/30s + +running (0m30.0s), 10/10 VUs, 296 complete and 0 interrupted iterations +default [ 100% ] 10 VUs 30.0s/30s + +running (0m31.0s), 02/10 VUs, 304 complete and 0 interrupted iterations +default โ†“ [ 100% ] 10 VUs 30s + +running (0m31.5s), 00/10 VUs, 306 complete and 0 interrupted iterations +default โœ“ [ 100% ] 10 VUs 30s diff --git a/scripts/load-test-results/load-test.js b/scripts/load-test-results/load-test.js new file mode 100644 index 0000000..14795e9 --- /dev/null +++ b/scripts/load-test-results/load-test.js @@ -0,0 +1,95 @@ +import http from 'k6/http'; +import { check, sleep } from 'k6'; +import { Rate } from 'k6/metrics'; + +// Custom metrics +export let errorRate = new Rate('errors'); + +export let options = { + vus: __ENV.VUS || 10, + duration: __ENV.DURATION || '30s', + thresholds: { + http_req_duration: ['p(95)<500'], // 95% of requests must complete below 500ms + http_req_failed: ['rate<0.1'], // Error rate must be below 10% + }, +}; + +export default function() { + // Randomize between foo and bar hosts + const hosts = ['foo.localhost', 'bar.localhost']; + const host = hosts[Math.floor(Math.random() * hosts.length)]; + + // Use the ingress controller service instead of localhost + let response = http.get('http://ingress-nginx-controller.ingress-nginx.svc.cluster.local/', { + headers: { 'Host': host }, + }); + + let success = check(response, { + 'status is 200': (r) => r.status === 200, + 'response time < 500ms': (r) => r.timings.duration < 500, + 'correct response body': (r) => { + // Add null check to prevent TypeError + if (!r.body) { + console.log(`No response body for ${host}`); + return false; + } + + if (host === 'foo.localhost') { + return r.body.includes('foo'); + } else { + return r.body.includes('bar'); + } + }, + }); + + // Log response details for debugging + if (!success) { + console.log(`Request to ${host} failed. Status: ${response.status}, Body: ${response.body}`); + } + + errorRate.add(!success); + + sleep(Math.random() * 2); // Random sleep between 0-2 seconds +} + +export function handleSummary(data) { + return { + '/data/artifacts/summary.json': JSON.stringify(data, null, 2), + '/data/artifacts/summary.html': htmlReport(data), + }; +} + +function htmlReport(data) { + return ` + + + + Load Test Results + + + +

Load Test Results

+

Summary

+
+ Total Requests: ${data.metrics.http_reqs.values.count} +
+
+ Request Rate: ${data.metrics.http_reqs.values.rate.toFixed(2)} req/s +
+
+ Average Response Time: ${data.metrics.http_req_duration.values.avg.toFixed(2)}ms +
+
+ 95th Percentile: ${data.metrics.http_req_duration.values['p(95)'].toFixed(2)}ms +
+
+ Error Rate: ${(data.metrics.http_req_failed.values.rate * 100).toFixed(2)}% +
+ +`; +} diff --git a/scripts/load-test-results/test-summary.md b/scripts/load-test-results/test-summary.md new file mode 100644 index 0000000..4a04c6f --- /dev/null +++ b/scripts/load-test-results/test-summary.md @@ -0,0 +1,24 @@ +# Load Test Results + +## Test Configuration +- **Virtual Users (VUs):** 10 +- **Duration:** 30s +- **Target Services:** foo.localhost, bar.localhost +- **Job Status:** SuccessCriteriaMet + +## Performance Metrics +- **Average Response Time:** N/Ams +- **95th Percentile Response Time:** N/Ams +- **Request Rate:** N/A req/s +- **Total Requests:** N/A +- **Success Rate:** 100.00% +- **Error Rate:** 0.00% + +## Test Status +โœ… **PASSED** - Error rate below 10% + +## Debug Information +- Checks Passed: 1 +- Checks Failed: 00 +- Total Checks: 1 + diff --git a/scripts/load-test.sh b/scripts/load-test.sh new file mode 100755 index 0000000..cc2e61e --- /dev/null +++ b/scripts/load-test.sh @@ -0,0 +1,410 @@ +#!/bin/bash +set -euo pipefail + +CLUSTER_NAME="${CLUSTER_NAME:-mlops-test-cluster}" +CONTEXT="kind-${CLUSTER_NAME}" +VUS="${VUS:-10}" +DURATION="${DURATION:-30s}" +OUTPUT_DIR="load-test-results" + +echo "๐Ÿš€ Starting k6 load testing for foo and bar services..." + +# Create k6 test script +create_k6_test_script() { + echo "๐Ÿ“ Creating k6 load test script..." + + mkdir -p "$OUTPUT_DIR" + + cat > "$OUTPUT_DIR/load-test.js" << 'EOF' +import http from 'k6/http'; +import { check, sleep } from 'k6'; +import { Rate } from 'k6/metrics'; + +// Custom metrics +export let errorRate = new Rate('errors'); + +export let options = { + vus: __ENV.VUS || 10, + duration: __ENV.DURATION || '30s', + thresholds: { + http_req_duration: ['p(95)<500'], // 95% of requests must complete below 500ms + http_req_failed: ['rate<0.1'], // Error rate must be below 10% + }, +}; + +export default function() { + // Randomize between foo and bar hosts + const hosts = ['foo.localhost', 'bar.localhost']; + const host = hosts[Math.floor(Math.random() * hosts.length)]; + + // Use the ingress controller service instead of localhost + let response = http.get('http://ingress-nginx-controller.ingress-nginx.svc.cluster.local/', { + headers: { 'Host': host }, + }); + + let success = check(response, { + 'status is 200': (r) => r.status === 200, + 'response time < 500ms': (r) => r.timings.duration < 500, + 'correct response body': (r) => { + // Add null check to prevent TypeError + if (!r.body) { + console.log(`No response body for ${host}`); + return false; + } + + if (host === 'foo.localhost') { + return r.body.includes('foo'); + } else { + return r.body.includes('bar'); + } + }, + }); + + // Log response details for debugging + if (!success) { + console.log(`Request to ${host} failed. Status: ${response.status}, Body: ${response.body}`); + } + + errorRate.add(!success); + + sleep(Math.random() * 2); // Random sleep between 0-2 seconds +} + +export function handleSummary(data) { + return { + '/data/artifacts/summary.json': JSON.stringify(data, null, 2), + '/data/artifacts/summary.html': htmlReport(data), + }; +} + +function htmlReport(data) { + return ` + + + + Load Test Results + + + +

Load Test Results

+

Summary

+
+ Total Requests: ${data.metrics.http_reqs.values.count} +
+
+ Request Rate: ${data.metrics.http_reqs.values.rate.toFixed(2)} req/s +
+
+ Average Response Time: ${data.metrics.http_req_duration.values.avg.toFixed(2)}ms +
+
+ 95th Percentile: ${data.metrics.http_req_duration.values['p(95)'].toFixed(2)}ms +
+
+ Error Rate: ${(data.metrics.http_req_failed.values.rate * 100).toFixed(2)}% +
+ +`; +} +EOF + + echo "โœ… k6 test script created" +} + + +# Deploy k6 test using Kubernetes +deploy_k6_test() { + echo "๐Ÿ“ฆ Deploying k6 load test in Kubernetes..." + + # Create configmap with test script + kubectl create configmap k6-test-script \ + --from-file="$OUTPUT_DIR/load-test.js" \ + --context "$CONTEXT" \ + --dry-run=client -o yaml | kubectl apply --context "$CONTEXT" -f - + + # Create k6 test job + cat </dev/null || echo "Unknown") + echo "๐Ÿ” Job status: $job_status" + + # Get pod name for the job + local pod_name=$(kubectl get pods --selector=job-name=k6-load-test --context "$CONTEXT" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) + + if [[ -n "$pod_name" ]]; then + echo "๐Ÿ“‹ Getting logs from pod: $pod_name" + + if kubectl logs "$pod_name" --context "$CONTEXT" > "$OUTPUT_DIR/k6-output.log" 2>&1; then + echo "โœ… Logs collected successfully" + else + echo "โŒ Failed to collect logs" + return 1 + fi + else + echo "โŒ No pod found for k6 job" + return 1 + fi + + # Parse metrics with safe arithmetic + echo "๐Ÿ“ˆ Extracting performance metrics..." + + if [[ -f "$OUTPUT_DIR/k6-output.log" ]]; then + # Extract metrics using safer methods + local avg_duration=$(grep -E "http_req_duration.*avg=" "$OUTPUT_DIR/k6-output.log" | tail -1 | sed -n 's/.*avg=\([0-9.]*\)ms.*/\1/p') + local p95_duration=$(grep -E "http_req_duration.*p\(95\)=" "$OUTPUT_DIR/k6-output.log" | tail -1 | sed -n 's/.*p(95)=\([0-9.]*\)ms.*/\1/p') + local req_rate=$(grep -E "http_reqs.*[0-9.]+/s" "$OUTPUT_DIR/k6-output.log" | tail -1 | sed -n 's/.*\([0-9.]*\)\/s.*/\1/p') + local error_rate=$(grep -E "http_req_failed.*[0-9.]+%" "$OUTPUT_DIR/k6-output.log" | tail -1 | sed -n 's/.*\([0-9.]*\)%.*/\1/p') + local total_requests=$(grep -E "http_reqs.*[0-9]+" "$OUTPUT_DIR/k6-output.log" | tail -1 | sed -n 's/.*http_reqs[^0-9]*\([0-9]*\).*/\1/p') + + # Safe defaults for empty values + avg_duration=${avg_duration:-"N/A"} + p95_duration=${p95_duration:-"N/A"} + req_rate=${req_rate:-"N/A"} + error_rate=${error_rate:-"N/A"} + total_requests=${total_requests:-"N/A"} + + # Safe check counting with proper sanitization + local checks_passed=$(grep -c "โœ“" "$OUTPUT_DIR/k6-output.log" 2>/dev/null || echo "0") + local checks_failed=$(grep -c "โœ—" "$OUTPUT_DIR/k6-output.log" 2>/dev/null || echo "0") + + # Clean the variables to remove any newlines or special characters + checks_passed=$(echo "$checks_passed" | tr -d '\n\r' | grep -o '[0-9]*' | head -1) + checks_failed=$(echo "$checks_failed" | tr -d '\n\r' | grep -o '[0-9]*' | head -1) + + # Set defaults if empty + checks_passed=${checks_passed:-0} + checks_failed=${checks_failed:-0} + + # Safe arithmetic calculation + local success_rate="N/A" + local calculated_error_rate="N/A" + + if [[ "$checks_passed" =~ ^[0-9]+$ ]] && [[ "$checks_failed" =~ ^[0-9]+$ ]]; then + local total_checks=$((checks_passed + checks_failed)) + if [[ $total_checks -gt 0 ]]; then + # Use awk for safer floating point arithmetic + success_rate=$(awk "BEGIN {printf \"%.2f\", $checks_passed * 100 / $total_checks}") + calculated_error_rate=$(awk "BEGIN {printf \"%.2f\", $checks_failed * 100 / $total_checks}") + fi + fi + + # Use calculated error rate if original parsing failed + if [[ "$error_rate" == "N/A" && "$calculated_error_rate" != "N/A" ]]; then + error_rate="$calculated_error_rate" + fi + + # Create summary report + cat > "$OUTPUT_DIR/test-summary.md" << EOF +# Load Test Results + +## Test Configuration +- **Virtual Users (VUs):** $VUS +- **Duration:** $DURATION +- **Target Services:** foo.localhost, bar.localhost +- **Job Status:** $job_status + +## Performance Metrics +- **Average Response Time:** ${avg_duration}ms +- **95th Percentile Response Time:** ${p95_duration}ms +- **Request Rate:** ${req_rate} req/s +- **Total Requests:** ${total_requests} +- **Success Rate:** ${success_rate}% +- **Error Rate:** ${error_rate}% + +## Test Status +$( + if [[ "$error_rate" =~ ^[0-9]+\.?[0-9]*$ ]] && (( $(awk "BEGIN {print ($error_rate < 10)}") )); then + echo "โœ… **PASSED** - Error rate below 10%" + elif [[ "$success_rate" =~ ^[0-9]+\.?[0-9]*$ ]] && (( $(awk "BEGIN {print ($success_rate > 90)}") )); then + echo "โœ… **PASSED** - Success rate above 90%" + else + echo "โŒ **FAILED** - Error rate above 10% or metrics unavailable" + fi +) + +## Debug Information +- Checks Passed: $checks_passed +- Checks Failed: $checks_failed +- Total Checks: $((checks_passed + checks_failed)) + +EOF + + echo "โœ… Test results processed" + + # Debug output + echo "๐Ÿ” Extracted metrics:" + echo " - Average Duration: $avg_duration" + echo " - P95 Duration: $p95_duration" + echo " - Request Rate: $req_rate" + echo " - Success Rate: $success_rate" + echo " - Error Rate: $error_rate" + + else + echo "โŒ No log file found to process" + return 1 + fi +} + +# Display test summary +display_test_summary() { + echo "" + echo "๐Ÿ“Š Load Test Summary:" + echo "====================" + + if [[ -f "$OUTPUT_DIR/test-summary.md" ]]; then + cat "$OUTPUT_DIR/test-summary.md" + else + echo "โŒ Test summary not found" + return 1 + fi + + echo "" + echo "๐Ÿ“ Results saved to: $OUTPUT_DIR/" + echo "๐Ÿ“‹ Full logs: $OUTPUT_DIR/k6-output.log" + echo "๐Ÿ“ˆ Summary: $OUTPUT_DIR/test-summary.md" +} + +# Cleanup test resources +cleanup_test_resources() { + echo "๐Ÿงน Cleaning up test resources..." + + kubectl delete job k6-load-test --context "$CONTEXT" --ignore-not-found=true + kubectl delete configmap k6-test-script --context "$CONTEXT" --ignore-not-found=true + + echo "โœ… Test resources cleaned up" +} + +wait_for_job_completion() { + echo "โณ Waiting for load test to complete..." + + local timeout=600 + local start_time=$(date +%s) + + while true; do + local current_time=$(date +%s) + local elapsed=$((current_time - start_time)) + + if [[ $elapsed -gt $timeout ]]; then + echo "โŒ Timeout waiting for job completion" + kubectl describe job k6-load-test --context "$CONTEXT" + return 1 + fi + + # Check job status + local job_status=$(kubectl get job k6-load-test --context "$CONTEXT" -o jsonpath='{.status.conditions[0].type}' 2>/dev/null || echo "Unknown") + local active_pods=$(kubectl get job k6-load-test --context "$CONTEXT" -o jsonpath='{.status.active}' 2>/dev/null || echo "0") + local succeeded_pods=$(kubectl get job k6-load-test --context "$CONTEXT" -o jsonpath='{.status.succeeded}' 2>/dev/null || echo "0") + local failed_pods=$(kubectl get job k6-load-test --context "$CONTEXT" -o jsonpath='{.status.failed}' 2>/dev/null || echo "0") + + echo "๐Ÿ“Š Job status: $job_status | Active: $active_pods | Succeeded: $succeeded_pods | Failed: $failed_pods (${elapsed}s elapsed)" + + if [[ "$job_status" == "Complete" || "$succeeded_pods" -gt 0 ]]; then + echo "โœ… Job completed successfully" + return 0 + elif [[ "$job_status" == "Failed" || "$failed_pods" -gt 0 ]]; then + echo "โŒ Job failed" + kubectl describe job k6-load-test --context "$CONTEXT" + kubectl logs -l job-name=k6-load-test --context "$CONTEXT" + return 1 + fi + + sleep 10 + done +} + +# Main execution +main() { + echo "๐Ÿ“‹ Load test configuration:" + echo " - Virtual Users: $VUS" + echo " - Duration: $DURATION" + echo " - Output Directory: $OUTPUT_DIR" + + # Verify prerequisites + if ! kubectl cluster-info --context "$CONTEXT" &>/dev/null; then + echo "โŒ Cluster $CLUSTER_NAME not found" + exit 1 + fi + + if ! kubectl get ingress echo-ingress --context "$CONTEXT" &>/dev/null; then + echo "โŒ Ingress not found. Run ./deploy-ingress.sh first" + exit 1 + fi + + # Run load test + create_k6_test_script + deploy_k6_test + + if ! wait_for_job_completion; then + echo "โŒ Load test job failed to complete" + cleanup_test_resources + exit 1 + fi + + if collect_test_results; then + display_test_summary + cleanup_test_resources + echo "๐ŸŽ‰ Load testing completed successfully!" + exit 0 + else + echo "โŒ Load testing failed" + cleanup_test_resources + exit 1 + fi +} + +main "$@" + diff --git a/scripts/provision-cluster.sh b/scripts/provision-cluster.sh new file mode 100755 index 0000000..e8655ad --- /dev/null +++ b/scripts/provision-cluster.sh @@ -0,0 +1,234 @@ +#!/bin/bash +set -euo pipefail + +CLUSTER_NAME="${CLUSTER_NAME:-mlops-test-cluster}" +CONFIG_FILE="../config/kind-cluster.yaml" +NODE_READY_TIMEOUT="${NODE_READY_TIMEOUT:-300}" +MAX_RETRIES=3 +RETRY_DELAY=10 + +echo "๐Ÿš€ Starting Kubernetes cluster provisioning..." + +# Enhanced function to wait for nodes to be ready +wait_for_nodes_ready() { + local cluster_name=$1 + local expected_nodes=3 + local timeout=$NODE_READY_TIMEOUT + local context="kind-${cluster_name}" + + echo "โณ Waiting for all $expected_nodes nodes to be ready (timeout: ${timeout}s)..." + + # Give initial time for nodes to register + echo "๐Ÿ”„ Initial wait for node registration..." + sleep 45 + + local start_time=$(date +%s) + while true; do + local current_time=$(date +%s) + local elapsed=$((current_time - start_time)) + + if [[ $elapsed -gt $timeout ]]; then + echo "โŒ Timeout waiting for nodes to be ready" + kubectl get nodes --context "$context" -o wide 2>/dev/null || true + return 1 + fi + + # Get node status with better error handling + local ready_nodes=0 + local total_nodes=0 + + if kubectl get nodes --context "$context" --no-headers &>/dev/null; then + ready_nodes=$(kubectl get nodes --context "$context" --no-headers 2>/dev/null | grep -c " Ready " || echo "0") + total_nodes=$(kubectl get nodes --context "$context" --no-headers 2>/dev/null | wc -l || echo "0") + else + echo "โš ๏ธ Cannot connect to cluster yet, retrying..." + sleep 10 + continue + fi + + echo "๐Ÿ“Š Node status: $ready_nodes/$total_nodes ready (${elapsed}s elapsed)" + + if [[ "$ready_nodes" -eq "$expected_nodes" ]] && [[ "$total_nodes" -eq "$expected_nodes" ]]; then + echo "โœ… All $expected_nodes nodes are ready!" + return 0 + fi + + sleep 15 + done +} + +# Function to validate basic cluster health +validate_cluster_basic() { + local cluster_name=$1 + local context="kind-${cluster_name}" + + echo "๐Ÿฅ Starting basic cluster validation..." + + # 1. Cluster API connectivity + echo "๐Ÿ” Validating cluster API connectivity..." + if ! kubectl cluster-info --context "$context" &> /dev/null; then + echo "โŒ Cluster API validation failed" + return 1 + fi + echo "โœ… Cluster API accessible" + + # 2. Node readiness verification + echo "๐Ÿ” Validating node readiness..." + local ready_nodes + ready_nodes=$(kubectl get nodes --context "$context" --no-headers | grep -c " Ready " || echo "0") + + if [[ "$ready_nodes" -lt 3 ]]; then + echo "โŒ Expected 3 nodes, found $ready_nodes ready nodes" + return 1 + fi + echo "โœ… All $ready_nodes nodes are ready" + + # 3. System pods validation + echo "๐Ÿ” Validating system pods..." + local system_pods_ready + system_pods_ready=$(kubectl get pods -n kube-system --context "$context" --no-headers | grep -c " Running " || echo "0") + local total_system_pods + total_system_pods=$(kubectl get pods -n kube-system --context "$context" --no-headers | wc -l || echo "0") + + if [[ "$system_pods_ready" -lt "$total_system_pods" ]]; then + echo "โŒ System pods not ready: $system_pods_ready/$total_system_pods running" + return 1 + fi + echo "โœ… All $system_pods_ready system pods are running" + + # 4. DNS functionality test + echo "๐Ÿ” Validating DNS functionality..." + if ! kubectl run dns-test --image=busybox --rm -i --restart=Never --context "$context" -- nslookup kubernetes.default.svc.cluster.local &>/dev/null; then + echo "โŒ DNS validation failed" + return 1 + fi + echo "โœ… DNS functionality verified" + + echo "๐ŸŽ‰ Basic cluster validation successful!" + return 0 +} + +# Function to cleanup existing cluster +cleanup_cluster() { + local cluster_name=$1 + echo "๐Ÿงน Cleaning up existing cluster: $cluster_name" + + if kind get clusters | grep -q "^${cluster_name}$"; then + kind delete cluster --name "$cluster_name" || true + sleep 5 + fi +} + +# Function to provision cluster with progressive validation +provision_cluster() { + local cluster_name=$1 + local config_file=$2 + local attempt=1 + + while [[ $attempt -le $MAX_RETRIES ]]; do + echo "๐Ÿ”„ Provisioning attempt $attempt/$MAX_RETRIES..." + + # Cleanup any existing cluster + cleanup_cluster "$cluster_name" + + # Create new cluster + if kind create cluster --name "$cluster_name" --config "$config_file" --wait 300s; then + echo "โœ… Cluster created successfully" + + # Progressive validation strategy + echo "๐Ÿ”„ Starting progressive validation..." + + # Step 1: Wait for basic node readiness + if wait_for_nodes_ready "$cluster_name"; then + echo "โœ… Node readiness validation passed" + + # Step 2: Basic cluster validation + if validate_cluster_basic "$cluster_name"; then + echo "๐ŸŽ‰ Cluster provisioning and validation completed successfully!" + return 0 + else + echo "โŒ Basic cluster validation failed" + fi + else + echo "โŒ Node readiness validation failed" + fi + else + echo "โŒ Cluster creation failed on attempt $attempt" + fi + + if [[ $attempt -lt $MAX_RETRIES ]]; then + echo "โณ Waiting ${RETRY_DELAY}s before retry..." + sleep $RETRY_DELAY + fi + + ((attempt++)) + done + + echo "โŒ Failed to provision cluster after $MAX_RETRIES attempts" + return 1 +} + +# Function to display cluster summary +display_cluster_summary() { + local cluster_name=$1 + local context="kind-${cluster_name}" + + echo "" + echo "๐Ÿ“Š Cluster Summary:" + echo "===================" + + echo "" + echo "๐Ÿ”— Cluster Information:" + kubectl cluster-info --context "$context" + + echo "" + echo "๐Ÿ–ฅ๏ธ Node Status:" + kubectl get nodes --context "$context" -o wide + + echo "" + echo "๐ŸŽฏ Cluster ready for ingress deployment!" +} + +# Main execution +main() { + echo "๐Ÿ“‹ Cluster provisioning configuration:" + echo " - Cluster name: $CLUSTER_NAME" + echo " - Config file: $CONFIG_FILE" + echo " - Node ready timeout: ${NODE_READY_TIMEOUT}s" + echo " - Max retries: $MAX_RETRIES" + + # Verify prerequisites + echo "๐Ÿ” Verifying prerequisites..." + + if ! command -v kind &> /dev/null; then + echo "โŒ kind not found. Please install kind first." + exit 1 + fi + echo "โœ… kind found: $(kind version)" + + if ! command -v kubectl &> /dev/null; then + echo "โŒ kubectl not found. Please install kubectl first." + exit 1 + fi + echo "โœ… kubectl found" + + if [[ ! -f "$CONFIG_FILE" ]]; then + echo "โŒ Config file not found: $CONFIG_FILE" + exit 1 + fi + echo "โœ… Config file found: $CONFIG_FILE" + + # Provision cluster + if provision_cluster "$CLUSTER_NAME" "$CONFIG_FILE"; then + display_cluster_summary "$CLUSTER_NAME" + echo "๐ŸŽฏ Next step: Run ./deploy-ingress.sh to deploy HTTP services" + exit 0 + else + echo "๐Ÿ’ฅ Cluster provisioning failed after all attempts" + exit 1 + fi +} + +# Execute main function +main "$@" + diff --git a/scripts/test-ingress.sh b/scripts/test-ingress.sh new file mode 100755 index 0000000..ad53926 --- /dev/null +++ b/scripts/test-ingress.sh @@ -0,0 +1,181 @@ +#!/bin/bash +set -euo pipefail + +CLUSTER_NAME="${CLUSTER_NAME:-mlops-test-cluster}" +CONTEXT="kind-${CLUSTER_NAME}" + +echo "๐Ÿงช Testing ingress HTTP connectivity..." + +# Test HTTP connectivity +test_http_connectivity() { + echo "๐Ÿ” Testing HTTP services connectivity..." + + local test_passed=true + + # Test foo service + echo "Testing foo.localhost..." + if curl -s -H "Host: foo.localhost" http://localhost/ | grep -q "foo"; then + echo "โœ… foo.localhost: SUCCESS (returns 'foo')" + else + echo "โŒ foo.localhost: FAILED" + test_passed=false + fi + + # Test bar service + echo "Testing bar.localhost..." + if curl -s -H "Host: bar.localhost" http://localhost/ | grep -q "bar"; then + echo "โœ… bar.localhost: SUCCESS (returns 'bar')" + else + echo "โŒ bar.localhost: FAILED" + test_passed=false + fi + + if [[ "$test_passed" == true ]]; then + echo "๐ŸŽ‰ All HTTP connectivity tests passed!" + return 0 + else + echo "โŒ Some HTTP connectivity tests failed" + return 1 + fi +} + +# Test ingress health +test_ingress_health() { + echo "๐Ÿ” Testing ingress controller health..." + + # Check ingress controller pods + local ingress_pods_ready + ingress_pods_ready=$(kubectl get pods -n ingress-nginx --context "$CONTEXT" --no-headers | grep -c " Running " || echo "0") + + if [[ "$ingress_pods_ready" -gt 0 ]]; then + echo "โœ… Ingress controller pods: $ingress_pods_ready running" + else + echo "โŒ No ingress controller pods running" + return 1 + fi + + # Check ingress resource + local ingress_count + ingress_count=$(kubectl get ingress --context "$CONTEXT" --no-headers | wc -l || echo "0") + + if [[ "$ingress_count" -gt 0 ]]; then + echo "โœ… Ingress resources: $ingress_count configured" + else + echo "โŒ No ingress resources found" + return 1 + fi + + echo "โœ… Ingress health check passed" + return 0 +} + +# Test service endpoints +test_service_endpoints() { + echo "๐Ÿ” Testing service endpoints..." + + # Check foo service endpoints + local foo_endpoints + foo_endpoints=$(kubectl get endpoints foo-service --context "$CONTEXT" -o jsonpath='{.subsets[0].addresses}' 2>/dev/null | jq length 2>/dev/null || echo "0") + + if [[ "$foo_endpoints" -gt 0 ]]; then + echo "โœ… foo-service endpoints: $foo_endpoints ready" + else + echo "โŒ foo-service has no ready endpoints" + return 1 + fi + + # Check bar service endpoints + local bar_endpoints + bar_endpoints=$(kubectl get endpoints bar-service --context "$CONTEXT" -o jsonpath='{.subsets[0].addresses}' 2>/dev/null | jq length 2>/dev/null || echo "0") + + if [[ "$bar_endpoints" -gt 0 ]]; then + echo "โœ… bar-service endpoints: $bar_endpoints ready" + else + echo "โŒ bar-service has no ready endpoints" + return 1 + fi + + echo "โœ… Service endpoints check passed" + return 0 +} + +# Comprehensive testing +run_comprehensive_tests() { + echo "๐Ÿงช Running comprehensive ingress tests..." + + local all_tests_passed=true + + # Test 1: Ingress health + if ! test_ingress_health; then + all_tests_passed=false + fi + + # Test 2: Service endpoints + if ! test_service_endpoints; then + all_tests_passed=false + fi + + # Test 3: HTTP connectivity + if ! test_http_connectivity; then + all_tests_passed=false + fi + + if [[ "$all_tests_passed" == true ]]; then + echo "๐ŸŽ‰ All ingress tests passed successfully!" + echo "๐ŸŽฏ Next step: Run ./load-test.sh to perform load testing" + return 0 + else + echo "โŒ Some ingress tests failed" + return 1 + fi +} + +# Debug information +show_debug_info() { + echo "" + echo "๐Ÿ” Debug Information:" + echo "====================" + + echo "" + echo "๐Ÿ“Š Ingress Controller Status:" + kubectl get pods -n ingress-nginx --context "$CONTEXT" + + echo "" + echo "๐Ÿ”— Services:" + kubectl get services --context "$CONTEXT" + + echo "" + echo "๐ŸŒ Ingress:" + kubectl describe ingress echo-ingress --context "$CONTEXT" + + echo "" + echo "๐Ÿ“ก Endpoints:" + kubectl get endpoints --context "$CONTEXT" +} + +# Main execution +main() { + # Verify cluster and ingress exist + if ! kubectl cluster-info --context "$CONTEXT" &>/dev/null; then + echo "โŒ Cluster $CLUSTER_NAME not found" + exit 1 + fi + + if ! kubectl get ingress echo-ingress --context "$CONTEXT" &>/dev/null; then + echo "โŒ Ingress not found. Run ./deploy-ingress.sh first" + exit 1 + fi + + # Run tests + if run_comprehensive_tests; then + echo "โœ… Ingress testing completed successfully!" + exit 0 + else + echo "โŒ Ingress testing failed" + show_debug_info + exit 1 + fi +} + +main "$@" +