Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -184,3 +184,112 @@ periodics:
limits:
cpu: 2
memory: "2Gi"
# Experimental tests for larger resource size as proposed in https://github.com/kubernetes/kubernetes/issues/134375
- cron: '1 17 2-31/2 * *' # Run on even days at 9:01PST (17:01 UTC)
name: ci-kubernetes-e2e-gce-scale-resource-size
tags:
- "perfDashPrefix: gce-5000Nodes-ResourceSize"
- "perfDashBuildsCount: 270"
- "perfDashJobType: performance"
cluster: k8s-infra-prow-build
labels:
preset-service-account: "true"
preset-k8s-ssh: "true"
preset-e2e-scalability-common: "true"
preset-e2e-scalability-periodics: "true"
preset-e2e-scalability-periodics-master: "true"
decorate: true
decoration_config:
timeout: 450m
extra_refs:
- org: kubernetes
repo: kubernetes
base_ref: master
path_alias: k8s.io/kubernetes
- org: kubernetes
repo: perf-tests
base_ref: master
path_alias: k8s.io/perf-tests
annotations:
testgrid-dashboards: sig-scalability-experiments
testgrid-tab-name: gce-master-scale-resource-size
description: "Experimental tests for larger resource size as proposed in https://github.com/kubernetes/kubernetes/issues/134375"
spec:
volumes:
- name: cache-secret
secret:
secretName: scale-pull-cache-token
containers:
- image: gcr.io/k8s-staging-test-infra/kubekins-e2e:v20251016-39cf27682d-master
volumeMounts:
- name: cache-secret
readOnly: true
mountPath: /etc/registry-auth
env:
- name: KUBERNETES_REGISTRY_PULL_THROUGH_HOST
value: https://us-central1-docker.pkg.dev/v2/k8s-infra-e2e-scale-5k-project/k8s-5k-scale-cache/
- name: KUBERNETES_REGISTRY_PULL_THROUGH_BASIC_AUTH_TOKEN_PATH
value: /etc/registry-auth/token
command:
- runner.sh
- /workspace/scenarios/kubernetes_e2e.py
args:
- --cluster=gce-scale-cluster
- --env=HEAPSTER_MACHINE_TYPE=e2-standard-32
# TODO(mborsz): Adjust or remove this change once we understand coredns
# memory usage regression.
- --env=KUBE_DNS_MEMORY_LIMIT=300Mi
- --extract=ci/fast/latest-fast
- --gcp-nodes=5000
- --gcp-project-type=scalability-scale-project
- --gcp-zone=us-east1-b
- --provider=gce
- --metadata-sources=cl2-metadata.json
- --env=CL2_LOAD_TEST_THROUGHPUT=50
- --env=CL2_DELETE_TEST_THROUGHPUT=50
- --env=CL2_RATE_LIMIT_POD_CREATION=false
- --env=KUBE_CONTROLLER_MANAGER_TEST_ARGS=--authorization-always-allow-paths=/healthz,/readyz,/livez,/metrics --endpointslice-updates-batch-period=500ms --endpoint-updates-batch-period=500ms
# Overrides CONTROLLER_MANAGER_TEST_ARGS from preset-e2e-scalability-periodics.
- --env=CONTROLLER_MANAGER_TEST_ARGS=--authorization-always-allow-paths=/healthz,/readyz,/livez,/metrics --profiling --contention-profiling --kube-api-qps=100 --kube-api-burst=100
# Overrides SCHEDULER_TEST_ARGS from preset-e2e-scalability-periodics.
# TODO(#1311): Clean this up after the experiment - it should allow
# to hugely decrease pod-startup-latency across the whole test.
# Given that individual controllers have separate QPS limits, we allow
# scheduler to keep up with the load from deployment, daemonset and job
# performing pod creations at once.
- --env=SCHEDULER_TEST_ARGS=--authorization-always-allow-paths=/healthz,/readyz,/livez,/metrics --profiling --contention-profiling --kube-api-qps=500 --kube-api-burst=500
# With APF only sum of --max-requests-inflight and --max-mutating-requests-inflight matters, so set --max-mutating-requests-inflight to 0.
- --env=APISERVER_TEST_ARGS=--max-requests-inflight=640 --max-mutating-requests-inflight=0
- --env=CL2_ENABLE_API_AVAILABILITY_MEASUREMENT=true
- --env=CL2_API_AVAILABILITY_PERCENTAGE_THRESHOLD=99.5
- --env=CL2_DAEMONSET_POD_PAYLOAD_SIZE=1024
- --env=CL2_DEPLOYMENT_POD_PAYLOAD_SIZE=1024
- --env=CL2_STATEFULSET_POD_PAYLOAD_SIZE=1024
- --env=CL2_JOB_POD_PAYLOAD_SIZE=1024
- --test=false
- --test-cmd=$GOPATH/src/k8s.io/perf-tests/run-e2e.sh
- --test-cmd-args=cluster-loader2
- --test-cmd-args=--experimental-gcp-snapshot-prometheus-disk=true
- --test-cmd-args=--experimental-prometheus-disk-snapshot-name=$(JOB_NAME)-$(BUILD_ID)
- --test-cmd-args=--experimental-prometheus-snapshot-to-report-dir=true
- --test-cmd-args=--nodes=5000
- --test-cmd-args=--prometheus-scrape-node-exporter
- --test-cmd-args=--provider=gce
- --test-cmd-args=--report-dir=$(ARTIFACTS)
- --test-cmd-args=--testconfig=testing/load/config.yaml
- --test-cmd-args=--testconfig=testing/huge-service/config.yaml
- --test-cmd-args=--testconfig=testing/access-tokens/config.yaml
- --test-cmd-args=--testoverrides=./testing/experiments/enable_restart_count_check.yaml
- --test-cmd-args=--testoverrides=./testing/experiments/ignore_known_gce_container_restarts.yaml
- --test-cmd-args=--testoverrides=./testing/overrides/5000_nodes.yaml
- --test-cmd-name=ClusterLoaderV2
Copy link
Contributor

@alaypatel07 alaypatel07 Oct 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we use this job to also test the 5k DRA test?

I have been trying for weeks to get this job run at larger scale, but been failing to grab attention. If the job can be invoked here, it will be very helpful. The 100 node config here has been stable

cc @pohly @johnbelamaric @klueska

- --timeout=420m
- --use-logexporter
- --logexporter-gcs-path=gs://k8s-infra-scalability-tests-logs/$(JOB_NAME)/$(BUILD_ID)
resources:
requests:
cpu: 6
memory: "16Gi"
limits:
cpu: 6
memory: "16Gi"
Original file line number Diff line number Diff line change
Expand Up @@ -1166,113 +1166,3 @@ periodics:
limits:
cpu: 3
memory: "8Gi"

# Exploratory tests for resource size limit as proposed in https://github.com/kubernetes/kubernetes/issues/134375
- cron: '1 17 2-31/2 * *' # Run on even days at 9:01PST (17:01 UTC)
name: ci-kubernetes-e2e-gce-scale-resource-size
tags:
- "perfDashPrefix: gce-5000Nodes-ResourceSize"
- "perfDashBuildsCount: 270"
- "perfDashJobType: performance"
cluster: k8s-infra-prow-build
labels:
preset-service-account: "true"
preset-k8s-ssh: "true"
preset-e2e-scalability-common: "true"
preset-e2e-scalability-periodics: "true"
preset-e2e-scalability-periodics-master: "true"
decorate: true
decoration_config:
timeout: 450m
extra_refs:
- org: kubernetes
repo: kubernetes
base_ref: master
path_alias: k8s.io/kubernetes
- org: kubernetes
repo: perf-tests
base_ref: master
path_alias: k8s.io/perf-tests
annotations:
testgrid-dashboards: sig-scalability-gce, google-gce
testgrid-tab-name: gce-master-scale-resource-size
description: "Exploratory tests for resource size limit as proposed in https://github.com/kubernetes/kubernetes/issues/134375"
spec:
volumes:
- name: cache-secret
secret:
secretName: scale-pull-cache-token
containers:
- image: gcr.io/k8s-staging-test-infra/kubekins-e2e:v20251016-39cf27682d-master
volumeMounts:
- name: cache-secret
readOnly: true
mountPath: /etc/registry-auth
env:
- name: KUBERNETES_REGISTRY_PULL_THROUGH_HOST
value: https://us-central1-docker.pkg.dev/v2/k8s-infra-e2e-scale-5k-project/k8s-5k-scale-cache/
- name: KUBERNETES_REGISTRY_PULL_THROUGH_BASIC_AUTH_TOKEN_PATH
value: /etc/registry-auth/token
command:
- runner.sh
- /workspace/scenarios/kubernetes_e2e.py
args:
- --cluster=gce-scale-cluster
- --env=HEAPSTER_MACHINE_TYPE=e2-standard-32
# TODO(mborsz): Adjust or remove this change once we understand coredns
# memory usage regression.
- --env=KUBE_DNS_MEMORY_LIMIT=300Mi
- --extract=ci/fast/latest-fast
- --gcp-nodes=5000
- --gcp-project-type=scalability-scale-project
- --gcp-zone=us-east1-b
- --provider=gce
- --metadata-sources=cl2-metadata.json
- --env=CL2_LOAD_TEST_THROUGHPUT=50
- --env=CL2_DELETE_TEST_THROUGHPUT=50
- --env=CL2_RATE_LIMIT_POD_CREATION=false
- --env=KUBE_CONTROLLER_MANAGER_TEST_ARGS=--authorization-always-allow-paths=/healthz,/readyz,/livez,/metrics --endpointslice-updates-batch-period=500ms --endpoint-updates-batch-period=500ms
# Overrides CONTROLLER_MANAGER_TEST_ARGS from preset-e2e-scalability-periodics.
- --env=CONTROLLER_MANAGER_TEST_ARGS=--authorization-always-allow-paths=/healthz,/readyz,/livez,/metrics --profiling --contention-profiling --kube-api-qps=100 --kube-api-burst=100
# Overrides SCHEDULER_TEST_ARGS from preset-e2e-scalability-periodics.
# TODO(#1311): Clean this up after the experiment - it should allow
# to hugely decrease pod-startup-latency across the whole test.
# Given that individual controllers have separate QPS limits, we allow
# scheduler to keep up with the load from deployment, daemonset and job
# performing pod creations at once.
- --env=SCHEDULER_TEST_ARGS=--authorization-always-allow-paths=/healthz,/readyz,/livez,/metrics --profiling --contention-profiling --kube-api-qps=500 --kube-api-burst=500
# With APF only sum of --max-requests-inflight and --max-mutating-requests-inflight matters, so set --max-mutating-requests-inflight to 0.
- --env=APISERVER_TEST_ARGS=--max-requests-inflight=640 --max-mutating-requests-inflight=0
- --env=CL2_ENABLE_API_AVAILABILITY_MEASUREMENT=true
- --env=CL2_API_AVAILABILITY_PERCENTAGE_THRESHOLD=99.5
- --env=CL2_DAEMONSET_POD_PAYLOAD_SIZE=1024
- --env=CL2_DEPLOYMENT_POD_PAYLOAD_SIZE=1024
- --env=CL2_STATEFULSET_POD_PAYLOAD_SIZE=1024
- --env=CL2_JOB_POD_PAYLOAD_SIZE=1024
- --test=false
- --test-cmd=$GOPATH/src/k8s.io/perf-tests/run-e2e.sh
- --test-cmd-args=cluster-loader2
- --test-cmd-args=--experimental-gcp-snapshot-prometheus-disk=true
- --test-cmd-args=--experimental-prometheus-disk-snapshot-name=$(JOB_NAME)-$(BUILD_ID)
- --test-cmd-args=--experimental-prometheus-snapshot-to-report-dir=true
- --test-cmd-args=--nodes=5000
- --test-cmd-args=--prometheus-scrape-node-exporter
- --test-cmd-args=--provider=gce
- --test-cmd-args=--report-dir=$(ARTIFACTS)
- --test-cmd-args=--testconfig=testing/load/config.yaml
- --test-cmd-args=--testconfig=testing/huge-service/config.yaml
- --test-cmd-args=--testconfig=testing/access-tokens/config.yaml
- --test-cmd-args=--testoverrides=./testing/experiments/enable_restart_count_check.yaml
- --test-cmd-args=--testoverrides=./testing/experiments/ignore_known_gce_container_restarts.yaml
- --test-cmd-args=--testoverrides=./testing/overrides/5000_nodes.yaml
- --test-cmd-name=ClusterLoaderV2
- --timeout=420m
- --use-logexporter
- --logexporter-gcs-path=gs://k8s-infra-scalability-tests-logs/$(JOB_NAME)/$(BUILD_ID)
resources:
requests:
cpu: 6
memory: "16Gi"
limits:
cpu: 6
memory: "16Gi"