update benchmarks (#89)

Signed-off-by: Dmitry Shmulevich <[email protected]>
NVIDIA · Aug 2, 2024 · c718051 · c718051
1 parent 52c3059
commit c718051
Showing 12 changed files with 342 additions and 54 deletions.
diff --git a/docs/examples/runai/runai.md b/docs/examples/runai/runai.md
@@ -23,18 +23,12 @@ Navigate to the Run:ai portal and create a new project. Upon creating the projec
 
 This script will deploy a `kind` cluster if necessary, followed by deploying `KWOK` and `Prometheus`. It will then prompt you to select a workload manager. Choose the `run:ai` option.
 
-4. **Update KWOK stage**:
 
-Update the pod-complete stage by running the following command
-```bash
-kubectl apply -f ./charts/overrides/kwok/pod-complete.yaml
-```
-
-5. **Replace cluster UID and project name in the sample workflow files**:
+4. **Replace cluster UID and project name in the sample workflow files**:
 
 Update the sample workflow files [test-trainingworkload.yml](../../../resources/workflows/runai/test-trainingworkload.yml#L40-L41) and [test-distributedworkload.yml](../../../resources/workflows/runai/test-distributedworkload.yml#L40-L41) by replacing `<RUNAI_CLUSTER_ID>` with the cluster UID and `<RUNAI_PROJECT>` with the project name.
 
-6. **Run the workflows**
+5. **Run the workflows**
 
 Run a Run:ai training workload: 
 ```bash

diff --git a/resources/benchmarks/gang-scheduling/workflows/run-test-runai.yml b/resources/benchmarks/gang-scheduling/workflows/run-test-runai.yml
@@ -0,0 +1,137 @@
+name: test-gang-scheduling
+tasks:
+- id: register-trainingworkload
+  type: RegisterObj
+  params:
+    template: "resources/benchmarks/templates/runai/trainingworkload.yml"
+    nameFormat: "twl{{._ENUM_}}"
+    podNameFormat: "{{._NAME_}}-0-0"
+    podCount: 1
+- id: register-distributedworkload
+  type: RegisterObj
+  params:
+    template: "resources/benchmarks/templates/runai/distributedworkload.yml"
+    nameFormat: "dwl{{._ENUM_}}"
+    podNameFormat: "{{._NAME_}}-(launcher-[a-z0-9]+|worker-[0-9]+)"
+    podCount: "{{.workers}} + 1"
+#
+### Benchmark test
+#
+- id: job1
+  type: SubmitObj
+  params:
+    refTaskId: register-distributedworkload
+    count: 1
+    params:
+      workers: 31
+      ttl: 30s
+- id: job2
+  type: SubmitObj
+  params:
+    refTaskId: register-distributedworkload
+    count: 2
+    params:
+      workers: 15
+      ttl: 30s
+- id: job3
+  type: SubmitObj
+  params:
+    refTaskId: register-distributedworkload
+    count: 3
+    params:
+      workers: 9
+      ttl: 30s
+- id: job3.1
+  type: SubmitObj
+  params:
+    refTaskId: register-distributedworkload
+    count: 1
+    params:
+      workers: 1
+      ttl: 30s
+- id: job4
+  type: SubmitObj
+  params:
+    refTaskId: register-distributedworkload
+    count: 4
+    params:
+      workers: 7
+      ttl: 30s
+- id: job5
+  type: SubmitObj
+  params:
+    refTaskId: register-distributedworkload
+    count: 5
+    params:
+      workers: 5
+      ttl: 30s
+- id: job5.1
+  type: SubmitObj
+  params:
+    refTaskId: register-trainingworkload
+    count: 2
+    params:
+      ttl: 30s
+- id: job6
+  type: SubmitObj
+  params:
+    refTaskId: register-distributedworkload
+    count: 6
+    params:
+      workers: 4
+      ttl: 30s
+- id: job6.1
+  type: SubmitObj
+  params:
+    refTaskId: register-distributedworkload
+    count: 1
+    params:
+      workers: 1
+      ttl: 30s
+- id: job7
+  type: SubmitObj
+  params:
+    refTaskId: register-distributedworkload
+    count: 7
+    params:
+      workers: 3
+      ttl: 30s
+- id: job7.1
+  type: SubmitObj
+  params:
+    refTaskId: register-distributedworkload
+    count: 1
+    params:
+      workers: 1
+      ttl: 30s
+- id: job7.2
+  type: SubmitObj
+  params:
+    refTaskId: register-trainingworkload
+    count: 2
+    params:
+      ttl: 30s
+- id: job8
+  type: SubmitObj
+  params:
+    refTaskId: register-distributedworkload
+    count: 8
+    params:
+     workers: 3
+     ttl: 30s
+- id: job9
+  type: SubmitObj
+  params:
+    refTaskId: register-distributedworkload
+    count: 9
+    params:
+      workers: 2
+      ttl: 30s
+- id: job9.1
+  type: SubmitObj
+  params:
+    refTaskId: register-distributedworkload
+    count: 1
+    params:
+      workers: 4
+      ttl: 30s
diff --git a/resources/benchmarks/gang-scheduling/workflows/run-test.yml b/resources/benchmarks/gang-scheduling/workflows/run-test.yml
@@ -4,7 +4,7 @@ tasks:
 #- id: register
 #  type: RegisterObj
 #  params:
-#    template: "resources/benchmarks/gang-scheduling/templates/k8s/job.yml"
+#    template: "resources/benchmarks/templates/k8s/job.yml"
 #    nameFormat: "job{{._ENUM_}}"
 #    podNameFormat: "{{._NAME_}}-[0-9]-.*"
 #    podCount: "{{.replicas}}"
@@ -13,7 +13,7 @@ tasks:
 #- id: register
 #  type: RegisterObj
 #  params:
-#    template: "resources/benchmarks/gang-scheduling/templates/k8s/jobset.yml"
+#    template: "resources/benchmarks/templates/k8s/jobset.yml"
 #    nameFormat: "jobset{{._ENUM_}}"
 #    podNameFormat: "{{._NAME_}}-workers-[0-9]+-[0-9]+-.+"
 #    podCount: "{{.replicas}}"
@@ -34,7 +34,7 @@ tasks:
 #- id: register
 #  type: RegisterObj
 #  params:
-#    template: "resources/benchmarks/gang-scheduling/templates/kueue/job.yml"
+#    template: "resources/benchmarks/templates/kueue/job.yml"
 #    nameFormat: "job{{._ENUM_}}"
 #    podNameFormat: "{{._NAME_}}-[0-9]-.*"
 #    podCount: "{{.replicas}}"
@@ -69,39 +69,61 @@ tasks:
 #- id: register
 #  type: RegisterObj
 #  params:
-#    template: "resources/benchmarks/gang-scheduling/templates/volcano/job.yml"
+#    template: "resources/benchmarks/templates/volcano/job.yml"
 #    nameFormat: "j{{._ENUM_}}"
 #    podNameFormat: "{{._NAME_}}-test-[0-9]+"
 #    podCount: "{{.replicas}}"
+#- id: configure
+#  type: Configure
+#  params:
+#    configmaps:
+#    - name: volcano-scheduler-configmap
+#      namespace: volcano-system
+#      op: create
+#      data:
+#        volcano-scheduler.conf: |
+#          actions: "enqueue, allocate, backfill"
+#          tiers:
+#          - plugins:
+#            - name: priority
+#            - name: gang
+#            - name: conformance
+#          - plugins:
+#            - name: drf
+#            - name: predicates
+#            - name: proportion
+#            - name: nodeorder
+#            - name: binpack
+#    timeout: 1m
 
 ### Yunikorn
-- id: register
-  type: RegisterObj
-  params:
-    template: "resources/benchmarks/gang-scheduling/templates/yunikorn/job.yml"
-    nameFormat: "job{{._ENUM_}}"
-    podNameFormat: "{{._NAME_}}-.*"
-    podCount: "{{.replicas}}"
-- id: configure
-  type: Configure
-  params:
-    configmaps:
-    - name: yunikorn-configs
-      namespace: yunikorn
-      op: create
-      data:
-        queues.yaml: |
-          partitions:
-            - name: default
-              queues:
-              - name: root
-                queues:
-                - name: sandbox
-                  submitacl: '*'
-                  resources:
-                    max:
-                      {memory: 36Gi, vcore: 8000m, nvidia.com/gpu: 256}
-    timeout: 1m
+#- id: register
+#  type: RegisterObj
+#  params:
+#    template: "resources/benchmarks/templates/yunikorn/job.yml"
+#    nameFormat: "job{{._ENUM_}}"
+#    podNameFormat: "{{._NAME_}}-.*"
+#    podCount: "{{.replicas}}"
+#- id: configure
+#  type: Configure
+#  params:
+#    configmaps:
+#    - name: yunikorn-configs
+#      namespace: yunikorn
+#      op: create
+#      data:
+#        queues.yaml: |
+#          partitions:
+#            - name: default
+#              queues:
+#              - name: root
+#                queues:
+#                - name: sandbox
+#                  submitacl: '*'
+#                  resources:
+#                    max:
+#                      {memory: 36Gi, vcore: 8000m, nvidia.com/gpu: 256}
+#    timeout: 1m
 #
 ### Benchmark test
 #
@@ -112,101 +134,116 @@ tasks:
     count: 1
     params:
       replicas: 32
+      ttl: 30s
 - id: job2
   type: SubmitObj
   params:
     refTaskId: register
     count: 2
     params:
       replicas: 16
+      ttl: 30s
 - id: job3
   type: SubmitObj
   params:
     refTaskId: register
     count: 3
     params:
       replicas: 10
+      ttl: 30s
 - id: job3.1
   type: SubmitObj
   params:
     refTaskId: register
     count: 1
     params:
-      replicas: 2  
+      replicas: 2
+      ttl: 30s
 - id: job4
   type: SubmitObj
   params:
     refTaskId: register
     count: 4
     params:
-      replicas: 8  
+      replicas: 8
+      ttl: 30s
 - id: job5
   type: SubmitObj
   params:
     refTaskId: register
     count: 5
     params:
       replicas: 6
+      ttl: 30s
 - id: job5.1
   type: SubmitObj
   params:
     refTaskId: register
     count: 2
     params:
       replicas: 1
+      ttl: 30s
 - id: job6
   type: SubmitObj
   params:
     refTaskId: register
     count: 6
     params:
       replicas: 5
+      ttl: 30s
 - id: job6.1
   type: SubmitObj
   params:
     refTaskId: register
     count: 1
     params:
       replicas: 2
+      ttl: 30s
 - id: job7
   type: SubmitObj
   params:
     refTaskId: register
     count: 7
     params:
       replicas: 4
+      ttl: 30s
 - id: job7.1
   type: SubmitObj
   params:
     refTaskId: register
     count: 1
     params:
       replicas: 2
+      ttl: 30s
 - id: job7.2
   type: SubmitObj
   params:
     refTaskId: register
     count: 2
     params:
       replicas: 1
+      ttl: 30s
 - id: job8
   type: SubmitObj
   params:
     refTaskId: register
     count: 8
     params:
      replicas: 4
+     ttl: 30s
 - id: job9
   type: SubmitObj
   params:
     refTaskId: register
     count: 9
     params:
       replicas: 3
+      ttl: 30s
 - id: job9.1
   type: SubmitObj
   params:
     refTaskId: register
     count: 1
     params:
       replicas: 5
+      ttl: 30s
diff --git a/...rks/gang-scheduling/templates/k8s/job.yml → resources/benchmarks/templates/k8s/job.yml b/...rks/gang-scheduling/templates/k8s/job.yml → resources/benchmarks/templates/k8s/job.yml
@@ -25,7 +25,8 @@ spec:
   template:
     metadata:
       annotations:
-        pod-complete.stage.kwok.x-k8s.io/delay: "30s"
+        pod-complete.stage.kwok.x-k8s.io/delay: {{.ttl}}
+        pod-complete.stage.kwok.x-k8s.io/jitter-delay: {{.ttl}}
     spec:
       schedulerName: default-scheduler
       containers:

diff --git a/.../gang-scheduling/templates/k8s/jobset.yml → ...urces/benchmarks/templates/k8s/jobset.yml b/.../gang-scheduling/templates/k8s/jobset.yml → ...urces/benchmarks/templates/k8s/jobset.yml
@@ -36,7 +36,8 @@ spec:
         template:
           metadata:
             annotations:
-              pod-complete.stage.kwok.x-k8s.io/delay: "30s"
+              pod-complete.stage.kwok.x-k8s.io/delay: {{.ttl}}
+              pod-complete.stage.kwok.x-k8s.io/jitter-delay: {{.ttl}}
           spec:
             schedulerName: default-scheduler
             containers:

diff --git a/...s/gang-scheduling/templates/kueue/job.yml → resources/benchmarks/templates/kueue/job.yml b/...s/gang-scheduling/templates/kueue/job.yml → resources/benchmarks/templates/kueue/job.yml
@@ -26,7 +26,8 @@ spec:
   template:
     metadata:
       annotations:
-        pod-complete.stage.kwok.x-k8s.io/delay: "30s"
+        pod-complete.stage.kwok.x-k8s.io/delay: {{.ttl}}
+        pod-complete.stage.kwok.x-k8s.io/jitter-delay: {{.ttl}}
     spec:
       containers:
       - name: test

diff --git a/resources/benchmarks/templates/runai/distributedworkload.yml b/resources/benchmarks/templates/runai/distributedworkload.yml
@@ -0,0 +1,64 @@
+apiVersion: run.ai/v2alpha1
+kind: DistributedWorkload
+metadata:
+  name: "{{._NAME_}}"
+  namespace: runai-<RUNAI_PROJECT>
+  annotations:
+    clusterId: <RUNAI_CLUSTER_ID>
+  labels:
+    project: <RUNAI_PROJECT>
+masterSpec:
+  name:
+    value: "{{._NAME_}}"
+  image:
+    value: ubuntu
+  imagePullPolicy:
+    value: Always
+  cpu:
+    value: 100m
+  memory:
+    value: 256M
+  gpuDevices:
+    value: 8
+  largeShm:
+    value: false
+  nodePools:
+    value: default
+  runAsUser:
+    value: true
+  usage: Submit
+  autoDeletionTimeAfterCompletionSeconds:
+    value: 2592000
+spec:
+  annotations:
+    items:
+      clusterId:
+        value: <RUNAI_CLUSTER_ID>
+      pod-complete.stage.kwok.x-k8s.io/delay:
+        value: {{.ttl}}
+      pod-complete.stage.kwok.x-k8s.io/jitter-delay:
+        value: {{.ttl}}
+  name:
+    value: "{{._NAME_}}"
+  jobType: MPIJob
+  image:
+    value: ubuntu
+  imagePullPolicy:
+    value: Always
+  cpu:
+    value: 100m
+  memory:
+    value: 256M
+  gpuDevices:
+    value: 8
+  workers:
+    value: {{.workers}}
+  largeShm:
+    value: false
+  nodePools:
+    value: default
+  runAsUser:
+    value: true
+  usage: Submit
+  autoDeletionTimeAfterCompletionSeconds:
+    value: 2592000
diff --git a/resources/benchmarks/templates/runai/trainingworkload.yml b/resources/benchmarks/templates/runai/trainingworkload.yml
@@ -0,0 +1,35 @@
+apiVersion: run.ai/v2alpha1
+kind: TrainingWorkload
+metadata:
+  name: "{{._NAME_}}"
+  namespace: runai-<RUNAI_PROJECT>
+  annotations:
+    clusterId: <RUNAI_CLUSTER_ID>
+  labels:
+    project: <RUNAI_PROJECT>
+spec:
+  name:
+    value: "{{._NAME_}}"
+  image:
+    value: ubuntu
+  imagePullPolicy:
+    value: IfNotPresent
+  active:
+    value: true
+  annotations:
+    items:
+      clusterId:
+        value: <RUNAI_CLUSTER_ID>
+      pod-complete.stage.kwok.x-k8s.io/delay:
+        value: {{.ttl}}
+      pod-complete.stage.kwok.x-k8s.io/jitter-delay:
+        value: {{.ttl}}
+  cpu:
+    value: 100m
+  memory:
+    value: 256M
+  gpuDevices:
+    value: 8
+  nodePools:
+    value: default
+  usage: Submit
diff --git a/...gang-scheduling/templates/volcano/job.yml → ...rces/benchmarks/templates/volcano/job.yml b/...gang-scheduling/templates/volcano/job.yml → ...rces/benchmarks/templates/volcano/job.yml
@@ -36,7 +36,8 @@ spec:
       metadata:
         name: test
         annotations:
-          pod-complete.stage.kwok.x-k8s.io/delay: "30s"
+          pod-complete.stage.kwok.x-k8s.io/delay: {{.ttl}}
+          pod-complete.stage.kwok.x-k8s.io/jitter-delay: {{.ttl}}
       spec:
         containers:
         - name: job
@@ -51,4 +52,3 @@ spec:
               cpu: 100m
               memory: 256M
               nvidia.com/gpu: "8"
-      restartPolicy: OnFailure
diff --git a/...ang-scheduling/templates/yunikorn/job.yml → ...ces/benchmarks/templates/yunikorn/job.yml b/...ang-scheduling/templates/yunikorn/job.yml → ...ces/benchmarks/templates/yunikorn/job.yml
@@ -1,4 +1,3 @@
-
 # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -27,7 +26,23 @@ spec:
         applicationId: "test-{{._NAME_}}"
         queue: root.sandbox
       annotations:
-        pod-complete.stage.kwok.x-k8s.io/delay: "30s"
+        yunikorn.apache.org/task-group-name: group-{{._NAME_}}
+        yunikorn.apache.org/schedulingPolicyParameters: "gangSchedulingStyle=Hard"
+        yunikorn.apache.org/task-groups: |-
+          [{
+            "name": "group-{{._NAME_}}",
+            "minMember": {{.replicas}},
+            "minResource": {
+              "cpu": "1",
+              "memory": "500M",
+              "nvidia.com/gpu": "8"
+            },
+            "nodeSelector": {},
+            "tolerations": [],
+            "affinity": {}
+          }]
+        pod-complete.stage.kwok.x-k8s.io/delay: {{.ttl}}
+        pod-complete.stage.kwok.x-k8s.io/jitter-delay: {{.ttl}}
     spec:
       schedulerName: yunikorn
       containers:

diff --git a/scripts/create-test-cluster.sh b/scripts/create-test-cluster.sh
@@ -42,6 +42,7 @@ fi
 deploy_prometheus
 
 deploy_kwok
+kubectl apply -f $REPO_HOME/charts/overrides/kwok/pod-complete.yaml
 
 echo ""
 printYellow "Select workload manager or leave it blank to skip:"

diff --git a/scripts/env.sh b/scripts/env.sh
@@ -60,7 +60,7 @@ function deploy_kwok() {
   kubectl apply -f https://github.com/${KWOK_REPO}/releases/download/${KWOK_RELEASE}/stage-fast.yaml
   kubectl apply -f https://github.com/${KWOK_REPO}/raw/main/kustomize/stage/pod/chaos/pod-init-container-running-failed.yaml
   kubectl apply -f https://github.com/${KWOK_REPO}/raw/main/kustomize/stage/pod/chaos/pod-container-running-failed.yaml
-  kubectl apply -f https://github.com/${KWOK_REPO}/raw/main/kustomize/stage/pod/general/pod-complete.yaml
+  #kubectl apply -f https://github.com/${KWOK_REPO}/raw/main/kustomize/stage/pod/general/pod-complete.yaml
 }
 
 # Prometheus
@@ -85,7 +85,7 @@ function deploy_prometheus() {
     --set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false \
     --set prometheus.prometheusSpec.podMonitorSelectorNilUsesHelmValues=false
 
-  kubectl -n monitoring wait --for=condition=ready pod -l app.kubernetes.io/instance=kube-prometheus-stack --timeout=3m
+  kubectl -n monitoring wait --for=condition=ready pod -l app.kubernetes.io/instance=kube-prometheus-stack --timeout=300s
 }
 
 # Tested workload managers
@@ -98,7 +98,8 @@ function deploy_jobset() {
   printGreen Deploying jobset
 
   kubectl apply --server-side -f https://github.com/kubernetes-sigs/jobset/releases/download/${JOBSET_VERSION}/manifests.yaml
-  kubectl -n jobset-system wait --for=condition=ready pod -l control-plane=controller-manager --timeout=60s
+
+  kubectl -n jobset-system wait --for=condition=ready pod -l control-plane=controller-manager --timeout=300s
 }
 
 # https://github.com/kubernetes-sigs/kueue
@@ -108,7 +109,8 @@ function deploy_kueue() {
   printGreen Deploying kueue
 
   kubectl apply --server-side -f https://github.com/kubernetes-sigs/kueue/releases/download/${KUEUE_VERSION}/manifests.yaml
-  kubectl -n kueue-system wait --for=condition=ready pod -l control-plane=controller-manager --timeout=60s
+
+  kubectl -n kueue-system wait --for=condition=ready pod -l control-plane=controller-manager --timeout=300s
 }
 
 # https://github.com/volcano-sh/volcano
@@ -123,7 +125,7 @@ function deploy_volcano() {
     --version=$VOLCANO_VERSION --wait
 
   for app in volcano-admission volcano-controller volcano-scheduler; do
-    kubectl -n volcano-system wait --for=condition=ready pod -l app=$app --timeout=60s
+    kubectl -n volcano-system wait --for=condition=ready pod -l app=$app --timeout=300s
   done
 
   # Wait until volcano webhook is ready
@@ -142,7 +144,7 @@ function deploy_yunikorn() {
   helm upgrade --install yunikorn yunikorn/yunikorn -n yunikorn --create-namespace \
     --version=$YUNIKORN_VERSION --wait
 
-  kubectl -n yunikorn wait --for=condition=ready pod -l app=yunikorn --timeout=60s
+  kubectl -n yunikorn wait --for=condition=ready pod -l app=yunikorn --timeout=300s
 }
 
 # https://www.run.ai/