Skip to content

Commit

Permalink
update benchmarks (#89)
Browse files Browse the repository at this point in the history
Signed-off-by: Dmitry Shmulevich <[email protected]>
dmitsh authored Aug 2, 2024

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature. The key has expired.
1 parent 52c3059 commit c718051
Showing 12 changed files with 342 additions and 54 deletions.
10 changes: 2 additions & 8 deletions docs/examples/runai/runai.md
Original file line number Diff line number Diff line change
@@ -23,18 +23,12 @@ Navigate to the Run:ai portal and create a new project. Upon creating the projec

This script will deploy a `kind` cluster if necessary, followed by deploying `KWOK` and `Prometheus`. It will then prompt you to select a workload manager. Choose the `run:ai` option.

4. **Update KWOK stage**:

Update the pod-complete stage by running the following command
```bash
kubectl apply -f ./charts/overrides/kwok/pod-complete.yaml
```

5. **Replace cluster UID and project name in the sample workflow files**:
4. **Replace cluster UID and project name in the sample workflow files**:

Update the sample workflow files [test-trainingworkload.yml](../../../resources/workflows/runai/test-trainingworkload.yml#L40-L41) and [test-distributedworkload.yml](../../../resources/workflows/runai/test-distributedworkload.yml#L40-L41) by replacing `<RUNAI_CLUSTER_ID>` with the cluster UID and `<RUNAI_PROJECT>` with the project name.

6. **Run the workflows**
5. **Run the workflows**

Run a Run:ai training workload:
```bash
137 changes: 137 additions & 0 deletions resources/benchmarks/gang-scheduling/workflows/run-test-runai.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
name: test-gang-scheduling
tasks:
- id: register-trainingworkload
type: RegisterObj
params:
template: "resources/benchmarks/templates/runai/trainingworkload.yml"
nameFormat: "twl{{._ENUM_}}"
podNameFormat: "{{._NAME_}}-0-0"
podCount: 1
- id: register-distributedworkload
type: RegisterObj
params:
template: "resources/benchmarks/templates/runai/distributedworkload.yml"
nameFormat: "dwl{{._ENUM_}}"
podNameFormat: "{{._NAME_}}-(launcher-[a-z0-9]+|worker-[0-9]+)"
podCount: "{{.workers}} + 1"
#
### Benchmark test
#
- id: job1
type: SubmitObj
params:
refTaskId: register-distributedworkload
count: 1
params:
workers: 31
ttl: 30s
- id: job2
type: SubmitObj
params:
refTaskId: register-distributedworkload
count: 2
params:
workers: 15
ttl: 30s
- id: job3
type: SubmitObj
params:
refTaskId: register-distributedworkload
count: 3
params:
workers: 9
ttl: 30s
- id: job3.1
type: SubmitObj
params:
refTaskId: register-distributedworkload
count: 1
params:
workers: 1
ttl: 30s
- id: job4
type: SubmitObj
params:
refTaskId: register-distributedworkload
count: 4
params:
workers: 7
ttl: 30s
- id: job5
type: SubmitObj
params:
refTaskId: register-distributedworkload
count: 5
params:
workers: 5
ttl: 30s
- id: job5.1
type: SubmitObj
params:
refTaskId: register-trainingworkload
count: 2
params:
ttl: 30s
- id: job6
type: SubmitObj
params:
refTaskId: register-distributedworkload
count: 6
params:
workers: 4
ttl: 30s
- id: job6.1
type: SubmitObj
params:
refTaskId: register-distributedworkload
count: 1
params:
workers: 1
ttl: 30s
- id: job7
type: SubmitObj
params:
refTaskId: register-distributedworkload
count: 7
params:
workers: 3
ttl: 30s
- id: job7.1
type: SubmitObj
params:
refTaskId: register-distributedworkload
count: 1
params:
workers: 1
ttl: 30s
- id: job7.2
type: SubmitObj
params:
refTaskId: register-trainingworkload
count: 2
params:
ttl: 30s
- id: job8
type: SubmitObj
params:
refTaskId: register-distributedworkload
count: 8
params:
workers: 3
ttl: 30s
- id: job9
type: SubmitObj
params:
refTaskId: register-distributedworkload
count: 9
params:
workers: 2
ttl: 30s
- id: job9.1
type: SubmitObj
params:
refTaskId: register-distributedworkload
count: 1
params:
workers: 4
ttl: 30s
103 changes: 70 additions & 33 deletions resources/benchmarks/gang-scheduling/workflows/run-test.yml
Original file line number Diff line number Diff line change
@@ -4,7 +4,7 @@ tasks:
#- id: register
# type: RegisterObj
# params:
# template: "resources/benchmarks/gang-scheduling/templates/k8s/job.yml"
# template: "resources/benchmarks/templates/k8s/job.yml"
# nameFormat: "job{{._ENUM_}}"
# podNameFormat: "{{._NAME_}}-[0-9]-.*"
# podCount: "{{.replicas}}"
@@ -13,7 +13,7 @@ tasks:
#- id: register
# type: RegisterObj
# params:
# template: "resources/benchmarks/gang-scheduling/templates/k8s/jobset.yml"
# template: "resources/benchmarks/templates/k8s/jobset.yml"
# nameFormat: "jobset{{._ENUM_}}"
# podNameFormat: "{{._NAME_}}-workers-[0-9]+-[0-9]+-.+"
# podCount: "{{.replicas}}"
@@ -34,7 +34,7 @@ tasks:
#- id: register
# type: RegisterObj
# params:
# template: "resources/benchmarks/gang-scheduling/templates/kueue/job.yml"
# template: "resources/benchmarks/templates/kueue/job.yml"
# nameFormat: "job{{._ENUM_}}"
# podNameFormat: "{{._NAME_}}-[0-9]-.*"
# podCount: "{{.replicas}}"
@@ -69,39 +69,61 @@ tasks:
#- id: register
# type: RegisterObj
# params:
# template: "resources/benchmarks/gang-scheduling/templates/volcano/job.yml"
# template: "resources/benchmarks/templates/volcano/job.yml"
# nameFormat: "j{{._ENUM_}}"
# podNameFormat: "{{._NAME_}}-test-[0-9]+"
# podCount: "{{.replicas}}"
#- id: configure
# type: Configure
# params:
# configmaps:
# - name: volcano-scheduler-configmap
# namespace: volcano-system
# op: create
# data:
# volcano-scheduler.conf: |
# actions: "enqueue, allocate, backfill"
# tiers:
# - plugins:
# - name: priority
# - name: gang
# - name: conformance
# - plugins:
# - name: drf
# - name: predicates
# - name: proportion
# - name: nodeorder
# - name: binpack
# timeout: 1m

### Yunikorn
- id: register
type: RegisterObj
params:
template: "resources/benchmarks/gang-scheduling/templates/yunikorn/job.yml"
nameFormat: "job{{._ENUM_}}"
podNameFormat: "{{._NAME_}}-.*"
podCount: "{{.replicas}}"
- id: configure
type: Configure
params:
configmaps:
- name: yunikorn-configs
namespace: yunikorn
op: create
data:
queues.yaml: |
partitions:
- name: default
queues:
- name: root
queues:
- name: sandbox
submitacl: '*'
resources:
max:
{memory: 36Gi, vcore: 8000m, nvidia.com/gpu: 256}
timeout: 1m
#- id: register
# type: RegisterObj
# params:
# template: "resources/benchmarks/templates/yunikorn/job.yml"
# nameFormat: "job{{._ENUM_}}"
# podNameFormat: "{{._NAME_}}-.*"
# podCount: "{{.replicas}}"
#- id: configure
# type: Configure
# params:
# configmaps:
# - name: yunikorn-configs
# namespace: yunikorn
# op: create
# data:
# queues.yaml: |
# partitions:
# - name: default
# queues:
# - name: root
# queues:
# - name: sandbox
# submitacl: '*'
# resources:
# max:
# {memory: 36Gi, vcore: 8000m, nvidia.com/gpu: 256}
# timeout: 1m
#
### Benchmark test
#
@@ -112,101 +134,116 @@ tasks:
count: 1
params:
replicas: 32
ttl: 30s
- id: job2
type: SubmitObj
params:
refTaskId: register
count: 2
params:
replicas: 16
ttl: 30s
- id: job3
type: SubmitObj
params:
refTaskId: register
count: 3
params:
replicas: 10
ttl: 30s
- id: job3.1
type: SubmitObj
params:
refTaskId: register
count: 1
params:
replicas: 2
replicas: 2
ttl: 30s
- id: job4
type: SubmitObj
params:
refTaskId: register
count: 4
params:
replicas: 8
replicas: 8
ttl: 30s
- id: job5
type: SubmitObj
params:
refTaskId: register
count: 5
params:
replicas: 6
ttl: 30s
- id: job5.1
type: SubmitObj
params:
refTaskId: register
count: 2
params:
replicas: 1
ttl: 30s
- id: job6
type: SubmitObj
params:
refTaskId: register
count: 6
params:
replicas: 5
ttl: 30s
- id: job6.1
type: SubmitObj
params:
refTaskId: register
count: 1
params:
replicas: 2
ttl: 30s
- id: job7
type: SubmitObj
params:
refTaskId: register
count: 7
params:
replicas: 4
ttl: 30s
- id: job7.1
type: SubmitObj
params:
refTaskId: register
count: 1
params:
replicas: 2
ttl: 30s
- id: job7.2
type: SubmitObj
params:
refTaskId: register
count: 2
params:
replicas: 1
ttl: 30s
- id: job8
type: SubmitObj
params:
refTaskId: register
count: 8
params:
replicas: 4
ttl: 30s
- id: job9
type: SubmitObj
params:
refTaskId: register
count: 9
params:
replicas: 3
ttl: 30s
- id: job9.1
type: SubmitObj
params:
refTaskId: register
count: 1
params:
replicas: 5
ttl: 30s
Original file line number Diff line number Diff line change
@@ -25,7 +25,8 @@ spec:
template:
metadata:
annotations:
pod-complete.stage.kwok.x-k8s.io/delay: "30s"
pod-complete.stage.kwok.x-k8s.io/delay: {{.ttl}}
pod-complete.stage.kwok.x-k8s.io/jitter-delay: {{.ttl}}
spec:
schedulerName: default-scheduler
containers:
Original file line number Diff line number Diff line change
@@ -36,7 +36,8 @@ spec:
template:
metadata:
annotations:
pod-complete.stage.kwok.x-k8s.io/delay: "30s"
pod-complete.stage.kwok.x-k8s.io/delay: {{.ttl}}
pod-complete.stage.kwok.x-k8s.io/jitter-delay: {{.ttl}}
spec:
schedulerName: default-scheduler
containers:
Original file line number Diff line number Diff line change
@@ -26,7 +26,8 @@ spec:
template:
metadata:
annotations:
pod-complete.stage.kwok.x-k8s.io/delay: "30s"
pod-complete.stage.kwok.x-k8s.io/delay: {{.ttl}}
pod-complete.stage.kwok.x-k8s.io/jitter-delay: {{.ttl}}
spec:
containers:
- name: test
64 changes: 64 additions & 0 deletions resources/benchmarks/templates/runai/distributedworkload.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
apiVersion: run.ai/v2alpha1
kind: DistributedWorkload
metadata:
name: "{{._NAME_}}"
namespace: runai-<RUNAI_PROJECT>
annotations:
clusterId: <RUNAI_CLUSTER_ID>
labels:
project: <RUNAI_PROJECT>
masterSpec:
name:
value: "{{._NAME_}}"
image:
value: ubuntu
imagePullPolicy:
value: Always
cpu:
value: 100m
memory:
value: 256M
gpuDevices:
value: 8
largeShm:
value: false
nodePools:
value: default
runAsUser:
value: true
usage: Submit
autoDeletionTimeAfterCompletionSeconds:
value: 2592000
spec:
annotations:
items:
clusterId:
value: <RUNAI_CLUSTER_ID>
pod-complete.stage.kwok.x-k8s.io/delay:
value: {{.ttl}}
pod-complete.stage.kwok.x-k8s.io/jitter-delay:
value: {{.ttl}}
name:
value: "{{._NAME_}}"
jobType: MPIJob
image:
value: ubuntu
imagePullPolicy:
value: Always
cpu:
value: 100m
memory:
value: 256M
gpuDevices:
value: 8
workers:
value: {{.workers}}
largeShm:
value: false
nodePools:
value: default
runAsUser:
value: true
usage: Submit
autoDeletionTimeAfterCompletionSeconds:
value: 2592000
35 changes: 35 additions & 0 deletions resources/benchmarks/templates/runai/trainingworkload.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
apiVersion: run.ai/v2alpha1
kind: TrainingWorkload
metadata:
name: "{{._NAME_}}"
namespace: runai-<RUNAI_PROJECT>
annotations:
clusterId: <RUNAI_CLUSTER_ID>
labels:
project: <RUNAI_PROJECT>
spec:
name:
value: "{{._NAME_}}"
image:
value: ubuntu
imagePullPolicy:
value: IfNotPresent
active:
value: true
annotations:
items:
clusterId:
value: <RUNAI_CLUSTER_ID>
pod-complete.stage.kwok.x-k8s.io/delay:
value: {{.ttl}}
pod-complete.stage.kwok.x-k8s.io/jitter-delay:
value: {{.ttl}}
cpu:
value: 100m
memory:
value: 256M
gpuDevices:
value: 8
nodePools:
value: default
usage: Submit
Original file line number Diff line number Diff line change
@@ -36,7 +36,8 @@ spec:
metadata:
name: test
annotations:
pod-complete.stage.kwok.x-k8s.io/delay: "30s"
pod-complete.stage.kwok.x-k8s.io/delay: {{.ttl}}
pod-complete.stage.kwok.x-k8s.io/jitter-delay: {{.ttl}}
spec:
containers:
- name: job
@@ -51,4 +52,3 @@ spec:
cpu: 100m
memory: 256M
nvidia.com/gpu: "8"
restartPolicy: OnFailure
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -27,7 +26,23 @@ spec:
applicationId: "test-{{._NAME_}}"
queue: root.sandbox
annotations:
pod-complete.stage.kwok.x-k8s.io/delay: "30s"
yunikorn.apache.org/task-group-name: group-{{._NAME_}}
yunikorn.apache.org/schedulingPolicyParameters: "gangSchedulingStyle=Hard"
yunikorn.apache.org/task-groups: |-
[{
"name": "group-{{._NAME_}}",
"minMember": {{.replicas}},
"minResource": {
"cpu": "1",
"memory": "500M",
"nvidia.com/gpu": "8"
},
"nodeSelector": {},
"tolerations": [],
"affinity": {}
}]
pod-complete.stage.kwok.x-k8s.io/delay: {{.ttl}}
pod-complete.stage.kwok.x-k8s.io/jitter-delay: {{.ttl}}
spec:
schedulerName: yunikorn
containers:
1 change: 1 addition & 0 deletions scripts/create-test-cluster.sh
Original file line number Diff line number Diff line change
@@ -42,6 +42,7 @@ fi
deploy_prometheus

deploy_kwok
kubectl apply -f $REPO_HOME/charts/overrides/kwok/pod-complete.yaml

echo ""
printYellow "Select workload manager or leave it blank to skip:"
14 changes: 8 additions & 6 deletions scripts/env.sh
Original file line number Diff line number Diff line change
@@ -60,7 +60,7 @@ function deploy_kwok() {
kubectl apply -f https://github.com/${KWOK_REPO}/releases/download/${KWOK_RELEASE}/stage-fast.yaml
kubectl apply -f https://github.com/${KWOK_REPO}/raw/main/kustomize/stage/pod/chaos/pod-init-container-running-failed.yaml
kubectl apply -f https://github.com/${KWOK_REPO}/raw/main/kustomize/stage/pod/chaos/pod-container-running-failed.yaml
kubectl apply -f https://github.com/${KWOK_REPO}/raw/main/kustomize/stage/pod/general/pod-complete.yaml
#kubectl apply -f https://github.com/${KWOK_REPO}/raw/main/kustomize/stage/pod/general/pod-complete.yaml
}

# Prometheus
@@ -85,7 +85,7 @@ function deploy_prometheus() {
--set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false \
--set prometheus.prometheusSpec.podMonitorSelectorNilUsesHelmValues=false

kubectl -n monitoring wait --for=condition=ready pod -l app.kubernetes.io/instance=kube-prometheus-stack --timeout=3m
kubectl -n monitoring wait --for=condition=ready pod -l app.kubernetes.io/instance=kube-prometheus-stack --timeout=300s
}

# Tested workload managers
@@ -98,7 +98,8 @@ function deploy_jobset() {
printGreen Deploying jobset

kubectl apply --server-side -f https://github.com/kubernetes-sigs/jobset/releases/download/${JOBSET_VERSION}/manifests.yaml
kubectl -n jobset-system wait --for=condition=ready pod -l control-plane=controller-manager --timeout=60s

kubectl -n jobset-system wait --for=condition=ready pod -l control-plane=controller-manager --timeout=300s
}

# https://github.com/kubernetes-sigs/kueue
@@ -108,7 +109,8 @@ function deploy_kueue() {
printGreen Deploying kueue

kubectl apply --server-side -f https://github.com/kubernetes-sigs/kueue/releases/download/${KUEUE_VERSION}/manifests.yaml
kubectl -n kueue-system wait --for=condition=ready pod -l control-plane=controller-manager --timeout=60s

kubectl -n kueue-system wait --for=condition=ready pod -l control-plane=controller-manager --timeout=300s
}

# https://github.com/volcano-sh/volcano
@@ -123,7 +125,7 @@ function deploy_volcano() {
--version=$VOLCANO_VERSION --wait

for app in volcano-admission volcano-controller volcano-scheduler; do
kubectl -n volcano-system wait --for=condition=ready pod -l app=$app --timeout=60s
kubectl -n volcano-system wait --for=condition=ready pod -l app=$app --timeout=300s
done

# Wait until volcano webhook is ready
@@ -142,7 +144,7 @@ function deploy_yunikorn() {
helm upgrade --install yunikorn yunikorn/yunikorn -n yunikorn --create-namespace \
--version=$YUNIKORN_VERSION --wait

kubectl -n yunikorn wait --for=condition=ready pod -l app=yunikorn --timeout=60s
kubectl -n yunikorn wait --for=condition=ready pod -l app=yunikorn --timeout=300s
}

# https://www.run.ai/

0 comments on commit c718051

Please sign in to comment.