Skip to content

Commit

Permalink
update gang-scheduling benchmark
Browse files Browse the repository at this point in the history
Signed-off-by: Dmitry Shmulevich <[email protected]>
  • Loading branch information
dmitsh committed Aug 10, 2024
1 parent 5c10d62 commit 502a024
Show file tree
Hide file tree
Showing 9 changed files with 289 additions and 263 deletions.
29 changes: 29 additions & 0 deletions resources/benchmarks/gang-scheduling/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Gang Scheduling Benchmark Test

This directory contains benchmark tests for various workload managers and schedulers. The supported systems include:

- Jobset
- Kueue
- Volcano
- Yunikorn
- Run:ai

## Usage

For all workload managers except Run:ai, the benchmark test involves two sequential workflows. The first workflow registers the CRDs, and the second workflow runs the common part of the test.

### Example

To run the benchmark test for Kueue:

```bash
./bin/knavigator -workflow resources/benchmarks/gang-scheduling/workflows/config-kueue.yml,resources/benchmarks/gang-scheduling/workflows/run-test-common.yml
```

### Run:ai

Run:ai requires additional customization and thus has a separate workflow:

```bash
./bin/knavigator -workflow resources/benchmarks/gang-scheduling/workflows/run-test-runai.yml
```
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
name: config-jobset
tasks:
- id: register
type: RegisterObj
params:
template: "resources/benchmarks/gang-scheduling/templates/k8s/jobset.yml"
nameFormat: "jobset{{._ENUM_}}"
podNameFormat: "{{._NAME_}}-workers-[0-9]+-[0-9]+-.+"
podCount: "{{.replicas}}"
50 changes: 50 additions & 0 deletions resources/benchmarks/gang-scheduling/workflows/config-kueue.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
name: config-kueue
tasks:
- id: register-cluster-queue
type: RegisterObj
params:
template: "resources/templates/kueue/cluster-queue.yml"
- id: register-local-queue
type: RegisterObj
params:
template: "resources/templates/kueue/local-queue.yml"
- id: register-resource-flavor
type: RegisterObj
params:
template: "resources/templates/kueue/resource-flavor.yml"
- id: register
type: RegisterObj
params:
template: "resources/benchmarks/templates/kueue/job.yml"
nameFormat: "job{{._ENUM_}}"
podNameFormat: "{{._NAME_}}-[0-9]-.*"
podCount: "{{.replicas}}"
- id: create-resource-flavor
type: SubmitObj
params:
refTaskId: register-resource-flavor
canExist: true
params:
name: "gpu-node"
nodeLabels:
nvidia.com/gpu.count: "8"
- id: create-cluster-queue
type: SubmitObj
params:
refTaskId: register-cluster-queue
canExist: true
params:
name: team
flavor: gpu-node
cpu: 8
memory: 36Gi
gpu: 256
- id: create-local-queue
type: SubmitObj
params:
refTaskId: register-local-queue
canExist: true
params:
name: team-queue
namespace: default
clusterQueue: team
13 changes: 0 additions & 13 deletions resources/benchmarks/gang-scheduling/workflows/config-nodes.yml

This file was deleted.

31 changes: 31 additions & 0 deletions resources/benchmarks/gang-scheduling/workflows/config-volcano.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
name: config-volcano
tasks:
- id: register
type: RegisterObj
params:
template: "resources/benchmarks/templates/volcano/job.yml"
nameFormat: "j{{._ENUM_}}"
podNameFormat: "{{._NAME_}}-test-[0-9]+"
podCount: "{{.replicas}}"
- id: configure
type: Configure
params:
configmaps:
- name: volcano-scheduler-configmap
namespace: volcano-system
op: create
data:
volcano-scheduler.conf: |
actions: "enqueue, allocate, backfill"
tiers:
- plugins:
- name: priority
- name: gang
- name: conformance
- plugins:
- name: drf
- name: predicates
- name: proportion
- name: nodeorder
- name: binpack
timeout: 1m
29 changes: 29 additions & 0 deletions resources/benchmarks/gang-scheduling/workflows/config-yunikorn.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
name: config-yunikorn
tasks:
- id: register
type: RegisterObj
params:
template: "resources/benchmarks/templates/yunikorn/job.yml"
nameFormat: "job{{._ENUM_}}"
podNameFormat: "{{._NAME_}}-.*"
podCount: "{{.replicas}}"
- id: configure
type: Configure
params:
configmaps:
- name: yunikorn-configs
namespace: yunikorn
op: create
data:
queues.yaml: |
partitions:
- name: default
queues:
- name: root
queues:
- name: sandbox
submitacl: '*'
resources:
max:
{memory: 36Gi, vcore: 8000m, nvidia.com/gpu: 256}
timeout: 1m
131 changes: 131 additions & 0 deletions resources/benchmarks/gang-scheduling/workflows/run-test-common.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
name: test-gang-scheduling
tasks:
- id: configure
type: Configure
params:
nodes:
- type: dgxa100.80g
count: 32
labels:
nvidia.com/gpu.count: "8"
timeout: 1m
- id: job1
type: SubmitObj
params:
refTaskId: register
count: 1
params:
replicas: 32
ttl: 30s
- id: job2
type: SubmitObj
params:
refTaskId: register
count: 2
params:
replicas: 16
ttl: 30s
- id: job3
type: SubmitObj
params:
refTaskId: register
count: 3
params:
replicas: 10
ttl: 30s
- id: job3.1
type: SubmitObj
params:
refTaskId: register
count: 1
params:
replicas: 2
ttl: 30s
- id: job4
type: SubmitObj
params:
refTaskId: register
count: 4
params:
replicas: 8
ttl: 30s
- id: job5
type: SubmitObj
params:
refTaskId: register
count: 5
params:
replicas: 6
ttl: 30s
- id: job5.1
type: SubmitObj
params:
refTaskId: register
count: 2
params:
replicas: 1
ttl: 30s
- id: job6
type: SubmitObj
params:
refTaskId: register
count: 6
params:
replicas: 5
ttl: 30s
- id: job6.1
type: SubmitObj
params:
refTaskId: register
count: 1
params:
replicas: 2
ttl: 30s
- id: job7
type: SubmitObj
params:
refTaskId: register
count: 7
params:
replicas: 4
ttl: 30s
- id: job7.1
type: SubmitObj
params:
refTaskId: register
count: 1
params:
replicas: 2
ttl: 30s
- id: job7.2
type: SubmitObj
params:
refTaskId: register
count: 2
params:
replicas: 1
ttl: 30s
- id: job8
type: SubmitObj
params:
refTaskId: register
count: 8
params:
replicas: 4
ttl: 30s
- id: job9
type: SubmitObj
params:
refTaskId: register
count: 9
params:
replicas: 3
ttl: 30s
- id: job9.1
type: SubmitObj
params:
refTaskId: register
count: 1
params:
replicas: 5
ttl: 30s
Original file line number Diff line number Diff line change
@@ -1,5 +1,14 @@
name: test-gang-scheduling
name: test-gang-scheduling-runai
tasks:
- id: configure
type: Configure
params:
nodes:
- type: dgxa100.80g
count: 32
labels:
nvidia.com/gpu.count: "8"
timeout: 1m
- id: register-trainingworkload
type: RegisterObj
params:
Expand Down
Loading

0 comments on commit 502a024

Please sign in to comment.