Skip to content

Commit

Permalink
add gang-scheduling benchmark for coscheduling plugin with jobset and…
Browse files Browse the repository at this point in the history
… kueue

Signed-off-by: Dmitry Shmulevich <[email protected]>
  • Loading branch information
dmitsh committed Aug 19, 2024
1 parent 7836225 commit 06e2c0c
Show file tree
Hide file tree
Showing 23 changed files with 458 additions and 27 deletions.
2 changes: 1 addition & 1 deletion charts/virtual-nodes/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ type: application
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 0.1.0
version: 0.2.0

# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to
Expand Down
2 changes: 1 addition & 1 deletion charts/virtual-nodes/templates/nodes.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,8 @@
{{- end }}

{{- $count := ($node.count | int) }}
{{- $suffix := ( randAlphaNum 6 | lower ) }}
{{- range until $count }}
{{- $suffix := ( randAlphaNum 6 | lower ) }}
---
apiVersion: v1
kind: Node
Expand Down
4 changes: 2 additions & 2 deletions resources/benchmarks/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,13 @@ The gang-scheduling benchmark workflow operates on 32 virtual GPU nodes, submitt
To run the benchmark test for Kueue:

```bash
./bin/knavigator -workflow 'resources/benchmarks/gang-scheduling/workflows/{config-kueue.yaml,run-test.yaml}'
./bin/knavigator -workflow 'resources/benchmarks/gang-scheduling/workflows/{config-nodes.yaml,config-kueue.yaml,run-test.yaml}'
```

#### Run:ai

```bash
./bin/knavigator -workflow resources/benchmarks/gang-scheduling/workflows/runai-test.yaml
./bin/knavigator -workflow 'resources/benchmarks/gang-scheduling/workflows/{config-nodes.yaml,runai-test.yaml}'
```

## Scaling Benchmark Test
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name: config-kueue
tasks:
- id: register-cluster-queue
type: RegisterObj
params:
template: "resources/templates/kueue/cluster-queue.yml"
- id: register-local-queue
type: RegisterObj
params:
template: "resources/templates/kueue/local-queue.yml"
- id: register-resource-flavor
type: RegisterObj
params:
template: "resources/templates/kueue/resource-flavor.yml"
- id: register
type: RegisterObj
params:
template: "resources/benchmarks/templates/jobset/jobset-coscheduling.yaml"
nameFormat: "jobset{{._ENUM_}}"
podNameFormat: "{{._NAME_}}-workers-[0-9]+-[0-9]+-.+"
podCount: "{{.replicas}}"
- id: create-resource-flavor
type: SubmitObj
params:
refTaskId: register-resource-flavor
canExist: true
params:
name: "gpu-node"
nodeLabels:
nvidia.com/gpu.count: "8"
- id: create-cluster-queue
type: SubmitObj
params:
refTaskId: register-cluster-queue
canExist: true
params:
name: team
flavor: gpu-node
cpu: 8
memory: 36Gi
pods: 32
gpu: 256
- id: create-local-queue
type: SubmitObj
params:
refTaskId: register-local-queue
canExist: true
params:
name: team-queue
namespace: default
clusterQueue: team
- id: job1
type: SubmitObj
params:
refTaskId: register
count: 1
params:
replicas: 4
ttl: 30s
14 changes: 14 additions & 0 deletions resources/benchmarks/gang-scheduling/workflows/config-kueue.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,17 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name: config-kueue
description: register, deploy and configure kueue custom resources
tasks:
Expand Down
25 changes: 25 additions & 0 deletions resources/benchmarks/gang-scheduling/workflows/config-nodes.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name: config-nodes
tasks:
- id: configure
type: Configure
params:
nodes:
- type: dgxa100.80g
count: 32
labels:
nvidia.com/gpu.count: "8"
timeout: 1m
14 changes: 14 additions & 0 deletions resources/benchmarks/gang-scheduling/workflows/config-volcano.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,17 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name: config-volcano
description: register, deploy and configure volcano custom resources
tasks:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,17 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name: config-yunikorn
description: register, deploy and configure yunikorn custom resources
tasks:
Expand Down
27 changes: 14 additions & 13 deletions resources/benchmarks/gang-scheduling/workflows/run-test.yaml
Original file line number Diff line number Diff line change
@@ -1,18 +1,19 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name: test-gang-scheduling
tasks:
- id: configure
type: Configure
params:
nodes:
- type: dgxa100.80g
count: 32
labels:
nvidia.com/gpu.count: "8"
timeout: 1m
- id: sleep
type: Sleep
params:
timeout: 5s
- id: job1
type: SubmitObj
params:
Expand Down
23 changes: 14 additions & 9 deletions resources/benchmarks/gang-scheduling/workflows/runai-test.yaml
Original file line number Diff line number Diff line change
@@ -1,14 +1,19 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name: test-gang-scheduling-runai
tasks:
- id: configure
type: Configure
params:
nodes:
- type: dgxa100.80g
count: 32
labels:
nvidia.com/gpu.count: "8"
timeout: 1m
- id: register-trainingworkload
type: RegisterObj
params:
Expand Down
14 changes: 14 additions & 0 deletions resources/benchmarks/scaling/workflows/config-kueue.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,17 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name: config-kueue
description: register, deploy and configure kueue custom resources
tasks:
Expand Down
14 changes: 14 additions & 0 deletions resources/benchmarks/scaling/workflows/config-nodes.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,17 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name: config-nodes
description: create 500 virtual GPU nodes
tasks:
Expand Down
14 changes: 14 additions & 0 deletions resources/benchmarks/scaling/workflows/config-runai.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,17 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name: config-runai
description: register, deploy and configure run:ai custom resources
tasks:
Expand Down
14 changes: 14 additions & 0 deletions resources/benchmarks/scaling/workflows/config-volcano.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,17 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name: config-volcano
description: register, deploy and configure volcano custom resources
tasks:
Expand Down
14 changes: 14 additions & 0 deletions resources/benchmarks/scaling/workflows/config-yunikorn.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,17 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name: config-yunikorn
description: register, deploy and configure yunikorn custom resources
tasks:
Expand Down
14 changes: 14 additions & 0 deletions resources/benchmarks/scaling/workflows/run-test-multi.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,17 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name: test-scaling-multi-node-job
description: deploy a 500-replicas job
tasks:
Expand Down
14 changes: 14 additions & 0 deletions resources/benchmarks/scaling/workflows/run-test-single.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,17 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name: test-scaling-single-node-jobs
description: deploy 500 single-replica jobs
tasks:
Expand Down
14 changes: 14 additions & 0 deletions resources/benchmarks/scaling/workflows/runai-test-multi.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,17 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name: test-scaling
description: deploy a 500-replicas job
tasks:
Expand Down
Loading

0 comments on commit 06e2c0c

Please sign in to comment.