diff --git a/charts/virtual-nodes/Chart.yaml b/charts/virtual-nodes/Chart.yaml index 1ebb94e..d69dd15 100644 --- a/charts/virtual-nodes/Chart.yaml +++ b/charts/virtual-nodes/Chart.yaml @@ -15,7 +15,7 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 0.1.0 +version: 0.2.0 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. Versions are not expected to diff --git a/charts/virtual-nodes/templates/nodes.yaml b/charts/virtual-nodes/templates/nodes.yaml index e77594d..72c1e99 100644 --- a/charts/virtual-nodes/templates/nodes.yaml +++ b/charts/virtual-nodes/templates/nodes.yaml @@ -63,8 +63,8 @@ {{- end }} {{- $count := ($node.count | int) }} -{{- $suffix := ( randAlphaNum 6 | lower ) }} {{- range until $count }} +{{- $suffix := ( randAlphaNum 6 | lower ) }} --- apiVersion: v1 kind: Node diff --git a/resources/benchmarks/README.md b/resources/benchmarks/README.md index 5fef533..fbfbec3 100644 --- a/resources/benchmarks/README.md +++ b/resources/benchmarks/README.md @@ -25,13 +25,13 @@ The gang-scheduling benchmark workflow operates on 32 virtual GPU nodes, submitt To run the benchmark test for Kueue: ```bash -./bin/knavigator -workflow 'resources/benchmarks/gang-scheduling/workflows/{config-kueue.yaml,run-test.yaml}' +./bin/knavigator -workflow 'resources/benchmarks/gang-scheduling/workflows/{config-nodes.yaml,config-kueue.yaml,run-test.yaml}' ``` #### Run:ai ```bash -./bin/knavigator -workflow resources/benchmarks/gang-scheduling/workflows/runai-test.yaml +./bin/knavigator -workflow 'resources/benchmarks/gang-scheduling/workflows/{config-nodes.yaml,runai-test.yaml}' ``` ## Scaling Benchmark Test diff --git a/resources/benchmarks/gang-scheduling/workflows/config-combo-coscheduling.yaml b/resources/benchmarks/gang-scheduling/workflows/config-combo-coscheduling.yaml new file mode 100644 index 0000000..9a5dd4b --- /dev/null +++ b/resources/benchmarks/gang-scheduling/workflows/config-combo-coscheduling.yaml @@ -0,0 +1,73 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: config-kueue +tasks: +- id: register-cluster-queue + type: RegisterObj + params: + template: "resources/templates/kueue/cluster-queue.yml" +- id: register-local-queue + type: RegisterObj + params: + template: "resources/templates/kueue/local-queue.yml" +- id: register-resource-flavor + type: RegisterObj + params: + template: "resources/templates/kueue/resource-flavor.yml" +- id: register + type: RegisterObj + params: + template: "resources/benchmarks/templates/jobset/jobset-coscheduling.yaml" + nameFormat: "jobset{{._ENUM_}}" + podNameFormat: "{{._NAME_}}-workers-[0-9]+-[0-9]+-.+" + podCount: "{{.replicas}}" +- id: create-resource-flavor + type: SubmitObj + params: + refTaskId: register-resource-flavor + canExist: true + params: + name: "gpu-node" + nodeLabels: + nvidia.com/gpu.count: "8" +- id: create-cluster-queue + type: SubmitObj + params: + refTaskId: register-cluster-queue + canExist: true + params: + name: team + flavor: gpu-node + cpu: 8 + memory: 36Gi + pods: 32 + gpu: 256 +- id: create-local-queue + type: SubmitObj + params: + refTaskId: register-local-queue + canExist: true + params: + name: team-queue + namespace: default + clusterQueue: team +- id: job1 + type: SubmitObj + params: + refTaskId: register + count: 1 + params: + replicas: 4 + ttl: 30s diff --git a/resources/benchmarks/gang-scheduling/workflows/config-kueue.yaml b/resources/benchmarks/gang-scheduling/workflows/config-kueue.yaml index 336cb31..8ed0315 100644 --- a/resources/benchmarks/gang-scheduling/workflows/config-kueue.yaml +++ b/resources/benchmarks/gang-scheduling/workflows/config-kueue.yaml @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + name: config-kueue description: register, deploy and configure kueue custom resources tasks: diff --git a/resources/benchmarks/gang-scheduling/workflows/config-nodes.yaml b/resources/benchmarks/gang-scheduling/workflows/config-nodes.yaml new file mode 100644 index 0000000..6dcdf9f --- /dev/null +++ b/resources/benchmarks/gang-scheduling/workflows/config-nodes.yaml @@ -0,0 +1,25 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: config-nodes +tasks: +- id: configure + type: Configure + params: + nodes: + - type: dgxa100.80g + count: 32 + labels: + nvidia.com/gpu.count: "8" + timeout: 1m diff --git a/resources/benchmarks/gang-scheduling/workflows/config-volcano.yaml b/resources/benchmarks/gang-scheduling/workflows/config-volcano.yaml index 11ecd34..6e0b879 100644 --- a/resources/benchmarks/gang-scheduling/workflows/config-volcano.yaml +++ b/resources/benchmarks/gang-scheduling/workflows/config-volcano.yaml @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + name: config-volcano description: register, deploy and configure volcano custom resources tasks: diff --git a/resources/benchmarks/gang-scheduling/workflows/config-yunikorn.yaml b/resources/benchmarks/gang-scheduling/workflows/config-yunikorn.yaml index 905b7f2..f52194e 100644 --- a/resources/benchmarks/gang-scheduling/workflows/config-yunikorn.yaml +++ b/resources/benchmarks/gang-scheduling/workflows/config-yunikorn.yaml @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + name: config-yunikorn description: register, deploy and configure yunikorn custom resources tasks: diff --git a/resources/benchmarks/gang-scheduling/workflows/run-test.yaml b/resources/benchmarks/gang-scheduling/workflows/run-test.yaml index 33cd019..782786c 100644 --- a/resources/benchmarks/gang-scheduling/workflows/run-test.yaml +++ b/resources/benchmarks/gang-scheduling/workflows/run-test.yaml @@ -1,18 +1,19 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + name: test-gang-scheduling tasks: -- id: configure - type: Configure - params: - nodes: - - type: dgxa100.80g - count: 32 - labels: - nvidia.com/gpu.count: "8" - timeout: 1m -- id: sleep - type: Sleep - params: - timeout: 5s - id: job1 type: SubmitObj params: diff --git a/resources/benchmarks/gang-scheduling/workflows/runai-test.yaml b/resources/benchmarks/gang-scheduling/workflows/runai-test.yaml index a917dda..9320194 100644 --- a/resources/benchmarks/gang-scheduling/workflows/runai-test.yaml +++ b/resources/benchmarks/gang-scheduling/workflows/runai-test.yaml @@ -1,14 +1,19 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + name: test-gang-scheduling-runai tasks: -- id: configure - type: Configure - params: - nodes: - - type: dgxa100.80g - count: 32 - labels: - nvidia.com/gpu.count: "8" - timeout: 1m - id: register-trainingworkload type: RegisterObj params: diff --git a/resources/benchmarks/scaling/workflows/config-kueue.yaml b/resources/benchmarks/scaling/workflows/config-kueue.yaml index d785d9e..2233526 100644 --- a/resources/benchmarks/scaling/workflows/config-kueue.yaml +++ b/resources/benchmarks/scaling/workflows/config-kueue.yaml @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + name: config-kueue description: register, deploy and configure kueue custom resources tasks: diff --git a/resources/benchmarks/scaling/workflows/config-nodes.yaml b/resources/benchmarks/scaling/workflows/config-nodes.yaml index 65eb85f..952028e 100644 --- a/resources/benchmarks/scaling/workflows/config-nodes.yaml +++ b/resources/benchmarks/scaling/workflows/config-nodes.yaml @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + name: config-nodes description: create 500 virtual GPU nodes tasks: diff --git a/resources/benchmarks/scaling/workflows/config-runai.yaml b/resources/benchmarks/scaling/workflows/config-runai.yaml index f7cf038..dfc47fe 100644 --- a/resources/benchmarks/scaling/workflows/config-runai.yaml +++ b/resources/benchmarks/scaling/workflows/config-runai.yaml @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + name: config-runai description: register, deploy and configure run:ai custom resources tasks: diff --git a/resources/benchmarks/scaling/workflows/config-volcano.yaml b/resources/benchmarks/scaling/workflows/config-volcano.yaml index 11ecd34..6e0b879 100644 --- a/resources/benchmarks/scaling/workflows/config-volcano.yaml +++ b/resources/benchmarks/scaling/workflows/config-volcano.yaml @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + name: config-volcano description: register, deploy and configure volcano custom resources tasks: diff --git a/resources/benchmarks/scaling/workflows/config-yunikorn.yaml b/resources/benchmarks/scaling/workflows/config-yunikorn.yaml index cc79a3d..debce1d 100644 --- a/resources/benchmarks/scaling/workflows/config-yunikorn.yaml +++ b/resources/benchmarks/scaling/workflows/config-yunikorn.yaml @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + name: config-yunikorn description: register, deploy and configure yunikorn custom resources tasks: diff --git a/resources/benchmarks/scaling/workflows/run-test-multi.yaml b/resources/benchmarks/scaling/workflows/run-test-multi.yaml index 818a1f8..cbf1e5f 100644 --- a/resources/benchmarks/scaling/workflows/run-test-multi.yaml +++ b/resources/benchmarks/scaling/workflows/run-test-multi.yaml @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + name: test-scaling-multi-node-job description: deploy a 500-replicas job tasks: diff --git a/resources/benchmarks/scaling/workflows/run-test-single.yaml b/resources/benchmarks/scaling/workflows/run-test-single.yaml index ff5e42e..54760d7 100644 --- a/resources/benchmarks/scaling/workflows/run-test-single.yaml +++ b/resources/benchmarks/scaling/workflows/run-test-single.yaml @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + name: test-scaling-single-node-jobs description: deploy 500 single-replica jobs tasks: diff --git a/resources/benchmarks/scaling/workflows/runai-test-multi.yaml b/resources/benchmarks/scaling/workflows/runai-test-multi.yaml index 5c45bc7..849bfc9 100644 --- a/resources/benchmarks/scaling/workflows/runai-test-multi.yaml +++ b/resources/benchmarks/scaling/workflows/runai-test-multi.yaml @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + name: test-scaling description: deploy a 500-replicas job tasks: diff --git a/resources/benchmarks/scaling/workflows/runai-test-single.yaml b/resources/benchmarks/scaling/workflows/runai-test-single.yaml index a8a530b..d97a9e9 100644 --- a/resources/benchmarks/scaling/workflows/runai-test-single.yaml +++ b/resources/benchmarks/scaling/workflows/runai-test-single.yaml @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + name: test-scaling description: deploy 500 single-replica jobs tasks: diff --git a/resources/benchmarks/templates/jobset/jobset-coscheduling.yaml b/resources/benchmarks/templates/jobset/jobset-coscheduling.yaml new file mode 100644 index 0000000..a55fbae --- /dev/null +++ b/resources/benchmarks/templates/jobset/jobset-coscheduling.yaml @@ -0,0 +1,88 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: scheduling.x-k8s.io/v1alpha1 +kind: PodGroup +metadata: + name: {{._NAME_}} +spec: + minMember: {{.replicas}} +--- +apiVersion: jobset.x-k8s.io/v1alpha2 +kind: JobSet +metadata: + name: "{{._NAME_}}" + namespace: default + labels: + kueue.x-k8s.io/queue-name: team-queue +spec: + successPolicy: + operator: All + targetReplicatedJobs: + - workers + replicatedJobs: + - name: workers + template: + metadata: + annotations: + kueue.x-k8s.io/job-min-parallelism: "1" + spec: + backoffLimit: 0 + completions: {{.replicas}} + parallelism: {{.replicas}} + completionMode: NonIndexed + template: + metadata: + annotations: + pod-complete.stage.kwok.x-k8s.io/delay: {{.ttl}} + pod-complete.stage.kwok.x-k8s.io/jitter-delay: {{.ttl}} + labels: + scheduling.x-k8s.io/pod-group: {{._NAME_}} + spec: + schedulerName: scheduler-plugins-scheduler + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/role + operator: In + values: + - agent + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: scheduling.x-k8s.io/pod-group + operator: In + values: + - pg-{{._NAME_}} + topologyKey: kubernetes.io/hostname + restartPolicy: Never + containers: + - name: test + image: ubuntu + resources: + limits: + cpu: 100m + memory: 250M + nvidia.com/gpu: "8" + requests: + cpu: 100m + memory: 250M + nvidia.com/gpu: "8" + securityContext: + runAsUser: 0 diff --git a/resources/benchmarks/templates/jobset/jobset.yaml b/resources/benchmarks/templates/jobset/jobset.yaml new file mode 100644 index 0000000..a13ba20 --- /dev/null +++ b/resources/benchmarks/templates/jobset/jobset.yaml @@ -0,0 +1,55 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: jobset.x-k8s.io/v1alpha2 +kind: JobSet +metadata: + name: "{{._NAME_}}" + namespace: default +spec: + # We want to declare our JobSet successful if workers finish. + # If workers finish we should clean up the remaining replicatedJobs. + successPolicy: + operator: All + targetReplicatedJobs: + - workers + replicatedJobs: + - name: workers + replicas: {{.replicas}} + template: + spec: + backoffLimit: 0 + completions: {{.replicas}} + parallelism: {{.replicas}} + completionMode: NonIndexed + template: + metadata: + annotations: + pod-complete.stage.kwok.x-k8s.io/delay: {{.ttl}} + pod-complete.stage.kwok.x-k8s.io/jitter-delay: {{.ttl}} + spec: + schedulerName: default-scheduler + containers: + - name: test + image: ubuntu + imagePullPolicy: IfNotPresent + resources: + limits: + cpu: 100m + memory: 250M + nvidia.com/gpu: "8" + requests: + cpu: 100m + memory: 250M + nvidia.com/gpu: "8" diff --git a/scripts/create-test-cluster.sh b/scripts/create-test-cluster.sh index c5471e3..ae54bb7 100755 --- a/scripts/create-test-cluster.sh +++ b/scripts/create-test-cluster.sh @@ -33,7 +33,7 @@ if kind get clusters > /dev/null 2>&1; then read -p "> " choice if [[ "$choice" == "y" ]]; then kind delete cluster - kind create cluster + kind create cluster --image=kindest/node:v1.29.7 fi else kind create cluster @@ -52,6 +52,7 @@ cat << EOF 3: volcano (https://github.com/volcano-sh/volcano) 4: yunikorn (https://github.com/apache/yunikorn-core) 5: run:ai (https://www.run.ai) + 6: combined: coscheduler plugin + jobset + kueue EOF read -p "> " choice @@ -71,6 +72,11 @@ case "$choice" in 5) deploy_runai ;; + 6) + deploy_scheduler_plugins + deploy_jobset + deploy_kueue + ;; esac printYellow Cluster is ready diff --git a/scripts/env.sh b/scripts/env.sh index 3a8fbdf..a7cedbb 100644 --- a/scripts/env.sh +++ b/scripts/env.sh @@ -224,3 +224,13 @@ Run:ai deployment requires environment variables: --set cluster.url=https://example.com \ --set-json 'affinity={"nodeAffinity":{"requiredDuringSchedulingIgnoredDuringExecution":{"nodeSelectorTerms":[{"matchExpressions":[{"key":"type","operator":"NotIn","values":["kwok"]}]}]}}}' } + +SCHEDULER_PLUGINS_VERSION=v0.29.7 +function deploy_scheduler_plugins() { + printGreen Deploying scheduler-plugins + + helm upgrade --install --repo https://scheduler-plugins.sigs.k8s.io scheduler-plugins scheduler-plugins \ + -n scheduler-plugins --create-namespace --version $SCHEDULER_PLUGINS_VERSION \ + --set-json 'scheduler.affinity={"nodeAffinity":{"requiredDuringSchedulingIgnoredDuringExecution":{"nodeSelectorTerms":[{"matchExpressions":[{"key":"type","operator":"NotIn","values":["kwok"]}]}]}}}' \ + --set-json 'controller.affinity={"nodeAffinity":{"requiredDuringSchedulingIgnoredDuringExecution":{"nodeSelectorTerms":[{"matchExpressions":[{"key":"type","operator":"NotIn","values":["kwok"]}]}]}}}' +}