From 80a434be07a8154c18f10e03c468a9e21e23edcc Mon Sep 17 00:00:00 2001 From: Dmitry Shmulevich Date: Thu, 9 May 2024 20:36:31 -0700 Subject: [PATCH] added kueue example Signed-off-by: Dmitry Shmulevich --- docs/examples/kueue/kueue.md | 39 +++++++++++++++++++++++ docs/examples/kueue/kwok-pod-complete.yml | 36 +++++++++++++++++++++ docs/examples/kueue/queues.yml | 30 +++++++++++++++++ docs/examples/kueue/values.yaml | 30 +++++++++++++++++ docs/getting_started.md | 4 +++ resources/templates/kueue/job.yml | 27 ++++++++++++++++ resources/tests/kueue/test-job.yml | 23 +++++++++++++ 7 files changed, 189 insertions(+) create mode 100644 docs/examples/kueue/kueue.md create mode 100644 docs/examples/kueue/kwok-pod-complete.yml create mode 100644 docs/examples/kueue/queues.yml create mode 100644 docs/examples/kueue/values.yaml create mode 100644 resources/templates/kueue/job.yml create mode 100644 resources/tests/kueue/test-job.yml diff --git a/docs/examples/kueue/kueue.md b/docs/examples/kueue/kueue.md new file mode 100644 index 0000000..975ac01 --- /dev/null +++ b/docs/examples/kueue/kueue.md @@ -0,0 +1,39 @@ +# Example of running Kueue with knavigator + +## Install kueue + +Install kueue by following these [instructions](https://kueue.sigs.k8s.io/docs/installation/): + +```bash +VERSION=v0.6.2 +kubectl apply --server-side -f https://github.com/kubernetes-sigs/kueue/releases/download/$VERSION/manifests.yaml +kubectl apply --server-side -f https://github.com/kubernetes-sigs/kueue/releases/download/$VERSION/prometheus.yaml +``` + +## Deploy cluster and local queues + +```bash +kubectl apply -f docs/examples/kueue/queues.yml +``` + +## Update KWOK stage + +Update KWOK's (pod-complete)[kwok-pod-complete.yml] stage. This change is required to successfully run pods initiated by `kueue`. + +```bash +kubectl apply -f docs/examples/kueue/kwok-pod-complete.yml +``` + +## Deploy virtual nodes + +In this example we deploy 4 GPU nodes. Refer to (values.yaml)[values.yaml] for more details. + +```bash +helm upgrade --install virtual-nodes charts/virtual-nodes -f docs/examples/kueue/values.yaml +``` + +## Run kueue job + +```bash +./bin/knavigator -tasks resources/tests/kueue/test-job.yml +``` diff --git a/docs/examples/kueue/kwok-pod-complete.yml b/docs/examples/kueue/kwok-pod-complete.yml new file mode 100644 index 0000000..2a4e8db --- /dev/null +++ b/docs/examples/kueue/kwok-pod-complete.yml @@ -0,0 +1,36 @@ +apiVersion: kwok.x-k8s.io/v1alpha1 +kind: Stage +metadata: + name: pod-complete +spec: + next: + statusTemplate: | + {{`{{ $now := Now }} + {{ $root := . }} + containerStatuses: + {{ range $index, $item := .spec.containers }} + {{ $origin := index $root.status.containerStatuses $index }} + - image: {{ $item.image | Quote }} + name: {{ $item.name | Quote }} + ready: false + restartCount: 0 + started: false + state: + terminated: + exitCode: 0 + finishedAt: {{ $now | Quote }} + reason: Completed + startedAt: {{ $now | Quote }} + {{ end }} + phase: Succeeded`}} + resourceRef: + apiGroup: v1 + kind: Pod + selector: + matchExpressions: + - key: .metadata.deletionTimestamp + operator: DoesNotExist + - key: .status.phase + operator: In + values: + - Running diff --git a/docs/examples/kueue/queues.yml b/docs/examples/kueue/queues.yml new file mode 100644 index 0000000..9ad7514 --- /dev/null +++ b/docs/examples/kueue/queues.yml @@ -0,0 +1,30 @@ +# cluster-queue.yaml +apiVersion: kueue.x-k8s.io/v1beta1 +kind: ClusterQueue +metadata: + name: "cluster-queue" +spec: + namespaceSelector: {} # match all. + resourceGroups: + - coveredResources: ["cpu", "memory", "nvidia.com/gpu"] + flavors: + - name: "default-flavor" + resources: + - name: "cpu" + nominalQuota: 4 + - name: "memory" + nominalQuota: 36Gi + - name: "nvidia.com/gpu" + nominalQuota: 4 +--- +apiVersion: kueue.x-k8s.io/v1beta1 +kind: ResourceFlavor +metadata: + name: "default-flavor" +--- +apiVersion: kueue.x-k8s.io/v1beta1 +kind: LocalQueue +metadata: + name: team-a-queue +spec: + clusterQueue: cluster-queue diff --git a/docs/examples/kueue/values.yaml b/docs/examples/kueue/values.yaml new file mode 100644 index 0000000..fe75c7b --- /dev/null +++ b/docs/examples/kueue/values.yaml @@ -0,0 +1,30 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +nodes: +- type: dgxa100.80g + count: 4 + annotations: {} + labels: + nvidia.com/gpu.count: "8" + nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + conditions: + - message: Filesystem is not read-only + reason: FilesystemIsNotReadOnly + status: "False" + type: ReadonlyFilesystem + - message: kernel has no deadlock + reason: KernelHasNoDeadlock + status: "False" + type: KernelDeadlock diff --git a/docs/getting_started.md b/docs/getting_started.md index e4fc1e2..386b478 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -58,3 +58,7 @@ Run a test jobset with a driver and workers: ```shell ./bin/knavigator -tasks ./resources/tests/k8s/test-jobset-with-driver.yml ``` + +### Kueue + +Refer to [this document](./docs/examples/kueue/kueue.md) for detailed instructions on how to run `kueue` system with `knavigator`. diff --git a/resources/templates/kueue/job.yml b/resources/templates/kueue/job.yml new file mode 100644 index 0000000..c19619a --- /dev/null +++ b/resources/templates/kueue/job.yml @@ -0,0 +1,27 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: "{{._NAME_}}" + namespace: {{.namespace}} + labels: + kueue.x-k8s.io/queue-name: {{.queueName}} +spec: + completions: {{.completions}} + parallelism: {{.parallelism}} + completionMode: {{.completionMode}} + template: + spec: + containers: + - name: test + image: {{.image}} + imagePullPolicy: IfNotPresent + resources: + limits: + cpu: "{{.cpu}}" + memory: {{.memory}} + nvidia.com/gpu: "{{.gpu}}" + requests: + cpu: "{{.cpu}}" + memory: {{.memory}} + nvidia.com/gpu: "{{.gpu}}" + restartPolicy: Never diff --git a/resources/tests/kueue/test-job.yml b/resources/tests/kueue/test-job.yml new file mode 100644 index 0000000..3e5959e --- /dev/null +++ b/resources/tests/kueue/test-job.yml @@ -0,0 +1,23 @@ +name: test-kueue-job +description: submit and validate a kueue job +tasks: +- id: job + type: SubmitObj + params: + count: 1 + grv: + group: batch + version: v1 + resource: jobs + template: "resources/templates/kueue/job.yml" + nameformat: "job{{._ENUM_}}" + overrides: + queueName: team-a-queue + namespace: default + parallelism: 3 + completions: 3 + completionMode: Indexed + image: ubuntu + cpu: 100m + memory: 512M + gpu: 1