Skip to content

Commit

Permalink
[kubectl-plugin][feat] support specifying number of head GPUs
Browse files Browse the repository at this point in the history
when creating a RayCluster with `kubectl ray create cluster NAME --head-gpu N`.
Similar to the `--worker-gpu` switch.

Signed-off-by: David Xia <[email protected]>
  • Loading branch information
davidxia committed Feb 4, 2025
1 parent c8d34f4 commit dd07f38
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 5 deletions.
5 changes: 4 additions & 1 deletion kubectl-plugin/pkg/cmd/create/create_cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ type CreateClusterOptions struct {
image string
headCPU string
headMemory string
headGPU string
workerCPU string
workerMemory string
workerGPU string
Expand Down Expand Up @@ -75,10 +76,11 @@ func NewCreateClusterCommand(streams genericclioptions.IOStreams) *cobra.Command
cmd.Flags().StringVar(&options.image, "image", options.image, "Ray image to use in the Ray Cluster yaml")
cmd.Flags().StringVar(&options.headCPU, "head-cpu", "2", "Number of CPU for the ray head. Default to 2")
cmd.Flags().StringVar(&options.headMemory, "head-memory", "4Gi", "Amount of memory to use for the ray head. Default to 4Gi")
cmd.Flags().StringVar(&options.headGPU, "head-gpu", "0", "Number of GPUs for the ray head. Default to 0")
cmd.Flags().Int32Var(&options.workerReplicas, "worker-replicas", 1, "Number of the worker group replicas. Default of 1")
cmd.Flags().StringVar(&options.workerCPU, "worker-cpu", "2", "Number of CPU for the ray worker. Default to 2")
cmd.Flags().StringVar(&options.workerMemory, "worker-memory", "4Gi", "Amount of memory to use for the ray worker. Default to 4Gi")
cmd.Flags().StringVar(&options.workerGPU, "worker-gpu", "0", "Number of GPU for the ray worker. Default to 0")
cmd.Flags().StringVar(&options.workerGPU, "worker-gpu", "0", "Number of GPUs for the ray worker. Default to 0")
cmd.Flags().BoolVar(&options.dryRun, "dry-run", false, "Will not apply the generated cluster and will print out the generated yaml")

options.configFlags.AddFlags(cmd.Flags())
Expand Down Expand Up @@ -129,6 +131,7 @@ func (options *CreateClusterOptions) Run(ctx context.Context, factory cmdutil.Fa
Image: options.image,
HeadCPU: options.headCPU,
HeadMemory: options.headMemory,
HeadGPU: options.headGPU,
WorkerReplicas: options.workerReplicas,
WorkerCPU: options.workerCPU,
WorkerMemory: options.workerMemory,
Expand Down
1 change: 1 addition & 0 deletions kubectl-plugin/pkg/cmd/create/create_cluster_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ func TestRayCreateClusterValidate(t *testing.T) {
rayVersion: "ray-version",
image: "ray-image",
headCPU: "5",
headGPU: "1",
headMemory: "5Gi",
workerReplicas: 3,
workerCPU: "4",
Expand Down
25 changes: 21 additions & 4 deletions kubectl-plugin/pkg/util/generation/generation.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,15 @@ import (
rayv1ac "github.com/ray-project/kuberay/ray-operator/pkg/client/applyconfiguration/ray/v1"
)

const (
resourceNvidiaGPU = "nvidia.com/gpu"
)

type RayClusterSpecObject struct {
RayVersion string
Image string
HeadCPU string
HeadGPU string
HeadMemory string
WorkerCPU string
WorkerGPU string
Expand Down Expand Up @@ -96,13 +101,25 @@ func (rayClusterSpecObject *RayClusterSpecObject) generateRayClusterSpec() *rayv
corev1.ResourceMemory: resource.MustParse(rayClusterSpecObject.WorkerMemory),
}))))))

gpuResource := resource.MustParse(rayClusterSpecObject.WorkerGPU)
if !gpuResource.IsZero() {
headGPUResource := resource.MustParse(rayClusterSpecObject.HeadGPU)
if !headGPUResource.IsZero() {
var requests, limits corev1.ResourceList
requests = *rayClusterSpec.HeadGroupSpec.Template.Spec.Containers[0].Resources.Requests
limits = *rayClusterSpec.HeadGroupSpec.Template.Spec.Containers[0].Resources.Limits
requests[corev1.ResourceName(resourceNvidiaGPU)] = headGPUResource
limits[corev1.ResourceName(resourceNvidiaGPU)] = headGPUResource

rayClusterSpec.HeadGroupSpec.Template.Spec.Containers[0].Resources.Requests = &requests
rayClusterSpec.HeadGroupSpec.Template.Spec.Containers[0].Resources.Limits = &limits
}

workerGPUResource := resource.MustParse(rayClusterSpecObject.WorkerGPU)
if !workerGPUResource.IsZero() {
var requests, limits corev1.ResourceList
requests = *rayClusterSpec.WorkerGroupSpecs[0].Template.Spec.Containers[0].Resources.Requests
limits = *rayClusterSpec.WorkerGroupSpecs[0].Template.Spec.Containers[0].Resources.Limits
requests[corev1.ResourceName("nvidia.com/gpu")] = gpuResource
limits[corev1.ResourceName("nvidia.com/gpu")] = gpuResource
requests[corev1.ResourceName(resourceNvidiaGPU)] = workerGPUResource
limits[corev1.ResourceName(resourceNvidiaGPU)] = workerGPUResource

rayClusterSpec.WorkerGroupSpecs[0].Template.Spec.Containers[0].Resources.Requests = &requests
rayClusterSpec.WorkerGroupSpecs[0].Template.Spec.Containers[0].Resources.Limits = &limits
Expand Down
6 changes: 6 additions & 0 deletions kubectl-plugin/pkg/util/generation/generation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ func TestGenerateRayCluterApplyConfig(t *testing.T) {
Image: "rayproject/ray:2.39.0",
HeadCPU: "1",
HeadMemory: "5Gi",
HeadGPU: "1",
WorkerReplicas: 3,
WorkerCPU: "2",
WorkerMemory: "10Gi",
Expand All @@ -34,6 +35,7 @@ func TestGenerateRayCluterApplyConfig(t *testing.T) {
assert.Equal(t, testRayClusterYamlObject.RayVersion, *result.Spec.RayVersion)
assert.Equal(t, testRayClusterYamlObject.Image, *result.Spec.HeadGroupSpec.Template.Spec.Containers[0].Image)
assert.Equal(t, resource.MustParse(testRayClusterYamlObject.HeadCPU), *result.Spec.HeadGroupSpec.Template.Spec.Containers[0].Resources.Requests.Cpu())
assert.Equal(t, resource.MustParse(testRayClusterYamlObject.HeadGPU), *result.Spec.HeadGroupSpec.Template.Spec.Containers[0].Resources.Requests.Name(corev1.ResourceName("nvidia.com/gpu"), resource.DecimalSI))
assert.Equal(t, resource.MustParse(testRayClusterYamlObject.HeadMemory), *result.Spec.HeadGroupSpec.Template.Spec.Containers[0].Resources.Requests.Memory())
assert.Equal(t, "default-group", *result.Spec.WorkerGroupSpecs[0].GroupName)
assert.Equal(t, testRayClusterYamlObject.WorkerReplicas, *result.Spec.WorkerGroupSpecs[0].Replicas)
Expand All @@ -51,6 +53,7 @@ func TestGenerateRayJobApplyConfig(t *testing.T) {
RayVersion: "2.39.0",
Image: "rayproject/ray:2.39.0",
HeadCPU: "1",
HeadGPU: "1",
HeadMemory: "5Gi",
WorkerReplicas: 3,
WorkerCPU: "2",
Expand Down Expand Up @@ -83,6 +86,7 @@ func TestConvertRayClusterApplyConfigToYaml(t *testing.T) {
Image: "rayproject/ray:2.39.0",
HeadCPU: "1",
HeadMemory: "5Gi",
HeadGPU: "1",
WorkerReplicas: 3,
WorkerCPU: "2",
WorkerMemory: "10Gi",
Expand Down Expand Up @@ -119,9 +123,11 @@ spec:
limits:
cpu: "1"
memory: 5Gi
nvidia.com/gpu: "1"
requests:
cpu: "1"
memory: 5Gi
nvidia.com/gpu: "1"
rayVersion: 2.39.0
workerGroupSpecs:
- groupName: default-group
Expand Down

0 comments on commit dd07f38

Please sign in to comment.