Skip to content

Commit 2a65edf

Browse files
authored
Updated new general yaml generation (#9)
* Updated new general yaml generation * Revert to empty default
1 parent 42139cf commit 2a65edf

File tree

4 files changed

+330
-36
lines changed

4 files changed

+330
-36
lines changed

src/codeflare_sdk/cluster/cluster.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,20 @@ def __init__(self, config: ClusterConfiguration):
1212
self.app_wrapper_yaml = self.create_app_wrapper()
1313

1414
def create_app_wrapper(self):
15-
cpu=self.config.max_cpus
16-
memory=self.config.memory
15+
min_cpu=self.config.min_cpus
16+
max_cpu=self.config.max_cpus
17+
min_memory=self.config.min_memory
18+
max_memory=self.config,max_memory
1719
gpu=self.config.gpu
1820
workers=self.config.max_worker
1921
template=self.config.template
20-
return generate_appwrapper(cpu=cpu, memory=memory,
21-
gpu=gpu,workers=workers,
22-
template=template)
22+
image=self.config.image
23+
instascale=self.config.instascale
24+
instance_types=self.config.machine_types
25+
env=self.config.envs
26+
return generate_appwrapper(min_cpu=min_cpu, max_cpu=max_cpu, min_memory=min_memory,
27+
max_memory=max_memory, gpu=gpu, workers=workers, template=template,
28+
image=image, instascale=instascale, instance_types=instance_types, env=env)
2329

2430
# creates a new cluster with the provided or default spec
2531
def up(self, namespace='default'):

src/codeflare_sdk/cluster/config.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,15 @@
44
class ClusterConfiguration:
55
name: str
66
head_info: list = []
7-
machine_types: list = []
7+
machine_types: list = [] #["m4.xlarge", "g4dn.xlarge"]
88
min_cpus: int = 1
99
max_cpus: int = 1
1010
min_worker: int = 1
1111
max_worker: int = 1
1212
min_memory: int = 2
1313
max_memory: int = 2
1414
gpu: int = 0
15-
template: str = "src/codeflare_sdk/templates/base-template.yaml"
15+
template: str = "src/codeflare_sdk/templates/new-template.yaml"
1616
instascale: bool = False
1717
envs: dict = {}
1818
image: str = "rayproject/ray:latest"
Lines changed: 234 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,234 @@
1+
apiVersion: mcad.ibm.com/v1beta1
2+
kind: AppWrapper
3+
metadata:
4+
name: aw-kuberay
5+
namespace: default
6+
#new addition
7+
labels:
8+
orderedinstance: "m4.xlarge_g4dn.xlarge"
9+
spec:
10+
priority: 9
11+
resources:
12+
Items: []
13+
GenericItems:
14+
- replicas: 1
15+
#new addition
16+
custompodresources:
17+
- replicas: 1
18+
requests:
19+
cpu: 2
20+
memory: 12G
21+
nvidia.com/gpu: 0
22+
limits:
23+
cpu: 2
24+
memory: 12G
25+
nvidia.com/gpu: 0
26+
- replicas: 3
27+
requests:
28+
cpu: 2
29+
memory: 12G
30+
nvidia.com/gpu: 1
31+
limits:
32+
cpu: 2
33+
memory: 12G
34+
nvidia.com/gpu: 1
35+
generictemplate:
36+
# This config demonstrates KubeRay's Ray autoscaler integration.
37+
# The resource requests and limits in this config are too small for production!
38+
# For an example with more realistic resource configuration, see
39+
# ray-cluster.autoscaler.large.yaml.
40+
apiVersion: ray.io/v1alpha1
41+
kind: RayCluster
42+
metadata:
43+
labels:
44+
appwrapper.mcad.ibm.com: "aw-kuberay"
45+
controller-tools.k8s.io: "1.0"
46+
# A unique identifier for the head node and workers of this cluster.
47+
name: kuberay-cluster
48+
# finalizers:
49+
# - kubernetes
50+
spec:
51+
# The version of Ray you are using. Make sure all Ray containers are running this version of Ray.
52+
rayVersion: '1.12.0'
53+
# If enableInTreeAutoscaling is true, the autoscaler sidecar will be added to the Ray head pod.
54+
# Ray autoscaler integration is supported only for Ray versions >= 1.11.0
55+
# Ray autoscaler integration is Beta with KubeRay >= 0.3.0 and Ray >= 2.0.0.
56+
enableInTreeAutoscaling: false
57+
# autoscalerOptions is an OPTIONAL field specifying configuration overrides for the Ray autoscaler.
58+
# The example configuration shown below below represents the DEFAULT values.
59+
# (You may delete autoscalerOptions if the defaults are suitable.)
60+
autoscalerOptions:
61+
# upscalingMode is "Default" or "Aggressive."
62+
# Conservative: Upscaling is rate-limited; the number of pending worker pods is at most the size of the Ray cluster.
63+
# Default: Upscaling is not rate-limited.
64+
# Aggressive: An alias for Default; upscaling is not rate-limited.
65+
upscalingMode: Default
66+
# idleTimeoutSeconds is the number of seconds to wait before scaling down a worker pod which is not using Ray resources.
67+
idleTimeoutSeconds: 60
68+
# image optionally overrides the autoscaler's container image.
69+
# If instance.spec.rayVersion is at least "2.0.0", the autoscaler will default to the same image as
70+
# the ray container. For older Ray versions, the autoscaler will default to using the Ray 2.0.0 image.
71+
## image: "my-repo/my-custom-autoscaler-image:tag"
72+
# imagePullPolicy optionally overrides the autoscaler container's image pull policy.
73+
imagePullPolicy: Always
74+
# resources specifies optional resource request and limit overrides for the autoscaler container.
75+
# For large Ray clusters, we recommend monitoring container resource usage to determine if overriding the defaults is required.
76+
resources:
77+
limits:
78+
cpu: "500m"
79+
memory: "512Mi"
80+
requests:
81+
cpu: "500m"
82+
memory: "512Mi"
83+
######################headGroupSpec#################################
84+
# head group template and specs, (perhaps 'group' is not needed in the name)
85+
headGroupSpec:
86+
# Kubernetes Service Type, valid values are 'ClusterIP', 'NodePort' and 'LoadBalancer'
87+
serviceType: ClusterIP
88+
# logical group name, for this called head-group, also can be functional
89+
# pod type head or worker
90+
# rayNodeType: head # Not needed since it is under the headgroup
91+
# the following params are used to complete the ray start: ray start --head --block ...
92+
rayStartParams:
93+
# Flag "no-monitor" will be automatically set when autoscaling is enabled.
94+
dashboard-host: '0.0.0.0'
95+
block: 'true'
96+
# num-cpus: '1' # can be auto-completed from the limits
97+
# Use `resources` to optionally specify custom resource annotations for the Ray node.
98+
# The value of `resources` is a string-integer mapping.
99+
# Currently, `resources` must be provided in the specific format demonstrated below:
100+
# resources: '"{\"Custom1\": 1, \"Custom2\": 5}"'
101+
num-gpus: 0
102+
#pod template
103+
template:
104+
spec:
105+
#new addition
106+
affinity:
107+
nodeAffinity:
108+
requiredDuringSchedulingIgnoredDuringExecution:
109+
nodeSelectorTerms:
110+
- matchExpressions:
111+
- key: aw-kuberay
112+
operator: In
113+
values:
114+
- "aw-kuberay"
115+
containers:
116+
# The Ray head pod
117+
- name: ray-head
118+
image: rayproject/ray:latest
119+
env:
120+
- name: AWS_ACCESS_KEY_ID
121+
valueFrom:
122+
secretKeyRef:
123+
name: s3-creds
124+
key: AWS_ACCESS_KEY_ID
125+
- name: AWS_SECRET_ACCESS_KEY
126+
valueFrom:
127+
secretKeyRef:
128+
name: s3-creds
129+
key: AWS_SECRET_ACCESS_KEY
130+
- name: ENDPOINT_URL
131+
valueFrom:
132+
secretKeyRef:
133+
name: s3-creds
134+
key: ENDPOINT_URL
135+
imagePullPolicy: Always
136+
ports:
137+
- containerPort: 6379
138+
name: gcs
139+
- containerPort: 8265
140+
name: dashboard
141+
- containerPort: 10001
142+
name: client
143+
lifecycle:
144+
preStop:
145+
exec:
146+
command: ["/bin/sh","-c","ray stop"]
147+
resources:
148+
limits:
149+
cpu: "2"
150+
memory: "12G"
151+
nvidia.com/gpu: "0"
152+
requests:
153+
cpu: "2"
154+
memory: "12G"
155+
nvidia.com/gpu: "0"
156+
workerGroupSpecs:
157+
# the pod replicas in this group typed worker
158+
- replicas: 3
159+
minReplicas: 3
160+
maxReplicas: 3
161+
# logical group name, for this called small-group, also can be functional
162+
groupName: small-group
163+
# if worker pods need to be added, we can simply increment the replicas
164+
# if worker pods need to be removed, we decrement the replicas, and populate the podsToDelete list
165+
# the operator will remove pods from the list until the number of replicas is satisfied
166+
# when a pod is confirmed to be deleted, its name will be removed from the list below
167+
#scaleStrategy:
168+
# workersToDelete:
169+
# - raycluster-complete-worker-small-group-bdtwh
170+
# - raycluster-complete-worker-small-group-hv457
171+
# - raycluster-complete-worker-small-group-k8tj7
172+
# the following params are used to complete the ray start: ray start --block ...
173+
rayStartParams:
174+
block: 'true'
175+
num-gpus: 1
176+
#pod template
177+
template:
178+
metadata:
179+
labels:
180+
key: value
181+
# annotations for pod
182+
annotations:
183+
key: value
184+
# finalizers:
185+
# - kubernetes
186+
spec:
187+
affinity:
188+
nodeAffinity:
189+
requiredDuringSchedulingIgnoredDuringExecution:
190+
nodeSelectorTerms:
191+
- matchExpressions:
192+
- key: aw-kuberay
193+
operator: In
194+
values:
195+
- "aw-kuberay"
196+
initContainers:
197+
# the env var $RAY_IP is set by the operator if missing, with the value of the head service name
198+
- name: init-myservice
199+
image: busybox:1.28
200+
command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"]
201+
containers:
202+
- name: machine-learning # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc'
203+
image: rayproject/ray:latest
204+
env:
205+
- name: AWS_ACCESS_KEY_ID
206+
valueFrom:
207+
secretKeyRef:
208+
name: s3-creds
209+
key: AWS_ACCESS_KEY_ID
210+
- name: AWS_SECRET_ACCESS_KEY
211+
valueFrom:
212+
secretKeyRef:
213+
name: s3-creds
214+
key: AWS_SECRET_ACCESS_KEY
215+
- name: ENDPOINT_URL
216+
valueFrom:
217+
secretKeyRef:
218+
name: s3-creds
219+
key: ENDPOINT_URL
220+
# environment variables to set in the container.Optional.
221+
# Refer to https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/
222+
lifecycle:
223+
preStop:
224+
exec:
225+
command: ["/bin/sh","-c","ray stop"]
226+
resources:
227+
limits:
228+
cpu: "2"
229+
memory: "12G"
230+
nvidia.com/gpu: "1"
231+
requests:
232+
cpu: "2"
233+
memory: "12G"
234+
nvidia.com/gpu: "1"

0 commit comments

Comments
 (0)