Skip to content

Commit 42139cf

Browse files
draft appwrapper templating (#8)
1 parent 0c7fe27 commit 42139cf

File tree

3 files changed

+183
-7
lines changed

3 files changed

+183
-7
lines changed

src/codeflare_sdk/cluster/cluster.py

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,34 @@
11
from .config import ClusterConfiguration
22
from .model import RayCluster, AppWrapper
33
from ..utils import pretty_print
4+
from ..utils.generate_yaml import generate_appwrapper
45
import openshift as oc
56
from typing import List, Optional
67

78

89
class Cluster:
910
def __init__(self, config: ClusterConfiguration):
1011
self.config = config
11-
12-
# creates a new cluser with the provided or default spec
12+
self.app_wrapper_yaml = self.create_app_wrapper()
13+
14+
def create_app_wrapper(self):
15+
cpu=self.config.max_cpus
16+
memory=self.config.memory
17+
gpu=self.config.gpu
18+
workers=self.config.max_worker
19+
template=self.config.template
20+
return generate_appwrapper(cpu=cpu, memory=memory,
21+
gpu=gpu,workers=workers,
22+
template=template)
23+
24+
# creates a new cluster with the provided or default spec
1325
def up(self, namespace='default'):
1426
with oc.project(namespace):
15-
oc.invoke("apply", ["-f",
16-
"https://raw.githubusercontent.com/IBM/multi-cluster-app-dispatcher/quota-management/doc/usage/examples/kuberay/config/aw-raycluster.yaml"])
27+
oc.invoke("apply", ["-f", self.app_wrapper_yaml ])
1728

18-
def down(self, name, namespace='default'):
29+
def down(self, namespace='default'):
1930
with oc.project(namespace):
20-
oc.invoke("delete",["AppWrapper", self.config.name])
31+
oc.invoke("delete",["AppWrapper", self.app_wrapper_yaml])
2132

2233
def status(self, print_to_console=True):
2334
cluster = _ray_cluster_status(self.config.name)

src/codeflare_sdk/cluster/config.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,16 @@
33
@dataclass
44
class ClusterConfiguration:
55
name: str
6+
head_info: list = []
7+
machine_types: list = []
68
min_cpus: int = 1
79
max_cpus: int = 1
8-
min_worker: int = 0
10+
min_worker: int = 1
911
max_worker: int = 1
12+
min_memory: int = 2
13+
max_memory: int = 2
14+
gpu: int = 0
15+
template: str = "src/codeflare_sdk/templates/base-template.yaml"
16+
instascale: bool = False
17+
envs: dict = {}
18+
image: str = "rayproject/ray:latest"
Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
apiVersion: mcad.ibm.com/v1beta1
2+
kind: AppWrapper
3+
metadata:
4+
name: raycluster-autoscaler
5+
namespace: default
6+
spec:
7+
resources:
8+
Items: []
9+
GenericItems:
10+
- replicas: 1
11+
custompodresources:
12+
- replicas: 2
13+
requests:
14+
cpu: 10
15+
memory: 512Mi
16+
limits:
17+
cpu: 10
18+
memory: 1G
19+
generictemplate:
20+
# This config demonstrates KubeRay's Ray autoscaler integration.
21+
# The resource requests and limits in this config are too small for production!
22+
# For an example with more realistic resource configuration, see
23+
# ray-cluster.autoscaler.large.yaml.
24+
apiVersion: ray.io/v1alpha1
25+
kind: RayCluster
26+
metadata:
27+
labels:
28+
controller-tools.k8s.io: "1.0"
29+
# A unique identifier for the head node and workers of this cluster.
30+
name: raycluster-autoscaler
31+
spec:
32+
# The version of Ray you are using. Make sure all Ray containers are running this version of Ray.
33+
rayVersion: '2.0.0'
34+
# If enableInTreeAutoscaling is true, the autoscaler sidecar will be added to the Ray head pod.
35+
# Ray autoscaler integration is supported only for Ray versions >= 1.11.0
36+
# Ray autoscaler integration is Beta with KubeRay >= 0.3.0 and Ray >= 2.0.0.
37+
enableInTreeAutoscaling: true
38+
# autoscalerOptions is an OPTIONAL field specifying configuration overrides for the Ray autoscaler.
39+
# The example configuration shown below below represents the DEFAULT values.
40+
# (You may delete autoscalerOptions if the defaults are suitable.)
41+
autoscalerOptions:
42+
# upscalingMode is "Default" or "Aggressive."
43+
# Conservative: Upscaling is rate-limited; the number of pending worker pods is at most the size of the Ray cluster.
44+
# Default: Upscaling is not rate-limited.
45+
# Aggressive: An alias for Default; upscaling is not rate-limited.
46+
upscalingMode: Default
47+
# idleTimeoutSeconds is the number of seconds to wait before scaling down a worker pod which is not using Ray resources.
48+
idleTimeoutSeconds: 60
49+
# image optionally overrides the autoscaler's container image.
50+
# If instance.spec.rayVersion is at least "2.0.0", the autoscaler will default to the same image as
51+
# the ray container. For older Ray versions, the autoscaler will default to using the Ray 2.0.0 image.
52+
## image: "my-repo/my-custom-autoscaler-image:tag"
53+
# imagePullPolicy optionally overrides the autoscaler container's image pull policy.
54+
imagePullPolicy: Always
55+
# resources specifies optional resource request and limit overrides for the autoscaler container.
56+
# For large Ray clusters, we recommend monitoring container resource usage to determine if overriding the defaults is required.
57+
resources:
58+
limits:
59+
cpu: "500m"
60+
memory: "512Mi"
61+
requests:
62+
cpu: "500m"
63+
memory: "512Mi"
64+
######################headGroupSpec#################################
65+
# head group template and specs, (perhaps 'group' is not needed in the name)
66+
headGroupSpec:
67+
# Kubernetes Service Type, valid values are 'ClusterIP', 'NodePort' and 'LoadBalancer'
68+
serviceType: ClusterIP
69+
# logical group name, for this called head-group, also can be functional
70+
# pod type head or worker
71+
# rayNodeType: head # Not needed since it is under the headgroup
72+
# the following params are used to complete the ray start: ray start --head --block ...
73+
rayStartParams:
74+
# Flag "no-monitor" will be automatically set when autoscaling is enabled.
75+
dashboard-host: '0.0.0.0'
76+
block: 'true'
77+
# num-cpus: '1' # can be auto-completed from the limits
78+
# Use `resources` to optionally specify custom resource annotations for the Ray node.
79+
# The value of `resources` is a string-integer mapping.
80+
# Currently, `resources` must be provided in the specific format demonstrated below:
81+
# resources: '"{\"Custom1\": 1, \"Custom2\": 5}"'
82+
#pod template
83+
template:
84+
spec:
85+
containers:
86+
# The Ray head pod
87+
- name: ray-head
88+
image: rayproject/ray:2.0.0
89+
imagePullPolicy: Always
90+
ports:
91+
- containerPort: 6379
92+
name: gcs
93+
- containerPort: 8265
94+
name: dashboard
95+
- containerPort: 10001
96+
name: client
97+
lifecycle:
98+
preStop:
99+
exec:
100+
command: ["/bin/sh","-c","ray stop"]
101+
resources:
102+
limits:
103+
cpu: "1"
104+
memory: "1G"
105+
requests:
106+
cpu: "500m"
107+
memory: "512Mi"
108+
workerGroupSpecs:
109+
# the pod replicas in this group typed worker
110+
- replicas: 1
111+
minReplicas: 1
112+
maxReplicas: 300
113+
# logical group name, for this called small-group, also can be functional
114+
groupName: small-group
115+
# if worker pods need to be added, we can simply increment the replicas
116+
# if worker pods need to be removed, we decrement the replicas, and populate the podsToDelete list
117+
# the operator will remove pods from the list until the number of replicas is satisfied
118+
# when a pod is confirmed to be deleted, its name will be removed from the list below
119+
#scaleStrategy:
120+
# workersToDelete:
121+
# - raycluster-complete-worker-small-group-bdtwh
122+
# - raycluster-complete-worker-small-group-hv457
123+
# - raycluster-complete-worker-small-group-k8tj7
124+
# the following params are used to complete the ray start: ray start --block ...
125+
rayStartParams:
126+
block: 'true'
127+
#pod template
128+
template:
129+
metadata:
130+
labels:
131+
key: value
132+
# annotations for pod
133+
annotations:
134+
key: value
135+
spec:
136+
initContainers:
137+
# the env var $RAY_IP is set by the operator if missing, with the value of the head service name
138+
- name: init-myservice
139+
image: busybox:1.28
140+
command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"]
141+
containers:
142+
- name: machine-learning # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc'
143+
image: rayproject/ray:2.0.0
144+
# environment variables to set in the container.Optional.
145+
# Refer to https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/
146+
lifecycle:
147+
preStop:
148+
exec:
149+
command: ["/bin/sh","-c","ray stop"]
150+
resources:
151+
limits:
152+
cpu: "1"
153+
memory: "512Mi"
154+
requests:
155+
cpu: "500m"
156+
memory: "256Mi"

0 commit comments

Comments
 (0)