1
- from .config import ClusterConfiguration
2
- from .model import RayCluster , AppWrapper
1
+ from os import stat
2
+ from typing import List , Optional , Tuple
3
+
4
+ import openshift as oc
5
+
3
6
from ..utils import pretty_print
4
7
from ..utils .generate_yaml import generate_appwrapper
5
- import openshift as oc
6
- from typing import List , Optional
8
+ from .config import ClusterConfiguration
9
+ from .model import (AppWrapper , AppWrapperStatus , CodeFlareClusterStatus ,
10
+ RayCluster , RayClusterStatus )
7
11
8
12
9
13
class Cluster :
10
14
def __init__ (self , config : ClusterConfiguration ):
11
15
self .config = config
12
- self .app_wrapper_yaml = self .create_app_wrapper ()
16
+ self .app_wrapper_yaml = self .create_app_wrapper ()
13
17
14
18
def create_app_wrapper (self ):
15
19
min_cpu = self .config .min_cpus
16
20
max_cpu = self .config .max_cpus
17
21
min_memory = self .config .min_memory
18
- max_memory = self .config , max_memory
22
+ max_memory = self .config . max_memory
19
23
gpu = self .config .gpu
20
24
workers = self .config .max_worker
21
25
template = self .config .template
@@ -30,12 +34,12 @@ def create_app_wrapper(self):
30
34
# creates a new cluster with the provided or default spec
31
35
def up (self , namespace = 'default' ):
32
36
with oc .project (namespace ):
33
- oc .invoke ("apply" , ["-f" , self .app_wrapper_yaml ])
37
+ oc .invoke ("apply" , ["-f" , self .app_wrapper_yaml ])
34
38
35
39
def down (self , namespace = 'default' ):
36
40
with oc .project (namespace ):
37
- oc .invoke ("delete" ,["AppWrapper" , self .app_wrapper_yaml ])
38
-
41
+ oc .invoke ("delete" , ["AppWrapper" , self .app_wrapper_yaml ])
42
+
39
43
def status (self , print_to_console = True ):
40
44
cluster = _ray_cluster_status (self .config .name )
41
45
if cluster :
@@ -45,6 +49,37 @@ def status(self, print_to_console=True):
45
49
else :
46
50
return None
47
51
52
+ # checks whether the ray cluster is ready
53
+ def is_ready (self ):
54
+ ready = False
55
+ status = CodeFlareClusterStatus .UNKNOWN
56
+ # check the app wrapper status
57
+ appwrapper = _app_wrapper_status (self .config .name )
58
+ if appwrapper :
59
+ if appwrapper .status in [AppWrapperStatus .RUNNING , AppWrapperStatus .COMPLETED , AppWrapperStatus .RUNNING_HOLD_COMPLETION ]:
60
+ ready = False
61
+ status = CodeFlareClusterStatus .QUEUED
62
+ elif appwrapper .status in [AppWrapperStatus .FAILED , AppWrapperStatus .DELETED ]:
63
+ ready = False
64
+ status = CodeFlareClusterStatus .FAILED #should deleted be separate
65
+ return ready , status #exit early, no need to check ray status
66
+ elif appwrapper .status in [AppWrapperStatus .PENDING ]:
67
+ ready = False
68
+ status = CodeFlareClusterStatus .QUEUED
69
+ return ready , status # no need to check the ray status since still in queue
70
+
71
+ # check the ray cluster status
72
+ cluster = _ray_cluster_status (self .config .name )
73
+ if cluster :
74
+ if cluster .status == RayClusterStatus .READY :
75
+ ready = True
76
+ status = CodeFlareClusterStatus .READY
77
+ elif cluster .status in [RayClusterStatus .UNHEALTHY , RayClusterStatus .FAILED ]:
78
+ ready = False
79
+ status = CodeFlareClusterStatus .FAILED
80
+
81
+ return status , ready
82
+
48
83
49
84
def list_all_clusters (print_to_console = True ):
50
85
clusters = _get_ray_clusters ()
@@ -60,13 +95,14 @@ def _get_appwrappers(namespace='default'):
60
95
app_wrappers = oc .selector ('appwrappers' ).qnames ()
61
96
return app_wrappers
62
97
63
-
98
+
64
99
def _app_wrapper_status (name , namespace = 'default' ) -> Optional [AppWrapper ]:
65
100
with oc .project (namespace ), oc .timeout (10 * 60 ):
66
101
cluster = oc .selector (f'appwrapper/{ name } ' ).object ()
67
102
if cluster :
68
103
return _map_to_app_wrapper (cluster )
69
-
104
+
105
+
70
106
def _ray_cluster_status (name , namespace = 'default' ) -> Optional [RayCluster ]:
71
107
# FIXME should we check the appwrapper first
72
108
with oc .project (namespace ), oc .timeout (10 * 60 ):
@@ -87,10 +123,10 @@ def _get_ray_clusters(namespace='default') -> List[RayCluster]:
87
123
return list_of_clusters
88
124
89
125
90
- def _map_to_ray_cluster (cluster )-> RayCluster :
126
+ def _map_to_ray_cluster (cluster ) -> RayCluster :
91
127
cluster_model = cluster .model
92
128
return RayCluster (
93
- name = cluster .name (), status = cluster_model .status .state ,
129
+ name = cluster .name (), status = RayClusterStatus ( cluster_model .status .state . lower ()) ,
94
130
min_workers = cluster_model .spec .workerGroupSpecs [0 ].replicas ,
95
131
max_workers = cluster_model .spec .workerGroupSpecs [0 ].replicas ,
96
132
worker_mem_max = cluster_model .spec .workerGroupSpecs [
@@ -101,9 +137,9 @@ def _map_to_ray_cluster(cluster)->RayCluster:
101
137
worker_gpu = 0 )
102
138
103
139
104
- def _map_to_app_wrapper (cluster )-> AppWrapper :
140
+ def _map_to_app_wrapper (cluster ) -> AppWrapper :
105
141
cluster_model = cluster .model
106
142
return AppWrapper (
107
- name = cluster .name (), status = cluster_model .status .state ,
143
+ name = cluster .name (), status = AppWrapperStatus ( cluster_model .status .state . lower ()) ,
108
144
can_run = cluster_model .status .canrun ,
109
145
job_state = cluster_model .status .queuejobstate )
0 commit comments