5
5
pull_request :
6
6
branches :
7
7
- main
8
- - ' release-*'
8
+ - " release-*"
9
9
- ray-jobs-feature
10
+ - kueue-integration
10
11
paths-ignore :
11
- - ' docs/**'
12
- - ' **.adoc'
13
- - ' **.md'
14
- - ' LICENSE'
12
+ - " docs/**"
13
+ - " **.adoc"
14
+ - " **.md"
15
+ - " LICENSE"
15
16
16
17
concurrency :
17
18
group : ${{ github.head_ref }}-${{ github.workflow }}
18
19
cancel-in-progress : true
19
20
20
21
env :
21
22
CODEFLARE_OPERATOR_IMG : " quay.io/project-codeflare/codeflare-operator:dev"
23
+ KUEUE_VERSION : " v0.13.4"
22
24
23
25
jobs :
24
26
kubernetes :
33
35
- name : Checkout common repo code
34
36
uses : actions/checkout@v4
35
37
with :
36
- repository : ' project-codeflare/codeflare-common'
37
- ref : ' main'
38
- path : ' common'
38
+ repository : " project-codeflare/codeflare-common"
39
+ ref : " main"
40
+ path : " common"
39
41
40
42
- name : Checkout CodeFlare operator repository
41
43
uses : actions/checkout@v4
46
48
- name : Set Go
47
49
uses : actions/setup-go@v5
48
50
with :
49
- go-version-file : ' ./codeflare-operator/go.mod'
51
+ go-version-file : " ./codeflare-operator/go.mod"
50
52
cache-dependency-path : " ./codeflare-operator/go.sum"
51
53
52
54
- name : Set up gotestfmt
57
59
- name : Set up specific Python version
58
60
uses : actions/setup-python@v5
59
61
with :
60
- python-version : ' 3.11 '
61
- cache : ' pip' # caching pip dependencies
62
+ python-version : " 3.12 "
63
+ cache : " pip" # caching pip dependencies
62
64
63
65
- name : Setup NVidia GPU environment for KinD
64
66
uses : ./common/github-actions/nvidia-gpu-setup
71
73
- name : Install NVidia GPU operator for KinD
72
74
uses : ./common/github-actions/nvidia-gpu-operator
73
75
76
+ - name : Wait for nodes to be ready
77
+ run : |
78
+ echo "Waiting for all nodes to be ready..."
79
+ kubectl wait --for=condition=Ready nodes --all --timeout=300s
80
+
81
+ echo "Checking node status..."
82
+ kubectl get nodes -o wide
83
+
84
+ echo "Checking for CNI readiness..."
85
+ for i in {1..30}; do
86
+ if kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.status.conditions[?(@.type=="Ready")].status}{"\n"}{end}' | grep -v "True"; then
87
+ echo "Waiting for CNI to initialize (attempt $i/30)..."
88
+ sleep 10
89
+ else
90
+ echo "All nodes are ready!"
91
+ break
92
+ fi
93
+ done
94
+
95
+ # Final verification
96
+ kubectl describe nodes | grep -E "Ready|NetworkReady|RuntimeReady|PodCIDR"
97
+
74
98
- name : Deploy CodeFlare stack
75
99
id : deploy
76
100
run : |
@@ -82,27 +106,62 @@ jobs:
82
106
kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager
83
107
cd ..
84
108
109
+ - name : Verify CodeFlare deployment
110
+ run : |
111
+ # Wait for Kueue to be ready
112
+ echo "Waiting for Kueue controller to be ready..."
113
+ kubectl wait --for=condition=Available --timeout=300s deployment -n kueue-system kueue-controller-manager || {
114
+ echo "Kueue deployment status:"
115
+ kubectl get all -n kueue-system
116
+ exit 1
117
+ }
118
+
119
+ # Wait for KubeRay to be ready
120
+ echo "Waiting for KubeRay operator to be ready..."
121
+ kubectl wait --for=condition=Available --timeout=300s deployment -n default kuberay-operator || {
122
+ echo "KubeRay deployment status:"
123
+ kubectl get all -n default
124
+ exit 1
125
+ }
126
+
127
+ # Verify webhook certificates
128
+ echo "Checking CodeFlare operator webhook certificates..."
129
+ kubectl get secret -n openshift-operators codeflare-operator-webhook-server-cert -o jsonpath='{.data.ca\.crt}' | base64 -d | openssl x509 -noout -text || {
130
+ echo "Warning: Webhook certificate might be missing or invalid"
131
+ }
132
+
85
133
- name : Add user to KinD
86
134
uses : ./common/github-actions/kind-add-user
87
135
with :
88
136
user-name : sdk-user
89
137
90
138
- name : Configure RBAC for sdk user with limited permissions
91
139
run : |
140
+ # CRD permissions for discovering resource types
141
+ kubectl create clusterrole crd-reader --verb=get,list,watch --resource=customresourcedefinitions.apiextensions.k8s.io
142
+ kubectl create clusterrolebinding sdk-user-crd-reader --clusterrole=crd-reader --user=sdk-user
143
+
144
+ # AppWrapper permissions for CodeFlare workloads
145
+ kubectl create clusterrole appwrapper-creator --verb=get,list,watch,create,update,patch,delete --resource=appwrappers.workload.codeflare.dev
146
+ kubectl create clusterrolebinding sdk-user-appwrapper-creator --clusterrole=appwrapper-creator --user=sdk-user
147
+
148
+ # Existing permissions
92
149
kubectl create clusterrole list-ingresses --verb=get,list --resource=ingresses
93
150
kubectl create clusterrolebinding sdk-user-list-ingresses --clusterrole=list-ingresses --user=sdk-user
94
151
kubectl create clusterrole namespace-creator --verb=get,list,create,delete,patch --resource=namespaces
95
152
kubectl create clusterrolebinding sdk-user-namespace-creator --clusterrole=namespace-creator --user=sdk-user
96
- kubectl create clusterrole raycluster-creator --verb=get,list,create,delete,patch --resource=rayclusters
153
+ kubectl create clusterrole raycluster-creator --verb=get,list,create,delete,patch,watch --resource=rayclusters.ray.io
97
154
kubectl create clusterrolebinding sdk-user-raycluster-creator --clusterrole=raycluster-creator --user=sdk-user
98
- kubectl create clusterrole appwrapper -creator --verb=get,list,create,delete,patch --resource=appwrappers
99
- kubectl create clusterrolebinding sdk-user-appwrapper -creator --clusterrole=appwrapper -creator --user=sdk-user
100
- kubectl create clusterrole resourceflavor-creator --verb=get,list,create,delete --resource=resourceflavors
155
+ kubectl create clusterrole rayjob -creator --verb=get,list,create,delete,patch,watch,update --resource=rayjobs.ray.io,rayjobs/status
156
+ kubectl create clusterrolebinding sdk-user-rayjob -creator --clusterrole=rayjob -creator --user=sdk-user
157
+ kubectl create clusterrole resourceflavor-creator --verb=get,list,create,delete --resource=resourceflavors.kueue.x-k8s.io
101
158
kubectl create clusterrolebinding sdk-user-resourceflavor-creator --clusterrole=resourceflavor-creator --user=sdk-user
102
- kubectl create clusterrole clusterqueue-creator --verb=get,list,create,delete,patch --resource=clusterqueues
159
+ kubectl create clusterrole clusterqueue-creator --verb=get,list,create,delete,patch --resource=clusterqueues.kueue.x-k8s.io
103
160
kubectl create clusterrolebinding sdk-user-clusterqueue-creator --clusterrole=clusterqueue-creator --user=sdk-user
104
- kubectl create clusterrole localqueue-creator --verb=get,list,create,delete,patch --resource=localqueues
161
+ kubectl create clusterrole localqueue-creator --verb=get,list,create,delete,patch --resource=localqueues.kueue.x-k8s.io
105
162
kubectl create clusterrolebinding sdk-user-localqueue-creator --clusterrole=localqueue-creator --user=sdk-user
163
+ kubectl create clusterrole workload-creator --verb=get,list,watch --resource=workloads.kueue.x-k8s.io
164
+ kubectl create clusterrolebinding sdk-user-workload-creator --clusterrole=workload-creator --user=sdk-user
106
165
kubectl create clusterrole list-secrets --verb=get,list --resource=secrets
107
166
kubectl create clusterrolebinding sdk-user-list-secrets --clusterrole=list-secrets --user=sdk-user
108
167
kubectl create clusterrole pod-creator --verb=get,list,watch --resource=pods
@@ -111,30 +170,72 @@ jobs:
111
170
kubectl create clusterrolebinding sdk-user-service-reader --clusterrole=service-reader --user=sdk-user
112
171
kubectl create clusterrole port-forward-pods --verb=create --resource=pods/portforward
113
172
kubectl create clusterrolebinding sdk-user-port-forward-pods-binding --clusterrole=port-forward-pods --user=sdk-user
173
+ kubectl create clusterrole node-reader --verb=get,list --resource=nodes
174
+ kubectl create clusterrolebinding sdk-user-node-reader --clusterrole=node-reader --user=sdk-user
114
175
kubectl config use-context sdk-user
115
176
177
+ - name : Verify cluster readiness before tests
178
+ run : |
179
+ echo "=== Pre-test cluster verification ==="
180
+ echo "Current context:"
181
+ kubectl config current-context
182
+
183
+ echo -e "\nNode status:"
184
+ kubectl get nodes -o wide
185
+
186
+ echo -e "\nSystem pods status:"
187
+ kubectl get pods -A | grep -E "(kube-system|ray-system|kueue-system|openshift-operators)" || true
188
+
189
+ echo -e "\nChecking for any pods in error state:"
190
+ kubectl get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded | grep -v "NAMESPACE" || echo "No pods in error state"
191
+
192
+ echo -e "\nKueue resources:"
193
+ kubectl get resourceflavors,clusterqueues,localqueues -A || true
194
+
195
+ echo -e "\nRay CRDs:"
196
+ kubectl get crd | grep ray || true
197
+
116
198
- name : Run e2e tests
117
199
run : |
118
- export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }}
200
+ export CODEFLARE_TEST_OUTPUT_DIR="${{ runner.temp }}/test-logs"
201
+ mkdir -p ${CODEFLARE_TEST_OUTPUT_DIR}
119
202
echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV
120
203
121
204
set -euo pipefail
122
205
pip install poetry
123
206
poetry install --with test,docs
124
207
echo "Running e2e tests..."
125
- poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1
208
+ poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/e2e-pytest_output.log 2>&1
209
+ env :
210
+ GRPC_DNS_RESOLVER : " native"
211
+
212
+ - name : Run RayJob e2e tests
213
+ run : |
214
+ set -euo pipefail
215
+ echo "Running RayJob e2e tests..."
216
+ # Set environment variable to prevent default queue assignment for non-Kueue tests
217
+ export DISABLE_DEFAULT_KUEUE_QUEUE=true
218
+
219
+ # Run only the tests that are designed for Kueue integration
220
+ poetry run pytest -v -s ./tests/e2e/rayjob/ -x > ${CODEFLARE_TEST_OUTPUT_DIR}/rayjob_pytest_output.log 2>&1
126
221
env :
127
222
GRPC_DNS_RESOLVER : " native"
128
223
129
224
- name : Switch to kind-cluster context to print logs
130
225
if : always() && steps.deploy.outcome == 'success'
131
226
run : kubectl config use-context kind-cluster
132
227
133
- - name : Print Pytest output log
228
+ - name : Print RayJob E2E Pytest output log
229
+ if : always() && steps.deploy.outcome == 'success'
230
+ run : |
231
+ echo "Printing RayJob Pytest output logs"
232
+ cat ${CODEFLARE_TEST_OUTPUT_DIR}/rayjob_pytest_output.log || echo "No RayJob test output found"
233
+
234
+ - name : Print E2E Pytest output log
134
235
if : always() && steps.deploy.outcome == 'success'
135
236
run : |
136
- echo "Printing Pytest output logs"
137
- cat ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log
237
+ echo "Printing E2E Pytest output logs"
238
+ cat ${CODEFLARE_TEST_OUTPUT_DIR}/e2e- pytest_output.log || echo "No E2E test output found"
138
239
139
240
- name : Print CodeFlare operator logs
140
241
if : always() && steps.deploy.outcome == 'success'
@@ -152,7 +253,7 @@ jobs:
152
253
uses : ./common/github-actions/kind-export-logs
153
254
if : always() && steps.deploy.outcome == 'success'
154
255
with :
155
- output-directory : ${CODEFLARE_TEST_OUTPUT_DIR}
256
+ output-directory : ${{ env. CODEFLARE_TEST_OUTPUT_DIR } }
156
257
157
258
- name : Upload logs
158
259
uses : actions/upload-artifact@v4
@@ -162,3 +263,4 @@ jobs:
162
263
retention-days : 10
163
264
path : |
164
265
${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/**/*.log
266
+ if-no-files-found : warn
0 commit comments