Skip to content

Commit 22f507d

Browse files
committed
RHOAIENG-32532: Fix broken E2E tests
1 parent 33fa535 commit 22f507d

File tree

15 files changed

+193
-318
lines changed

15 files changed

+193
-318
lines changed

.github/workflows/e2e_tests.yaml

Lines changed: 125 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -5,20 +5,22 @@ on:
55
pull_request:
66
branches:
77
- main
8-
- 'release-*'
8+
- "release-*"
99
- ray-jobs-feature
10+
- kueue-integration
1011
paths-ignore:
11-
- 'docs/**'
12-
- '**.adoc'
13-
- '**.md'
14-
- 'LICENSE'
12+
- "docs/**"
13+
- "**.adoc"
14+
- "**.md"
15+
- "LICENSE"
1516

1617
concurrency:
1718
group: ${{ github.head_ref }}-${{ github.workflow }}
1819
cancel-in-progress: true
1920

2021
env:
2122
CODEFLARE_OPERATOR_IMG: "quay.io/project-codeflare/codeflare-operator:dev"
23+
KUEUE_VERSION: "v0.13.4"
2224

2325
jobs:
2426
kubernetes:
@@ -33,9 +35,9 @@ jobs:
3335
- name: Checkout common repo code
3436
uses: actions/checkout@v4
3537
with:
36-
repository: 'project-codeflare/codeflare-common'
37-
ref: 'main'
38-
path: 'common'
38+
repository: "project-codeflare/codeflare-common"
39+
ref: "main"
40+
path: "common"
3941

4042
- name: Checkout CodeFlare operator repository
4143
uses: actions/checkout@v4
@@ -46,7 +48,7 @@ jobs:
4648
- name: Set Go
4749
uses: actions/setup-go@v5
4850
with:
49-
go-version-file: './codeflare-operator/go.mod'
51+
go-version-file: "./codeflare-operator/go.mod"
5052
cache-dependency-path: "./codeflare-operator/go.sum"
5153

5254
- name: Set up gotestfmt
@@ -57,8 +59,8 @@ jobs:
5759
- name: Set up specific Python version
5860
uses: actions/setup-python@v5
5961
with:
60-
python-version: '3.11'
61-
cache: 'pip' # caching pip dependencies
62+
python-version: "3.12"
63+
cache: "pip" # caching pip dependencies
6264

6365
- name: Setup NVidia GPU environment for KinD
6466
uses: ./common/github-actions/nvidia-gpu-setup
@@ -71,6 +73,28 @@ jobs:
7173
- name: Install NVidia GPU operator for KinD
7274
uses: ./common/github-actions/nvidia-gpu-operator
7375

76+
- name: Wait for nodes to be ready
77+
run: |
78+
echo "Waiting for all nodes to be ready..."
79+
kubectl wait --for=condition=Ready nodes --all --timeout=300s
80+
81+
echo "Checking node status..."
82+
kubectl get nodes -o wide
83+
84+
echo "Checking for CNI readiness..."
85+
for i in {1..30}; do
86+
if kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.status.conditions[?(@.type=="Ready")].status}{"\n"}{end}' | grep -v "True"; then
87+
echo "Waiting for CNI to initialize (attempt $i/30)..."
88+
sleep 10
89+
else
90+
echo "All nodes are ready!"
91+
break
92+
fi
93+
done
94+
95+
# Final verification
96+
kubectl describe nodes | grep -E "Ready|NetworkReady|RuntimeReady|PodCIDR"
97+
7498
- name: Deploy CodeFlare stack
7599
id: deploy
76100
run: |
@@ -82,27 +106,62 @@ jobs:
82106
kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager
83107
cd ..
84108
109+
- name: Verify CodeFlare deployment
110+
run: |
111+
# Wait for Kueue to be ready
112+
echo "Waiting for Kueue controller to be ready..."
113+
kubectl wait --for=condition=Available --timeout=300s deployment -n kueue-system kueue-controller-manager || {
114+
echo "Kueue deployment status:"
115+
kubectl get all -n kueue-system
116+
exit 1
117+
}
118+
119+
# Wait for KubeRay to be ready
120+
echo "Waiting for KubeRay operator to be ready..."
121+
kubectl wait --for=condition=Available --timeout=300s deployment -n default kuberay-operator || {
122+
echo "KubeRay deployment status:"
123+
kubectl get all -n default
124+
exit 1
125+
}
126+
127+
# Verify webhook certificates
128+
echo "Checking CodeFlare operator webhook certificates..."
129+
kubectl get secret -n openshift-operators codeflare-operator-webhook-server-cert -o jsonpath='{.data.ca\.crt}' | base64 -d | openssl x509 -noout -text || {
130+
echo "Warning: Webhook certificate might be missing or invalid"
131+
}
132+
85133
- name: Add user to KinD
86134
uses: ./common/github-actions/kind-add-user
87135
with:
88136
user-name: sdk-user
89137

90138
- name: Configure RBAC for sdk user with limited permissions
91139
run: |
140+
# CRD permissions for discovering resource types
141+
kubectl create clusterrole crd-reader --verb=get,list,watch --resource=customresourcedefinitions.apiextensions.k8s.io
142+
kubectl create clusterrolebinding sdk-user-crd-reader --clusterrole=crd-reader --user=sdk-user
143+
144+
# AppWrapper permissions for CodeFlare workloads
145+
kubectl create clusterrole appwrapper-creator --verb=get,list,watch,create,update,patch,delete --resource=appwrappers.workload.codeflare.dev
146+
kubectl create clusterrolebinding sdk-user-appwrapper-creator --clusterrole=appwrapper-creator --user=sdk-user
147+
148+
# Existing permissions
92149
kubectl create clusterrole list-ingresses --verb=get,list --resource=ingresses
93150
kubectl create clusterrolebinding sdk-user-list-ingresses --clusterrole=list-ingresses --user=sdk-user
94151
kubectl create clusterrole namespace-creator --verb=get,list,create,delete,patch --resource=namespaces
95152
kubectl create clusterrolebinding sdk-user-namespace-creator --clusterrole=namespace-creator --user=sdk-user
96-
kubectl create clusterrole raycluster-creator --verb=get,list,create,delete,patch --resource=rayclusters
153+
kubectl create clusterrole raycluster-creator --verb=get,list,create,delete,patch,watch --resource=rayclusters.ray.io
97154
kubectl create clusterrolebinding sdk-user-raycluster-creator --clusterrole=raycluster-creator --user=sdk-user
98-
kubectl create clusterrole appwrapper-creator --verb=get,list,create,delete,patch --resource=appwrappers
99-
kubectl create clusterrolebinding sdk-user-appwrapper-creator --clusterrole=appwrapper-creator --user=sdk-user
100-
kubectl create clusterrole resourceflavor-creator --verb=get,list,create,delete --resource=resourceflavors
155+
kubectl create clusterrole rayjob-creator --verb=get,list,create,delete,patch,watch,update --resource=rayjobs.ray.io,rayjobs/status
156+
kubectl create clusterrolebinding sdk-user-rayjob-creator --clusterrole=rayjob-creator --user=sdk-user
157+
kubectl create clusterrole resourceflavor-creator --verb=get,list,create,delete --resource=resourceflavors.kueue.x-k8s.io
101158
kubectl create clusterrolebinding sdk-user-resourceflavor-creator --clusterrole=resourceflavor-creator --user=sdk-user
102-
kubectl create clusterrole clusterqueue-creator --verb=get,list,create,delete,patch --resource=clusterqueues
159+
kubectl create clusterrole clusterqueue-creator --verb=get,list,create,delete,patch --resource=clusterqueues.kueue.x-k8s.io
103160
kubectl create clusterrolebinding sdk-user-clusterqueue-creator --clusterrole=clusterqueue-creator --user=sdk-user
104-
kubectl create clusterrole localqueue-creator --verb=get,list,create,delete,patch --resource=localqueues
161+
kubectl create clusterrole localqueue-creator --verb=get,list,create,delete,patch --resource=localqueues.kueue.x-k8s.io
105162
kubectl create clusterrolebinding sdk-user-localqueue-creator --clusterrole=localqueue-creator --user=sdk-user
163+
kubectl create clusterrole workload-creator --verb=get,list,watch --resource=workloads.kueue.x-k8s.io
164+
kubectl create clusterrolebinding sdk-user-workload-creator --clusterrole=workload-creator --user=sdk-user
106165
kubectl create clusterrole list-secrets --verb=get,list --resource=secrets
107166
kubectl create clusterrolebinding sdk-user-list-secrets --clusterrole=list-secrets --user=sdk-user
108167
kubectl create clusterrole pod-creator --verb=get,list,watch --resource=pods
@@ -111,30 +170,72 @@ jobs:
111170
kubectl create clusterrolebinding sdk-user-service-reader --clusterrole=service-reader --user=sdk-user
112171
kubectl create clusterrole port-forward-pods --verb=create --resource=pods/portforward
113172
kubectl create clusterrolebinding sdk-user-port-forward-pods-binding --clusterrole=port-forward-pods --user=sdk-user
173+
kubectl create clusterrole node-reader --verb=get,list --resource=nodes
174+
kubectl create clusterrolebinding sdk-user-node-reader --clusterrole=node-reader --user=sdk-user
114175
kubectl config use-context sdk-user
115176
177+
- name: Verify cluster readiness before tests
178+
run: |
179+
echo "=== Pre-test cluster verification ==="
180+
echo "Current context:"
181+
kubectl config current-context
182+
183+
echo -e "\nNode status:"
184+
kubectl get nodes -o wide
185+
186+
echo -e "\nSystem pods status:"
187+
kubectl get pods -A | grep -E "(kube-system|ray-system|kueue-system|openshift-operators)" || true
188+
189+
echo -e "\nChecking for any pods in error state:"
190+
kubectl get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded | grep -v "NAMESPACE" || echo "No pods in error state"
191+
192+
echo -e "\nKueue resources:"
193+
kubectl get resourceflavors,clusterqueues,localqueues -A || true
194+
195+
echo -e "\nRay CRDs:"
196+
kubectl get crd | grep ray || true
197+
116198
- name: Run e2e tests
117199
run: |
118-
export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }}
200+
export CODEFLARE_TEST_OUTPUT_DIR="${{ runner.temp }}/test-logs"
201+
mkdir -p ${CODEFLARE_TEST_OUTPUT_DIR}
119202
echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV
120203
121204
set -euo pipefail
122205
pip install poetry
123206
poetry install --with test,docs
124207
echo "Running e2e tests..."
125-
poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1
208+
poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/e2e-pytest_output.log 2>&1
209+
env:
210+
GRPC_DNS_RESOLVER: "native"
211+
212+
- name: Run RayJob e2e tests
213+
run: |
214+
set -euo pipefail
215+
echo "Running RayJob e2e tests..."
216+
# Set environment variable to prevent default queue assignment for non-Kueue tests
217+
export DISABLE_DEFAULT_KUEUE_QUEUE=true
218+
219+
# Run only the tests that are designed for Kueue integration
220+
poetry run pytest -v -s ./tests/e2e/rayjob/ -x > ${CODEFLARE_TEST_OUTPUT_DIR}/rayjob_pytest_output.log 2>&1
126221
env:
127222
GRPC_DNS_RESOLVER: "native"
128223

129224
- name: Switch to kind-cluster context to print logs
130225
if: always() && steps.deploy.outcome == 'success'
131226
run: kubectl config use-context kind-cluster
132227

133-
- name: Print Pytest output log
228+
- name: Print RayJob E2E Pytest output log
229+
if: always() && steps.deploy.outcome == 'success'
230+
run: |
231+
echo "Printing RayJob Pytest output logs"
232+
cat ${CODEFLARE_TEST_OUTPUT_DIR}/rayjob_pytest_output.log || echo "No RayJob test output found"
233+
234+
- name: Print E2E Pytest output log
134235
if: always() && steps.deploy.outcome == 'success'
135236
run: |
136-
echo "Printing Pytest output logs"
137-
cat ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log
237+
echo "Printing E2E Pytest output logs"
238+
cat ${CODEFLARE_TEST_OUTPUT_DIR}/e2e-pytest_output.log || echo "No E2E test output found"
138239
139240
- name: Print CodeFlare operator logs
140241
if: always() && steps.deploy.outcome == 'success'
@@ -152,7 +253,7 @@ jobs:
152253
uses: ./common/github-actions/kind-export-logs
153254
if: always() && steps.deploy.outcome == 'success'
154255
with:
155-
output-directory: ${CODEFLARE_TEST_OUTPUT_DIR}
256+
output-directory: ${{ env.CODEFLARE_TEST_OUTPUT_DIR }}
156257

157258
- name: Upload logs
158259
uses: actions/upload-artifact@v4
@@ -162,3 +263,4 @@ jobs:
162263
retention-days: 10
163264
path: |
164265
${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/**/*.log
266+
if-no-files-found: warn

0 commit comments

Comments
 (0)