Skip to content

Commit d3f9a51

Browse files
committed
test: revert e2e workflow
1 parent dfeeb39 commit d3f9a51

File tree

1 file changed

+23
-124
lines changed

1 file changed

+23
-124
lines changed

.github/workflows/e2e_tests.yaml

Lines changed: 23 additions & 124 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,13 @@ on:
55
pull_request:
66
branches:
77
- main
8-
- "release-*"
8+
- 'release-*'
99
- ray-jobs-feature
10-
- kueue-integration
1110
paths-ignore:
12-
- "docs/**"
13-
- "**.adoc"
14-
- "**.md"
15-
- "LICENSE"
11+
- 'docs/**'
12+
- '**.adoc'
13+
- '**.md'
14+
- 'LICENSE'
1615

1716
concurrency:
1817
group: ${{ github.head_ref }}-${{ github.workflow }}
@@ -35,9 +34,9 @@ jobs:
3534
- name: Checkout common repo code
3635
uses: actions/checkout@v4
3736
with:
38-
repository: "project-codeflare/codeflare-common"
39-
ref: "main"
40-
path: "common"
37+
repository: 'project-codeflare/codeflare-common'
38+
ref: 'main'
39+
path: 'common'
4140

4241
- name: Checkout CodeFlare operator repository
4342
uses: actions/checkout@v4
@@ -48,7 +47,7 @@ jobs:
4847
- name: Set Go
4948
uses: actions/setup-go@v5
5049
with:
51-
go-version-file: "./codeflare-operator/go.mod"
50+
go-version-file: './codeflare-operator/go.mod'
5251
cache-dependency-path: "./codeflare-operator/go.sum"
5352

5453
- name: Set up gotestfmt
@@ -59,8 +58,8 @@ jobs:
5958
- name: Set up specific Python version
6059
uses: actions/setup-python@v5
6160
with:
62-
python-version: "3.12"
63-
cache: "pip" # caching pip dependencies
61+
python-version: '3.11'
62+
cache: 'pip' # caching pip dependencies
6463

6564
- name: Setup NVidia GPU environment for KinD
6665
uses: ./common/github-actions/nvidia-gpu-setup
@@ -73,28 +72,6 @@ jobs:
7372
- name: Install NVidia GPU operator for KinD
7473
uses: ./common/github-actions/nvidia-gpu-operator
7574

76-
- name: Wait for nodes to be ready
77-
run: |
78-
echo "Waiting for all nodes to be ready..."
79-
kubectl wait --for=condition=Ready nodes --all --timeout=300s
80-
81-
echo "Checking node status..."
82-
kubectl get nodes -o wide
83-
84-
echo "Checking for CNI readiness..."
85-
for i in {1..30}; do
86-
if kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.status.conditions[?(@.type=="Ready")].status}{"\n"}{end}' | grep -v "True"; then
87-
echo "Waiting for CNI to initialize (attempt $i/30)..."
88-
sleep 10
89-
else
90-
echo "All nodes are ready!"
91-
break
92-
fi
93-
done
94-
95-
# Final verification
96-
kubectl describe nodes | grep -E "Ready|NetworkReady|RuntimeReady|PodCIDR"
97-
9875
- name: Deploy CodeFlare stack
9976
id: deploy
10077
run: |
@@ -106,62 +83,27 @@ jobs:
10683
kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager
10784
cd ..
10885
109-
- name: Verify CodeFlare deployment
110-
run: |
111-
# Wait for Kueue to be ready
112-
echo "Waiting for Kueue controller to be ready..."
113-
kubectl wait --for=condition=Available --timeout=300s deployment -n kueue-system kueue-controller-manager || {
114-
echo "Kueue deployment status:"
115-
kubectl get all -n kueue-system
116-
exit 1
117-
}
118-
119-
# Wait for KubeRay to be ready
120-
echo "Waiting for KubeRay operator to be ready..."
121-
kubectl wait --for=condition=Available --timeout=300s deployment -n default kuberay-operator || {
122-
echo "KubeRay deployment status:"
123-
kubectl get all -n default
124-
exit 1
125-
}
126-
127-
# Verify webhook certificates
128-
echo "Checking CodeFlare operator webhook certificates..."
129-
kubectl get secret -n openshift-operators codeflare-operator-webhook-server-cert -o jsonpath='{.data.ca\.crt}' | base64 -d | openssl x509 -noout -text || {
130-
echo "Warning: Webhook certificate might be missing or invalid"
131-
}
132-
13386
- name: Add user to KinD
13487
uses: ./common/github-actions/kind-add-user
13588
with:
13689
user-name: sdk-user
13790

13891
- name: Configure RBAC for sdk user with limited permissions
13992
run: |
140-
# CRD permissions for discovering resource types
141-
kubectl create clusterrole crd-reader --verb=get,list,watch --resource=customresourcedefinitions.apiextensions.k8s.io
142-
kubectl create clusterrolebinding sdk-user-crd-reader --clusterrole=crd-reader --user=sdk-user
143-
144-
# AppWrapper permissions for CodeFlare workloads
145-
kubectl create clusterrole appwrapper-creator --verb=get,list,watch,create,update,patch,delete --resource=appwrappers.workload.codeflare.dev
146-
kubectl create clusterrolebinding sdk-user-appwrapper-creator --clusterrole=appwrapper-creator --user=sdk-user
147-
148-
# Existing permissions
14993
kubectl create clusterrole list-ingresses --verb=get,list --resource=ingresses
15094
kubectl create clusterrolebinding sdk-user-list-ingresses --clusterrole=list-ingresses --user=sdk-user
15195
kubectl create clusterrole namespace-creator --verb=get,list,create,delete,patch --resource=namespaces
15296
kubectl create clusterrolebinding sdk-user-namespace-creator --clusterrole=namespace-creator --user=sdk-user
153-
kubectl create clusterrole raycluster-creator --verb=get,list,create,delete,patch,watch --resource=rayclusters.ray.io
97+
kubectl create clusterrole raycluster-creator --verb=get,list,create,delete,patch --resource=rayclusters
15498
kubectl create clusterrolebinding sdk-user-raycluster-creator --clusterrole=raycluster-creator --user=sdk-user
155-
kubectl create clusterrole rayjob-creator --verb=get,list,create,delete,patch,watch,update --resource=rayjobs.ray.io,rayjobs/status
156-
kubectl create clusterrolebinding sdk-user-rayjob-creator --clusterrole=rayjob-creator --user=sdk-user
157-
kubectl create clusterrole resourceflavor-creator --verb=get,list,create,delete --resource=resourceflavors.kueue.x-k8s.io
99+
kubectl create clusterrole appwrapper-creator --verb=get,list,create,delete,patch --resource=appwrappers
100+
kubectl create clusterrolebinding sdk-user-appwrapper-creator --clusterrole=appwrapper-creator --user=sdk-user
101+
kubectl create clusterrole resourceflavor-creator --verb=get,list,create,delete --resource=resourceflavors
158102
kubectl create clusterrolebinding sdk-user-resourceflavor-creator --clusterrole=resourceflavor-creator --user=sdk-user
159-
kubectl create clusterrole clusterqueue-creator --verb=get,list,create,delete,patch --resource=clusterqueues.kueue.x-k8s.io
103+
kubectl create clusterrole clusterqueue-creator --verb=get,list,create,delete,patch --resource=clusterqueues
160104
kubectl create clusterrolebinding sdk-user-clusterqueue-creator --clusterrole=clusterqueue-creator --user=sdk-user
161-
kubectl create clusterrole localqueue-creator --verb=get,list,create,delete,patch --resource=localqueues.kueue.x-k8s.io
105+
kubectl create clusterrole localqueue-creator --verb=get,list,create,delete,patch --resource=localqueues
162106
kubectl create clusterrolebinding sdk-user-localqueue-creator --clusterrole=localqueue-creator --user=sdk-user
163-
kubectl create clusterrole workload-creator --verb=get,list,watch --resource=workloads.kueue.x-k8s.io
164-
kubectl create clusterrolebinding sdk-user-workload-creator --clusterrole=workload-creator --user=sdk-user
165107
kubectl create clusterrole list-secrets --verb=get,list --resource=secrets
166108
kubectl create clusterrolebinding sdk-user-list-secrets --clusterrole=list-secrets --user=sdk-user
167109
kubectl create clusterrole pod-creator --verb=get,list,watch --resource=pods
@@ -170,72 +112,30 @@ jobs:
170112
kubectl create clusterrolebinding sdk-user-service-reader --clusterrole=service-reader --user=sdk-user
171113
kubectl create clusterrole port-forward-pods --verb=create --resource=pods/portforward
172114
kubectl create clusterrolebinding sdk-user-port-forward-pods-binding --clusterrole=port-forward-pods --user=sdk-user
173-
kubectl create clusterrole node-reader --verb=get,list --resource=nodes
174-
kubectl create clusterrolebinding sdk-user-node-reader --clusterrole=node-reader --user=sdk-user
175115
kubectl config use-context sdk-user
176116
177-
- name: Verify cluster readiness before tests
178-
run: |
179-
echo "=== Pre-test cluster verification ==="
180-
echo "Current context:"
181-
kubectl config current-context
182-
183-
echo -e "\nNode status:"
184-
kubectl get nodes -o wide
185-
186-
echo -e "\nSystem pods status:"
187-
kubectl get pods -A | grep -E "(kube-system|ray-system|kueue-system|openshift-operators)" || true
188-
189-
echo -e "\nChecking for any pods in error state:"
190-
kubectl get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded | grep -v "NAMESPACE" || echo "No pods in error state"
191-
192-
echo -e "\nKueue resources:"
193-
kubectl get resourceflavors,clusterqueues,localqueues -A || true
194-
195-
echo -e "\nRay CRDs:"
196-
kubectl get crd | grep ray || true
197-
198117
- name: Run e2e tests
199118
run: |
200-
export CODEFLARE_TEST_OUTPUT_DIR="${{ runner.temp }}/test-logs"
201-
mkdir -p ${CODEFLARE_TEST_OUTPUT_DIR}
119+
export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }}
202120
echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV
203121
204122
set -euo pipefail
205123
pip install poetry
206124
poetry install --with test,docs
207125
echo "Running e2e tests..."
208-
poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/e2e-pytest_output.log 2>&1
209-
env:
210-
GRPC_DNS_RESOLVER: "native"
211-
212-
- name: Run RayJob e2e tests
213-
run: |
214-
set -euo pipefail
215-
echo "Running RayJob e2e tests..."
216-
# Set environment variable to prevent default queue assignment for non-Kueue tests
217-
export DISABLE_DEFAULT_KUEUE_QUEUE=true
218-
219-
# Run only the tests that are designed for Kueue integration
220-
poetry run pytest -v -s ./tests/e2e/rayjob/ -x > ${CODEFLARE_TEST_OUTPUT_DIR}/rayjob_pytest_output.log 2>&1
126+
poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1
221127
env:
222128
GRPC_DNS_RESOLVER: "native"
223129

224130
- name: Switch to kind-cluster context to print logs
225131
if: always() && steps.deploy.outcome == 'success'
226132
run: kubectl config use-context kind-cluster
227133

228-
- name: Print RayJob E2E Pytest output log
229-
if: always() && steps.deploy.outcome == 'success'
230-
run: |
231-
echo "Printing RayJob Pytest output logs"
232-
cat ${CODEFLARE_TEST_OUTPUT_DIR}/rayjob_pytest_output.log || echo "No RayJob test output found"
233-
234-
- name: Print E2E Pytest output log
134+
- name: Print Pytest output log
235135
if: always() && steps.deploy.outcome == 'success'
236136
run: |
237-
echo "Printing E2E Pytest output logs"
238-
cat ${CODEFLARE_TEST_OUTPUT_DIR}/e2e-pytest_output.log || echo "No E2E test output found"
137+
echo "Printing Pytest output logs"
138+
cat ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log
239139
240140
- name: Print CodeFlare operator logs
241141
if: always() && steps.deploy.outcome == 'success'
@@ -253,7 +153,7 @@ jobs:
253153
uses: ./common/github-actions/kind-export-logs
254154
if: always() && steps.deploy.outcome == 'success'
255155
with:
256-
output-directory: ${{ env.CODEFLARE_TEST_OUTPUT_DIR }}
156+
output-directory: ${CODEFLARE_TEST_OUTPUT_DIR}
257157

258158
- name: Upload logs
259159
uses: actions/upload-artifact@v4
@@ -263,4 +163,3 @@ jobs:
263163
retention-days: 10
264164
path: |
265165
${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/**/*.log
266-
if-no-files-found: warn

0 commit comments

Comments
 (0)