Skip to content

Commit 0c2fe88

Browse files
committed
RHOAIENG-32532: Run RayJob tests in CI
1 parent 398e51b commit 0c2fe88

File tree

1 file changed

+210
-0
lines changed

1 file changed

+210
-0
lines changed
Lines changed: 210 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,210 @@
1+
name: rayjob-e2e-with-kueue
2+
3+
on:
4+
pull_request:
5+
branches:
6+
- main
7+
- 'release-*'
8+
- ray-jobs-feature
9+
paths-ignore:
10+
- 'docs/**'
11+
- '**.adoc'
12+
- '**.md'
13+
- 'LICENSE'
14+
15+
concurrency:
16+
group: ${{ github.head_ref }}-${{ github.workflow }}
17+
cancel-in-progress: true
18+
19+
env:
20+
CODEFLARE_OPERATOR_IMG: "quay.io/project-codeflare/codeflare-operator:dev"
21+
KUEUE_VERSION: "v0.13.3"
22+
23+
jobs:
24+
kubernetes:
25+
runs-on: ubuntu-latest
26+
27+
steps:
28+
- name: Checkout code
29+
uses: actions/checkout@v4
30+
with:
31+
submodules: recursive
32+
33+
- name: Checkout common repo code
34+
uses: actions/checkout@v4
35+
with:
36+
repository: 'project-codeflare/codeflare-common'
37+
ref: 'main'
38+
path: 'common'
39+
40+
41+
- name: Set up specific Python version
42+
uses: actions/setup-python@v5
43+
with:
44+
python-version: '3.11'
45+
cache: 'pip' # caching pip dependencies
46+
47+
- name: Setup and start KinD cluster
48+
uses: ./common/github-actions/kind
49+
with:
50+
worker-nodes: 2 # Multiple nodes for testing Kueue scheduling
51+
52+
- name: Verify Kind cluster
53+
run: |
54+
echo "Checking Kind clusters..."
55+
kind get clusters
56+
echo "Current kubectl context:"
57+
kubectl config current-context
58+
echo "Checking nodes:"
59+
kubectl get nodes
60+
61+
- name: Deploy Kueue
62+
run: |
63+
echo "Deploying Kueue ${KUEUE_VERSION}"
64+
kubectl apply --server-side -f https://github.com/kubernetes-sigs/kueue/releases/download/${KUEUE_VERSION}/manifests.yaml
65+
66+
# Sleep until the kueue manager is running
67+
echo "Waiting for pods in the kueue-system namespace to become ready"
68+
while [[ $(kubectl get pods -n kueue-system -o 'jsonpath={..status.conditions[?(@.type=="Ready")].status}' | tr ' ' '\n' | sort -u) != "True" ]]
69+
do
70+
echo -n "." && sleep 1;
71+
done
72+
echo ""
73+
74+
sleep 5
75+
76+
- name: Deploy KubeRay operator
77+
run: |
78+
KUBERAY_VERSION="v1.4.0"
79+
echo "Deploying KubeRay ${KUBERAY_VERSION}"
80+
81+
# Create namespace first
82+
kubectl create namespace ray-system || true
83+
84+
kubectl apply --server-side -k "github.com/ray-project/kuberay/ray-operator/config/default?ref=${KUBERAY_VERSION}&timeout=180s"
85+
86+
# Check all namespaces to see where KubeRay was deployed
87+
echo "Checking for KubeRay deployment in all namespaces..."
88+
kubectl get deployments -A | grep kuberay || true
89+
90+
# Wait for KubeRay operator to be ready
91+
echo "Waiting for KubeRay operator to become ready..."
92+
kubectl wait --for=condition=Available --timeout=300s deployment/kuberay-operator -n ray-system || {
93+
echo "KubeRay operator not found in ray-system, checking other namespaces:"
94+
kubectl get pods -A | grep kuberay
95+
kubectl get deployments -A | grep kuberay
96+
# Try default namespace
97+
kubectl wait --for=condition=Available --timeout=30s deployment/kuberay-operator -n default || true
98+
# Try kuberay-system namespace
99+
kubectl wait --for=condition=Available --timeout=30s deployment/kuberay-operator -n kuberay-system || true
100+
}
101+
102+
- name: Add user to KinD
103+
uses: ./common/github-actions/kind-add-user
104+
with:
105+
user-name: sdk-user
106+
107+
- name: Configure RBAC for sdk user with limited permissions
108+
run: |
109+
# Basic permissions
110+
kubectl create clusterrole list-ingresses --verb=get,list --resource=ingresses
111+
kubectl create clusterrolebinding sdk-user-list-ingresses --clusterrole=list-ingresses --user=sdk-user
112+
kubectl create clusterrole namespace-creator --verb=get,list,create,delete,patch --resource=namespaces
113+
kubectl create clusterrolebinding sdk-user-namespace-creator --clusterrole=namespace-creator --user=sdk-user
114+
115+
# Ray permissions
116+
kubectl create clusterrole raycluster-creator --verb=get,list,create,delete,patch,watch --resource=rayclusters
117+
kubectl create clusterrolebinding sdk-user-raycluster-creator --clusterrole=raycluster-creator --user=sdk-user
118+
kubectl create clusterrole rayjob-creator --verb=get,list,create,delete,patch,watch,update --resource=rayjobs,rayjobs/status
119+
kubectl create clusterrolebinding sdk-user-rayjob-creator --clusterrole=rayjob-creator --user=sdk-user
120+
121+
# Kueue permissions
122+
kubectl create clusterrole resourceflavor-creator --verb=get,list,create,delete --resource=resourceflavors
123+
kubectl create clusterrolebinding sdk-user-resourceflavor-creator --clusterrole=resourceflavor-creator --user=sdk-user
124+
kubectl create clusterrole clusterqueue-creator --verb=get,list,create,delete,patch --resource=clusterqueues
125+
kubectl create clusterrolebinding sdk-user-clusterqueue-creator --clusterrole=clusterqueue-creator --user=sdk-user
126+
kubectl create clusterrole localqueue-creator --verb=get,list,create,delete,patch --resource=localqueues
127+
kubectl create clusterrolebinding sdk-user-localqueue-creator --clusterrole=localqueue-creator --user=sdk-user
128+
kubectl create clusterrole workload-creator --verb=get,list,watch --resource=workloads
129+
kubectl create clusterrolebinding sdk-user-workload-creator --clusterrole=workload-creator --user=sdk-user
130+
131+
# Additional permissions
132+
kubectl create clusterrole list-secrets --verb=get,list --resource=secrets
133+
kubectl create clusterrolebinding sdk-user-list-secrets --clusterrole=list-secrets --user=sdk-user
134+
kubectl create clusterrole pod-creator --verb=get,list,watch --resource=pods
135+
kubectl create clusterrolebinding sdk-user-pod-creator --clusterrole=pod-creator --user=sdk-user
136+
kubectl create clusterrole service-reader --verb=get,list,watch --resource=services
137+
kubectl create clusterrolebinding sdk-user-service-reader --clusterrole=service-reader --user=sdk-user
138+
kubectl create clusterrole port-forward-pods --verb=create --resource=pods/portforward
139+
kubectl create clusterrolebinding sdk-user-port-forward-pods-binding --clusterrole=port-forward-pods --user=sdk-user
140+
kubectl create clusterrole node-reader --verb=get,list --resource=nodes
141+
kubectl create clusterrolebinding sdk-user-node-reader --clusterrole=node-reader --user=sdk-user
142+
kubectl config use-context sdk-user
143+
144+
- name: Setup test output directory
145+
run: |
146+
CODEFLARE_TEST_OUTPUT_DIR="${{ runner.temp }}/test-logs"
147+
mkdir -p ${CODEFLARE_TEST_OUTPUT_DIR}
148+
echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV
149+
150+
- name: Run RayJob e2e tests
151+
run: |
152+
set -euo pipefail
153+
pip install poetry
154+
poetry install --with test,docs
155+
156+
# Install the SDK in editable mode
157+
pip install -e .
158+
159+
echo "Running RayJob e2e tests..."
160+
# Set environment variable to prevent default queue assignment for non-Kueue tests
161+
export DISABLE_DEFAULT_KUEUE_QUEUE=true
162+
163+
# Run only the tests that are designed for Kueue integration
164+
poetry run pytest -v -s ./tests/e2e/rayjob/rayjob_existing_cluster_test.py ./tests/e2e/rayjob/rayjob_lifecycled_cluster_test.py -x > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1
165+
env:
166+
GRPC_DNS_RESOLVER: "native"
167+
168+
- name: Switch to kind-cluster context to print logs
169+
if: always()
170+
run: kubectl config use-context kind-cluster
171+
172+
- name: Print Pytest output log
173+
if: always()
174+
run: |
175+
echo "Printing Pytest output logs"
176+
cat ${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/pytest_output.log || true
177+
178+
- name: Print Kueue operator logs
179+
if: always()
180+
run: |
181+
echo "Printing Kueue operator logs"
182+
kubectl logs -n kueue-system --tail -1 -l control-plane=controller-manager | tee ${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/kueue-operator.log || true
183+
184+
- name: Print KubeRay operator logs
185+
if: always()
186+
run: |
187+
echo "Printing KubeRay operator logs"
188+
echo "Checking ray-system namespace contents:"
189+
kubectl get all -n ray-system || true
190+
echo "Attempting to get KubeRay logs with different selectors:"
191+
kubectl logs -n ray-system --tail -1 -l app.kubernetes.io/name=kuberay-operator | tee ${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/kuberay.log || \
192+
kubectl logs -n ray-system --tail -1 -l app.kubernetes.io/component=kuberay-operator | tee ${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/kuberay.log || \
193+
kubectl logs -n ray-system --tail -1 deployment/kuberay-operator | tee ${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/kuberay.log || \
194+
echo "Could not find KubeRay operator logs"
195+
196+
- name: Export all KinD pod logs
197+
uses: ./common/github-actions/kind-export-logs
198+
if: always()
199+
with:
200+
output-directory: ${{ env.CODEFLARE_TEST_OUTPUT_DIR }}
201+
202+
- name: Upload logs
203+
uses: actions/upload-artifact@v4
204+
if: always()
205+
with:
206+
name: logs
207+
retention-days: 10
208+
path: |
209+
${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/**/*.log
210+
if-no-files-found: warn

0 commit comments

Comments
 (0)