RHOAIENG-32532: Update kueue integration #1929
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# e2e tests workflow for CodeFlare-SDK | |
name: e2e | |
on: | |
pull_request: | |
branches: | |
- main | |
- "release-*" | |
- ray-jobs-feature | |
- kueue-integration | |
paths-ignore: | |
- "docs/**" | |
- "**.adoc" | |
- "**.md" | |
- "LICENSE" | |
concurrency: | |
group: ${{ github.head_ref }}-${{ github.workflow }} | |
cancel-in-progress: true | |
env: | |
CODEFLARE_OPERATOR_IMG: "quay.io/project-codeflare/codeflare-operator:dev" | |
KUEUE_VERSION: "v0.13.4" | |
jobs: | |
kubernetes: | |
runs-on: gpu-t4-4-core | |
steps: | |
- name: Checkout code | |
uses: actions/checkout@v4 | |
with: | |
submodules: recursive | |
- name: Checkout common repo code | |
uses: actions/checkout@v4 | |
with: | |
repository: "project-codeflare/codeflare-common" | |
ref: "main" | |
path: "common" | |
- name: Checkout CodeFlare operator repository | |
uses: actions/checkout@v4 | |
with: | |
repository: project-codeflare/codeflare-operator | |
path: codeflare-operator | |
- name: Set Go | |
uses: actions/setup-go@v5 | |
with: | |
go-version-file: "./codeflare-operator/go.mod" | |
cache-dependency-path: "./codeflare-operator/go.sum" | |
- name: Set up gotestfmt | |
uses: gotesttools/gotestfmt-action@v2 | |
with: | |
token: ${{ secrets.GITHUB_TOKEN }} | |
- name: Set up specific Python version | |
uses: actions/setup-python@v5 | |
with: | |
python-version: "3.12" | |
cache: "pip" # caching pip dependencies | |
- name: Setup NVidia GPU environment for KinD | |
uses: ./common/github-actions/nvidia-gpu-setup | |
- name: Setup and start KinD cluster | |
uses: ./common/github-actions/kind | |
with: | |
worker-nodes: 1 | |
- name: Install NVidia GPU operator for KinD | |
uses: ./common/github-actions/nvidia-gpu-operator | |
- name: Deploy CodeFlare stack | |
id: deploy | |
run: | | |
cd codeflare-operator | |
echo Setting up CodeFlare stack | |
make setup-e2e | |
echo Deploying CodeFlare operator | |
make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e" | |
kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager | |
cd .. | |
- name: Add user to KinD | |
uses: ./common/github-actions/kind-add-user | |
with: | |
user-name: sdk-user | |
- name: Wait for nodes to be ready | |
run: | | |
echo "Waiting for all nodes to be ready..." | |
kubectl wait --for=condition=Ready nodes --all --timeout=300s | |
echo "Checking node status..." | |
kubectl get nodes -o wide | |
echo "Checking for CNI readiness..." | |
for i in {1..30}; do | |
if kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.status.conditions[?(@.type=="Ready")].status}{"\n"}{end}' | grep -v "True"; then | |
echo "Waiting for CNI to initialize (attempt $i/30)..." | |
sleep 10 | |
else | |
echo "All nodes are ready!" | |
break | |
fi | |
done | |
# Final verification | |
kubectl describe nodes | grep -E "Ready|NetworkReady|RuntimeReady|PodCIDR" | |
- name: Deploy CodeFlare stack | |
id: deploy | |
run: | | |
cd codeflare-operator | |
echo Setting up CodeFlare stack | |
make setup-e2e | |
echo Deploying CodeFlare operator | |
make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e" | |
kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager | |
cd .. | |
- name: Add user to KinD | |
uses: ./common/github-actions/kind-add-user | |
with: | |
user-name: sdk-user | |
- name: Configure RBAC for sdk user with limited permissions | |
run: | | |
kubectl create clusterrole list-ingresses --verb=get,list --resource=ingresses | |
kubectl create clusterrolebinding sdk-user-list-ingresses --clusterrole=list-ingresses --user=sdk-user | |
kubectl create clusterrole namespace-creator --verb=get,list,create,delete,patch --resource=namespaces | |
kubectl create clusterrolebinding sdk-user-namespace-creator --clusterrole=namespace-creator --user=sdk-user | |
kubectl create clusterrole raycluster-creator --verb=get,list,create,delete,patch,watch --resource=rayclusters | |
kubectl create clusterrolebinding sdk-user-raycluster-creator --clusterrole=raycluster-creator --user=sdk-user | |
kubectl create clusterrole rayjob-creator --verb=get,list,create,delete,patch,watch,update --resource=rayjobs,rayjobs/status | |
kubectl create clusterrolebinding sdk-user-rayjob-creator --clusterrole=rayjob-creator --user=sdk-user | |
kubectl create clusterrole resourceflavor-creator --verb=get,list,create,delete --resource=resourceflavors | |
kubectl create clusterrolebinding sdk-user-resourceflavor-creator --clusterrole=resourceflavor-creator --user=sdk-user | |
kubectl create clusterrole clusterqueue-creator --verb=get,list,create,delete,patch --resource=clusterqueues | |
kubectl create clusterrolebinding sdk-user-clusterqueue-creator --clusterrole=clusterqueue-creator --user=sdk-user | |
kubectl create clusterrole localqueue-creator --verb=get,list,create,delete,patch --resource=localqueues | |
kubectl create clusterrolebinding sdk-user-localqueue-creator --clusterrole=localqueue-creator --user=sdk-user | |
kubectl create clusterrole workload-creator --verb=get,list,watch --resource=workloads | |
kubectl create clusterrolebinding sdk-user-workload-creator --clusterrole=workload-creator --user=sdk-user | |
kubectl create clusterrole list-secrets --verb=get,list --resource=secrets | |
kubectl create clusterrolebinding sdk-user-list-secrets --clusterrole=list-secrets --user=sdk-user | |
kubectl create clusterrole pod-creator --verb=get,list,watch --resource=pods | |
kubectl create clusterrolebinding sdk-user-pod-creator --clusterrole=pod-creator --user=sdk-user | |
kubectl create clusterrole service-reader --verb=get,list,watch --resource=services | |
kubectl create clusterrolebinding sdk-user-service-reader --clusterrole=service-reader --user=sdk-user | |
kubectl create clusterrole port-forward-pods --verb=create --resource=pods/portforward | |
kubectl create clusterrolebinding sdk-user-port-forward-pods-binding --clusterrole=port-forward-pods --user=sdk-user | |
kubectl create clusterrole node-reader --verb=get,list --resource=nodes | |
kubectl create clusterrolebinding sdk-user-node-reader --clusterrole=node-reader --user=sdk-user | |
kubectl config use-context sdk-user | |
- name: Verify cluster readiness before tests | |
run: | | |
echo "=== Pre-test cluster verification ===" | |
echo "Current context:" | |
kubectl config current-context | |
echo -e "\nNode status:" | |
kubectl get nodes -o wide | |
echo -e "\nSystem pods status:" | |
kubectl get pods -A | grep -E "(kube-system|ray-system|kueue-system|openshift-operators)" || true | |
echo -e "\nChecking for any pods in error state:" | |
kubectl get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded | grep -v "NAMESPACE" || echo "No pods in error state" | |
echo -e "\nKueue resources:" | |
kubectl get resourceflavors,clusterqueues,localqueues -A || true | |
echo -e "\nRay CRDs:" | |
kubectl get crd | grep ray || true | |
- name: Run e2e tests | |
run: | | |
export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }} | |
echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV | |
set -euo pipefail | |
pip install poetry | |
poetry install --with test,docs | |
echo "Running e2e tests..." | |
poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/e2e-pytest_output.log 2>&1 | |
env: | |
GRPC_DNS_RESOLVER: "native" | |
- name: Run RayJob e2e tests | |
run: | | |
set -euo pipefail | |
echo "Running RayJob e2e tests..." | |
# Set environment variable to prevent default queue assignment for non-Kueue tests | |
export DISABLE_DEFAULT_KUEUE_QUEUE=true | |
# Run only the tests that are designed for Kueue integration | |
poetry run pytest -v -s ./tests/e2e/rayjob/ -x > ${CODEFLARE_TEST_OUTPUT_DIR}/rayjob_pytest_output.log 2>&1 | |
env: | |
GRPC_DNS_RESOLVER: "native" | |
- name: Switch to kind-cluster context to print logs | |
if: always() && steps.deploy.outcome == 'success' | |
run: kubectl config use-context kind-cluster | |
- name: Print RayJob E2E Pytest output log | |
if: always() && steps.deploy.outcome == 'success' | |
run: | | |
echo "Printing Pytest output logs" | |
cat ${CODEFLARE_TEST_OUTPUT_DIR}/rayjob_pytest_output.log | |
- name: Print E2E Pytest output log | |
if: always() && steps.deploy.outcome == 'success' | |
run: | | |
echo "Printing Pytest output logs" | |
cat ${CODEFLARE_TEST_OUTPUT_DIR}/e2e-pytest_output.log | |
- name: Print CodeFlare operator logs | |
if: always() && steps.deploy.outcome == 'success' | |
run: | | |
echo "Printing CodeFlare operator logs" | |
kubectl logs -n openshift-operators --tail -1 -l app.kubernetes.io/name=codeflare-operator | tee ${CODEFLARE_TEST_OUTPUT_DIR}/codeflare-operator.log | |
- name: Print KubeRay operator logs | |
if: always() && steps.deploy.outcome == 'success' | |
run: | | |
echo "Printing KubeRay operator logs" | |
kubectl logs -n ray-system --tail -1 -l app.kubernetes.io/name=kuberay | tee ${CODEFLARE_TEST_OUTPUT_DIR}/kuberay.log | |
- name: Export all KinD pod logs | |
uses: ./common/github-actions/kind-export-logs | |
if: always() && steps.deploy.outcome == 'success' | |
with: | |
output-directory: ${CODEFLARE_TEST_OUTPUT_DIR} | |
- name: Upload logs | |
uses: actions/upload-artifact@v4 | |
if: always() && steps.deploy.outcome == 'success' | |
with: | |
name: logs | |
retention-days: 10 | |
path: | | |
${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/**/*.log | |
if-no-files-found: warn |