diff --git a/.github/workflows/examples-ci.yml b/.github/workflows/examples-ci.yml new file mode 100644 index 0000000..b13e9b9 --- /dev/null +++ b/.github/workflows/examples-ci.yml @@ -0,0 +1,190 @@ +name: Iris examples CI + +on: + pull_request: + branches: [ main ] + push: + branches: [ main ] + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + test-iris: + runs-on: ubuntu-latest + env: + DIGITALOCEAN_API_URL: ${{ secrets.DIGITALOCEAN_API_URL }} + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Install doctl + uses: digitalocean/action-doctl@v2 + with: + token: ${{ secrets.DEV_CLOUD_KEY }} + + - name: Install jq + run: | + sudo apt-get update + sudo apt-get install -y jq + + - name: Create Droplet + id: create + uses: nick-fields/retry@v3 + env: + DIGITALOCEAN_API_URL: ${{ secrets.DIGITALOCEAN_API_URL }} + with: + timeout_minutes: 5 + max_attempts: 10 + retry_wait_seconds: 60 + command: | + DROPLET_NAME="iris-$(date +%s)" + + # Create droplet and capture error output + DROPLET_JSON=$(doctl compute droplet create \ + --image 188571990 \ + --size "${{ secrets.DIGITALOCEAN_SIZE }}" \ + --region atl1 \ + --ssh-keys "${{ secrets.SSH_KEY_ID }}" \ + "$DROPLET_NAME" \ + -o json \ + --wait 2>&1) + + DROPLET_EXIT_CODE=$? + + # Check if droplet creation was successful + if [ $DROPLET_EXIT_CODE -ne 0 ]; then + echo "$DROPLET_JSON" + exit 1 + fi + + # Validate that we got valid JSON + if ! echo "$DROPLET_JSON" | jq . >/dev/null 2>&1; then + echo "$DROPLET_JSON" + exit 1 + fi + + # Extract droplet ID and IP with error checking + DROPLET_ID=$(echo "$DROPLET_JSON" | jq -r '.[0].id // empty') + PUBLIC_IP=$(echo "$DROPLET_JSON" | jq -r '.[0].networks.v4[] | select(.type=="public") | .ip_address // empty') + + if [ -z "$DROPLET_ID" ] || [ -z "$PUBLIC_IP" ]; then + echo "$DROPLET_JSON" + exit 1 + fi + + # Set outputs for other steps + echo "droplet_id=$DROPLET_ID" >> $GITHUB_OUTPUT + echo "public_ip=$PUBLIC_IP" >> $GITHUB_OUTPUT + + echo "βœ… Droplet created successfully!" + + - name: Setup SSH key + run: | + mkdir -p ~/.ssh + echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_rsa + chmod 600 ~/.ssh/id_rsa + ssh-keyscan -H ${{ steps.create.outputs.public_ip }} >> ~/.ssh/known_hosts 2>/dev/null || true + + - name: Wait for SSH to be ready + run: | + echo "⏳ Waiting for SSH to be ready..." + for i in {1..30}; do + if ssh -o ConnectTimeout=10 -o StrictHostKeyChecking=no root@${{ steps.create.outputs.public_ip }} "echo 'SSH ready'" 2>/dev/null; then + echo "βœ… SSH is ready!" + break + fi + echo "Attempt $i/30: SSH not ready yet, waiting 10 seconds..." + sleep 10 + done + + - name: Determine commit hash + id: commit_hash + run: | + if [ "${{ github.event_name }}" == "pull_request" ]; then + echo "commit_hash=${{ github.event.pull_request.head.sha }}" >> $GITHUB_OUTPUT + else + echo "commit_hash=${{ github.sha }}" >> $GITHUB_OUTPUT + fi + + - name: Install Iris and run tests + run: | + echo "πŸš€Iris installation..." + + # Setup SSH, clone repo, and install dependencies + ssh -o StrictHostKeyChecking=no root@${{ steps.create.outputs.public_ip }} " + set -e + + # Remove any stale dpkg locks + sudo rm -f /var/lib/apt/lists/lock + sudo rm -f /var/cache/apt/archives/lock + sudo rm -f /var/lib/dpkg/lock* + + # Setup SSH key for git access + mkdir -p ~/.ssh + echo '${{ secrets.SSH_PRIVATE_KEY }}' > ~/.ssh/id_rsa + chmod 600 ~/.ssh/id_rsa + ssh-keyscan -H github.com >> ~/.ssh/known_hosts + + # Set environment variables + export ROCM_PATH=/opt/rocm + export PATH=\$ROCM_PATH/bin:\$PATH + export LD_LIBRARY_PATH=\$ROCM_PATH/lib:\$LD_LIBRARY_PATH + + # Install system dependencies + sudo apt-get update && sudo apt-get install -y python3-venv cmake openmpi-bin libopenmpi-dev + + # Clone the repository + git clone git@github.com:ROCm/iris.git + cd iris + echo 'Checking out commit ${{ steps.commit_hash.outputs.commit_hash }}' + git checkout ${{ steps.commit_hash.outputs.commit_hash }} + + # Setup Python environment + python3 -m venv iris_env + source iris_env/bin/activate + + pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm6.4 + pip install -e '.[dev]' + + # Create results directory + mkdir -p /iris_results + + # Run pytest tests + pytest tests/ -v + " + + - name: Download test outputs + if: always() + run: | + echo "πŸ“₯ Downloading test outputs..." + mkdir -p test_outputs + scp -r -o StrictHostKeyChecking=no root@${{ steps.create.outputs.public_ip }}:/iris_results/ ./test_outputs/ || echo "No results directory found" + + # Create tar artifact + tar -czf iris_test_outputs.tar.gz -C test_outputs . + echo "βœ… Test outputs archived as iris_test_outputs.tar.gz" + + # Print test results summary + echo "πŸ“Š Iris Test Results Summary:" + echo "Pytest tests completed. Check the logs above for detailed results." + + - name: Upload test outputs as artifact + if: always() + uses: actions/upload-artifact@v4 + with: + name: iris-test-outputs + path: iris_test_outputs.tar.gz + retention-days: 15 + + - name: Auto-destroy droplet after use + if: always() + env: + DIGITALOCEAN_API_URL: ${{ secrets.DIGITALOCEAN_API_URL }} + run: | + echo "πŸ—‘οΈ Auto-destroying droplet ${{ steps.create.outputs.droplet_id }}..." + doctl compute droplet delete ${{ steps.create.outputs.droplet_id }} --force + echo "βœ… Droplet auto-destroyed successfully!" diff --git a/.github/workflows/scripts/examples.sh b/.github/workflows/scripts/examples.sh new file mode 100755 index 0000000..486d48e --- /dev/null +++ b/.github/workflows/scripts/examples.sh @@ -0,0 +1,29 @@ +#!/bin/bash +set -e + +export OMPI_ALLOW_RUN_AS_ROOT=1 +export OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 + +# Run examples and store outputs +echo 'Running Iris examples...' + +mkdir -p /iris_results + +# Examples +mpirun -np 8 python examples/00_load/load_bench.py -o /iris_results/load_bench.json +mpirun -np 8 python examples/01_store/store_bench.py -o /iris_results/store_bench.json + + +mpirun -np 8 python examples/02_all_load/all_load_bench.py -o /iris_results/all_load_bench.json +mpirun -np 8 python examples/03_all_store/all_store_bench.py -o /iris_results/all_store_bench.json + + +mpirun -np 8 python examples/04_atomic_add/atomic_add_bench.py -o /iris_results/atomic_add_bench.json +mpirun -np 8 python examples/05_atomic_xchg/atomic_xchg_bench.py -o /iris_results/atomic_xchg_bench.json + +mpirun -np 2 python examples/06_message_passing/message_passing_load_store.py +mpirun -np 2 python examples/06_message_passing/message_passing_put.py + +mpirun -np 8 python examples/07_gemm_all_scatter/benchmark.py --benchmark --validate -o /iris_results/gemm_all_scatter_bench.json +mpirun -np 8 python examples/08_gemm_atomics_all_reduce/benchmark.py --benchmark --validate -o /iris_results/gemm_atomics_all_reduce_bench.json +mpirun -np 8 python examples/09_gemm_one_shot_all_reduce/benchmark.py --benchmark --validate -o /iris_results/gemm_one_shot_all_reduce_bench.json diff --git a/docs/DEVCLOUD.md b/docs/DEVCLOUD.md new file mode 100644 index 0000000..d5f3590 --- /dev/null +++ b/docs/DEVCLOUD.md @@ -0,0 +1,66 @@ +# AMD Developer Cloud Setup Guide + +This guide provides step-by-step instructions for setting up Iris on the AMD Developer Cloud environment. + +## Prerequisites + +Before starting, ensure you have access to an AMD Developer Cloud and create a GPU Droplet. + +## Environment Setup + +### 1. Set ROCm Environment Variables + +First, set up the ROCm environment variables: + +```bash +export ROCM_PATH=/opt/rocm +export PATH=$ROCM_PATH/bin:$PATH +export LD_LIBRARY_PATH=$ROCM_PATH/lib +``` + +**Note**: You may want to add these to your shell profile (`.bashrc`, `.zshrc`, etc.) for persistence across sessions. + +### 2. Install System Dependencies + +Install the required system packages: + +```bash +sudo apt-get update && sudo apt-get install -y python3-venv cmake openmpi-bin libopenmpi-dev +``` + +### 3. Create and Activate Virtual Environment + +Create a Python virtual environment to isolate Iris dependencies: + +```bash +# Create virtual environment +python3 -m venv iris_env + +# Activate virtual environment +source iris_env/bin/activate +``` + +### 4. Install Python Dependencies +```bash +pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm6.4 +``` + + +## Iris Installation + +### 1. Clone the Repository + +```bash +git clone git@github.com:ROCm/iris.git +cd iris +``` + +### 2. Install Iris + +Install Iris in development mode: + +```bash +pip install -e . +``` + +Next, you can run the examples! See the [Examples README](../examples/README.md) for detailed information about available examples and how to run them. diff --git a/examples/README.md b/examples/README.md index e626558..0b6a8b9 100644 --- a/examples/README.md +++ b/examples/README.md @@ -42,7 +42,8 @@ mpirun -np 8 python examples/04_atomic_add/atomic_add_bench.py # Atomic add acr mpirun -np 8 python examples/05_atomic_xchg/atomic_xchg_bench.py # Atomic exchange across GPUs # Example command to run message passing -python examples/06_message_passing/message_passing.py +mpirun -np 2 python examples/06_message_passing/message_passing_load_store.py +mpirun -np 2 python examples/06_message_passing/message_passing_put.py ``` ### GEMM Operations