audiohacking · Copilot · Apr 7, 2026 · Apr 7, 2026 · Apr 7, 2026
diff --git a/.github/workflows/build-inference.yml b/.github/workflows/build-inference.yml
@@ -0,0 +1,199 @@
+# Build llama-server for every supported GGML backend.
+#
+# Backend matrix:
+#   cuda  — NVIDIA CUDA, built inside Docker on ubuntu-latest, pushed to GHCR
+#   rocm  — AMD ROCm/HIP, built inside Docker on ubuntu-latest, pushed to GHCR
+#   cpu   — CPU-only, built inside Docker on ubuntu-latest, pushed to GHCR
+#   metal — Apple Metal, built natively on macos-latest (Metal GPU frameworks
+#            are unavailable inside Linux containers); binaries uploaded as
+#            workflow artifacts and attached to GitHub Releases.
+#
+# Images are tagged:
+#   ghcr.io/<owner>/atlas/llama-server:<branch>-<backend>
+#   ghcr.io/<owner>/atlas/llama-server:sha-<sha>-<backend>
+#   ghcr.io/<owner>/atlas/llama-server:<semver>-<backend>   (on release)
+#
+# Trigger conditions:
+#   • push to main that touches inference/ or this file
+#   • any pull request that touches inference/ or this file (build only, no push)
+#   • GitHub Release published (build + push + attach Metal zip to release)
+#   • workflow_dispatch for ad-hoc builds
+
+name: Build Inference Images
+
+on:
+  push:
+    branches: [main]
+    paths:
+      - "inference/**"
+      - ".github/workflows/build-inference.yml"
+  pull_request:
+    paths:
+      - "inference/**"
+      - ".github/workflows/build-inference.yml"
+  release:
+    types: [published]
+  workflow_dispatch:
+    inputs:
+      push_images:
+        description: "Push images to GHCR (linux backends)"
+        type: boolean
+        default: false
+      cuda_architectures:
+        description: "CUDA architectures (semicolon-separated, e.g. 89-real;90-real;120-real)"
+        type: string
+        default: "89-real;90-real;120-real"
+
+env:
+  REGISTRY: ghcr.io
+  # Image namespace: ghcr.io/<owner>/atlas/llama-server
+  IMAGE_NAME: ${{ github.repository_owner }}/atlas/llama-server
+
+jobs:
+  # ─────────────────────────────────────────────────────────────────────────
+  # Linux builds: CUDA / ROCm / CPU
+  # The Dockerfiles contain all compiler toolchains (nvcc, hipcc) so no GPU
+  # hardware is required on the runner itself — compilation happens inside
+  # the container image layers.
+  # ─────────────────────────────────────────────────────────────────────────
+  build-linux:
+    name: "${{ matrix.backend }} (ubuntu-latest)"
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - backend: cuda
+            # CUDA arch targets:
+            #   89-real = Ada Lovelace (RTX 4000, L40)
+            #   90-real = Hopper (H100, H200)
+            #   120-real = Blackwell (GB200, RTX 5000 series)
+            # Override via workflow_dispatch input to target a single GPU.
+            cuda_architectures: "89-real;90-real;120-real"
+          - backend: rocm
+            cuda_architectures: ""
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Log in to GHCR
+        # Skip on PRs to avoid credential exposure for untrusted forks
+        if: >
+          github.event_name != 'pull_request' &&
+          (github.event_name != 'workflow_dispatch' || inputs.push_images)
+        uses: docker/login-action@v3
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Resolve CUDA architectures
+        id: cuda_arch
+        run: |
+          # workflow_dispatch input overrides matrix default
+          if [ "${{ github.event_name }}" = "workflow_dispatch" ] && \
+             [ -n "${{ inputs.cuda_architectures }}" ]; then
+            echo "value=${{ inputs.cuda_architectures }}" >> "$GITHUB_OUTPUT"
+          else
+            echo "value=${{ matrix.cuda_architectures }}" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Docker metadata
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
+          tags: |
+            type=ref,event=branch,suffix=-${{ matrix.backend }}
+            type=ref,event=pr,suffix=-${{ matrix.backend }}
+            type=semver,pattern={{version}},suffix=-${{ matrix.backend }}
+            type=semver,pattern={{major}}.{{minor}},suffix=-${{ matrix.backend }}
+            type=sha,prefix=sha-,suffix=-${{ matrix.backend }}
+
+      - name: Build (and push) Docker image
+        uses: docker/build-push-action@v6
+        with:
+          context: ./inference
+          file: ./inference/Dockerfile.v31
+          build-args: |
+            GGML_BACKEND=${{ matrix.backend }}
+            CUDA_ARCHITECTURES=${{ steps.cuda_arch.outputs.value }}
+          push: >-
+            ${{
+              github.event_name == 'push' ||
+              github.event_name == 'release' ||
+              (github.event_name == 'workflow_dispatch' && inputs.push_images)
+            }}
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
+          # Layer cache scoped per backend so cuda/rocm/cpu don't share
+          cache-from: type=gha,scope=inference-${{ matrix.backend }}
+          cache-to: type=gha,scope=inference-${{ matrix.backend }},mode=max
+
+  # ─────────────────────────────────────────────────────────────────────────
+  # Metal build: native macOS
+  #
+  # Metal GPU frameworks (Metal.framework, MetalPerformanceShaders, etc.) are
+  # macOS-only and cannot be accessed from inside a Linux Docker container.
+  # The binary produced here runs directly on the host — no container needed.
+  #
+  # Outputs:
+  #   • workflow artifact: llama-server-metal-macos-arm64
+  #   • on release: zip attached to the GitHub Release
+  # ─────────────────────────────────────────────────────────────────────────
+  build-metal:
+    name: "metal (macos-latest)"
+    runs-on: macos-latest
+    permissions:
+      contents: write  # needed to upload release assets
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Install build dependencies
+        run: brew install cmake
+
+      - name: Clone llama.cpp
+        run: |
+          git clone --depth 1 https://github.com/ggml-org/llama.cpp /tmp/llama.cpp
+
+      - name: Build with GGML_METAL=ON
+        run: |
+          cd /tmp/llama.cpp
+          cmake -B build \
+            -DGGML_METAL=ON \
+            -DBUILD_SHARED_LIBS=OFF \
+            -DCMAKE_BUILD_TYPE=Release
+          cmake --build build --config Release -j$(sysctl -n hw.logicalcpu)
+
+      - name: Smoke-test binary
+        run: /tmp/llama.cpp/build/bin/llama-server --version
+
+      - name: Upload binaries as workflow artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: llama-server-metal-macos-arm64
+          path: |
+            /tmp/llama.cpp/build/bin/llama-server
+            /tmp/llama.cpp/build/bin/llama-cli
+          if-no-files-found: error
+          retention-days: 90
+
+      - name: Attach binaries to GitHub Release
+        if: github.event_name == 'release'
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          cd /tmp/llama.cpp/build/bin
+          zip -j llama-server-metal-macos-arm64.zip llama-server llama-cli
+          gh release upload "${{ github.ref_name }}" \
+            llama-server-metal-macos-arm64.zip \
+            --repo "${{ github.repository }}"
diff --git a/benchmark/analysis/hardware_info.py b/benchmark/analysis/hardware_info.py
@@ -39,73 +39,140 @@ def run_command(cmd: str, default: str = "") -> str:
         return default
 
 
-def get_gpu_info() -> Dict[str, Any]:
+def _get_nvidia_gpu_info(info: Dict[str, Any]) -> bool:
     """
-    Get GPU information using nvidia-smi.
-
-    Returns:
-        Dictionary with GPU model, VRAM, driver version, and power draw
+    Populate *info* from nvidia-smi.  Returns True if NVIDIA GPU was found.
     """
-    info = {
-        "model": "",
-        "vram_gb": 0.0,
-        "driver_version": "",
-        "power_draw_watts": 0.0
-    }
-
-    # Try nvidia-smi
-    nvidia_smi = run_command("which nvidia-smi")
-    if not nvidia_smi:
-        return info
+    if not run_command("which nvidia-smi"):
+        return False
 
-    # Get GPU name
     name = run_command("nvidia-smi --query-gpu=name --format=csv,noheader,nounits")
     if name:
         info["model"] = name.split('\n')[0].strip()
 
-    # Get VRAM
     vram = run_command("nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits")
     if vram:
         try:
-            # Convert MiB to GB
             info["vram_gb"] = float(vram.split('\n')[0].strip()) / 1024
         except ValueError:
             pass
 
-    # Get driver version
     driver = run_command("nvidia-smi --query-gpu=driver_version --format=csv,noheader,nounits")
     if driver:
         info["driver_version"] = driver.split('\n')[0].strip()
 
-    # Get current power draw
     power = run_command("nvidia-smi --query-gpu=power.draw --format=csv,noheader,nounits")
     if power:
         try:
             info["power_draw_watts"] = float(power.split('\n')[0].strip())
         except ValueError:
             pass
 
+    return bool(info["model"])
+
+
+def _get_rocm_gpu_info(info: Dict[str, Any]) -> bool:
+    """
+    Populate *info* from rocm-smi (AMD ROCm).  Returns True if AMD GPU was found.
+    """
+    if not run_command("which rocm-smi"):
+        return False
+
+    # rocm-smi --showproductname prints lines like "GPU[0] : Product Name: Radeon RX 7900 XTX"
+    name = run_command("rocm-smi --showproductname --noheader 2>/dev/null | awk -F': ' '/Product Name/{print $NF; exit}'")
+    if name:
+        info["model"] = name.strip()
+
+    # VRAM in bytes → GB
+    vram = run_command("rocm-smi --showmeminfo vram --noheader 2>/dev/null | awk '/Total Memory/{print $NF; exit}'")
+    if vram:
+        try:
+            info["vram_gb"] = float(vram.strip()) / (1024 ** 3)
+        except ValueError:
+            pass
+
+    driver = run_command("rocm-smi --showdriverversion --noheader 2>/dev/null | awk '{print $NF; exit}'")
+    if driver:
+        info["driver_version"] = driver.strip()
+
+    return bool(info["model"])
+
+
+def _get_metal_gpu_info(info: Dict[str, Any]) -> bool:
+    """
+    Populate *info* from system_profiler (Apple Metal / macOS).
+    Returns True if a Metal GPU was found.
+    """
+    if platform.system() != "Darwin":
+        return False
+
+    sp_out = run_command("system_profiler SPDisplaysDataType 2>/dev/null")
+    if not sp_out:
+        return False
+
+    match = re.search(r'Chipset Model:\s*(.+)', sp_out)
+    if match:
+        info["model"] = match.group(1).strip()
+
+    match = re.search(r'VRAM \([^)]+\):\s*([\d.]+)\s*(MB|GB)', sp_out, re.IGNORECASE)
+    if match:
+        try:
+            vram_val = float(match.group(1))
+            if match.group(2).upper() == "MB":
+                vram_val /= 1024
+            info["vram_gb"] = vram_val
+        except ValueError:
+            pass
+
+    return bool(info["model"])
+
+
+def get_gpu_info() -> Dict[str, Any]:
+    """
+    Get GPU information, trying NVIDIA, AMD ROCm, and Apple Metal in order.
+
+    Returns:
+        Dictionary with GPU model, VRAM, driver version, and power draw
+    """
+    info = {
+        "model": "",
+        "vram_gb": 0.0,
+        "driver_version": "",
+        "power_draw_watts": 0.0
+    }
+
+    _get_nvidia_gpu_info(info) or _get_rocm_gpu_info(info) or _get_metal_gpu_info(info)
     return info
 
 
 def get_cuda_version() -> str:
     """
-    Get CUDA version.
+    Get the GPU accelerator version (CUDA, ROCm, or Metal).
 
     Returns:
-        CUDA version string
+        Version string for the active GPU accelerator, or empty string.
     """
-    # Try nvcc first
+    # CUDA — try nvcc then nvidia-smi
     nvcc_version = run_command("nvcc --version | grep release | sed 's/.*release //' | sed 's/,.*//'")
     if nvcc_version:
         return nvcc_version
 
-    # Try nvidia-smi
     nvidia_smi_output = run_command("nvidia-smi")
     match = re.search(r'CUDA Version:\s*(\d+\.\d+)', nvidia_smi_output)
     if match:
         return match.group(1)
 
+    # ROCm — hipconfig
+    rocm_version = run_command("hipconfig --version 2>/dev/null | head -1")
+    if rocm_version:
+        return rocm_version
+
+    # Metal — macOS build version (Metal is always present on modern macOS)
+    if platform.system() == "Darwin":
+        macos_ver = run_command("sw_vers -productVersion 2>/dev/null")
+        if macos_ver:
+            return f"Metal (macOS {macos_ver.strip()})"
+
     return ""
 
 

diff --git a/docker-compose.rocm.yml b/docker-compose.rocm.yml
@@ -0,0 +1,24 @@
+# AMD ROCm override for docker-compose.yml
+#
+# Usage:
+#   GGML_BACKEND=rocm docker compose -f docker-compose.yml -f docker-compose.rocm.yml up --build
+#
+# This override:
+#   - Sets GGML_BACKEND=rocm for the llama-server build
+#   - Replaces the NVIDIA deploy block with ROCm device mappings
+#   - Mounts /dev/kfd (ROCm kernel driver) and /dev/dri (GPU render nodes)
+
+services:
+  llama-server:
+    build:
+      args:
+        GGML_BACKEND: rocm
+    deploy: !reset {}       # remove the nvidia deploy block
+    devices:
+      - /dev/kfd:/dev/kfd
+      - /dev/dri:/dev/dri
+    group_add:
+      - video
+      - render
+    environment:
+      - HSA_OVERRIDE_GFX_VERSION=${HSA_OVERRIDE_GFX_VERSION:-}