Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
199 changes: 199 additions & 0 deletions .github/workflows/build-inference.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
# Build llama-server for every supported GGML backend.
#
# Backend matrix:
# cuda — NVIDIA CUDA, built inside Docker on ubuntu-latest, pushed to GHCR
# rocm — AMD ROCm/HIP, built inside Docker on ubuntu-latest, pushed to GHCR
# cpu — CPU-only, built inside Docker on ubuntu-latest, pushed to GHCR
# metal — Apple Metal, built natively on macos-latest (Metal GPU frameworks
# are unavailable inside Linux containers); binaries uploaded as
# workflow artifacts and attached to GitHub Releases.
#
# Images are tagged:
# ghcr.io/<owner>/atlas/llama-server:<branch>-<backend>
# ghcr.io/<owner>/atlas/llama-server:sha-<sha>-<backend>
# ghcr.io/<owner>/atlas/llama-server:<semver>-<backend> (on release)
#
# Trigger conditions:
# • push to main that touches inference/ or this file
# • any pull request that touches inference/ or this file (build only, no push)
# • GitHub Release published (build + push + attach Metal zip to release)
# • workflow_dispatch for ad-hoc builds

name: Build Inference Images

on:
push:
branches: [main]
paths:
- "inference/**"
- ".github/workflows/build-inference.yml"
pull_request:
paths:
- "inference/**"
- ".github/workflows/build-inference.yml"
release:
types: [published]
workflow_dispatch:
inputs:
push_images:
description: "Push images to GHCR (linux backends)"
type: boolean
default: false
cuda_architectures:
description: "CUDA architectures (semicolon-separated, e.g. 89-real;90-real;120-real)"
type: string
default: "89-real;90-real;120-real"

env:
REGISTRY: ghcr.io
# Image namespace: ghcr.io/<owner>/atlas/llama-server
IMAGE_NAME: ${{ github.repository_owner }}/atlas/llama-server

jobs:
# ─────────────────────────────────────────────────────────────────────────
# Linux builds: CUDA / ROCm / CPU
# The Dockerfiles contain all compiler toolchains (nvcc, hipcc) so no GPU
# hardware is required on the runner itself — compilation happens inside
# the container image layers.
# ─────────────────────────────────────────────────────────────────────────
build-linux:
name: "${{ matrix.backend }} (ubuntu-latest)"
runs-on: ubuntu-latest
permissions:
contents: read
packages: write
strategy:
fail-fast: false
matrix:
include:
- backend: cuda
# CUDA arch targets:
# 89-real = Ada Lovelace (RTX 4000, L40)
# 90-real = Hopper (H100, H200)
# 120-real = Blackwell (GB200, RTX 5000 series)
# Override via workflow_dispatch input to target a single GPU.
cuda_architectures: "89-real;90-real;120-real"
- backend: rocm
cuda_architectures: ""

steps:
- name: Checkout
uses: actions/checkout@v4

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3

- name: Log in to GHCR
# Skip on PRs to avoid credential exposure for untrusted forks
if: >
github.event_name != 'pull_request' &&
(github.event_name != 'workflow_dispatch' || inputs.push_images)
uses: docker/login-action@v3
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Resolve CUDA architectures
id: cuda_arch
run: |
# workflow_dispatch input overrides matrix default
if [ "${{ github.event_name }}" = "workflow_dispatch" ] && \
[ -n "${{ inputs.cuda_architectures }}" ]; then
echo "value=${{ inputs.cuda_architectures }}" >> "$GITHUB_OUTPUT"
else
echo "value=${{ matrix.cuda_architectures }}" >> "$GITHUB_OUTPUT"
fi

- name: Docker metadata
id: meta
uses: docker/metadata-action@v5
with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
tags: |
type=ref,event=branch,suffix=-${{ matrix.backend }}
type=ref,event=pr,suffix=-${{ matrix.backend }}
type=semver,pattern={{version}},suffix=-${{ matrix.backend }}
type=semver,pattern={{major}}.{{minor}},suffix=-${{ matrix.backend }}
type=sha,prefix=sha-,suffix=-${{ matrix.backend }}

- name: Build (and push) Docker image
uses: docker/build-push-action@v6
with:
context: ./inference
file: ./inference/Dockerfile.v31
build-args: |
GGML_BACKEND=${{ matrix.backend }}
CUDA_ARCHITECTURES=${{ steps.cuda_arch.outputs.value }}
push: >-
${{
github.event_name == 'push' ||
github.event_name == 'release' ||
(github.event_name == 'workflow_dispatch' && inputs.push_images)
}}
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
# Layer cache scoped per backend so cuda/rocm/cpu don't share
cache-from: type=gha,scope=inference-${{ matrix.backend }}
cache-to: type=gha,scope=inference-${{ matrix.backend }},mode=max

# ─────────────────────────────────────────────────────────────────────────
# Metal build: native macOS
#
# Metal GPU frameworks (Metal.framework, MetalPerformanceShaders, etc.) are
# macOS-only and cannot be accessed from inside a Linux Docker container.
# The binary produced here runs directly on the host — no container needed.
#
# Outputs:
# • workflow artifact: llama-server-metal-macos-arm64
# • on release: zip attached to the GitHub Release
# ─────────────────────────────────────────────────────────────────────────
build-metal:
name: "metal (macos-latest)"
runs-on: macos-latest
permissions:
contents: write # needed to upload release assets

steps:
- name: Checkout
uses: actions/checkout@v4

- name: Install build dependencies
run: brew install cmake

- name: Clone llama.cpp
run: |
git clone --depth 1 https://github.com/ggml-org/llama.cpp /tmp/llama.cpp

- name: Build with GGML_METAL=ON
run: |
cd /tmp/llama.cpp
cmake -B build \
-DGGML_METAL=ON \
-DBUILD_SHARED_LIBS=OFF \
-DCMAKE_BUILD_TYPE=Release
cmake --build build --config Release -j$(sysctl -n hw.logicalcpu)

- name: Smoke-test binary
run: /tmp/llama.cpp/build/bin/llama-server --version

- name: Upload binaries as workflow artifact
uses: actions/upload-artifact@v4
with:
name: llama-server-metal-macos-arm64
path: |
/tmp/llama.cpp/build/bin/llama-server
/tmp/llama.cpp/build/bin/llama-cli
if-no-files-found: error
retention-days: 90

- name: Attach binaries to GitHub Release
if: github.event_name == 'release'
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
cd /tmp/llama.cpp/build/bin
zip -j llama-server-metal-macos-arm64.zip llama-server llama-cli
gh release upload "${{ github.ref_name }}" \
llama-server-metal-macos-arm64.zip \
--repo "${{ github.repository }}"
117 changes: 92 additions & 25 deletions benchmark/analysis/hardware_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,73 +39,140 @@ def run_command(cmd: str, default: str = "") -> str:
return default


def get_gpu_info() -> Dict[str, Any]:
def _get_nvidia_gpu_info(info: Dict[str, Any]) -> bool:
"""
Get GPU information using nvidia-smi.

Returns:
Dictionary with GPU model, VRAM, driver version, and power draw
Populate *info* from nvidia-smi. Returns True if NVIDIA GPU was found.
"""
info = {
"model": "",
"vram_gb": 0.0,
"driver_version": "",
"power_draw_watts": 0.0
}

# Try nvidia-smi
nvidia_smi = run_command("which nvidia-smi")
if not nvidia_smi:
return info
if not run_command("which nvidia-smi"):
return False

# Get GPU name
name = run_command("nvidia-smi --query-gpu=name --format=csv,noheader,nounits")
if name:
info["model"] = name.split('\n')[0].strip()

# Get VRAM
vram = run_command("nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits")
if vram:
try:
# Convert MiB to GB
info["vram_gb"] = float(vram.split('\n')[0].strip()) / 1024
except ValueError:
pass

# Get driver version
driver = run_command("nvidia-smi --query-gpu=driver_version --format=csv,noheader,nounits")
if driver:
info["driver_version"] = driver.split('\n')[0].strip()

# Get current power draw
power = run_command("nvidia-smi --query-gpu=power.draw --format=csv,noheader,nounits")
if power:
try:
info["power_draw_watts"] = float(power.split('\n')[0].strip())
except ValueError:
pass

return bool(info["model"])


def _get_rocm_gpu_info(info: Dict[str, Any]) -> bool:
"""
Populate *info* from rocm-smi (AMD ROCm). Returns True if AMD GPU was found.
"""
if not run_command("which rocm-smi"):
return False

# rocm-smi --showproductname prints lines like "GPU[0] : Product Name: Radeon RX 7900 XTX"
name = run_command("rocm-smi --showproductname --noheader 2>/dev/null | awk -F': ' '/Product Name/{print $NF; exit}'")
if name:
info["model"] = name.strip()

# VRAM in bytes → GB
vram = run_command("rocm-smi --showmeminfo vram --noheader 2>/dev/null | awk '/Total Memory/{print $NF; exit}'")
if vram:
try:
info["vram_gb"] = float(vram.strip()) / (1024 ** 3)
except ValueError:
pass

driver = run_command("rocm-smi --showdriverversion --noheader 2>/dev/null | awk '{print $NF; exit}'")
if driver:
info["driver_version"] = driver.strip()

return bool(info["model"])


def _get_metal_gpu_info(info: Dict[str, Any]) -> bool:
"""
Populate *info* from system_profiler (Apple Metal / macOS).
Returns True if a Metal GPU was found.
"""
if platform.system() != "Darwin":
return False

sp_out = run_command("system_profiler SPDisplaysDataType 2>/dev/null")
if not sp_out:
return False

match = re.search(r'Chipset Model:\s*(.+)', sp_out)
if match:
info["model"] = match.group(1).strip()

match = re.search(r'VRAM \([^)]+\):\s*([\d.]+)\s*(MB|GB)', sp_out, re.IGNORECASE)
if match:
try:
vram_val = float(match.group(1))
if match.group(2).upper() == "MB":
vram_val /= 1024
info["vram_gb"] = vram_val
except ValueError:
pass

return bool(info["model"])


def get_gpu_info() -> Dict[str, Any]:
"""
Get GPU information, trying NVIDIA, AMD ROCm, and Apple Metal in order.

Returns:
Dictionary with GPU model, VRAM, driver version, and power draw
"""
info = {
"model": "",
"vram_gb": 0.0,
"driver_version": "",
"power_draw_watts": 0.0
}

_get_nvidia_gpu_info(info) or _get_rocm_gpu_info(info) or _get_metal_gpu_info(info)
return info


def get_cuda_version() -> str:
"""
Get CUDA version.
Get the GPU accelerator version (CUDA, ROCm, or Metal).

Returns:
CUDA version string
Version string for the active GPU accelerator, or empty string.
"""
# Try nvcc first
# CUDA — try nvcc then nvidia-smi
nvcc_version = run_command("nvcc --version | grep release | sed 's/.*release //' | sed 's/,.*//'")
if nvcc_version:
return nvcc_version

# Try nvidia-smi
nvidia_smi_output = run_command("nvidia-smi")
match = re.search(r'CUDA Version:\s*(\d+\.\d+)', nvidia_smi_output)
if match:
return match.group(1)

# ROCm — hipconfig
rocm_version = run_command("hipconfig --version 2>/dev/null | head -1")
if rocm_version:
return rocm_version

# Metal — macOS build version (Metal is always present on modern macOS)
if platform.system() == "Darwin":
macos_ver = run_command("sw_vers -productVersion 2>/dev/null")
if macos_ver:
return f"Metal (macOS {macos_ver.strip()})"

return ""


Expand Down
24 changes: 24 additions & 0 deletions docker-compose.rocm.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# AMD ROCm override for docker-compose.yml
#
# Usage:
# GGML_BACKEND=rocm docker compose -f docker-compose.yml -f docker-compose.rocm.yml up --build
#
# This override:
# - Sets GGML_BACKEND=rocm for the llama-server build
# - Replaces the NVIDIA deploy block with ROCm device mappings
# - Mounts /dev/kfd (ROCm kernel driver) and /dev/dri (GPU render nodes)

services:
llama-server:
build:
args:
GGML_BACKEND: rocm
deploy: !reset {} # remove the nvidia deploy block
devices:
- /dev/kfd:/dev/kfd
- /dev/dri:/dev/dri
group_add:
- video
- render
environment:
- HSA_OVERRIDE_GFX_VERSION=${HSA_OVERRIDE_GFX_VERSION:-}
Loading
Loading