runtime/Dockerfile.cuda at develop · muxi-ai/runtime · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# MUXI Runtime - CUDA variant (EXPERIMENTAL)
# ============================================================================
#
# Status: experimental. Only buildable on linux/amd64 with NVIDIA tooling;
# not exercised in the macOS dev loop, not smoke-tested against live GPUs
# in CI at the time of the 0.20260422.x cut. Treat this variant as a
# preview -- the image produces, but the full runtime path (CUDA torch +
# faiss-gpu + onnxruntime-gpu inside a SIF) has not been end-to-end
# validated. Use the pytorch variant for CPU knowledge workloads until
# this variant graduates.
#
# Inherits from the lean runtime image and swaps in the GPU stack:
#   - PyTorch with CUDA 12.x (default PyPI wheels on linux/amd64)
#   - onellm[local-cuda,local-pytorch] -> onnxruntime-gpu, faiss-gpu-cu12,
#     transformers, numpy, sentence-transformers
#   - faissx-gpu replacing faissx (GPU-enabled client for MUXI's FAISSx;
#     drop-in API, swaps the local-mode FAISS backend to the CUDA build)
#
# When to use this variant:
#   - Host has NVIDIA GPU + CUDA 12.x driver
#   - Formations benefit from GPU-accelerated ONNX inference (large
#     embedding models, batch workloads) OR run PyTorch-only embedding
#     models that need GPU (e.g. Nomic v2 MoE on GPU)
#   - Vector memory is in local mode (faissx.client local backend) on
#     large working sets; remote-mode deployments get no benefit (the
#     GPU path is in-process) but nothing is broken either.
#
# Platforms: linux/amd64 (CUDA GPU support) and linux/arm64 (e.g. Jetson / Graviton).
# No macOS (no CUDA driver there). On arm64 hosts without a CUDA GPU, CUDA-specific
# libraries install but fall back to CPU execution at runtime.
#
# Size caveat: the CUDA stack (torch-cuda + onnxruntime-gpu + nvidia-*-cu12
# transitives + faiss-gpu-cu12) totals 4-6 GB. SIF builds exceed the 2 GB
# GitHub release upload ceiling. Distribution is via muxi-server's CDN
# (not GitHub) so this is not a release blocker.
#
# Selected by muxi-server when the formation declares
#   muxi_runtime: "<version>:cuda"
#
# ============================================================================

ARG BASE_IMAGE=ghcr.io/muxi-ai/runtime
ARG BASE_TAG=latest
FROM ${BASE_IMAGE}:${BASE_TAG}

LABEL maintainer="Ran Aroussi <[email protected]>"
LABEL description="MUXI Runtime - CUDA variant with GPU ONNX, GPU FAISS, and CUDA PyTorch"
LABEL muxi.runtime.variant="cuda"

# Steps 1+2: arch-aware GPU stack installation.
#
# onnxruntime-gpu and faiss-gpu-cu12 only publish amd64 wheels.
# On arm64 (Graviton / Jetson) we keep the CPU onnxruntime and faiss-cpu
# from the base image; PyTorch installs CPU-only wheels.
#
# amd64:
#   - uninstall CPU onnxruntime/faiss/faissx (conflict with GPU equivalents)
#   - torch+torchvision: default PyPI index (CUDA 12.x wheels)
#   - onellm[local-cuda,local-pytorch]: onnxruntime-gpu, faiss-gpu-cu12,
#     sentence-transformers, transformers, numpy
#   - faissx-gpu: drop-in for faissx client (uses faiss-gpu-cu12 locally)
#
# arm64:
#   - keep CPU onnxruntime + faiss-cpu (no GPU wheels on arm64)
#   - torch+torchvision: CPU-only wheels from PyTorch index
#   - onellm[local-pytorch]: sentence-transformers, transformers, numpy
#   - keep faissx CPU client
RUN ARCH="$(uname -m)" && \
    if [ "$ARCH" = "x86_64" ]; then \
        pip uninstall -y faiss-cpu faissx onnxruntime && \
        pip install --no-cache-dir torch torchvision && \
        pip install --no-cache-dir "onellm[local-cuda]" && \
        pip install --no-cache-dir "faissx-gpu>=0.20260422.2"; \
    else \
        pip install --no-cache-dir torch torchvision \
            --index-url https://download.pytorch.org/whl/cpu && \
        pip install --no-cache-dir "onellm[local-pytorch]"; \
    fi && \
    find /usr/local -name "*.pyc" -delete && \
    find /usr/local -name "__pycache__" -type d -exec rm -rf {} + 2>/dev/null || true && \
    rm -rf /root/.cache /tmp/*

# Step 3: sanity-check imports (no GPU at build time on either arch).
# onnxruntime may not be present on arm64 (no GPU wheels); check it
# separately so the core import verification still runs.
RUN python -c "import torch, sentence_transformers; from faissx import client; print(f'torch {torch.__version__} | st {sentence_transformers.__version__} | faissx OK')" \
 && python -c "exec(\"try:\\n import onnxruntime as ort\\n print(f'ort {ort.__version__} providers={ort.get_available_providers()}')\\nexcept ImportError:\\n print('onnxruntime not installed (expected on arm64)')\")"

# Inherited from the lean base (no override needed):
#   - HF_HOME=/opt/hf-cache
#   - HF_HUB_CACHE=/opt/hf-cache
#   - PYTHONPATH, PYTHONDONTWRITEBYTECODE, PYTHONUNBUFFERED, LC_ALL, LANG
#   - /opt/hf-cache stub directory
#   - docker-entrypoint.sh + ENTRYPOINT (incl. SIF-mode HF_HUB_OFFLINE + cache assertion)
#   - EXPOSE 8000, HEALTHCHECK
#   - Source at /app, packages at /usr/local