-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathDockerfile.cuda
More file actions
96 lines (91 loc) · 4.77 KB
/
Dockerfile.cuda
File metadata and controls
96 lines (91 loc) · 4.77 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# MUXI Runtime - CUDA variant (EXPERIMENTAL)
# ============================================================================
#
# Status: experimental. Only buildable on linux/amd64 with NVIDIA tooling;
# not exercised in the macOS dev loop, not smoke-tested against live GPUs
# in CI at the time of the 0.20260422.x cut. Treat this variant as a
# preview -- the image produces, but the full runtime path (CUDA torch +
# faiss-gpu + onnxruntime-gpu inside a SIF) has not been end-to-end
# validated. Use the pytorch variant for CPU knowledge workloads until
# this variant graduates.
#
# Inherits from the lean runtime image and swaps in the GPU stack:
# - PyTorch with CUDA 12.x (default PyPI wheels on linux/amd64)
# - onellm[local-cuda,local-pytorch] -> onnxruntime-gpu, faiss-gpu-cu12,
# transformers, numpy, sentence-transformers
# - faissx-gpu replacing faissx (GPU-enabled client for MUXI's FAISSx;
# drop-in API, swaps the local-mode FAISS backend to the CUDA build)
#
# When to use this variant:
# - Host has NVIDIA GPU + CUDA 12.x driver
# - Formations benefit from GPU-accelerated ONNX inference (large
# embedding models, batch workloads) OR run PyTorch-only embedding
# models that need GPU (e.g. Nomic v2 MoE on GPU)
# - Vector memory is in local mode (faissx.client local backend) on
# large working sets; remote-mode deployments get no benefit (the
# GPU path is in-process) but nothing is broken either.
#
# Platforms: linux/amd64 (CUDA GPU support) and linux/arm64 (e.g. Jetson / Graviton).
# No macOS (no CUDA driver there). On arm64 hosts without a CUDA GPU, CUDA-specific
# libraries install but fall back to CPU execution at runtime.
#
# Size caveat: the CUDA stack (torch-cuda + onnxruntime-gpu + nvidia-*-cu12
# transitives + faiss-gpu-cu12) totals 4-6 GB. SIF builds exceed the 2 GB
# GitHub release upload ceiling. Distribution is via muxi-server's CDN
# (not GitHub) so this is not a release blocker.
#
# Selected by muxi-server when the formation declares
# muxi_runtime: "<version>:cuda"
#
# ============================================================================
ARG BASE_IMAGE=ghcr.io/muxi-ai/runtime
ARG BASE_TAG=latest
FROM ${BASE_IMAGE}:${BASE_TAG}
LABEL maintainer="Ran Aroussi <[email protected]>"
LABEL description="MUXI Runtime - CUDA variant with GPU ONNX, GPU FAISS, and CUDA PyTorch"
LABEL muxi.runtime.variant="cuda"
# Steps 1+2: arch-aware GPU stack installation.
#
# onnxruntime-gpu and faiss-gpu-cu12 only publish amd64 wheels.
# On arm64 (Graviton / Jetson) we keep the CPU onnxruntime and faiss-cpu
# from the base image; PyTorch installs CPU-only wheels.
#
# amd64:
# - uninstall CPU onnxruntime/faiss/faissx (conflict with GPU equivalents)
# - torch+torchvision: default PyPI index (CUDA 12.x wheels)
# - onellm[local-cuda,local-pytorch]: onnxruntime-gpu, faiss-gpu-cu12,
# sentence-transformers, transformers, numpy
# - faissx-gpu: drop-in for faissx client (uses faiss-gpu-cu12 locally)
#
# arm64:
# - keep CPU onnxruntime + faiss-cpu (no GPU wheels on arm64)
# - torch+torchvision: CPU-only wheels from PyTorch index
# - onellm[local-pytorch]: sentence-transformers, transformers, numpy
# - keep faissx CPU client
RUN ARCH="$(uname -m)" && \
if [ "$ARCH" = "x86_64" ]; then \
pip uninstall -y faiss-cpu faissx onnxruntime && \
pip install --no-cache-dir torch torchvision && \
pip install --no-cache-dir "onellm[local-cuda]" && \
pip install --no-cache-dir "faissx-gpu>=0.20260422.2"; \
else \
pip install --no-cache-dir torch torchvision \
--index-url https://download.pytorch.org/whl/cpu && \
pip install --no-cache-dir "onellm[local-pytorch]"; \
fi && \
find /usr/local -name "*.pyc" -delete && \
find /usr/local -name "__pycache__" -type d -exec rm -rf {} + 2>/dev/null || true && \
rm -rf /root/.cache /tmp/*
# Step 3: sanity-check imports (no GPU at build time on either arch).
# onnxruntime may not be present on arm64 (no GPU wheels); check it
# separately so the core import verification still runs.
RUN python -c "import torch, sentence_transformers; from faissx import client; print(f'torch {torch.__version__} | st {sentence_transformers.__version__} | faissx OK')" \
&& python -c "exec(\"try:\\n import onnxruntime as ort\\n print(f'ort {ort.__version__} providers={ort.get_available_providers()}')\\nexcept ImportError:\\n print('onnxruntime not installed (expected on arm64)')\")"
# Inherited from the lean base (no override needed):
# - HF_HOME=/opt/hf-cache
# - HF_HUB_CACHE=/opt/hf-cache
# - PYTHONPATH, PYTHONDONTWRITEBYTECODE, PYTHONUNBUFFERED, LC_ALL, LANG
# - /opt/hf-cache stub directory
# - docker-entrypoint.sh + ENTRYPOINT (incl. SIF-mode HF_HUB_OFFLINE + cache assertion)
# - EXPOSE 8000, HEALTHCHECK
# - Source at /app, packages at /usr/local