From db7a111d85b29d41462b349c399f59b95e9151f0 Mon Sep 17 00:00:00 2001
From: Daniele Salvatore Albano <d.albano@gmail.com>
Date: Mon, 6 Oct 2025 14:44:52 +0200
Subject: [PATCH 1/9] Add support for sm120 (blackwell / nvidia gtx 5090 gpu
 support)

---
 Dockerfile-cuda                    | 5 ++++-
 Dockerfile-cuda-all                | 2 +-
 backends/candle/src/compute_cap.rs | 1 +
 backends/candle/src/flash_attn.rs  | 2 +-
 4 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/Dockerfile-cuda b/Dockerfile-cuda
index 537ad59f..bba1c1b5 100644
--- a/Dockerfile-cuda
+++ b/Dockerfile-cuda
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:12.2.0-devel-ubuntu22.04 AS base-builder
+FROM nvidia/cuda:12.9.0-devel-ubuntu22.04 AS base-builder
 
 ENV SCCACHE=0.10.0
 ENV RUSTC_WRAPPER=/usr/local/bin/sccache
@@ -58,6 +58,9 @@ RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
     elif [ ${CUDA_COMPUTE_CAP} -eq 90 ]; \
     then  \
         nvprune --generate-code code=sm_90 /usr/local/cuda/lib64/libcublas_static.a -o /usr/local/cuda/lib64/libcublas_static.a; \
+    elif [ ${CUDA_COMPUTE_CAP} -eq 120 ]; \
+    then  \
+        nvprune --generate-code code=sm_120 /usr/local/cuda/lib64/libcublas_static.a -o /usr/local/cuda/lib64/libcublas_static.a; \
     else  \
         echo "cuda compute cap ${CUDA_COMPUTE_CAP} is not supported"; exit 1; \
     fi;
diff --git a/Dockerfile-cuda-all b/Dockerfile-cuda-all
index 5dca432a..47298b71 100644
--- a/Dockerfile-cuda-all
+++ b/Dockerfile-cuda-all
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:12.2.0-devel-ubuntu22.04 AS base-builder
+FROM nvidia/cuda:12.9.0-devel-ubuntu22.04 AS base-builder
 
 ENV SCCACHE=0.10.0
 ENV RUSTC_WRAPPER=/usr/local/bin/sccache
diff --git a/backends/candle/src/compute_cap.rs b/backends/candle/src/compute_cap.rs
index ac79fcf1..c4a7802a 100644
--- a/backends/candle/src/compute_cap.rs
+++ b/backends/candle/src/compute_cap.rs
@@ -30,6 +30,7 @@ fn compute_cap_matching(runtime_compute_cap: usize, compile_compute_cap: usize)
         (86..=89, 80..=86) => true,
         (89, 89) => true,
         (90, 90) => true,
+        (120, 120) => true,
         (_, _) => false,
     }
 }
diff --git a/backends/candle/src/flash_attn.rs b/backends/candle/src/flash_attn.rs
index 8dbe58cf..f1b69c72 100644
--- a/backends/candle/src/flash_attn.rs
+++ b/backends/candle/src/flash_attn.rs
@@ -61,7 +61,7 @@ pub(crate) fn flash_attn_varlen(
         }
         #[cfg(not(feature = "flash-attn-v1"))]
         candle::bail!("Flash attention v1 is not installed. Use `flash-attn-v1` feature.")
-    } else if (80..90).contains(&runtime_compute_cap) || runtime_compute_cap == 90 {
+    } else if (80..90).contains(&runtime_compute_cap) || runtime_compute_cap == 90 || runtime_compute_cap == 120 {
         #[cfg(feature = "flash-attn")]
         {
             use candle_flash_attn::{flash_attn_varlen_alibi_windowed, flash_attn_varlen_windowed};

From 6c78e75bfb5ef4503f3b49e0885f3174644da489 Mon Sep 17 00:00:00 2001
From: Daniele Salvatore Albano <d.albano@gmail.com>
Date: Mon, 6 Oct 2025 14:48:25 +0200
Subject: [PATCH 2/9] Update docs

---
 README.md                          | 3 +++
 docs/source/en/custom_container.md | 1 +
 2 files changed, 4 insertions(+)

diff --git a/README.md b/README.md
index e9e98d0b..eeb55ed7 100644
--- a/README.md
+++ b/README.md
@@ -581,6 +581,9 @@ runtime_compute_cap=89
 # Example for H100
 runtime_compute_cap=90
 
+# Example for Nvidia GTX 5090
+runtime_compute_cap=120
+
 docker build . -f Dockerfile-cuda --build-arg CUDA_COMPUTE_CAP=$runtime_compute_cap
 ```
 
diff --git a/docs/source/en/custom_container.md b/docs/source/en/custom_container.md
index c670026c..8b85262e 100644
--- a/docs/source/en/custom_container.md
+++ b/docs/source/en/custom_container.md
@@ -32,6 +32,7 @@ the examples of runtime compute capabilities for various GPU types:
 - A10 - `runtime_compute_cap=86`
 - Ada Lovelace (RTX 4000 series, ...) - `runtime_compute_cap=89`
 - H100 - `runtime_compute_cap=90`
+- Blackwell (RTX 5000 series, ...) - `runtime_compute_cap=120`
 
 Once you have determined the compute capability is determined, set it as the `runtime_compute_cap` variable and build
 the container as shown in the example below:

From f4eeffd4a4e05fde33abf61801ecba959a66079f Mon Sep 17 00:00:00 2001
From: Daniele Salvatore Albano <d.albano@gmail.com>
Date: Mon, 6 Oct 2025 14:48:36 +0200
Subject: [PATCH 3/9] Update compute cap test

---
 backends/candle/src/compute_cap.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/backends/candle/src/compute_cap.rs b/backends/candle/src/compute_cap.rs
index c4a7802a..56978f5d 100644
--- a/backends/candle/src/compute_cap.rs
+++ b/backends/candle/src/compute_cap.rs
@@ -55,6 +55,7 @@ mod tests {
         assert!(compute_cap_matching(86, 86));
         assert!(compute_cap_matching(89, 89));
         assert!(compute_cap_matching(90, 90));
+        assert!(compute_cap_matching(120, 120));
 
         assert!(compute_cap_matching(86, 80));
         assert!(compute_cap_matching(89, 80));

From d79baf5c07b7ea5ac716fb1efc43c9f44eeeb4e3 Mon Sep 17 00:00:00 2001
From: Daniele Salvatore Albano <d.albano@gmail.com>
Date: Mon, 6 Oct 2025 14:49:23 +0200
Subject: [PATCH 4/9] Ensure that the runtime compute cap 120 doc string is
 uniform with the rest of the related information

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index eeb55ed7..71c49156 100644
--- a/README.md
+++ b/README.md
@@ -581,7 +581,7 @@ runtime_compute_cap=89
 # Example for H100
 runtime_compute_cap=90
 
-# Example for Nvidia GTX 5090
+# Example for Blackwell (RTX 5000 series, ...)
 runtime_compute_cap=120
 
 docker build . -f Dockerfile-cuda --build-arg CUDA_COMPUTE_CAP=$runtime_compute_cap

From 8f4b6c8170d285949a83613c04cf42de2c30688c Mon Sep 17 00:00:00 2001
From: Daniele Salvatore Albano <d.albano@gmail.com>
Date: Mon, 6 Oct 2025 15:00:45 +0200
Subject: [PATCH 5/9] Add support for sm120 in Dockerfile-cuda-all

---
 Dockerfile-cuda-all | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/Dockerfile-cuda-all b/Dockerfile-cuda-all
index 47298b71..d3908112 100644
--- a/Dockerfile-cuda-all
+++ b/Dockerfile-cuda-all
@@ -85,6 +85,15 @@ RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
       CUDA_COMPUTE_CAP=90 cargo chef cook --release --features candle-cuda --recipe-path recipe.json && sccache -s; \
     fi;
 
+RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
+    --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
+    if [ $VERTEX = "true" ]; \
+    then \
+      CUDA_COMPUTE_CAP=120 cargo chef cook --release --features google --features candle-cuda --recipe-path recipe.json && sccache -s; \
+    else \
+      CUDA_COMPUTE_CAP=120 cargo chef cook --release --features candle-cuda --recipe-path recipe.json && sccache -s; \
+    fi;
+
 COPY backends backends
 COPY core core
 COPY router router
@@ -122,9 +131,18 @@ RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
         CUDA_COMPUTE_CAP=90 cargo build --release --bin text-embeddings-router -F candle-cuda && sccache -s; \
     fi;
 
+RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
+    --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
+    if [ $VERTEX = "true" ]; \
+    then \
+        CUDA_COMPUTE_CAP=120 cargo build --release --bin text-embeddings-router -F candle-cuda -F google  && sccache -s; \
+    else \
+        CUDA_COMPUTE_CAP=120 cargo build --release --bin text-embeddings-router -F candle-cuda && sccache -s; \
+    fi;
+
 RUN mv /usr/src/target/release/text-embeddings-router /usr/src/target/release/text-embeddings-router-90
 
-FROM nvidia/cuda:12.2.0-runtime-ubuntu22.04 AS base
+FROM nvidia/cuda:12.9.0-runtime-ubuntu22.04 AS base
 
 ARG DEFAULT_USE_FLASH_ATTENTION=True
 

From a6f1bad07613f7a14da76b61fa0c2f5059823a33 Mon Sep 17 00:00:00 2001
From: Daniele Salvatore Albano <d.albano@gmail.com>
Date: Mon, 13 Oct 2025 10:44:56 +0200
Subject: [PATCH 6/9] Revert changes to Dockerfile-cuda and Dockerfile-cuda-all

---
 Dockerfile-cuda     |  5 +----
 Dockerfile-cuda-all | 22 ++--------------------
 2 files changed, 3 insertions(+), 24 deletions(-)

diff --git a/Dockerfile-cuda b/Dockerfile-cuda
index bba1c1b5..537ad59f 100644
--- a/Dockerfile-cuda
+++ b/Dockerfile-cuda
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:12.9.0-devel-ubuntu22.04 AS base-builder
+FROM nvidia/cuda:12.2.0-devel-ubuntu22.04 AS base-builder
 
 ENV SCCACHE=0.10.0
 ENV RUSTC_WRAPPER=/usr/local/bin/sccache
@@ -58,9 +58,6 @@ RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
     elif [ ${CUDA_COMPUTE_CAP} -eq 90 ]; \
     then  \
         nvprune --generate-code code=sm_90 /usr/local/cuda/lib64/libcublas_static.a -o /usr/local/cuda/lib64/libcublas_static.a; \
-    elif [ ${CUDA_COMPUTE_CAP} -eq 120 ]; \
-    then  \
-        nvprune --generate-code code=sm_120 /usr/local/cuda/lib64/libcublas_static.a -o /usr/local/cuda/lib64/libcublas_static.a; \
     else  \
         echo "cuda compute cap ${CUDA_COMPUTE_CAP} is not supported"; exit 1; \
     fi;
diff --git a/Dockerfile-cuda-all b/Dockerfile-cuda-all
index d3908112..5dca432a 100644
--- a/Dockerfile-cuda-all
+++ b/Dockerfile-cuda-all
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:12.9.0-devel-ubuntu22.04 AS base-builder
+FROM nvidia/cuda:12.2.0-devel-ubuntu22.04 AS base-builder
 
 ENV SCCACHE=0.10.0
 ENV RUSTC_WRAPPER=/usr/local/bin/sccache
@@ -85,15 +85,6 @@ RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
       CUDA_COMPUTE_CAP=90 cargo chef cook --release --features candle-cuda --recipe-path recipe.json && sccache -s; \
     fi;
 
-RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
-    --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
-    if [ $VERTEX = "true" ]; \
-    then \
-      CUDA_COMPUTE_CAP=120 cargo chef cook --release --features google --features candle-cuda --recipe-path recipe.json && sccache -s; \
-    else \
-      CUDA_COMPUTE_CAP=120 cargo chef cook --release --features candle-cuda --recipe-path recipe.json && sccache -s; \
-    fi;
-
 COPY backends backends
 COPY core core
 COPY router router
@@ -131,18 +122,9 @@ RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
         CUDA_COMPUTE_CAP=90 cargo build --release --bin text-embeddings-router -F candle-cuda && sccache -s; \
     fi;
 
-RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
-    --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
-    if [ $VERTEX = "true" ]; \
-    then \
-        CUDA_COMPUTE_CAP=120 cargo build --release --bin text-embeddings-router -F candle-cuda -F google  && sccache -s; \
-    else \
-        CUDA_COMPUTE_CAP=120 cargo build --release --bin text-embeddings-router -F candle-cuda && sccache -s; \
-    fi;
-
 RUN mv /usr/src/target/release/text-embeddings-router /usr/src/target/release/text-embeddings-router-90
 
-FROM nvidia/cuda:12.9.0-runtime-ubuntu22.04 AS base
+FROM nvidia/cuda:12.2.0-runtime-ubuntu22.04 AS base
 
 ARG DEFAULT_USE_FLASH_ATTENTION=True
 

From 32c60396d803173530f74edb1e805132459bab85 Mon Sep 17 00:00:00 2001
From: Daniele Salvatore Albano <d.albano@gmail.com>
Date: Mon, 13 Oct 2025 10:45:46 +0200
Subject: [PATCH 7/9] Add a dockerfile to build the cuda blackwell support

---
 Dockerfile-cuda-blackwell | 145 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 145 insertions(+)
 create mode 100644 Dockerfile-cuda-blackwell

diff --git a/Dockerfile-cuda-blackwell b/Dockerfile-cuda-blackwell
new file mode 100644
index 00000000..8614df59
--- /dev/null
+++ b/Dockerfile-cuda-blackwell
@@ -0,0 +1,145 @@
+FROM nvidia/cuda:12.9.0-devel-ubuntu22.04 AS base-builder
+
+ENV SCCACHE=0.10.0
+ENV RUSTC_WRAPPER=/usr/local/bin/sccache
+ENV PATH="/root/.cargo/bin:${PATH}"
+# aligned with `cargo-chef` version in `lukemathwalker/cargo-chef:latest-rust-1.85-bookworm`
+ENV CARGO_CHEF=0.1.71
+
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    curl \
+    libssl-dev \
+    pkg-config \
+    && rm -rf /var/lib/apt/lists/*
+
+# Donwload and configure sccache
+RUN curl -fsSL https://github.com/mozilla/sccache/releases/download/v$SCCACHE/sccache-v$SCCACHE-x86_64-unknown-linux-musl.tar.gz | tar -xzv --strip-components=1 -C /usr/local/bin sccache-v$SCCACHE-x86_64-unknown-linux-musl/sccache && \
+    chmod +x /usr/local/bin/sccache
+
+RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
+RUN cargo install cargo-chef --version $CARGO_CHEF --locked
+
+FROM base-builder AS planner
+
+WORKDIR /usr/src
+
+COPY backends backends
+COPY core core
+COPY router router
+COPY Cargo.toml ./
+COPY Cargo.lock ./
+
+RUN cargo chef prepare  --recipe-path recipe.json
+
+FROM base-builder AS builder
+
+ARG CUDA_COMPUTE_CAP=80
+ARG GIT_SHA
+ARG DOCKER_LABEL
+
+# Limit parallelism
+ARG RAYON_NUM_THREADS
+ARG CARGO_BUILD_JOBS
+ARG CARGO_BUILD_INCREMENTAL
+
+# sccache specific variables
+ARG SCCACHE_GHA_ENABLED
+
+WORKDIR /usr/src
+
+RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
+    --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
+    if [ ${CUDA_COMPUTE_CAP} -ge 75 -a ${CUDA_COMPUTE_CAP} -lt 80 ]; \
+    then  \
+        nvprune --generate-code code=sm_${CUDA_COMPUTE_CAP} /usr/local/cuda/lib64/libcublas_static.a -o /usr/local/cuda/lib64/libcublas_static.a; \
+    elif [ ${CUDA_COMPUTE_CAP} -ge 80 -a ${CUDA_COMPUTE_CAP} -lt 90 ]; \
+    then  \
+        nvprune --generate-code code=sm_80 --generate-code code=sm_${CUDA_COMPUTE_CAP} /usr/local/cuda/lib64/libcublas_static.a -o /usr/local/cuda/lib64/libcublas_static.a; \
+    elif [ ${CUDA_COMPUTE_CAP} -eq 90 ]; \
+    then  \
+        nvprune --generate-code code=sm_90 /usr/local/cuda/lib64/libcublas_static.a -o /usr/local/cuda/lib64/libcublas_static.a; \
+    elif [ ${CUDA_COMPUTE_CAP} -eq 120 ]; \
+    then  \
+        nvprune --generate-code code=sm_120 /usr/local/cuda/lib64/libcublas_static.a -o /usr/local/cuda/lib64/libcublas_static.a; \
+    else  \
+        echo "cuda compute cap ${CUDA_COMPUTE_CAP} is not supported"; exit 1; \
+    fi;
+
+COPY --from=planner /usr/src/recipe.json recipe.json
+
+RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
+    --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
+    if [ ${CUDA_COMPUTE_CAP} -ge 75 -a ${CUDA_COMPUTE_CAP} -lt 80 ]; \
+    then \
+        cargo chef cook --release --features candle-cuda-turing --features static-linking --no-default-features --recipe-path recipe.json && sccache -s; \
+    else \
+        cargo chef cook --release --features candle-cuda --features static-linking --no-default-features --recipe-path recipe.json && sccache -s; \
+    fi;
+
+COPY backends backends
+COPY core core
+COPY router router
+COPY Cargo.toml ./
+COPY Cargo.lock ./
+
+FROM builder AS http-builder
+
+RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
+    --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
+    if [ ${CUDA_COMPUTE_CAP} -ge 75 -a ${CUDA_COMPUTE_CAP} -lt 80 ]; \
+    then \
+        cargo build --release --bin text-embeddings-router -F candle-cuda-turing -F static-linking -F http --no-default-features && sccache -s; \
+    else \
+        cargo build --release --bin text-embeddings-router -F candle-cuda -F static-linking -F http --no-default-features && sccache -s; \
+    fi;
+
+FROM builder AS grpc-builder
+
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    unzip \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
+    curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
+    unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
+    unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
+    rm -f $PROTOC_ZIP
+
+COPY proto proto
+
+RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
+    --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
+    if [ ${CUDA_COMPUTE_CAP} -ge 75 -a ${CUDA_COMPUTE_CAP} -lt 80 ]; \
+    then \
+        cargo build --release --bin text-embeddings-router -F candle-cuda-turing -F static-linking -F grpc --no-default-features && sccache -s; \
+    else \
+        cargo build --release --bin text-embeddings-router -F candle-cuda -F static-linking -F grpc --no-default-features && sccache -s; \
+    fi;
+
+FROM nvidia/cuda:12.9.0-base-ubuntu22.04 AS base
+
+ARG DEFAULT_USE_FLASH_ATTENTION=True
+
+ENV HUGGINGFACE_HUB_CACHE=/data \
+    PORT=80 \
+    USE_FLASH_ATTENTION=$DEFAULT_USE_FLASH_ATTENTION
+
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    ca-certificates \
+    libssl-dev \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+
+FROM base AS grpc
+
+COPY --from=grpc-builder /usr/src/target/release/text-embeddings-router /usr/local/bin/text-embeddings-router
+
+ENTRYPOINT ["text-embeddings-router"]
+CMD ["--json-output"]
+
+FROM base
+
+COPY --from=http-builder /usr/src/target/release/text-embeddings-router /usr/local/bin/text-embeddings-router
+
+ENTRYPOINT ["text-embeddings-router"]
+CMD ["--json-output"]

From 448531408388e3e23abc66c26b56e9a0a084eb2e Mon Sep 17 00:00:00 2001
From: Daniele Salvatore Albano <d.albano@gmail.com>
Date: Mon, 13 Oct 2025 10:51:47 +0200
Subject: [PATCH 8/9] Update README.md

---
 README.md | 34 +++++++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 71c49156..13a0d2dc 100644
--- a/README.md
+++ b/README.md
@@ -557,6 +557,8 @@ You can build the CPU container with:
 docker build .
 ```
 
+### CUDA - Pre Blackwell architecture
+
 To build the CUDA containers, you need to know the compute cap of the GPU you will be using
 at runtime.
 
@@ -581,10 +583,40 @@ runtime_compute_cap=89
 # Example for H100
 runtime_compute_cap=90
 
+docker build . -f Dockerfile-cuda --build-arg CUDA_COMPUTE_CAP=$runtime_compute_cap
+```
+
+### CUDA - Blackwell architecture
+
+To build the CUDA containers for the Blackwell architecture CUDA 12.9 is required, you need to use a different Dockerfile
+and set the compute cap to 120.
+This Dockerfile can still be used to build for previous architectures.
+
+Commands to build the container:
+
+```shell
+# Get submodule dependencies
+git submodule update --init
+
+# Example for Turing (T4, RTX 2000 series, ...)
+runtime_compute_cap=75
+
+# Example for A100
+runtime_compute_cap=80
+
+# Example for A10
+runtime_compute_cap=86
+
+# Example for Ada Lovelace (RTX 4000 series, ...)
+runtime_compute_cap=89
+
+# Example for H100
+runtime_compute_cap=90
+
 # Example for Blackwell (RTX 5000 series, ...)
 runtime_compute_cap=120
 
-docker build . -f Dockerfile-cuda --build-arg CUDA_COMPUTE_CAP=$runtime_compute_cap
+docker build . -f Dockerfile-cuda-blackwell --build-arg CUDA_COMPUTE_CAP=$runtime_compute_cap
 ```
 
 ### Apple M1/M2 arm64 architectures

From c40661962348ce3b62b0f7534c663d167740c7be Mon Sep 17 00:00:00 2001
From: Daniele Salvatore Albano <d.albano@gmail.com>
Date: Mon, 13 Oct 2025 10:53:55 +0200
Subject: [PATCH 9/9] Update custom_container.md docs

---
 docs/source/en/custom_container.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/custom_container.md b/docs/source/en/custom_container.md
index 8b85262e..bbdbae27 100644
--- a/docs/source/en/custom_container.md
+++ b/docs/source/en/custom_container.md
@@ -16,7 +16,7 @@ rendered properly in your Markdown viewer.
 
 # Build a custom container for TEI
 
-You can build our own CPU or CUDA TEI container using Docker.  To build a CPU container, run the following command in the
+You can build our own CPU or CUDA TEI container using Docker. To build a CPU container, run the following command in the
 directory containing your custom Dockerfile:
 
 ```shell
@@ -35,7 +35,8 @@ the examples of runtime compute capabilities for various GPU types:
 - Blackwell (RTX 5000 series, ...) - `runtime_compute_cap=120`
 
 Once you have determined the compute capability is determined, set it as the `runtime_compute_cap` variable and build
-the container as shown in the example below:
+the container using `Dockerfile-cuda` if the runtime compute cap is lower than 120 otherwise use
+`Dockerfile-cuda-blackwell` as shown in the example below:
 
 ```shell
 # Get submodule dependencies