ai-dynamo · nv-tusharma · Nov 20, 2025 · Nov 20, 2025 · Nov 20, 2025
@@ -183,7 +183,6 @@ async def init(runtime: DistributedRuntime, config: Config):
         "pipeline_parallel_size": config.pipeline_parallel_size,
         "moe_expert_parallel_size": config.expert_parallel_size,
         "backend": "pytorch",
-        "skip_tokenizer_init": True,
         "build_config": build_config,
         "kv_cache_config": kv_cache_config,
         "gpus_per_node": gpus_per_node,
@@ -240,8 +239,7 @@ async def init(runtime: DistributedRuntime, config: Config):
     # Populate default sampling params from the model
     tokenizer = tokenizer_factory(arg_map["model"])
     default_sampling_params = SamplingParams()
-    default_sampling_params._setup(tokenizer)
-    default_sampling_params.stop = None
+
     model_input = ModelInput.Tokens
 
     # Set model type based on disaggregation mode for unified frontend support

@@ -41,11 +41,11 @@ def __init__(self) -> None:
         self.kv_block_size: int = 32
         self.migration_limit: int = 0
         self.gpus_per_node: Optional[int] = None
-        self.max_batch_size: int = BuildConfig.max_batch_size
-        self.max_num_tokens: int = BuildConfig.max_num_tokens
-        self.max_seq_len: int = BuildConfig.max_seq_len
-        self.max_beam_width: int = BuildConfig.max_beam_width
-        self.free_gpu_memory_fraction: Optional[float] = None
+        self.max_batch_size: int = BuildConfig.model_fields["max_batch_size"].default
+        self.max_num_tokens: int = BuildConfig.model_fields["max_num_tokens"].default
+        self.max_seq_len: int = BuildConfig.model_fields["max_seq_len"].default
+        self.max_beam_width: int = BuildConfig.model_fields["max_beam_width"].default
+        self.free_gpu_memory_fraction: float = 0.9
         self.extra_engine_args: str = ""
         self.override_engine_args: str = ""
         self.publish_events_and_metrics: bool = False
@@ -174,26 +174,26 @@ def cmd_line_args():
     parser.add_argument(
         "--max-batch-size",
         type=int,
-        default=BuildConfig.max_batch_size,
+        default=BuildConfig.model_fields["max_batch_size"].default,
         help="Maximum number of requests that the engine can schedule.",
     )
     parser.add_argument(
         "--max-num-tokens",
         type=int,
-        default=BuildConfig.max_num_tokens,
+        default=BuildConfig.model_fields["max_num_tokens"].default,
         help="Maximum number of batched input tokens after padding is removed in each batch.",
     )
     parser.add_argument(
         "--max-seq-len",
         type=int,
-        default=BuildConfig.max_seq_len,
+        default=BuildConfig.model_fields["max_seq_len"].default,
         help="Maximum total length of one request, including prompt and outputs. "
         "If unspecified, the value is deduced from the model config.",
     )
     parser.add_argument(
         "--max-beam-width",
         type=int,
-        default=BuildConfig.max_beam_width,
+        default=BuildConfig.model_fields["max_beam_width"].default,
         help="Maximum number of beams for beam search decoding.",
     )
     parser.add_argument(

@@ -2,13 +2,13 @@
 # SPDX-License-Identifier: Apache-2.0
 
 ARG BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
-ARG BASE_IMAGE_TAG="25.06-cuda12.9-devel-ubuntu24.04"
+ARG BASE_IMAGE_TAG="25.10-cuda13.0-devel-ubuntu24.04"
 
 ARG PYTORCH_BASE_IMAGE="nvcr.io/nvidia/pytorch"
-ARG PYTORCH_BASE_IMAGE_TAG="25.06-py3"
+ARG PYTORCH_BASE_IMAGE_TAG="25.10-py3"
 ARG ENABLE_KVBM=false
 ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda"
-ARG RUNTIME_IMAGE_TAG="12.9.1-runtime-ubuntu24.04"
+ARG RUNTIME_IMAGE_TAG="13.0.2-runtime-ubuntu24.04"
 
 # TensorRT-LLM specific configuration
 ARG HAS_TRTLLM_CONTEXT=0
@@ -72,6 +72,7 @@ RUN apt-get update && \
         git \
         git-lfs \
         ca-certificates && \
+    apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
 # Copy uv
@@ -82,16 +83,12 @@ RUN mkdir -p /opt/dynamo/venv && \
     uv venv /opt/dynamo/venv --python $PYTHON_VERSION
 
 # Copy pytorch installation from NGC PyTorch
-ARG TORCH_VER=2.8.0a0+5228986c39.nv25.6
-ARG TORCHVISION_VER=0.22.0a0+95f10a4e
-ARG SETUPTOOLS_VER=78.1.1
-ARG PYTORCH_TRITON_VER=3.3.0+git96316ce52.nvinternal
+ARG TORCH_VER=2.9.0a0+145a3a7bda.nv25.10
+ARG TORCHVISION_VER=0.24.0a0+094e7af5
+# ARG PYTORCH_TRITON_VER=3.4.0+gitc817b9b6
 ARG JINJA2_VER=3.1.6
-ARG NETWORKX_VER=3.5
 ARG SYMPY_VER=1.14.0
-ARG PACKAGING_VER=23.2
-ARG FLASH_ATTN_VER=2.7.4.post1
-ARG MPMATH_VER=1.3.0
+ARG FLASH_ATTN_VER=2.7.4.post1+25.10
 
 COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torch ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/torch
 COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torch-${TORCH_VER}.dist-info ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/torch-${TORCH_VER}.dist-info
@@ -107,8 +104,8 @@ COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/sy
 COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/flash_attn ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/flash_attn
 COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/flash_attn-${FLASH_ATTN_VER}.dist-info ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/flash_attn-${FLASH_ATTN_VER}.dist-info
 COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/flash_attn_2_cuda.cpython-*-*-linux-gnu.so ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/
-COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/triton ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/triton
-COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/pytorch_triton-${PYTORCH_TRITON_VER}.dist-info ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/pytorch_triton-${PYTORCH_TRITON_VER}.dist-info
+# COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/triton ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/triton
+# COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/pytorch_triton-${PYTORCH_TRITON_VER}.dist-info ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/pytorch_triton-${PYTORCH_TRITON_VER}.dist-info
 
 # Install TensorRT-LLM and related dependencies
 ARG HAS_TRTLLM_CONTEXT
@@ -120,8 +117,7 @@ ARG GITHUB_TRTLLM_COMMIT
 COPY --from=trtllm_wheel /*.whl /trtllm_wheel/
 COPY --from=trtllm_wheel /*.txt /trtllm_wheel/
 
-# NOTE: locking cuda-python version to <13 to avoid breaks with tensorrt-llm 1.0.0rc6.
-RUN uv pip install "cuda-python>=12,<13"
+RUN uv pip install --no-cache "cuda-python==13.0.2"
 
 # Note: TensorRT needs to be uninstalled before installing the TRTLLM wheel
 # because there might be mismatched versions of TensorRT between the NGC PyTorch
@@ -141,7 +137,7 @@ RUN if [ "$HAS_TRTLLM_CONTEXT" = "1" ]; then \
         # Install from local wheel directory in build context
         WHEEL_FILE="$(find /trtllm_wheel -name "*.whl" | head -n 1)"; \
         if [ -n "$WHEEL_FILE" ]; then \
-            uv pip install "$WHEEL_FILE"; \
+            uv pip install --no-cache "$WHEEL_FILE"; \
         else \
             echo "No wheel file found in /trtllm_wheel directory."; \
             exit 1; \
@@ -155,7 +151,9 @@ RUN if [ "$HAS_TRTLLM_CONTEXT" = "1" ]; then \
         sed -i 's/pip3 install/uv pip install/g' /tmp/install_tensorrt.sh && \
         bash /tmp/install_tensorrt.sh && \
         # Install TensorRT-LLM wheel from the provided index URL, allow dependencies from PyPI
-        uv pip install --extra-index-url "${TENSORRTLLM_INDEX_URL}" "${TENSORRTLLM_PIP_WHEEL}"; \
+        # TRTLLM 1.2.0rc2 has issues installing from pypi with uv, installing from direct wheel link works best
+        TENSORRTLLM_PIP_WHEEL="https://pypi.nvidia.com/tensorrt-llm/tensorrt_llm-1.2.0rc2-cp312-cp312-linux_${ARCH_ALT}.whl"; \
+        uv pip install --no-cache --index-strategy=unsafe-best-match --extra-index-url "${TENSORRTLLM_INDEX_URL}" "${TENSORRTLLM_PIP_WHEEL}" ; \
     fi
 
 ##################################################
@@ -190,6 +188,10 @@ ENV VIRTUAL_ENV=/opt/dynamo/venv
 ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl
 ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib/${ARCH_ALT}-linux-gnu
 ENV NIXL_PLUGIN_DIR=$NIXL_LIB_DIR/plugins
+# workaround for pickle lib issue
+ENV OMPI_MCA_coll_ucc_enable=0
+# Use UCX KVCACHE by default
+ENV TRTLLM_USE_UCX_KVCACHE=1
 
 ARG DYNAMO_COMMIT_SHA
 ENV DYNAMO_COMMIT_SHA=$DYNAMO_COMMIT_SHA
@@ -209,7 +211,8 @@ RUN apt-get update && \
         # jq for polling various endpoints and health checks
         jq \
         # CUDA/ML libraries
-        libcudnn9-cuda-12 \
+        libcudnn9-cuda-13 \
+        libnvshmem3-cuda-13 \
         # Network and communication libraries
         libzmq3-dev \
         # RDMA/UCX libraries required to find RDMA devices
@@ -228,6 +231,8 @@ RUN apt-get update && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
+ENV LD_LIBRARY_PATH="/usr/lib/${ARCH_ALT}-linux-gnu/nvshmem/13/:${LD_LIBRARY_PATH}"
+
 # Copy CUDA development tools (nvcc, headers, dependencies, etc.) from PyTorch base image
 COPY --from=pytorch_base /usr/local/cuda/bin/nvcc /usr/local/cuda/bin/nvcc
 COPY --from=pytorch_base /usr/local/cuda/bin/cudafe++ /usr/local/cuda/bin/cudafe++
@@ -238,6 +243,15 @@ COPY --from=pytorch_base /usr/local/cuda/nvvm /usr/local/cuda/nvvm
 COPY --from=pytorch_base /usr/local/cuda/lib64/libcudart.so* /usr/local/cuda/lib64/
 COPY --from=pytorch_base /usr/local/cuda/lib64/libcupti* /usr/local/cuda/lib64/
 COPY --from=pytorch_base /usr/local/lib/lib* /usr/local/lib/
+COPY --from=pytorch_base /usr/local/cuda/bin/cuobjdump /usr/local/cuda/bin/cuobjdump
+COPY --from=pytorch_base /usr/local/cuda/bin/nvdisasm /usr/local/cuda/bin/nvdisasm
+
+ENV TRITON_CUPTI_PATH=/usr/local/cuda/include \
+    TRITON_CUDACRT_PATH=/usr/local/cuda/include \
+    TRITON_CUOBJDUMP_PATH=/usr/local/cuda/bin/cuobjdump \
+    TRITON_NVDISASM_PATH=/usr/local/cuda/bin/nvdisasm \
+    TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas \
+    TRITON_CUDART_PATH=/usr/local/cuda/include
 
 # Copy nats and etcd from dynamo_base image
 COPY --from=dynamo_base /usr/bin/nats-server /usr/bin/nats-server
@@ -255,8 +269,6 @@ COPY --from=pytorch_base /opt/hpcx /opt/hpcx
 # This is needed to make libucc.so visible so pytorch can use it.
 ENV LD_LIBRARY_PATH="/opt/hpcx/ucc/lib:${LD_LIBRARY_PATH}"
 # Might not need to copy cusparseLt in the future once it's included in DLFW cuda container
-# networkx, packaging, setuptools get overridden by trtllm installation, so not copying them
-# pytorch-triton is copied after trtllm installation.
 COPY --from=pytorch_base /usr/local/cuda/lib64/libcusparseLt* /usr/local/cuda/lib64/
 
 # Copy uv to system /bin
@@ -305,11 +317,12 @@ ENV LD_LIBRARY_PATH=${TENSORRT_LIB_DIR}:${LD_LIBRARY_PATH}
 COPY --chown=dynamo: benchmarks/ /opt/dynamo/benchmarks/
 COPY --chown=dynamo: --from=dynamo_base /opt/dynamo/wheelhouse/ /opt/dynamo/wheelhouse/
 RUN uv pip install \
+      --no-cache \
       /opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \
       /opt/dynamo/wheelhouse/ai_dynamo*any.whl \
       /opt/dynamo/wheelhouse/nixl/nixl*.whl \
     && if [ "${ENABLE_KVBM}" = "true" ]; then \
-        uv pip install /opt/dynamo/wheelhouse/kvbm*.whl; \
+        uv pip install --no-cache /opt/dynamo/wheelhouse/kvbm*.whl; \
        fi \
     && cd /opt/dynamo/benchmarks \
     && UV_GIT_LFS=1 uv pip install --no-cache . \
@@ -321,8 +334,11 @@ RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requi
     --mount=type=bind,source=./container/deps/requirements.test.txt,target=/tmp/requirements.test.txt \
     UV_GIT_LFS=1 uv pip install \
         --no-cache \
+        --index-strategy unsafe-best-match \
+        --extra-index-url https://download.pytorch.org/whl/cu130 \
         --requirement /tmp/requirements.txt \
-        --requirement /tmp/requirements.test.txt
+        --requirement /tmp/requirements.test.txt \
+        cupy-cuda13x
 
 # Copy tests, benchmarks, deploy and components for CI with correct ownership
 COPY --chown=dynamo: tests /workspace/tests
@@ -346,7 +362,6 @@ RUN chmod 755 /opt/dynamo/.launch_screen && \
     echo 'cat /opt/dynamo/.launch_screen' >> /etc/bash.bashrc
 
 USER dynamo
-
 ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
 CMD []
 
@@ -374,29 +389,30 @@ USER root
 # Install utilities as root
 RUN apt-get update -y && \
     apt-get install -y --no-install-recommends  \
-    # Install utilities
-    nvtop \
-    wget \
-    tmux \
-    vim \
-    git \
-    iproute2 \
-    rsync \
-    zip \
-    unzip \
-    htop \
-    # Build Dependencies
-    autoconf \
-    automake \
-    cmake \
-    libtool \
-    meson \
-    net-tools \
-    pybind11-dev \
-    # Rust build dependencies
-    clang \
-    libclang-dev \
-    protobuf-compiler && \
+        # Install utilities
+        nvtop \
+        wget \
+        tmux \
+        vim \
+        git \
+        iproute2 \
+        rsync \
+        zip \
+        unzip \
+        htop \
+        # Build Dependencies
+        autoconf \
+        automake \
+        cmake \
+        libtool \
+        meson \
+        net-tools \
+        pybind11-dev \
+        # Rust build dependencies
+        clang \
+        libclang-dev \
+        protobuf-compiler && \
+    apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
 # Set workspace directory variable
@@ -412,10 +428,10 @@ COPY --from=dynamo_base /usr/local/rustup /usr/local/rustup
 COPY --from=dynamo_base /usr/local/cargo /usr/local/cargo
 
 # Install maturin, for maturin develop
-RUN uv pip install maturin[patchelf]
+RUN uv pip install --no-cache maturin[patchelf]
 
 # Editable install of dynamo
 COPY pyproject.toml README.md hatch_build.py /workspace/
-RUN uv pip install --no-deps -e .
+RUN uv pip install --no-cache --no-deps -e .
 
 CMD []
@@ -59,7 +59,7 @@ BUILD_CONTEXT=$(dirname "$(readlink -f "$SOURCE_DIR")")
 
 # Base Images
 TRTLLM_BASE_IMAGE=nvcr.io/nvidia/pytorch
-TRTLLM_BASE_IMAGE_TAG=25.06-py3
+TRTLLM_BASE_IMAGE_TAG=25.10-py3
 
 # Important Note: Because of ABI compatibility issues between TensorRT-LLM and NGC PyTorch,
 # we need to build the TensorRT-LLM wheel from source.
@@ -89,7 +89,7 @@ DEFAULT_TENSORRTLLM_PIP_WHEEL_DIR="/tmp/trtllm_wheel/"
 # TensorRT-LLM commit to use for building the trtllm wheel if not provided.
 # Important Note: This commit is not used in our CI pipeline. See the CI
 # variables to learn how to run a pipeline with a specific commit.
-DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT="0c9430e5a530ba958fc9dca561a3ad865ad9f492"
+DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT="31116825b39f4e6a6a1e127001f5204b73d1dc32" # 1.2.0rc2
 TRTLLM_COMMIT=""
 TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL="0"
 TRTLLM_GIT_URL=""
@@ -98,10 +98,9 @@ TRTLLM_GIT_URL=""
 DEFAULT_TENSORRTLLM_INDEX_URL="https://pypi.python.org/simple"
 # TODO: Remove the version specification from here and use the ai-dynamo[trtllm] package.
 # Need to update the Dockerfile.trtllm to use the ai-dynamo[trtllm] package.
-DEFAULT_TENSORRTLLM_PIP_WHEEL="tensorrt-llm==1.1.0rc5"
+DEFAULT_TENSORRTLLM_PIP_WHEEL="tensorrt-llm==1.2.0rc2"
 TENSORRTLLM_PIP_WHEEL=""
 
-
 VLLM_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
 # FIXME: NCCL will hang with 25.03, so use 25.01 for now
 # Please check https://github.com/ai-dynamo/dynamo/pull/1065

@@ -23,11 +23,11 @@ set -ex
 
 GITHUB_URL="https://github.com"
 
-UCX_VERSION="v1.18.1"
+UCX_VERSION="v1.19.1"
 UCX_INSTALL_PATH="/usr/local/ucx/"
 CUDA_PATH="/usr/local/cuda"
 
-NIXL_COMMIT="16348080f5bdeb9fe6058a23be140cec020ef3f3"
+NIXL_COMMIT="97c9b5b48e2ed3f1f2539c461c4971a7db8b1197"
 
 UCX_REPO="https://github.com/openucx/ucx.git"
 NIXL_REPO="https://github.com/ai-dynamo/nixl.git"

diff --git a/pyproject.toml b/pyproject.toml
@@ -49,7 +49,7 @@ Repository = "https://github.com/ai-dynamo/dynamo.git"
 [project.optional-dependencies]
 trtllm =[
     "uvloop",
-    "tensorrt-llm==1.1.0rc5",
+    "tensorrt-llm==1.2.0rc2",
 ]
 
 vllm = [