Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions components/src/dynamo/trtllm/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,6 @@ async def init(runtime: DistributedRuntime, config: Config):
"pipeline_parallel_size": config.pipeline_parallel_size,
"moe_expert_parallel_size": config.expert_parallel_size,
"backend": "pytorch",
"skip_tokenizer_init": True,
"build_config": build_config,
"kv_cache_config": kv_cache_config,
"gpus_per_node": gpus_per_node,
Expand Down Expand Up @@ -240,8 +239,7 @@ async def init(runtime: DistributedRuntime, config: Config):
# Populate default sampling params from the model
tokenizer = tokenizer_factory(arg_map["model"])
default_sampling_params = SamplingParams()
default_sampling_params._setup(tokenizer)
default_sampling_params.stop = None

model_input = ModelInput.Tokens

# Set model type based on disaggregation mode for unified frontend support
Expand Down
18 changes: 9 additions & 9 deletions components/src/dynamo/trtllm/utils/trtllm_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,11 @@ def __init__(self) -> None:
self.kv_block_size: int = 32
self.migration_limit: int = 0
self.gpus_per_node: Optional[int] = None
self.max_batch_size: int = BuildConfig.max_batch_size
self.max_num_tokens: int = BuildConfig.max_num_tokens
self.max_seq_len: int = BuildConfig.max_seq_len
self.max_beam_width: int = BuildConfig.max_beam_width
self.free_gpu_memory_fraction: Optional[float] = None
self.max_batch_size: int = BuildConfig.model_fields["max_batch_size"].default
self.max_num_tokens: int = BuildConfig.model_fields["max_num_tokens"].default
self.max_seq_len: int = BuildConfig.model_fields["max_seq_len"].default
self.max_beam_width: int = BuildConfig.model_fields["max_beam_width"].default
self.free_gpu_memory_fraction: float = 0.9
self.extra_engine_args: str = ""
self.override_engine_args: str = ""
self.publish_events_and_metrics: bool = False
Expand Down Expand Up @@ -174,26 +174,26 @@ def cmd_line_args():
parser.add_argument(
"--max-batch-size",
type=int,
default=BuildConfig.max_batch_size,
default=BuildConfig.model_fields["max_batch_size"].default,
help="Maximum number of requests that the engine can schedule.",
)
parser.add_argument(
"--max-num-tokens",
type=int,
default=BuildConfig.max_num_tokens,
default=BuildConfig.model_fields["max_num_tokens"].default,
help="Maximum number of batched input tokens after padding is removed in each batch.",
)
parser.add_argument(
"--max-seq-len",
type=int,
default=BuildConfig.max_seq_len,
default=BuildConfig.model_fields["max_seq_len"].default,
help="Maximum total length of one request, including prompt and outputs. "
"If unspecified, the value is deduced from the model config.",
)
parser.add_argument(
"--max-beam-width",
type=int,
default=BuildConfig.max_beam_width,
default=BuildConfig.model_fields["max_beam_width"].default,
help="Maximum number of beams for beam search decoding.",
)
parser.add_argument(
Expand Down
112 changes: 64 additions & 48 deletions container/Dockerfile.trtllm
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@
# SPDX-License-Identifier: Apache-2.0

ARG BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
ARG BASE_IMAGE_TAG="25.06-cuda12.9-devel-ubuntu24.04"
ARG BASE_IMAGE_TAG="25.10-cuda13.0-devel-ubuntu24.04"

ARG PYTORCH_BASE_IMAGE="nvcr.io/nvidia/pytorch"
ARG PYTORCH_BASE_IMAGE_TAG="25.06-py3"
ARG PYTORCH_BASE_IMAGE_TAG="25.10-py3"
ARG ENABLE_KVBM=false
ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda"
ARG RUNTIME_IMAGE_TAG="12.9.1-runtime-ubuntu24.04"
ARG RUNTIME_IMAGE_TAG="13.0.2-runtime-ubuntu24.04"

# TensorRT-LLM specific configuration
ARG HAS_TRTLLM_CONTEXT=0
Expand Down Expand Up @@ -72,6 +72,7 @@ RUN apt-get update && \
git \
git-lfs \
ca-certificates && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

# Copy uv
Expand All @@ -82,16 +83,12 @@ RUN mkdir -p /opt/dynamo/venv && \
uv venv /opt/dynamo/venv --python $PYTHON_VERSION

# Copy pytorch installation from NGC PyTorch
ARG TORCH_VER=2.8.0a0+5228986c39.nv25.6
ARG TORCHVISION_VER=0.22.0a0+95f10a4e
ARG SETUPTOOLS_VER=78.1.1
ARG PYTORCH_TRITON_VER=3.3.0+git96316ce52.nvinternal
ARG TORCH_VER=2.9.0a0+145a3a7bda.nv25.10
ARG TORCHVISION_VER=0.24.0a0+094e7af5
# ARG PYTORCH_TRITON_VER=3.4.0+gitc817b9b6
ARG JINJA2_VER=3.1.6
ARG NETWORKX_VER=3.5
ARG SYMPY_VER=1.14.0
ARG PACKAGING_VER=23.2
ARG FLASH_ATTN_VER=2.7.4.post1
ARG MPMATH_VER=1.3.0
ARG FLASH_ATTN_VER=2.7.4.post1+25.10

COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torch ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/torch
COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torch-${TORCH_VER}.dist-info ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/torch-${TORCH_VER}.dist-info
Expand All @@ -107,8 +104,8 @@ COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/sy
COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/flash_attn ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/flash_attn
COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/flash_attn-${FLASH_ATTN_VER}.dist-info ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/flash_attn-${FLASH_ATTN_VER}.dist-info
COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/flash_attn_2_cuda.cpython-*-*-linux-gnu.so ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/
COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/triton ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/triton
COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/pytorch_triton-${PYTORCH_TRITON_VER}.dist-info ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/pytorch_triton-${PYTORCH_TRITON_VER}.dist-info
# COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/triton ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/triton
# COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/pytorch_triton-${PYTORCH_TRITON_VER}.dist-info ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/pytorch_triton-${PYTORCH_TRITON_VER}.dist-info

# Install TensorRT-LLM and related dependencies
ARG HAS_TRTLLM_CONTEXT
Expand All @@ -120,8 +117,7 @@ ARG GITHUB_TRTLLM_COMMIT
COPY --from=trtllm_wheel /*.whl /trtllm_wheel/
COPY --from=trtllm_wheel /*.txt /trtllm_wheel/

# NOTE: locking cuda-python version to <13 to avoid breaks with tensorrt-llm 1.0.0rc6.
RUN uv pip install "cuda-python>=12,<13"
RUN uv pip install --no-cache "cuda-python==13.0.2"

# Note: TensorRT needs to be uninstalled before installing the TRTLLM wheel
# because there might be mismatched versions of TensorRT between the NGC PyTorch
Expand All @@ -141,7 +137,7 @@ RUN if [ "$HAS_TRTLLM_CONTEXT" = "1" ]; then \
# Install from local wheel directory in build context
WHEEL_FILE="$(find /trtllm_wheel -name "*.whl" | head -n 1)"; \
if [ -n "$WHEEL_FILE" ]; then \
uv pip install "$WHEEL_FILE"; \
uv pip install --no-cache "$WHEEL_FILE"; \
else \
echo "No wheel file found in /trtllm_wheel directory."; \
exit 1; \
Expand All @@ -155,7 +151,9 @@ RUN if [ "$HAS_TRTLLM_CONTEXT" = "1" ]; then \
sed -i 's/pip3 install/uv pip install/g' /tmp/install_tensorrt.sh && \
bash /tmp/install_tensorrt.sh && \
# Install TensorRT-LLM wheel from the provided index URL, allow dependencies from PyPI
uv pip install --extra-index-url "${TENSORRTLLM_INDEX_URL}" "${TENSORRTLLM_PIP_WHEEL}"; \
# TRTLLM 1.2.0rc2 has issues installing from pypi with uv, installing from direct wheel link works best
TENSORRTLLM_PIP_WHEEL="https://pypi.nvidia.com/tensorrt-llm/tensorrt_llm-1.2.0rc2-cp312-cp312-linux_${ARCH_ALT}.whl"; \
uv pip install --no-cache --index-strategy=unsafe-best-match --extra-index-url "${TENSORRTLLM_INDEX_URL}" "${TENSORRTLLM_PIP_WHEEL}" ; \
fi

##################################################
Expand Down Expand Up @@ -190,6 +188,10 @@ ENV VIRTUAL_ENV=/opt/dynamo/venv
ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl
ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib/${ARCH_ALT}-linux-gnu
ENV NIXL_PLUGIN_DIR=$NIXL_LIB_DIR/plugins
# workaround for pickle lib issue
ENV OMPI_MCA_coll_ucc_enable=0
# Use UCX KVCACHE by default
ENV TRTLLM_USE_UCX_KVCACHE=1

ARG DYNAMO_COMMIT_SHA
ENV DYNAMO_COMMIT_SHA=$DYNAMO_COMMIT_SHA
Expand All @@ -209,7 +211,8 @@ RUN apt-get update && \
# jq for polling various endpoints and health checks
jq \
# CUDA/ML libraries
libcudnn9-cuda-12 \
libcudnn9-cuda-13 \
libnvshmem3-cuda-13 \
# Network and communication libraries
libzmq3-dev \
# RDMA/UCX libraries required to find RDMA devices
Expand All @@ -228,6 +231,8 @@ RUN apt-get update && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

ENV LD_LIBRARY_PATH="/usr/lib/${ARCH_ALT}-linux-gnu/nvshmem/13/:${LD_LIBRARY_PATH}"

# Copy CUDA development tools (nvcc, headers, dependencies, etc.) from PyTorch base image
COPY --from=pytorch_base /usr/local/cuda/bin/nvcc /usr/local/cuda/bin/nvcc
COPY --from=pytorch_base /usr/local/cuda/bin/cudafe++ /usr/local/cuda/bin/cudafe++
Expand All @@ -238,6 +243,15 @@ COPY --from=pytorch_base /usr/local/cuda/nvvm /usr/local/cuda/nvvm
COPY --from=pytorch_base /usr/local/cuda/lib64/libcudart.so* /usr/local/cuda/lib64/
COPY --from=pytorch_base /usr/local/cuda/lib64/libcupti* /usr/local/cuda/lib64/
COPY --from=pytorch_base /usr/local/lib/lib* /usr/local/lib/
COPY --from=pytorch_base /usr/local/cuda/bin/cuobjdump /usr/local/cuda/bin/cuobjdump
COPY --from=pytorch_base /usr/local/cuda/bin/nvdisasm /usr/local/cuda/bin/nvdisasm

ENV TRITON_CUPTI_PATH=/usr/local/cuda/include \
TRITON_CUDACRT_PATH=/usr/local/cuda/include \
TRITON_CUOBJDUMP_PATH=/usr/local/cuda/bin/cuobjdump \
TRITON_NVDISASM_PATH=/usr/local/cuda/bin/nvdisasm \
TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas \
TRITON_CUDART_PATH=/usr/local/cuda/include

# Copy nats and etcd from dynamo_base image
COPY --from=dynamo_base /usr/bin/nats-server /usr/bin/nats-server
Expand All @@ -255,8 +269,6 @@ COPY --from=pytorch_base /opt/hpcx /opt/hpcx
# This is needed to make libucc.so visible so pytorch can use it.
ENV LD_LIBRARY_PATH="/opt/hpcx/ucc/lib:${LD_LIBRARY_PATH}"
# Might not need to copy cusparseLt in the future once it's included in DLFW cuda container
# networkx, packaging, setuptools get overridden by trtllm installation, so not copying them
# pytorch-triton is copied after trtllm installation.
COPY --from=pytorch_base /usr/local/cuda/lib64/libcusparseLt* /usr/local/cuda/lib64/

# Copy uv to system /bin
Expand Down Expand Up @@ -305,11 +317,12 @@ ENV LD_LIBRARY_PATH=${TENSORRT_LIB_DIR}:${LD_LIBRARY_PATH}
COPY --chown=dynamo: benchmarks/ /opt/dynamo/benchmarks/
COPY --chown=dynamo: --from=dynamo_base /opt/dynamo/wheelhouse/ /opt/dynamo/wheelhouse/
RUN uv pip install \
--no-cache \
/opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \
/opt/dynamo/wheelhouse/ai_dynamo*any.whl \
/opt/dynamo/wheelhouse/nixl/nixl*.whl \
&& if [ "${ENABLE_KVBM}" = "true" ]; then \
uv pip install /opt/dynamo/wheelhouse/kvbm*.whl; \
uv pip install --no-cache /opt/dynamo/wheelhouse/kvbm*.whl; \
fi \
&& cd /opt/dynamo/benchmarks \
&& UV_GIT_LFS=1 uv pip install --no-cache . \
Expand All @@ -321,8 +334,11 @@ RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requi
--mount=type=bind,source=./container/deps/requirements.test.txt,target=/tmp/requirements.test.txt \
UV_GIT_LFS=1 uv pip install \
--no-cache \
--index-strategy unsafe-best-match \
--extra-index-url https://download.pytorch.org/whl/cu130 \
--requirement /tmp/requirements.txt \
--requirement /tmp/requirements.test.txt
--requirement /tmp/requirements.test.txt \
cupy-cuda13x

# Copy tests, benchmarks, deploy and components for CI with correct ownership
COPY --chown=dynamo: tests /workspace/tests
Expand All @@ -346,7 +362,6 @@ RUN chmod 755 /opt/dynamo/.launch_screen && \
echo 'cat /opt/dynamo/.launch_screen' >> /etc/bash.bashrc

USER dynamo

ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
CMD []

Expand Down Expand Up @@ -374,29 +389,30 @@ USER root
# Install utilities as root
RUN apt-get update -y && \
apt-get install -y --no-install-recommends \
# Install utilities
nvtop \
wget \
tmux \
vim \
git \
iproute2 \
rsync \
zip \
unzip \
htop \
# Build Dependencies
autoconf \
automake \
cmake \
libtool \
meson \
net-tools \
pybind11-dev \
# Rust build dependencies
clang \
libclang-dev \
protobuf-compiler && \
# Install utilities
nvtop \
wget \
tmux \
vim \
git \
iproute2 \
rsync \
zip \
unzip \
htop \
# Build Dependencies
autoconf \
automake \
cmake \
libtool \
meson \
net-tools \
pybind11-dev \
# Rust build dependencies
clang \
libclang-dev \
protobuf-compiler && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

# Set workspace directory variable
Expand All @@ -412,10 +428,10 @@ COPY --from=dynamo_base /usr/local/rustup /usr/local/rustup
COPY --from=dynamo_base /usr/local/cargo /usr/local/cargo

# Install maturin, for maturin develop
RUN uv pip install maturin[patchelf]
RUN uv pip install --no-cache maturin[patchelf]

# Editable install of dynamo
COPY pyproject.toml README.md hatch_build.py /workspace/
RUN uv pip install --no-deps -e .
RUN uv pip install --no-cache --no-deps -e .

CMD []
7 changes: 3 additions & 4 deletions container/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ BUILD_CONTEXT=$(dirname "$(readlink -f "$SOURCE_DIR")")

# Base Images
TRTLLM_BASE_IMAGE=nvcr.io/nvidia/pytorch
TRTLLM_BASE_IMAGE_TAG=25.06-py3
TRTLLM_BASE_IMAGE_TAG=25.10-py3

# Important Note: Because of ABI compatibility issues between TensorRT-LLM and NGC PyTorch,
# we need to build the TensorRT-LLM wheel from source.
Expand Down Expand Up @@ -89,7 +89,7 @@ DEFAULT_TENSORRTLLM_PIP_WHEEL_DIR="/tmp/trtllm_wheel/"
# TensorRT-LLM commit to use for building the trtllm wheel if not provided.
# Important Note: This commit is not used in our CI pipeline. See the CI
# variables to learn how to run a pipeline with a specific commit.
DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT="0c9430e5a530ba958fc9dca561a3ad865ad9f492"
DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT="31116825b39f4e6a6a1e127001f5204b73d1dc32" # 1.2.0rc2
TRTLLM_COMMIT=""
TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL="0"
TRTLLM_GIT_URL=""
Expand All @@ -98,10 +98,9 @@ TRTLLM_GIT_URL=""
DEFAULT_TENSORRTLLM_INDEX_URL="https://pypi.python.org/simple"
# TODO: Remove the version specification from here and use the ai-dynamo[trtllm] package.
# Need to update the Dockerfile.trtllm to use the ai-dynamo[trtllm] package.
DEFAULT_TENSORRTLLM_PIP_WHEEL="tensorrt-llm==1.1.0rc5"
DEFAULT_TENSORRTLLM_PIP_WHEEL="tensorrt-llm==1.2.0rc2"
TENSORRTLLM_PIP_WHEEL=""


VLLM_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
# FIXME: NCCL will hang with 25.03, so use 25.01 for now
# Please check https://github.com/ai-dynamo/dynamo/pull/1065
Expand Down
4 changes: 2 additions & 2 deletions container/deps/trtllm/install_nixl.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,11 @@ set -ex

GITHUB_URL="https://github.com"

UCX_VERSION="v1.18.1"
UCX_VERSION="v1.19.1"
UCX_INSTALL_PATH="/usr/local/ucx/"
CUDA_PATH="/usr/local/cuda"

NIXL_COMMIT="16348080f5bdeb9fe6058a23be140cec020ef3f3"
NIXL_COMMIT="97c9b5b48e2ed3f1f2539c461c4971a7db8b1197"

UCX_REPO="https://github.com/openucx/ucx.git"
NIXL_REPO="https://github.com/ai-dynamo/nixl.git"
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ Repository = "https://github.com/ai-dynamo/dynamo.git"
[project.optional-dependencies]
trtllm =[
"uvloop",
"tensorrt-llm==1.1.0rc5",
"tensorrt-llm==1.2.0rc2",
]

vllm = [
Expand Down
Loading