NVIDIA · zeroepoch · Jan 30, 2025 · Jan 30, 2025
diff --git a/.github/workflows/auto-assign.yml b/.github/workflows/auto-assign.yml
@@ -0,0 +1,43 @@
+name: auto-assign
+on:
+  issues:
+    types:
+      - labeled
+
+jobs:
+  assign_issue:
+    # Only run on module label colors.
+    if: ${{ github.event.label.color == '00611d' }}
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v2
+
+      - name: Get assignee
+        uses: actions/github-script@v6
+        id: get-assignee
+        with:
+          github-token: ${{secrets.GITHUB_TOKEN}}
+          script: |
+            const fs = require('fs');
+
+            // Read configuration file
+            const config = JSON.parse(fs.readFileSync('.github/workflows/module-owners.json', 'utf8'));
+
+            // Find matching label in config
+            for (const [configLabel, users] of Object.entries(config)) {
+              if (configLabel == "${{ github.event.label.name}}") {
+                // Select Randomly
+                const index = Math.floor(Math.random() * users.length)
+                const assignee = users[index % users.length];
+                return assignee
+              }
+            }
+            // Returning empty string in case a valid assignee is not found.
+            return ""
+          result-encoding: string
+
+      - name: Assign
+        run: gh issue edit ${{ github.event.issue.number }} --add-label "triaged" --add-label "investigating" --add-assignee ${{ steps.get-assignee.outputs.result }}
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/auto-close-inactive-issues.yml b/.github/workflows/auto-close-inactive-issues.yml
@@ -0,0 +1,29 @@
+# Ref: https://docs.github.com/en/actions/managing-issues-and-pull-requests/closing-inactive-issues
+name: Close inactive issues
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: "0 * * * *"
+
+jobs:
+  stale:
+    runs-on: ubuntu-latest
+    permissions:
+      issues: write
+      pull-requests: write
+
+    steps:
+      - uses: actions/stale@v9
+        with:
+          repo-token: ${{ secrets.GITHUB_TOKEN }}
+          stale-issue-message: 'Issue has not received an update in over 14 days. Adding stale label.'
+          stale-pr-message: 'PR has not received an update in over 14 days. Adding stale label.'
+          close-issue-message: 'This issue was closed because it has been 14 days without activity since it has been marked as stale.'
+          close-pr-message: 'This PR was closed because it has been 14 days without activity since it has been marked as stale.'
+          days-before-issue-stale: 14
+          days-before-close: 14
+          only-labels: 'waiting for feedback'
+          labels-to-add-when-unstale: 'investigating'
+          labels-to-remove-when-unstale: 'stale,waiting for feedback'
+          stale-issue-label: 'stale'
+          stale-pr-label: 'stale'
diff --git a/.github/workflows/auto_close_inactive_issues.yml b/.github/workflows/auto_close_inactive_issues.yml
diff --git a/.github/workflows/module-owners.json b/.github/workflows/module-owners.json
@@ -0,0 +1,14 @@
+{
+    "Generic Runtime": ["funatiq", "pcastonguay", "Shixiaowei02", "MartinMarciniszyn", "schetlur-nv", "dcampora"],
+    "Triton Backend": ["Tabrizian", "pcastonguay", "schetlur-nv"],
+    "LLM API/Workflow": ["Superjomn", "syuoni", "nv-guomingz", "litaotju", "QiJune"],
+    "KV-Cache Management":["thorjohnsen", "schetlur-nv"],
+    "Low Precision":["Tracin", "nv-guomingz", "Naveassaf"],
+    "Speculative Decoding":["yweng0828", "nekorobov", "lfr-0531"],
+    "Customized Kernels":["lowsfer", "PerkzZheng", "jdemouth-nvidia"],
+    "Performance": ["kaiyux", "jiahanc", "hypdeb"],
+    "Lora/P-tuning":["byshiue", "Naveassaf"],
+    "Disaggregated Serving":["Shixiaowei02", "joyang-nv", "chuangz0", "schetlur-nv"],
+    "Documentation":["nv-guomingz"],
+    "Windows":["pamelap-nvidia"]
+}
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -7,8 +7,8 @@ repos:
     rev: v1.1.13
     hooks:
     -   id: remove-crlf
--   repo: https://github.com/pre-commit/mirrors-yapf
-    rev: v0.32.0
+-   repo: https://github.com/google/yapf
+    rev: v0.43.0
     hooks:
     -   id: yapf
 -   repo: https://github.com/pre-commit/pre-commit-hooks
@@ -47,4 +47,4 @@ repos:
         - --skip=".git,3rdparty"
         - --exclude-file=examples/whisper/tokenizer.py
         - --ignore-words-list=rouge,inout,atleast,strat,nd,subtile,thrid,improbe
-        exclude: 'tests/llm-test-defs/turtle/test_input_files'
+        exclude: 'tests/llm-test-defs/turtle/test_input_files|.*/test_star_attention_input.jsonl'
diff --git a/3rdparty/cutlass b/3rdparty/cutlass
diff --git a/README.md b/README.md
@@ -5,11 +5,11 @@ TensorRT-LLM
 <h4> A TensorRT Toolbox for Optimized Large Language Model Inference</h4>
 
 [![Documentation](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](https://nvidia.github.io/TensorRT-LLM/)
-[![python](https://img.shields.io/badge/python-3.12.3-green)](https://www.python.org/downloads/release/python-3123/)
-[![python](https://img.shields.io/badge/python-3.10.12-green)](https://www.python.org/downloads/release/python-31012/)
-[![cuda](https://img.shields.io/badge/cuda-12.6.3-green)](https://developer.nvidia.com/cuda-downloads)
-[![trt](https://img.shields.io/badge/TRT-10.7.0-green)](https://developer.nvidia.com/tensorrt)
-[![version](https://img.shields.io/badge/release-0.16.0-green)](./tensorrt_llm/version.py)
+[![python](https://img.shields.io/badge/python-3.12-green)](https://www.python.org/downloads/release/python-3123/)
+[![python](https://img.shields.io/badge/python-3.10-green)](https://www.python.org/downloads/release/python-31012/)
+[![cuda](https://img.shields.io/badge/cuda-12.8.0-green)](https://developer.nvidia.com/cuda-downloads)
+[![trt](https://img.shields.io/badge/TRT-10.8.0-green)](https://developer.nvidia.com/tensorrt)
+[![version](https://img.shields.io/badge/release-0.17.0-green)](./tensorrt_llm/version.py)
 [![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE)
 
 [Architecture](./docs/source/architecture/overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Performance](./docs/source/performance/perf-overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Examples](./examples/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Documentation](./docs/source/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Roadmap](https://docs.google.com/presentation/d/1gycPmtdh7uUcH6laOvW65Dbp9F1McUkGDIcAyjicBZs/edit?usp=sharing)
@@ -18,12 +18,31 @@ TensorRT-LLM
 <div align="left">
 
 ## Latest News
-* [2024/11/19] Llama 3.2 Full-Stack Optimizations Unlock High Performance on NVIDIA GPUs
-[➡️ link](https://developer.nvidia.com/blog/llama-3-2-full-stack-optimizations-unlock-high-performance-on-nvidia-gpus/?ncid=so-link-721194)
+* [2024/12/10] ⚡ Llama 3.3 70B from AI at Meta is accelerated by TensorRT-LLM. 🌟 State-of-the-art model on par with Llama 3.1 405B for reasoning, math, instruction following and tool use. Explore the preview
+[➡️ link](https://build.nvidia.com/meta/llama-3_3-70b-instruct)
 <div align="center">
-<img src="https://developer-blogs.nvidia.com/wp-content/uploads/2024/11/three-llamas-holding-number-10-signs-1.jpg" width="50%">
+<img src="https://media.licdn.com/dms/image/v2/D4E10AQEMmE0xCJ6QvQ/image-shrink_800/image-shrink_800/0/1733853602424?e=1734912000&v=beta&t=mjW-P7skGnxGSgMx07IWczyYYC_05RYypB2mEmbgqZQ" width="50%">
 <div align="left">
 
+* [2024/12/03] 🌟 Boost your AI hashtag#inference throughput by up to 3.6x.  We now support speculative decoding and tripling token throughput with our NVIDIA TensorRT-LLM. Perfect for your generative AI apps.  ⚡Learn how in this technical deep dive
+[➡️ link](https://nvda.ws/3ZCZTzD)
+
+* [2024/12/02] Working on deploying ONNX models for performance-critical applications? Try our NVIDIA Nsight Deep Learning Designer ⚡ A user-friendly GUI and tight integration with NVIDIA TensorRT that offers:
+✅ Intuitive visualization of ONNX model graphs
+✅ Quick tweaking of model architecture and parameters
+✅ Detailed performance profiling with either ORT or TensorRT
+✅ Easy building of TensorRT engines
+[➡️ link](https://developer.nvidia.com/nsight-dl-designer?ncid=so-link-485689&linkId=100000315016072)
+
+* [2024/11/26] 📣 Introducing TensorRT-LLM for Jetson AGX Orin, making it even easier to deploy on Jetson AGX Orin with initial support in JetPack 6.1 via the v0.12.0-jetson branch of the TensorRT-LLM repo. ✅ Pre-compiled TensorRT-LLM wheels & containers for easy integration ✅ Comprehensive guides & docs to get you started
+[➡️ link](https://forums.developer.nvidia.com/t/tensorrt-llm-for-jetson/313227?linkId=100000312718869)
+
+* [2024/11/21] NVIDIA TensorRT-LLM Multiblock Attention Boosts Throughput by More Than 3x for Long Sequence Lengths on NVIDIA HGX H200
+[➡️ link](https://developer.nvidia.com/blog/nvidia-tensorrt-llm-multiblock-attention-boosts-throughput-by-more-than-3x-for-long-sequence-lengths-on-nvidia-hgx-h200/)
+
+* [2024/11/19] Llama 3.2 Full-Stack Optimizations Unlock High Performance on NVIDIA GPUs
+[➡️ link](https://developer.nvidia.com/blog/llama-3-2-full-stack-optimizations-unlock-high-performance-on-nvidia-gpus/?ncid=so-link-721194)
+
 * [2024/11/09] 🚀🚀🚀 3x Faster AllReduce with NVSwitch and TensorRT-LLM MultiShot
 [➡️ link](https://developer.nvidia.com/blog/3x-faster-allreduce-with-nvswitch-and-tensorrt-llm-multishot/)
 
@@ -47,6 +66,9 @@ TensorRT-LLM
 * [2024/10/07] 🚀🚀🚀Optimizing Microsoft Bing Visual Search with NVIDIA Accelerated Libraries
 [➡️ link](https://developer.nvidia.com/blog/optimizing-microsoft-bing-visual-search-with-nvidia-accelerated-libraries/)
 
+<details close>
+<summary>Previous News</summary>
+
 * [2024/09/29] 🌟 AI at Meta PyTorch + TensorRT v2.4 🌟 ⚡TensorRT 10.1 ⚡PyTorch 2.4 ⚡CUDA 12.4 ⚡Python 3.12
 [➡️ link](https://github.com/pytorch/TensorRT/releases/tag/v2.4.0)
 
@@ -62,8 +84,6 @@ TensorRT-LLM
 * [2024/09/04] 🏎️🏎️🏎️ Best Practices for Tuning TensorRT-LLM for Optimal Serving with BentoML
 [➡️ link](https://www.bentoml.com/blog/tuning-tensor-rt-llm-for-optimal-serving-with-bentoml)
 
-<details close>
-<summary>Previous News</summary>
 
 * [2024/08/20] 🏎️SDXL with #TensorRT Model Optimizer ⏱️⚡ 🏁 cache diffusion 🏁 quantization aware training 🏁 QLoRA 🏁 #Python 3.12
 [➡️ link](https://developer.nvidia.com/blog/nvidia-tensorrt-model-optimizer-v0-15-boosts-inference-performance-and-expands-model-support/)

diff --git a/benchmarks/cpp/README.md b/benchmarks/cpp/README.md
@@ -368,13 +368,15 @@ cd cpp/build
 `disaggServerBenchmark` only supports `decoder-only` models.
 Here is the basic usage:
 ```
+export TRTLLM_USE_MPI_KVCACHE=1
 mpirun -n ${proc} benchmarks/disaggServerBenchmark --context_engine_dirs ${context_engine_0},${context_engine_1}...,${context_engine_{m-1}} \
 --generation_engine_dirs ${generation_engine_0},${generation_engine_1}...,${generation_engine_{n-1}} --dataset ${dataset_path}
 ```
 This command will launch m context engines and n generation engines. You need to ensure `proc` is equal to the sum of the number of processes required for each engine plus 1. Since we use orchestrator mode for `disaggServerBenchmark` we need an additional process as the orchestrator. For example, if there are two context engines (one is TP2_PP1,another is TP1_PP1) and two generation engines(one is TP2_PP1,another is TP1_PP1), then the `proc` value should be set to 7.
 
 for example:
 ```
+export TRTLLM_USE_MPI_KVCACHE=1
 mpirun -n 7 benchmarks/disaggServerBenchmark --context_engine_dirs ${llama_7b_tp2_pp1_dir},${llama_7b_tp1_pp1_dir} --generation_engine_dirs ${llama_7b_tp1_pp1_dir},${llama_7b_tp2_pp1_dir} --dataset ${dataset_path}
 
 # need 6 gpus and 7 processes to launch the benchmark.