NVIDIA
diff --git a/‎docs/api/pytorch.rst‎
Lines changed: 4 additions & 1 deletion b/‎docs/api/pytorch.rst‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎docs/examples/onnx/onnx_export.ipynb‎
Lines changed: 1 addition & 1 deletion b/‎docs/examples/onnx/onnx_export.ipynb‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/pytorch/comm_gemm_overlap/te_layer_with_overlap.py‎
Lines changed: 7 additions & 1 deletion b/‎examples/pytorch/comm_gemm_overlap/te_layer_with_overlap.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎qa/L0_cppunittest/test.sh‎
Lines changed: 1 addition & 1 deletion b/‎qa/L0_cppunittest/test.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎qa/L1_cpp_distributed/test.sh‎
Lines changed: 15 additions & 0 deletions b/‎qa/L1_cpp_distributed/test.sh‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎qa/L1_jax_distributed_unittest/test.sh‎
Lines changed: 1 addition & 0 deletions b/‎qa/L1_jax_distributed_unittest/test.sh‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎setup.py‎
Lines changed: 13 additions & 0 deletions b/‎setup.py‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎tests/cpp/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎tests/cpp/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tests/cpp/comm_gemm/CMakeLists.txt‎
Lines changed: 19 additions & 0 deletions b/‎tests/cpp/comm_gemm/CMakeLists.txt‎
Lines changed: 19 additions & 0 deletions
@@ -49,7 +49,7 @@ pyTorch
 
 .. autoapifunction:: transformer_engine.pytorch.moe_permute
 
-.. autoapifunction:: transformer_engine.pytorch.moe_permute_with_probs  
+.. autoapifunction:: transformer_engine.pytorch.moe_permute_with_probs
 
 .. autoapifunction:: transformer_engine.pytorch.moe_unpermute
 
@@ -62,3 +62,6 @@ pyTorch
 .. autoapifunction:: transformer_engine.pytorch.initialize_ub
 
 .. autoapifunction:: transformer_engine.pytorch.destroy_ub
+
+.. autoapiclass:: transformer_engine.pytorch.UserBufferQuantizationMode
+  :members: FP8, NONE
@@ -10,7 +10,7 @@
     "\n",
     "<b>Note:</b>\n",
     "\n",
-    "Currently, export to ONNX is supported only for high precision, FP8 delayed scaling and MXFP8.\n",
+    "Currently, export to ONNX is supported only for high precision, FP8 delayed scaling, FP8 current scaling and MXFP8.\n",
     "\n",
     "</div>\n",
     "\n",
 
@@ -263,7 +263,13 @@ def dist_print(msg, end="\n", group=nccl_world, src=0, debug=False, error=False)
         te.module.base.initialize_ub(
             [batched_size, hidden_size],
             tp_size,
-            use_fp8=opts.fp8,
+            quantization_modes=[
+                (
+                    te.module.base.UserBufferQuantizationMode.FP8
+                    if opts.fp8
+                    else te.module.base.UserBufferQuantizationMode.NONE
+                )
+            ],
             dtype=torch.bfloat16,
             bootstrap_backend=opts.bootstrap_backend,
         )
 
@@ -17,4 +17,4 @@ cd $TE_PATH/tests/cpp
 cmake -GNinja -Bbuild .
 cmake --build build
 export OMP_NUM_THREADS=$((NUM_PHYSICAL_CORES / NUM_PARALLEL_JOBS))
-ctest --test-dir build -j$NUM_PARALLEL_JOBS
+ctest --test-dir build -j$NUM_PARALLEL_JOBS -E '(AgGemm|GemmRs|GemmAr)'
@@ -0,0 +1,15 @@
+# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+set -e
+
+# Find TE
+: ${TE_PATH:=/opt/transformerengine}
+TE_LIB_PATH=$(pip3 show transformer-engine | grep -E "Location:|Editable project location:" | tail -n 1 | awk '{print $NF}')
+export LD_LIBRARY_PATH=$TE_LIB_PATH:$LD_LIBRARY_PATH
+
+cd $TE_PATH/tests/cpp
+cmake -GNinja -S. -Bbuild
+cmake --build build
+mpirun --allow-run-as-root --np 4 --oversubscribe ./build/comm_gemm/test_comm_gemm
@@ -9,3 +9,4 @@ set -xe
 mkdir -p "$XML_LOG_DIR"
 
 NVTE_JAX_UNITTEST_LEVEL="L1" python3 -m pytest -c $TE_PATH/tests/jax/pytest.ini -v --junitxml=$XML_LOG_DIR/pytest.xml $TE_PATH/tests/jax/test_distributed_*
+SCRIPT_NAME=test_multi_process_distributed_grouped_gemm.py bash $TE_PATH/tests/jax/multi_process_launch.sh
@@ -4,6 +4,7 @@
 
 """Installation script."""
 
+from importlib import metadata
 import os
 import time
 from pathlib import Path
@@ -66,6 +67,18 @@ def setup_common_extension() -> CMakeExtension:
     if bool(int(os.getenv("NVTE_BUILD_ACTIVATION_WITH_FAST_MATH", "0"))):
         cmake_flags.append("-DNVTE_BUILD_ACTIVATION_WITH_FAST_MATH=ON")
 
+    if bool(int(os.getenv("NVTE_WITH_CUBLASMP", "0"))):
+        cmake_flags.append("-DNVTE_WITH_CUBLASMP=ON")
+        cublasmp_dir = os.getenv("CUBLASMP_HOME") or metadata.distribution(
+            "nvidia-cublasmp-cu12"
+        ).locate_file("nvidia/cublasmp/cu12")
+        cmake_flags.append(f"-DCUBLASMP_DIR={cublasmp_dir}")
+        nvshmem_dir = os.getenv("NVSHMEM_HOME") or metadata.distribution(
+            "nvidia-nvshmem-cu12"
+        ).locate_file("nvidia/nvshmem")
+        cmake_flags.append(f"-DNVSHMEM_DIR={nvshmem_dir}")
+        print("CMAKE_FLAGS:", cmake_flags[-2:])
+
     # Add custom CMake arguments from environment variable
     nvte_cmake_extra_args = os.getenv("NVTE_CMAKE_EXTRA_ARGS")
     if nvte_cmake_extra_args:
 
@@ -37,6 +37,7 @@ find_library(TE_LIB NAMES transformer_engine PATHS "${TE_LIB_PATH}/.." ${TE_LIB_
 message(STATUS "Found transformer_engine library: ${TE_LIB}")
 include_directories(../../transformer_engine/common/include)
 include_directories(../../transformer_engine/common)
+include_directories(../../transformer_engine)
 include_directories(${CMAKE_SOURCE_DIR})
 
 find_package(CUDAToolkit REQUIRED)
 
@@ -0,0 +1,19 @@
+# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+add_executable(test_comm_gemm
+               test_comm_gemm.cu
+               ../test_common.cu)
+
+find_package(OpenMP REQUIRED)
+find_package(MPI REQUIRED)
+find_library(NCCL_LIB
+             NAMES nccl libnccl
+             PATH_SUFFIXES lib
+             REQUIRED)
+target_include_directories(test_comm_gemm PRIVATE ${MPI_CXX_INCLUDE_PATH} $ENV{CUBLASMP_HOME}/include)
+target_link_libraries(test_comm_gemm PUBLIC CUDA::cuda_driver CUDA::cudart GTest::gtest ${TE_LIB} CUDA::nvrtc CUDNN::cudnn MPI::MPI_CXX ${NCCL_LIB} OpenMP::OpenMP_CXX)
+
+include(GoogleTest)
+gtest_discover_tests(test_comm_gemm DISCOVERY_TIMEOUT 600)
Original file line number	Diff line number	Diff line change
`@@ -9,3 +9,4 @@ set -xe`
`9`	`9`	`mkdir -p "$XML_LOG_DIR"`
`10`	`10`
`11`	`11`	`NVTE_JAX_UNITTEST_LEVEL="L1" python3 -m pytest -c $TE_PATH/tests/jax/pytest.ini -v --junitxml=$XML_LOG_DIR/pytest.xml $TE_PATH/tests/jax/test_distributed_*`
	`12`	`+SCRIPT_NAME=test_multi_process_distributed_grouped_gemm.py bash $TE_PATH/tests/jax/multi_process_launch.sh`