NVIDIA
diff --git a/‎docs/api/pytorch.rst‎
Lines changed: 4 additions & 1 deletion b/‎docs/api/pytorch.rst‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎examples/pytorch/comm_gemm_overlap/te_layer_with_overlap.py‎
Lines changed: 3 additions & 1 deletion b/‎examples/pytorch/comm_gemm_overlap/te_layer_with_overlap.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎tests/pytorch/distributed/run_layer_with_overlap.py‎
Lines changed: 65 additions & 12 deletions b/‎tests/pytorch/distributed/run_layer_with_overlap.py‎
Lines changed: 65 additions & 12 deletions
diff --git a/‎tests/pytorch/distributed/test_fusible_ops_with_userbuffers.py‎
Lines changed: 7 additions & 1 deletion b/‎tests/pytorch/distributed/test_fusible_ops_with_userbuffers.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎transformer_engine/common/comm_gemm_overlap/userbuffers/userbuffers-host.cpp‎
Lines changed: 1 addition & 1 deletion b/‎transformer_engine/common/comm_gemm_overlap/userbuffers/userbuffers-host.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎transformer_engine/common/comm_gemm_overlap/userbuffers/userbuffers.h‎
Lines changed: 1 addition & 1 deletion b/‎transformer_engine/common/comm_gemm_overlap/userbuffers/userbuffers.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎transformer_engine/pytorch/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎transformer_engine/pytorch/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎transformer_engine/pytorch/module/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎transformer_engine/pytorch/module/__init__.py‎
Lines changed: 1 addition & 1 deletion
@@ -49,7 +49,7 @@ pyTorch
 
 .. autoapifunction:: transformer_engine.pytorch.moe_permute
 
-.. autoapifunction:: transformer_engine.pytorch.moe_permute_with_probs  
+.. autoapifunction:: transformer_engine.pytorch.moe_permute_with_probs
 
 .. autoapifunction:: transformer_engine.pytorch.moe_unpermute
 
@@ -62,3 +62,6 @@ pyTorch
 .. autoapifunction:: transformer_engine.pytorch.initialize_ub
 
 .. autoapifunction:: transformer_engine.pytorch.destroy_ub
+
+.. autoapiclass:: transformer_engine.pytorch.UserBufferQuantizationMode
+  :members: FP8, NONE
@@ -263,7 +263,9 @@ def dist_print(msg, end="\n", group=nccl_world, src=0, debug=False, error=False)
         te.module.base.initialize_ub(
             [batched_size, hidden_size],
             tp_size,
-            use_fp8=opts.fp8,
+            quantization_modes=[
+                UserBufferQuantizationMode.FP8 if opts.fp8 else UserBufferQuantizationMode.NONE
+            ],
             dtype=torch.bfloat16,
             bootstrap_backend=opts.bootstrap_backend,
         )
 
@@ -12,6 +12,8 @@
 import warnings
 import pprint
 import yaml
+from contextlib import nullcontext
+from functools import partial
 
 import torch
 import torch.distributed as dist
@@ -35,9 +37,10 @@ def __init__(self, module, num_layers, *args, **kwargs):
         self.num_layers = num_layers
         self.layers = torch.nn.ModuleList([module(*args, **kwargs) for _ in range(num_layers)])
 
-    def forward(self, x):
-        for layer in self.layers:
-            x = layer(x)
+    def forward(self, x, layer_contexts):
+        for layer, context in zip(self.layers, layer_contexts):
+            with context():
+                x = layer(x)
         return x
 
 
@@ -237,12 +240,46 @@ def _parse_args(argv=None, namespace=None):
         default=False,
         help="Print out additional debug information.",
     )
+    parser.add_argument(
+        "--first-last-layers-bf16",
+        action="store_true",
+        default=False,
+        help="Use bf16 for first and last N layers.",
+    )
+    parser.add_argument(
+        "--num-layers-at-start-in-bf16",
+        type=int,
+        default=0,
+        help="Number of layers at the start to run in bf16.",
+    )
+    parser.add_argument(
+        "--num-layers-at-end-in-bf16",
+        type=int,
+        default=0,
+        help="Number of layers at the end to run in bf16.",
+    )
     args = parser.parse_args(argv, namespace)
 
     if args.use_cuda_graphs and args.layer_type in [te.MultiheadAttention, te.TransformerLayer]:
         warnings.warn(f"{args.layer_type.__name__} does not support CUDA Graphs!")
         args.use_cuda_graphs = False
 
+    if not args.first_last_layers_bf16 and (
+        args.num_layers_at_start_in_bf16 > 0 or args.num_layers_at_end_in_bf16 > 0
+    ):
+        warnings.warn(
+            "num-layers-at-start-in-bf16 and num-layers-at-end-in-bf16 are only supported when"
+            " first-last-layers-bf16 is enabled!"
+        )
+        args.num_layers_at_start_in_bf16 = 0
+        args.num_layers_at_end_in_bf16 = 0
+
+    if args.num_layers_at_start_in_bf16 + args.num_layers_at_end_in_bf16 > args.num_layers:
+        raise ValueError(
+            "num-layers-at-start-in-bf16 + num-layers-at-end-in-bf16 must be less than or equal to"
+            " num-layers!"
+        )
+
     return args
 
 
@@ -381,10 +418,17 @@ def dist_print(msg, src=None, end="\n", debug=False, error=False):
             "qkv_dgrad": {"method": "ring_exchange"},
             "fc1_dgrad": {"method": "ring_exchange"},
         }
+
+    quantization_modes = [
+        UserBufferQuantizationMode.FP8 if opts.fp8 else UserBufferQuantizationMode.NONE
+    ]
+    if opts.first_last_layers_bf16 and opts.fp8:
+        quantization_modes.append(UserBufferQuantizationMode.NONE)
+
     te.module.base.initialize_ub(
         [opts.seq_length * opts.batch_size, opts.num_heads * opts.head_dim],
         opts.tp,
-        use_fp8=opts.fp8,
+        quantization_modes=quantization_modes,
         dtype=torch.bfloat16,
         bootstrap_backend=opts.bootstrap_backend,
         ub_cfgs=ub_cfgs if opts.ub_cfg is None else opts.ub_cfg,
@@ -423,6 +467,16 @@ def dist_print(msg, src=None, end="\n", debug=False, error=False):
     elif opts.quantization == "mxfp8":
         fp8_recipe = MXFP8BlockScaling()
 
+    layer_contexts = [
+        (
+            partial(te.fp8_autocast, enabled=opts.fp8, fp8_recipe=fp8_recipe, fp8_group=nccl_world)
+            if opts.num_layers_at_start_in_bf16 <= i
+            and i < (opts.num_layers - opts.num_layers_at_end_in_bf16)
+            else nullcontext
+        )
+        for i in range(opts.num_layers)
+    ]
+
     # Prepare random input tensors
     test_x = torch.randn(input_shape, dtype=torch.float32, device="cuda", requires_grad=True)
     test_x.retain_grad()
@@ -435,14 +489,13 @@ def dist_print(msg, src=None, end="\n", debug=False, error=False):
     # Execute fwd/bwd and collect tensors to test
     def run_fwd_bwd(model, x):
         with torch.amp.autocast("cuda", dtype=torch.bfloat16):
-            with te.fp8_autocast(enabled=opts.fp8, fp8_recipe=fp8_recipe, fp8_group=nccl_world):
-                y = model(x)
-                if isinstance(y, tuple):
-                    out, *_ = y
-                else:
-                    out = y
-                loss = out.sum()
-                loss.backward()
+            y = model(x, layer_contexts)
+            if isinstance(y, tuple):
+                out, *_ = y
+            else:
+                out = y
+            loss = out.sum()
+            loss.backward()
         return out
 
     torch_rng_state = torch.get_rng_state()
 
@@ -506,7 +506,13 @@ def main() -> None:
                 model_config.num_heads * model_config.head_dim,
             ],
             torch.distributed.get_world_size(group),
-            use_fp8=model_config.quantization is not None,
+            quantization_modes=[
+                (
+                    UserBufferQuantizationMode.FP8
+                    if model_config.quantization is not None
+                    else UserBufferQuantizationMode.NONE
+                )
+            ],
             dtype=model_config.dtype,
             bootstrap_backend=bootstrap_backend,
             ub_cfgs=userbuffer_configs,
 
@@ -511,7 +511,7 @@ void destroy_communicator_mpi(communicator *comm) {
 }
 
 int register_user_buffer_collective(void **gpubuff, size_t bytes, communicator *comm, bool alloc) {
-  if (comm->free_region > NVTE_MAX_REGIONS) return -1;
+  if (comm->free_region >= NVTE_MAX_REGIONS) return -1;
   int hndl = comm->free_region;
   comm->peer_ptr[hndl] = reinterpret_cast<void **>(malloc(sizeof(void *) * (comm->nvsize)));
   size_t aligned_size = bytes;
 
@@ -27,7 +27,7 @@
 using ExtAllgatherOp = std::function<void(void *, size_t, void *, size_t, ExtComm)>;
 using ExtBarrierOp = std::function<void(ExtComm)>;
 
-#define NVTE_MAX_REGIONS 16
+#define NVTE_MAX_REGIONS 32
 #define NVTE_MAX_SMS 32
 #define NVTE_MAX_OPS 32
 #define NVTE_MAX_PEERS 8192
 
@@ -33,6 +33,7 @@ def torch_version() -> tuple[int, ...]:
 from transformer_engine.pytorch.module import Fp8Padding, Fp8Unpadding
 from transformer_engine.pytorch.module import initialize_ub
 from transformer_engine.pytorch.module import destroy_ub
+from transformer_engine.pytorch.module import UserBufferQuantizationMode
 from transformer_engine.pytorch.attention import DotProductAttention
 from transformer_engine.pytorch.attention import MultiheadAttention
 from transformer_engine.pytorch.attention import InferenceParams
 
@@ -11,4 +11,4 @@
 from .rmsnorm import RMSNorm
 from .fp8_padding import Fp8Padding
 from .fp8_unpadding import Fp8Unpadding
-from .base import initialize_ub, destroy_ub
+from .base import initialize_ub, destroy_ub, UserBufferQuantizationMode
Original file line number	Diff line number	Diff line change
`@@ -511,7 +511,7 @@ void destroy_communicator_mpi(communicator *comm) {`
`511`	`511`	`}`
`512`	`512`
`513`	`513`	`int register_user_buffer_collective(void *gpubuff, size_t bytes, communicator comm, bool alloc) {`
`514`		`- if (comm->free_region > NVTE_MAX_REGIONS) return -1;`
	`514`	`+ if (comm->free_region >= NVTE_MAX_REGIONS) return -1;`
`515`	`515`	`int hndl = comm->free_region;`
`516`	`516`	`comm->peer_ptr[hndl] = reinterpret_cast<void *>(malloc(sizeof(void ) * (comm->nvsize)));`
`517`	`517`	`size_t aligned_size = bytes;`