NVIDIA
diff --git a/‎qa/L1_jax_distributed_unittest/test.sh‎
Lines changed: 1 addition & 0 deletions b/‎qa/L1_jax_distributed_unittest/test.sh‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tests/jax/multi_process_launch.sh‎
Lines changed: 23 additions & 0 deletions b/‎tests/jax/multi_process_launch.sh‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎tests/jax/test_helper.py‎
Lines changed: 21 additions & 16 deletions b/‎tests/jax/test_helper.py‎
Lines changed: 21 additions & 16 deletions
diff --git a/‎tests/jax/test_layer.py‎
Lines changed: 21 additions & 24 deletions b/‎tests/jax/test_layer.py‎
Lines changed: 21 additions & 24 deletions
@@ -9,3 +9,4 @@ set -xe
 mkdir -p "$XML_LOG_DIR"
 
 NVTE_JAX_UNITTEST_LEVEL="L1" python3 -m pytest -c $TE_PATH/tests/jax/pytest.ini -v --junitxml=$XML_LOG_DIR/pytest.xml $TE_PATH/tests/jax/test_distributed_*
+SCRIPT_NAME=test_multi_process_distributed_grouped_gemm.py bash $TE_PATH/tests/jax/multi_process_launch.sh
@@ -0,0 +1,23 @@
+# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+#!/bin/bash
+
+SCRIPT_NAME="${SCRIPT_NAME:-test.py}"
+
+
+XLA_BASE_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true
+                --xla_gpu_enable_command_buffer=''"
+
+export XLA_FLAGS="${XLA_BASE_FLAGS}"
+
+NUM_RUNS=$(nvidia-smi --query-gpu=count --format=csv,noheader)
+for ((i=1; i<NUM_RUNS; i++))
+do
+    CUDA_VISIBLE_DEVICES=$i python $SCRIPT_NAME 127.0.0.1:12345 $i $NUM_PROC > /dev/null 2>&1 &
+done
+
+CUDA_VISIBLE_DEVICES=0 python $SCRIPT_NAME 127.0.0.1:12345 0 $NUM_PROC
+
+wait
@@ -14,10 +14,11 @@
 from transformer_engine.common.recipe import Format as FP8Format
 from transformer_engine.jax import fp8_autocast, get_delayed_scaling
 from transformer_engine.jax.quantize import (
-    QuantizeConfig,
+    get_quantize_config,
     is_fp8_available,
     ScalingMode,
     update_collections,
+    TensorSource,
 )
 from transformer_engine.jax.sharding import MeshResource, global_mesh_resource
 
@@ -49,7 +50,7 @@ def test_update_collections(self):
 class TestFP8Functions(unittest.TestCase):
 
     def _check_default_state(self):
-        self.assertFalse(QuantizeConfig.is_fp8_enabled())
+        self.assertFalse(get_quantize_config().is_fp8_enabled())
 
     def _compare_delay_scaling(self, ref, test):
         self.assertTrue(ref.margin == test.margin)
@@ -58,17 +59,23 @@ def _compare_delay_scaling(self, ref, test):
         self.assertTrue(ref.amax_compute_algo == test.amax_compute_algo)
 
     def _compare_current_scaling(self, test):
-        self.assertEqual(QuantizeConfig.FP8_FORMAT, test.fp8_format)
-        self.assertEqual(QuantizeConfig.SCALING_MODE, ScalingMode.CURRENT_TENSOR_SCALING)
+        self.assertEqual(get_quantize_config().FP8_FORMAT, test.fp8_format)
+        for tensor_source in TensorSource:
+            self.assertEqual(
+                get_quantize_config().get_scaling_mode(tensor_source),
+                ScalingMode.CURRENT_TENSOR_SCALING,
+            )
 
     def _compare_mxfp8_scaling(self, test):
-        self.assertEqual(QuantizeConfig.MARGIN, test.margin)
-        self.assertEqual(QuantizeConfig.FP8_FORMAT, test.fp8_format)
-        self.assertEqual(QuantizeConfig.SCALING_MODE, ScalingMode.MXFP8_1D_SCALING)
+        self.assertEqual(get_quantize_config().MARGIN, test.margin)
+        self.assertEqual(get_quantize_config().FP8_FORMAT, test.fp8_format)
+        for tensor_source in TensorSource:
+            self.assertEqual(
+                get_quantize_config().get_scaling_mode(tensor_source), ScalingMode.MXFP8_1D_SCALING
+            )
 
     @unittest.skipIf(not is_fp8_supported, reason=reason)
     def test_fp8_autocast_delayed_scaling(self):
-        QuantizeConfig.finalize()  # Ensure the testing not affect by previous tests.
         self._check_default_state()
 
         with fp8_autocast(enabled=False, fp8_recipe=DelayedScaling(), mesh_resource=MeshResource()):
@@ -78,21 +85,20 @@ def test_fp8_autocast_delayed_scaling(self):
 
         ds = DelayedScaling(margin=5.0, fp8_format=FP8Format.E4M3, amax_history_len=1)
         with fp8_autocast(enabled=True, fp8_recipe=ds, mesh_resource=MeshResource()):
-            self.assertTrue(QuantizeConfig.is_fp8_enabled())
+            self.assertTrue(get_quantize_config().is_fp8_enabled())
             self._compare_delay_scaling(get_delayed_scaling(), ds)
 
         self._check_default_state()
 
         ds = DelayedScaling(margin=3.0, fp8_format=FP8Format.HYBRID, amax_history_len=1)
         with fp8_autocast(enabled=True, fp8_recipe=ds, mesh_resource=MeshResource()):
-            self.assertTrue(QuantizeConfig.is_fp8_enabled())
+            self.assertTrue(get_quantize_config().is_fp8_enabled())
             self._compare_delay_scaling(get_delayed_scaling(), ds)
 
         self._check_default_state()
 
     @unittest.skipIf(not is_fp8_supported, reason=reason)
     def test_fp8_autocast_current_scaling(self):
-        QuantizeConfig.finalize()  # Ensure the testing not affect by previous tests.
         self._check_default_state()
 
         with fp8_autocast(
@@ -104,21 +110,20 @@ def test_fp8_autocast_current_scaling(self):
 
         cs = Float8CurrentScaling(fp8_format=FP8Format.E4M3)
         with fp8_autocast(enabled=True, fp8_recipe=cs, mesh_resource=MeshResource()):
-            self.assertTrue(QuantizeConfig.is_fp8_enabled())
+            self.assertTrue(get_quantize_config().is_fp8_enabled())
             self._compare_current_scaling(cs)
 
         self._check_default_state()
 
         cs = Float8CurrentScaling(fp8_format=FP8Format.HYBRID)
         with fp8_autocast(enabled=True, fp8_recipe=cs, mesh_resource=MeshResource()):
-            self.assertTrue(QuantizeConfig.is_fp8_enabled())
+            self.assertTrue(get_quantize_config().is_fp8_enabled())
             self._compare_current_scaling(cs)
 
         self._check_default_state()
 
     @unittest.skipIf(not is_mxfp8_supported, reason=mxfp8_reason)
     def test_fp8_autocast_mxfp8_block_scaling(self):
-        QuantizeConfig.finalize()  # Ensure the testing not affect by previous tests.
         self._check_default_state()
 
         with fp8_autocast(
@@ -130,14 +135,14 @@ def test_fp8_autocast_mxfp8_block_scaling(self):
 
         bs = MXFP8BlockScaling(margin=5.0, fp8_format=FP8Format.E4M3)
         with fp8_autocast(enabled=True, fp8_recipe=bs, mesh_resource=MeshResource()):
-            self.assertTrue(QuantizeConfig.is_fp8_enabled())
+            self.assertTrue(get_quantize_config().is_fp8_enabled())
             self._compare_mxfp8_scaling(bs)
 
         self._check_default_state()
 
         bs = MXFP8BlockScaling(margin=3.0, fp8_format=FP8Format.HYBRID)
         with fp8_autocast(enabled=True, fp8_recipe=bs, mesh_resource=MeshResource()):
-            self.assertTrue(QuantizeConfig.is_fp8_enabled())
+            self.assertTrue(get_quantize_config().is_fp8_enabled())
             self._compare_mxfp8_scaling(bs)
 
         self._check_default_state()
@@ -23,12 +23,14 @@
 from transformer_engine.common import recipe
 from transformer_engine.jax.flax import TransformerLayer, TransformerLayerType
 from transformer_engine.jax.quantize import (
-    QuantizeConfig,
+    get_quantize_config,
     ScalingMode,
     is_fp8_available,
     update_collections,
+    TensorSource,
+    fp8_autocast,
 )
-from transformer_engine.jax.sharding import MeshResource, global_shard_guard
+from transformer_engine.jax.sharding import MeshResource
 
 
 @pytest.fixture(autouse=True, scope="function")
@@ -356,7 +358,7 @@ def test_backward(
 
         ref_params, test_params = self._sync_params(ref_params, test_params)
 
-        if QuantizeConfig.is_fp8_enabled():
+        if get_quantize_config().is_fp8_enabled():
             for _ in range(4):
                 _, updated_state = jax.value_and_grad(self._loss_fn, argnums=(3,), has_aux=False)(
                     inputs,
@@ -365,12 +367,15 @@ def test_backward(
                     test_others,
                     test_layer,
                 )
-                if QuantizeConfig.SCALING_MODE == ScalingMode.DELAYED_TENSOR_SCALING:
+                if (
+                    get_quantize_config().get_scaling_mode(TensorSource.X)
+                    == ScalingMode.DELAYED_TENSOR_SCALING
+                ):
                     _, updated_quantize_meta = flax.core.pop(
-                        updated_state[0], QuantizeConfig.COLLECTION_NAME
+                        updated_state[0], get_quantize_config().COLLECTION_NAME
                     )
                     test_others = update_collections(
-                        {QuantizeConfig.COLLECTION_NAME: updated_quantize_meta}, test_others
+                        {get_quantize_config().COLLECTION_NAME: updated_quantize_meta}, test_others
                     )
                     del updated_quantize_meta
                 del updated_state
@@ -500,41 +505,33 @@ class BaseTester:
 
     def test_forward(self, data_shape, dtype, attrs):
         """Test normal datatype forward"""
-        QuantizeConfig.finalize()  # Ensure FP8 disabled.
-        with global_shard_guard(
-            MeshResource()
-        ):  # Empty MeshResource is used as we are running on a single device
+        # Ensure FP8 disabled.
+        # Empty MeshResource is used as we are running on a single device
+        with fp8_autocast(enabled=False, mesh_resource=MeshResource()):
             self.runner(attrs).test_forward(data_shape, dtype)
 
     def test_backward(self, data_shape, dtype, attrs):
         """Test normal datatype backward"""
-        QuantizeConfig.finalize()  # Ensure FP8 disabled.
-        with global_shard_guard(
-            MeshResource()
-        ):  # Empty MeshResource is used as we are running on a single device
+        # Ensure FP8 disabled.
+        # Empty MeshResource is used as we are running on a single device
+        with fp8_autocast(enabled=False, mesh_resource=MeshResource()):
             self.runner(attrs).test_backward(data_shape, dtype)
 
     @pytest.mark.skipif(not is_fp8_supported, reason=reason)
     @pytest.mark.parametrize("fp8_recipe", QUANTIZE_RECIPES)
     def test_forward_with_fp8(self, data_shape, dtype, attrs, fp8_recipe):
         """Test forward with fp8 enabled"""
-        QuantizeConfig.initialize(fp8_recipe=fp8_recipe)
-        with global_shard_guard(
-            MeshResource()
-        ):  # Empty MeshResource is used as we are running on a single device
+        # Empty MeshResource is used as we are running on a single device
+        with fp8_autocast(enabled=True, fp8_recipe=fp8_recipe, mesh_resource=MeshResource()):
             self.runner(attrs).test_forward(data_shape, dtype, rtol=1e-4, atol=1e-3)
-        QuantizeConfig.finalize()
 
     @pytest.mark.skipif(not is_fp8_supported, reason=reason)
     @pytest.mark.parametrize("fp8_recipe", QUANTIZE_RECIPES)
     def test_backward_with_fp8(self, data_shape, dtype, attrs, fp8_recipe):
         """Test backward with fp8 enabled"""
-        QuantizeConfig.initialize(fp8_recipe=fp8_recipe)
-        with global_shard_guard(
-            MeshResource()
-        ):  # Empty MeshResource is used as we are running on a single device
+        # Empty MeshResource is used as we are running on a single device
+        with fp8_autocast(enabled=True, fp8_recipe=fp8_recipe, mesh_resource=MeshResource()):
             self.runner(attrs).test_backward(data_shape, dtype, rtol=1e-4, atol=1e-3)
-        QuantizeConfig.finalize()
 
 
 class TestEncoderLayer(BaseTester):
Original file line number	Diff line number	Diff line change
`@@ -9,3 +9,4 @@ set -xe`
`9`	`9`	`mkdir -p "$XML_LOG_DIR"`
`10`	`10`
`11`	`11`	`NVTE_JAX_UNITTEST_LEVEL="L1" python3 -m pytest -c $TE_PATH/tests/jax/pytest.ini -v --junitxml=$XML_LOG_DIR/pytest.xml $TE_PATH/tests/jax/test_distributed_*`
	`12`	`+SCRIPT_NAME=test_multi_process_distributed_grouped_gemm.py bash $TE_PATH/tests/jax/multi_process_launch.sh`