[PyTorch] Cache RHT device tensors properly (#2395)

ksivaman · web-flow · commit e1221735c910 · 2025-11-18T16:57:55.000-05:00
* Cache device tensors properly

Signed-off-by: Kirthi Shankar Sivamani &lt;ksivamani@nvidia.com&gt;

* Fix annotation and add test

Signed-off-by: Kirthi Shankar Sivamani &lt;ksivamani@nvidia.com&gt;

* skip nvfp4 test if not supported

Signed-off-by: Kirthi Shankar Sivamani &lt;ksivamani@nvidia.com&gt;

---------

Signed-off-by: Kirthi Shankar Sivamani &lt;ksivamani@nvidia.com&gt;
diff --git a/tests/pytorch/distributed/test_sanity.py b/tests/pytorch/distributed/test_sanity.py
@@ -7,7 +7,16 @@
 import pytest
 import torch
 import transformer_engine
-from transformer_engine.pytorch import DotProductAttention, TransformerLayer, Linear, GroupedLinear
+from transformer_engine.pytorch import (
+    DotProductAttention,
+    TransformerLayer,
+    Linear,
+    GroupedLinear,
+    NVFP4Quantizer,
+    autocast,
+    is_nvfp4_available,
+)
+from transformer_engine.common import recipe
 
 _current_file = pathlib.Path(__file__).resolve()
 sys.path.append(str(_current_file.parent.parent))
@@ -17,6 +26,8 @@
     "small": ModelConfig(2, 10, 2, 16),
 }
 
+nvfp4_available, reason_for_no_nvfp4 = is_nvfp4_available(return_reason=True)
+
 
 @pytest.mark.parametrize("model", ["small"])
 @pytest.mark.parametrize(
@@ -138,3 +149,24 @@ def test_current_device(model, module):
     assert (
         tensor_device_grad == tensor_device
     ), "The gradient tensor should be the same as the input tensors!"
+
+
+@pytest.mark.skipif(not nvfp4_available, reason=reason_for_no_nvfp4)
+def test_nvfp4_rht_cache():
+    """Ensure correct RHT cache for NVFP4."""
+
+    num_devices = torch.cuda.device_count()
+    assert num_devices > 1, "This test requires more than one GPU!"
+
+    # Populate cache on last device.
+    with torch.cuda.device(num_devices - 1):
+        _ = NVFP4Quantizer()
+
+    hidden_size = 128
+    dtype = torch.bfloat16
+
+    model = Linear(hidden_size, hidden_size, params_dtype=dtype)
+    inp = torch.randn(hidden_size, hidden_size, device=torch.cuda.current_device(), dtype=dtype)
+    fp4_recipe = recipe.NVFP4BlockScaling()
+    with autocast(recipe=fp4_recipe):
+        _ = model(inp)
diff --git a/transformer_engine/pytorch/tensor/nvfp4_tensor.py b/transformer_engine/pytorch/tensor/nvfp4_tensor.py
@@ -28,9 +28,9 @@
 aten = torch.ops.aten
 
 
-def get_no_random_sign_vector() -> torch.Tensor:
+def get_no_random_sign_vector(device: int) -> torch.Tensor:
     """Non-random sign vector for Hadamard transform."""
-    return torch.tensor([1], dtype=torch.float32, device="cuda")
+    return torch.tensor([1], dtype=torch.float32, device=device)
 
 
 def get_sign_from_vector(vector: torch.Tensor) -> int:
@@ -45,7 +45,7 @@ def get_sign_from_vector(vector: torch.Tensor) -> int:
     return mask.item()
 
 
-def get_wgrad_sign_vector() -> torch.Tensor:
+def get_wgrad_sign_vector(device: int) -> torch.Tensor:
     """Hard-coded random signs for Hadamard transform.
 
     https://xkcd.com/221/
@@ -54,11 +54,11 @@ def get_wgrad_sign_vector() -> torch.Tensor:
     return torch.tensor(
         [1, 1, 1, -1, 1, -1, -1, -1, -1, -1, -1, 1, -1, 1, -1, -1],
         dtype=torch.float32,
-        device="cuda",
+        device=device,
     )
 
 
-def get_hadamard_matrix(hadamard_dimension: int) -> torch.Tensor:
+def get_hadamard_matrix(hadamard_dimension: int, device: int) -> torch.Tensor:
     """Construct a 16x16 Hadamard matrix."""
     assert hadamard_dimension == 16, "Only hadamard dimension 16 is supported."
     hadamard_scale = 1 / math.sqrt(hadamard_dimension)
@@ -83,30 +83,30 @@ def get_hadamard_matrix(hadamard_dimension: int) -> torch.Tensor:
                 [1, -1, -1, 1, -1, 1, 1, -1, -1, 1, 1, -1, 1, -1, -1, 1],
             ],
             dtype=torch.float32,
-            device="cuda",
+            device=device,
         )
         * hadamard_scale
     )
 
 
 @functools.lru_cache(maxsize=None)
-def get_rht_matrix(with_random_sign_mask: bool) -> torch.Tensor:
+def get_rht_matrix(with_random_sign_mask: bool, device: int) -> torch.Tensor:
     """Construct matrix used in random Hadamard transform."""
     hadamard_dimension = 16
     if with_random_sign_mask:
-        signs = get_wgrad_sign_vector()
+        signs = get_wgrad_sign_vector(device=device)
     else:
-        signs = get_no_random_sign_vector()
-    sign_matrix = signs * torch.eye(hadamard_dimension, dtype=torch.float32, device="cuda")
-    rht_matrix = sign_matrix @ get_hadamard_matrix(hadamard_dimension)
+        signs = get_no_random_sign_vector(device=device)
+    sign_matrix = signs * torch.eye(hadamard_dimension, dtype=torch.float32, device=device)
+    rht_matrix = sign_matrix @ get_hadamard_matrix(hadamard_dimension, device=device)
     return rht_matrix.to(dtype=torch.bfloat16)
 
 
 @functools.lru_cache(maxsize=None)
-def get_random_sign_mask_for_rht(with_random_sign_mask: bool) -> int:
+def get_random_sign_mask_for_rht(with_random_sign_mask: bool, device: int) -> int:
     """Sign mask for random Hadamard transform."""
     if with_random_sign_mask:
-        return get_sign_from_vector(get_wgrad_sign_vector())
+        return get_sign_from_vector(get_wgrad_sign_vector(device=device))
     return 0
 
 
@@ -152,8 +152,10 @@ def __init__(
         self.amax_reduction_group = amax_reduction_group
         self.with_2d_quantization = with_2d_quantization
         self.stochastic_rounding = stochastic_rounding
-        self.rht_matrix_random_sign_mask_t = get_random_sign_mask_for_rht(with_random_sign_mask)
-        self.rht_matrix = get_rht_matrix(with_random_sign_mask)
+        self.rht_matrix_random_sign_mask_t = get_random_sign_mask_for_rht(
+            with_random_sign_mask, torch.cuda.current_device()
+        )
+        self.rht_matrix = get_rht_matrix(with_random_sign_mask, torch.cuda.current_device())
 
     def update_quantized(
         self,