Map hash_zch_identities to corresponding unique indices in TBE (#5077)

Joey Yang · meta-codesync[bot] · commit d1d26ace1904 · 2025-11-03T21:58:21.000-08:00
Summary: X-link: https://github.com/facebookresearch/FBGEMM/pull/2082 Pull Request resolved: #5077 This change selects the `hash_zch_identities` that corresponds with unique indices during TBE prefetch. This is specifically required for MPZCH tables, which need both the slot index and the corresponding identities for correct lookup behavior. Without the identities, the inference side cannot correctly verify if it's using the correct slot, leading to potential lookup errors. Reviewed By: chouxi Differential Revision: D85999577 fbshipit-source-id: 3c8a4add1dd112e9a746b334e7046bb442ea977b
diff --git a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_training.py b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_training.py
@@ -200,6 +200,13 @@ class RESParams:
     )  # table sizes for the global rows the TBE holds
 
 
+@dataclass(frozen=True)
+class PrefetchedInfo:
+    linear_unique_indices: torch.Tensor
+    linear_unique_indices_length: torch.Tensor
+    hash_zch_identities: Optional[torch.Tensor]
+
+
 def construct_split_state(
     embedding_specs: list[tuple[int, int, EmbeddingLocation, ComputeDevice]],
     rowwise: bool,
@@ -2100,6 +2107,12 @@ def forward(  # noqa: C901
                 requires this information for allocating the weight gradient
                 tensor in the backward pass.
 
+            hash_zch_identities (Optional[Tensor]): The original raw IDs before
+                remapping to ZCH (Zero-Collision Hash) table slots. This tensor is
+                populated when using Multi-Probe Zero Collision Hash (MPZCH) modules
+                and is required for Raw Embedding Streaming (RES) to maintain
+                consistency between training and inference.
+
         Returns:
             A 2D-tensor containing looked up data. Shape `(B, total_D)` where `B` =
             batch size and `total_D` = the sum of all embedding dimensions in the
@@ -2217,7 +2230,6 @@ def forward(  # noqa: C901
             # In forward, we don't enable multi-pass prefetch as we want the process
             # to be as fast as possible and memory usage doesn't matter (will be recycled
             # by dense fwd/bwd)
-            # TODO: Properly pass in the hash_zch_identities
             self._prefetch(
                 indices,
                 offsets,
@@ -4140,6 +4152,60 @@ def raw_embedding_stream(self) -> None:
                 False,  # blocking_tensor_copy
             )
 
+    @staticmethod
+    @torch.jit.ignore
+    def _get_prefetched_info(
+        linear_cache_indices_merged: torch.Tensor,
+        total_cache_hash_size: int,
+        hash_zch_identities: Optional[torch.Tensor],
+    ) -> PrefetchedInfo:
+        compute_inverse_indices = hash_zch_identities is not None
+        (
+            linear_unique_indices,
+            linear_unique_indices_length,
+            linear_unique_indices_cnt,
+            linear_unique_inverse_indices,
+        ) = torch.ops.fbgemm.get_unique_indices_with_inverse(
+            linear_cache_indices_merged,
+            total_cache_hash_size,
+            compute_count=compute_inverse_indices,
+            compute_inverse_indices=compute_inverse_indices,
+        )
+        # linear_unique_indices is the result after deduplication and sorting
+        linear_unique_indices = linear_unique_indices.narrow(
+            0, 0, linear_unique_indices_length[0]
+        )
+
+        if hash_zch_identities is None:
+            return PrefetchedInfo(
+                linear_unique_indices,
+                linear_unique_indices_length,
+                None,
+            )
+
+        # Compute cumulative sum as indices for selecting unique elements to
+        # map hash_zch_identities to linear_unique_indices
+        count_cum_sum = torch.ops.fbgemm.asynchronous_complete_cumsum(
+            linear_unique_indices_cnt
+        )
+        count_cum_sum = count_cum_sum.narrow(0, 0, linear_unique_indices_length[0])
+
+        # Select indices corresponding to first occurrence of each unique element
+        linear_unique_inverse_indices = linear_unique_inverse_indices.index_select(
+            dim=0, index=count_cum_sum
+        )
+
+        # Map hash_zch_identities to unique indices
+        hash_zch_identities_cpu = hash_zch_identities.index_select(
+            dim=0, index=linear_unique_inverse_indices
+        ).to(device=torch.device("cpu"))
+
+        return PrefetchedInfo(
+            linear_unique_indices,
+            linear_unique_indices_length,
+            hash_zch_identities_cpu,
+        )
+
     @torch.jit.ignore
     def _store_prefetched_tensors(
         self,
@@ -4150,35 +4216,26 @@ def _store_prefetched_tensors(
         NOTE: this needs to be a method with jit.ignore as the identities tensor is conditional.
         This function stores the prefetched tensors for the raw embedding streaming.
         """
-        if self.enable_raw_embedding_streaming:
-            with record_function(
-                "## uvm_save_prefetched_rows {} {} ##".format(self.timestep, self.uuid)
-            ):
+        if not self.enable_raw_embedding_streaming:
+            return
+
+        with record_function(
+            "## uvm_save_prefetched_rows {} {} ##".format(self.timestep, self.uuid)
+        ):
+            # Process hash_zch_identities using helper function
+            prefetched_info = self._get_prefetched_info(
+                linear_cache_indices_merged,
+                self.total_cache_hash_size,
+                hash_zch_identities,
+            )
+
+            self.prefetched_info.append(
                 (
-                    linear_unique_indices,
-                    linear_unique_indices_length,
-                    _,
-                ) = torch.ops.fbgemm.get_unique_indices(
-                    linear_cache_indices_merged,
-                    self.total_cache_hash_size,
-                    compute_count=False,
-                )
-                linear_unique_indices = linear_unique_indices.narrow(
-                    0, 0, linear_unique_indices_length[0]
-                )
-                self.prefetched_info.append(
-                    (
-                        linear_unique_indices,
-                        linear_unique_indices_length,
-                        (
-                            hash_zch_identities.index_select(
-                                dim=0, index=linear_unique_indices
-                            ).to(device=torch.device("cpu"))
-                            if hash_zch_identities is not None
-                            else None
-                        ),
-                    )
+                    prefetched_info.linear_unique_indices,
+                    prefetched_info.linear_unique_indices_length,
+                    prefetched_info.hash_zch_identities,
                 )
+            )
 
     @torch.jit.ignore
     def __report_input_params_factory(
diff --git a/fbgemm_gpu/test/tbe/training/store_prefetched_tensors_test.py b/fbgemm_gpu/test/tbe/training/store_prefetched_tensors_test.py
@@ -0,0 +1,134 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+
+import unittest
+
+import torch
+
+from fbgemm_gpu.split_table_batched_embeddings_ops_training import (
+    SplitTableBatchedEmbeddingBagsCodegen,
+)
+
+from ..common import open_source
+
+if open_source:
+    # pyre-ignore[21]
+    from test_utils import gpu_unavailable
+else:
+    from fbgemm_gpu.test.test_utils import gpu_unavailable
+
+
+class StorePrefetchedTensorsTest(unittest.TestCase):
+    @unittest.skipIf(*gpu_unavailable)
+    def test_get_prefetched_info(self) -> None:
+        hash_zch_identities = torch.tensor(
+            [
+                [3350213393928437575],  # for index 54
+                [6548733451892409412],  # for index 27
+                [4126118985661274454],  # for index 43
+                [2565973416302224539],  # for index 90
+            ],
+            device=torch.cuda.current_device(),
+            dtype=torch.int64,
+        )
+        total_cache_hash_size = 100
+        linear_cache_indices_merged = torch.tensor(
+            [54, 27, 43, 90],
+            device=torch.cuda.current_device(),
+            dtype=torch.int64,
+        )
+
+        prefetched_info = SplitTableBatchedEmbeddingBagsCodegen._get_prefetched_info(
+            linear_cache_indices_merged,
+            total_cache_hash_size,
+            hash_zch_identities,
+        )
+
+        self.assertEqual(
+            [27, 43, 54, 90],
+            prefetched_info.linear_unique_indices.tolist(),
+        )
+        self.assertEqual(
+            prefetched_info.linear_unique_indices_length[0].item(),
+            4,
+        )
+        assert prefetched_info.hash_zch_identities is not None
+        self.assertEqual(
+            prefetched_info.hash_zch_identities.shape[0],
+            4,
+        )
+        self.assertEqual(
+            [
+                [6548733451892409412],
+                [4126118985661274454],
+                [3350213393928437575],
+                [2565973416302224539],
+            ],
+            prefetched_info.hash_zch_identities.tolist(),
+        )
+
+    @unittest.skipIf(*gpu_unavailable)
+    def test_get_prefetched_info_with_duplicate_hash_zch_identities(self) -> None:
+        """
+        Test that duplicate cache indices are correctly deduplicated.
+        When the same cache index appears multiple times with the same identity,
+        only the first occurrence should be kept in the output.
+        """
+        hash_zch_identities = torch.tensor(
+            [
+                [3350213393928437575],  # for index 54 (first occurrence)
+                [6548733451892409412],  # for index 27
+                [3350213393928437575],  # for index 54 (duplicate - same identity)
+                [4126118985661274454],  # for index 43
+                [6548733451892409412],  # for index 27 (duplicate - same identity)
+                [3350213393928437575],  # for index 54 (duplicate - same identity)
+                [2565973416302224539],  # for index 90
+            ],
+            device=torch.cuda.current_device(),
+            dtype=torch.int64,
+        )
+        total_cache_hash_size = 100
+        linear_cache_indices_merged = torch.tensor(
+            [54, 27, 54, 43, 27, 54, 90],  # Duplicates: 54 appears 3x, 27 appears 2x
+            device=torch.cuda.current_device(),
+            dtype=torch.int64,
+        )
+
+        prefetched_info = SplitTableBatchedEmbeddingBagsCodegen._get_prefetched_info(
+            linear_cache_indices_merged,
+            total_cache_hash_size,
+            hash_zch_identities,
+        )
+
+        self.assertEqual(
+            [27, 43, 54, 90],
+            prefetched_info.linear_unique_indices.tolist(),
+        )
+        self.assertEqual(
+            prefetched_info.linear_unique_indices_length[0].item(),
+            4,
+        )
+        assert prefetched_info.hash_zch_identities is not None
+        self.assertEqual(
+            prefetched_info.hash_zch_identities.shape[0],
+            4,
+        )
+        self.assertEqual(
+            [
+                [6548733451892409412],  # for index 27
+                [4126118985661274454],  # for index 43
+                [3350213393928437575],  # for index 54
+                [2565973416302224539],  # for index 90
+            ],
+            prefetched_info.hash_zch_identities.tolist(),
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()