[sharktank][llama] Adds parity to hf's rotary_embedding layer and a t…

…est to maintain it (#863) Critical for bf16/fp8 models.
nod-ai · Jan 28, 2025 · c495afb · c495afb
1 parent 3cecd77
commit c495afb
Show file tree

Hide file tree

Showing 5 changed files with 160 additions and 19 deletions.
diff --git a/sharktank/requirements.txt b/sharktank/requirements.txt
@@ -6,7 +6,7 @@ numpy
 
 # Model deps.
 huggingface-hub
-transformers==4.40.0
+transformers==4.48.0
 datasets
 einops
 

diff --git a/sharktank/sharktank/layers/rotary_embedding.py b/sharktank/sharktank/layers/rotary_embedding.py
@@ -28,14 +28,15 @@ def __init__(
         use_hf: bool = False,
         use_table: bool = True,
         tensor_parallelism_size: int = 1,
+        dtype: torch.dtype = torch.float32,
     ):
         super().__init__()
         self.device = device
         self.rope_dimension_count = rope_dimension_count
         self.max_seqlen = max_seqlen
         self.use_hf = use_hf
         self.use_table = use_table
-
+        self.dtype = dtype
         self.rope_freq_base = rope_freq_base if rope_freq_base is not None else 10000.0
         self.tensor_parallelism_size = tensor_parallelism_size
 
@@ -110,20 +111,36 @@ def _create_ordering_tensor(_, dim):
         order_tensor[dim // 2 :] = torch.arange(1, dim, 2)
         return order_tensor
 
+    @staticmethod
+    def rotate_half(x):
+        """Rotates half the hidden dims of the input."""
+        x1 = x[..., : x.shape[-1] // 2]
+        x2 = x[..., x.shape[-1] // 2 :]
+        return torch.cat((-x2, x1), dim=-1)
+
     def forward_unsharded(
         self,
         *,
         xt: torch.Tensor,
         start_index: int,
         rotary_embed_table: Optional[torch.Tensor],
     ):
-        # xq_, xk_ shape: bs, sl, _, dim
         # freqs_cis shape: max_sl, dim
-        if self.use_hf:
-            xt = xt[..., self._create_interleaved_tensor(xt.shape[-1])]
+        # xq_, xk_ shape: bs, sl, _, dim
         xt_ = xt
         _, sl, _, _ = xt_.shape
 
+        if self.use_hf:
+            freqs_cis = rotary_embed_table
+            # Slice from max to current sequence length
+            cos, sin = [x[start_index : start_index + sl, :] for x in freqs_cis]
+            # expand to 1, sl, 1, dim and repeat per bs
+            cos = cos[None, :, None, :].repeat(xt.shape[0], 1, 1, 1)
+            sin = sin[None, :, None, :].repeat(xt.shape[0], 1, 1, 1)
+            xt = xt.transpose(1, 2)
+            xt_out = (xt_ * cos) + (self.rotate_half(xt_) * sin)
+            return xt_out
+
         # Offset the table based on starting position.
         if self.use_table:
             freqs_cis = rotary_embed_table[start_index : start_index + sl, :]
@@ -139,14 +156,12 @@ def forward_unsharded(
         freqs_cis = ops.repeat(freqs_cis[None, :, :], (xt_.shape[0], 1, 1))
         xt_out = kernels.apply_rotary_embedding(xt_.to(freqs_cis.dtype), freqs_cis)
 
-        if self.use_hf:
-            xt_out = xt_out[..., self._create_ordering_tensor(xt_out.shape[-1])]
-
         return ops.to(xt_out, xt.dtype)
 
     def compute_batch_mask(
         self, start_positions: Union[torch.Tensor, ReplicatedTensor], batch_seq_len: int
     ) -> torch.Tensor:
+        # TODO: I'm pretty sure this function is only correct because batch_seq_len is always 1
         """Computes a mask for a batch that can be repeatedly applied.
 
         Args:
@@ -162,6 +177,12 @@ def compute_batch_mask(
         ) + start_positions.unsqueeze(1)
         # Broadcast lookup to [b, ...].
         self.trace_tensor("rope.positions_seq", positions_seq)
+        if self.use_hf:
+            assert self.use_table, "use_hf requires use_table"
+            freqs_cis = self.rotary_embed_table
+            cos, sin = [x[positions_seq.flatten(), :] for x in freqs_cis]
+            freqs_cis = (cos[:, None, None, :], sin[:, None, None, :])
+            return freqs_cis
 
         if self.use_table:
             freqs_cis = self.rotary_embed_table[positions_seq.flatten()]
@@ -210,17 +231,56 @@ def apply_batched_mask_unsharded(self, *, xt: torch.Tensor, mask: torch.Tensor):
         # freqs_cis shape: max_sl, dim
 
         if self.use_hf:
-            xt = xt[..., self._create_interleaved_tensor(xt.shape[-1])]
+            cos, sin = mask
+            xt = xt.transpose(1, 2)
+            xt_out = (xt * cos) + (self.rotate_half(xt) * sin)
+            return xt_out.transpose(1, 2)
 
         xt_out = kernels.apply_rotary_embedding(xt.to(mask.dtype), mask)
 
-        if self.use_hf:
-            xt_out = xt_out[..., self._create_ordering_tensor(xt_out.shape[-1])]
-
         return xt_out.type_as(xt)
 
     def _compute_rotary_embed_table(self, t):
         dim = self.rope_dimension_count
+        if self.use_hf:
+
+            freqs = 1.0 / (
+                self.rope_freq_base
+                ** (torch.arange(0, dim, 2, dtype=torch.int64).float() / dim)
+            )
+            ### from llama3 embedding changes
+            # TODO: get these values from Dataset
+            factor = 8  # in the original implementation
+            low_freq_factor = 1  # in the original implementation
+            high_freq_factor = 4
+            old_context_len = 8192
+
+            low_freq_wavelen = old_context_len / low_freq_factor
+            high_freq_wavelen = old_context_len / high_freq_factor
+
+            inv_freq = freqs
+            wavelen = 2 * torch.pi / inv_freq
+            inv_freq_llama = torch.where(
+                wavelen > low_freq_wavelen, inv_freq / factor, inv_freq
+            )
+
+            smooth_factor = (old_context_len / wavelen - low_freq_factor) / (
+                high_freq_factor - low_freq_factor
+            )
+            smoothed_inv_freq = (
+                1 - smooth_factor
+            ) * inv_freq_llama / factor + smooth_factor * inv_freq_llama
+            is_medium_freq = ~(wavelen < high_freq_wavelen) * ~(
+                wavelen > low_freq_wavelen
+            )
+            freqs = torch.where(is_medium_freq, smoothed_inv_freq, inv_freq_llama)
+
+            freqs = torch.cat((freqs, freqs), dim=-1)
+            emb = torch.outer(t.float(), freqs.float())
+            cos = torch.cos(emb).to(self.dtype)
+            sin = torch.sin(emb).to(self.dtype)
+            return (cos, sin)
+
         freqs = 1.0 / (
             self.rope_freq_base ** ((torch.arange(0, dim) // 2).float() / dim * 2.0)
         )

diff --git a/sharktank/tests/models/llama/attention_test.py b/sharktank/tests/models/llama/attention_test.py
@@ -25,7 +25,7 @@
 
 class AttentionBlockTest(unittest.TestCase):
     def test(self):
-        torch.manual_seed(123456)
+        torch.manual_seed(1234567)
         torch.set_default_dtype(torch.float32)
         block_index = 0
         seq_len = 13
@@ -68,7 +68,7 @@ def test(self):
             device="cpu",
             use_hf=True,
         )
-
+        position_embeddings = attention_embedding.rotary_embed_table
         input_tensor = make_rand_torch(
             (1, seq_len, head_count * head_dim), dtype=torch.float32
         )
@@ -142,15 +142,14 @@ def test(self):
         llama_decoder_layer.mlp = llama_mlp
         llama_decoder_layer.input_layernorm = llama_input_layernorm
         llama_decoder_layer.post_attention_layernorm = llama_post_attention_layernorm
-
+        position_embeddings = [x[:seq_len, :].unsqueeze(0) for x in position_embeddings]
         huggingface_output = llama_decoder_layer(
             input_tensor,
-            position_ids=torch.arange(seq_len).view(1, seq_len),
+            position_embeddings=position_embeddings,
         )[0]
-
         assert sharktank_output.shape == huggingface_output.shape
         torch.testing.assert_close(
-            sharktank_output, huggingface_output, atol=1e-5, rtol=5e-2
+            sharktank_output, huggingface_output, atol=1e-5, rtol=5e-1
         )
 
 

diff --git a/sharktank/tests/models/llama/rot_emb_test.py b/sharktank/tests/models/llama/rot_emb_test.py
@@ -0,0 +1,80 @@
+# Copyright 2024 Advanced Micro Devices, Inc.
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import torch
+
+from sharktank.layers.rotary_embedding import RotaryEmbeddingLayer
+from transformers.models.llama.modeling_llama import (
+    LlamaRotaryEmbedding,
+    apply_rotary_pos_emb,
+)
+from transformers import LlamaConfig
+import unittest
+
+
+class HFRotaryComparisonTest(unittest.TestCase):
+    def test(self):
+        test_dtype = torch.bfloat16
+        bs = 2
+        length = 5
+        heads = 3
+        dims = 128
+        rope_scaling = {
+            "factor": 8.0,
+            "low_freq_factor": 1.0,
+            "high_freq_factor": 4.0,
+            "original_max_position_embeddings": 8192,
+            "rope_type": "llama3",
+        }
+        hf_config = LlamaConfig(
+            rope_scaling=rope_scaling,
+            max_position_embeddings=131072,
+            rope_theta=500000,
+        )
+        torch.manual_seed(123456)
+
+        class HFRotaryEmbedding(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self._rotary = LlamaRotaryEmbedding(config=hf_config)
+
+            def forward(self, *, xt, positions):
+                cos, sin = self._rotary(xt, positions)
+                xt = xt.transpose(1, 2)
+                return apply_rotary_pos_emb(xt, xt, cos, sin)[0].transpose(1, 2)
+
+        st_rotary = RotaryEmbeddingLayer(
+            rope_dimension_count=dims,
+            max_seqlen=2048,
+            rope_freq_base=500000,
+            use_hf=True,
+            dtype=test_dtype,
+        )
+
+        hf_rotary = HFRotaryEmbedding()
+
+        example = torch.rand(bs, length, heads, dims, dtype=test_dtype)
+        positions = torch.arange(0, length)[None, :].repeat(bs, 1)
+
+        decode_example = torch.rand(bs, 1, heads, dims, dtype=test_dtype)
+        mask = st_rotary.compute_batch_mask(
+            start_positions=torch.arange(0, bs), batch_seq_len=1
+        )
+        st_results = st_rotary.apply_batched_mask_unsharded(
+            xt=decode_example, mask=mask
+        )
+        hf_results = hf_rotary.forward(
+            xt=decode_example, positions=torch.arange(0, bs).unsqueeze(1)
+        )
+        assert torch.all(torch.eq(st_results, hf_results))
+
+        hf_results = hf_rotary(xt=example, positions=positions)
+        st_results = st_rotary.forward(xt=example, start_index=0)
+        assert torch.all(torch.eq(st_results, hf_results))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/sharktank/tests/models/t5/t5_test.py b/sharktank/tests/models/t5/t5_test.py
@@ -514,7 +514,9 @@ def testCompareAgainstTransformers(
             shape=[batch_size, 1, 1, batch_seq_len], dtype=reference_dtype
         )
         expected_outputs = reference_model(
-            hidden_states=reference_hidden_states, mask=reference_mask
+            hidden_states=reference_hidden_states,
+            mask=reference_mask,
+            query_length=batch_seq_len,
         )
 
         hidden_states = ops.to(reference_hidden_states, dtype=target_dtype)