flexaihq · MohammedTaherMcW · Jul 29, 2025 · Aug 5, 2025 · Aug 5, 2025 · Aug 7, 2025
diff --git a/models/common/rmsnorm.py b/models/common/rmsnorm.py
@@ -82,7 +82,7 @@ def __init__(
             torch_weight,
             device=device,
             dtype=weight_dtype,
-            layout=ttnn.ROW_MAJOR_LAYOUT,
+            layout=ttnn.TILE_LAYOUT,
             memory_config=weight_memory_config,
             cache_file_name=cache_name,
             mesh_mapper=ttnn.ReplicateTensorToMesh(device) if is_mesh_device else None,
@@ -93,7 +93,7 @@ def __init__(
                 torch_weight,
                 device=device,
                 dtype=weight_dtype,
-                layout=ttnn.ROW_MAJOR_LAYOUT,
+                layout=ttnn.TILE_LAYOUT,
                 memory_config=weight_memory_config,
                 cache_file_name=cache_name,
                 mesh_mapper=ttnn.ShardTensor2dMesh(device, dims=(None, 2), mesh_shape=list(device.shape))
@@ -125,6 +125,11 @@ def forward(self, x: ttnn.Tensor, mode, in_sharded=False, out_sharded=False) ->
         else:
             assert not out_sharded, "Non-sharded version of RMSNorm cannot output a sharded tensor"
 
+        if x.shape[-1] % weight.shape[-1] == 0:
+            # Reshape weight only if x's last dimension is divisible by weight's last dimension,
+            # to avoid padding errors in RMSNorm when dimensions are not aligned
+            weight = ttnn.reshape(weight, [1, 1, 1, -1])
+
         x = norm(
             x,
             epsilon=self.eps,

diff --git a/models/experimental/gemma3_1b/tests/test_attention.py b/models/experimental/gemma3_1b/tests/test_attention.py