Add ability to run Gemma 2 models without post attention norm and post feedforward norm

amer-sinha · amer-sinha · commit f8f3264866df · 2025-09-03T16:35:35.000-07:00
diff --git a/src/transformers/models/gemma2/configuration_gemma2.py b/src/transformers/models/gemma2/configuration_gemma2.py
@@ -88,6 +88,11 @@ class Gemma2Config(PretrainedConfig):
             scaling factor when applying tanh softcapping on the logits.
         attn_logit_softcapping (`float`, *optional*, defaults to 50.0):
             scaling factor when applying tanh softcapping on the attention scores.
+        use_post_attention_norm (`bool`, *optional*, defaults to `True`):
+            whether to use a post attention layer normalization layer.
+        use_post_feedforward_norm (`bool`, *optional*, defaults to `True`):
+            whether to use a post feedforward layer normalization layer.
+
 
     ```python
     >>> from transformers import Gemma2Model, Gemma2Config
@@ -142,6 +147,8 @@ def __init__(
         layer_types=None,
         final_logit_softcapping=30.0,
         attn_logit_softcapping=50.0,
+        use_post_attention_norm=True,
+        use_post_feedforward_norm=True,
         **kwargs,
     ):
         super().__init__(
@@ -170,6 +177,8 @@ def __init__(
         self.sliding_window = sliding_window
         self.final_logit_softcapping = final_logit_softcapping
         self.attn_logit_softcapping = attn_logit_softcapping
+        self.use_post_attention_norm = use_post_attention_norm
+        self.use_post_feedforward_norm = use_post_feedforward_norm
         self.layer_types = layer_types
 
         if self.layer_types is None:
diff --git a/src/transformers/models/gemma2/modeling_gemma2.py b/src/transformers/models/gemma2/modeling_gemma2.py
@@ -248,10 +248,12 @@ def __init__(self, config: Gemma2Config, layer_idx: int):
         self.self_attn = Gemma2Attention(config=config, layer_idx=layer_idx)
         self.mlp = Gemma2MLP(config)
         self.input_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        if self.config.use_post_attention_norm:
+            self.post_attention_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
         self.pre_feedforward_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_feedforward_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        if self.config.use_post_feedforward_norm:
+            self.post_feedforward_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
     @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
     def forward(
@@ -282,13 +284,15 @@ def forward(
             cache_position=cache_position,
             **kwargs,
         )
-        hidden_states = self.post_attention_layernorm(hidden_states)
+        if self.config.use_post_attention_norm:
+            hidden_states = self.post_attention_layernorm(hidden_states)
         hidden_states = residual + hidden_states
 
         residual = hidden_states
         hidden_states = self.pre_feedforward_layernorm(hidden_states)
         hidden_states = self.mlp(hidden_states)
-        hidden_states = self.post_feedforward_layernorm(hidden_states)
+        if self.config.use_post_feedforward_norm:
+            hidden_states = self.post_feedforward_layernorm(hidden_states)
         hidden_states = residual + hidden_states
 
         outputs = (hidden_states,)
diff --git a/src/transformers/models/gemma2/modular_gemma2.py b/src/transformers/models/gemma2/modular_gemma2.py
@@ -112,6 +112,11 @@ class Gemma2Config(PretrainedConfig):
             scaling factor when applying tanh softcapping on the logits.
         attn_logit_softcapping (`float`, *optional*, defaults to 50.0):
             scaling factor when applying tanh softcapping on the attention scores.
+        use_post_attention_norm (`bool`, *optional*, defaults to `True`):
+            whether to use a post attention layer normalization layer.
+        use_post_feedforward_norm (`bool`, *optional*, defaults to `True`):
+            whether to use a post feedforward layer normalization layer.
+
 
     ```python
     >>> from transformers import Gemma2Model, Gemma2Config
@@ -166,6 +171,8 @@ def __init__(
         layer_types=None,
         final_logit_softcapping=30.0,
         attn_logit_softcapping=50.0,
+        use_post_attention_norm=True,
+        use_post_feedforward_norm=True,
         **kwargs,
     ):
         super().__init__(
@@ -194,6 +201,8 @@ def __init__(
         self.sliding_window = sliding_window
         self.final_logit_softcapping = final_logit_softcapping
         self.attn_logit_softcapping = attn_logit_softcapping
+        self.use_post_attention_norm = use_post_attention_norm
+        self.use_post_feedforward_norm = use_post_feedforward_norm
         self.layer_types = layer_types
 
         if self.layer_types is None:
@@ -313,10 +322,12 @@ def __init__(self, config: Gemma2Config, layer_idx: int):
         self.self_attn = Gemma2Attention(config=config, layer_idx=layer_idx)
         self.mlp = Gemma2MLP(config)
         self.input_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        if self.config.use_post_attention_norm:
+            self.post_attention_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
         self.pre_feedforward_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_feedforward_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        if self.config.use_post_feedforward_norm:
+            self.post_feedforward_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
     @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
     def forward(
@@ -347,13 +358,15 @@ def forward(
             cache_position=cache_position,
             **kwargs,
         )
-        hidden_states = self.post_attention_layernorm(hidden_states)
+        if self.config.use_post_attention_norm:
+            hidden_states = self.post_attention_layernorm(hidden_states)
         hidden_states = residual + hidden_states
 
         residual = hidden_states
         hidden_states = self.pre_feedforward_layernorm(hidden_states)
         hidden_states = self.mlp(hidden_states)
-        hidden_states = self.post_feedforward_layernorm(hidden_states)
+        if self.config.use_post_feedforward_norm:
+            hidden_states = self.post_feedforward_layernorm(hidden_states)
         hidden_states = residual + hidden_states
 
         outputs = (hidden_states,)
diff --git a/src/transformers/models/t5gemma/configuration_t5gemma.py b/src/transformers/models/t5gemma/configuration_t5gemma.py
@@ -90,6 +90,11 @@ class T5GemmaModuleConfig(PretrainedConfig):
             scaling factor when applying tanh softcapping on the logits.
         attn_logit_softcapping (`float`, *optional*, defaults to 50.0):
             scaling factor when applying tanh softcapping on the attention scores.
+        use_post_attention_norm (`bool`, *optional*, defaults to `True`):
+            whether to use a post attention layer normalization layer.
+        use_post_feedforward_norm (`bool`, *optional*, defaults to `True`):
+            whether to use a post feedforward layer normalization layer.
+
 
     ```python
     >>> from transformers import T5GemmaModuleModel, T5GemmaModuleConfig
@@ -144,6 +149,8 @@ def __init__(
         layer_types=None,
         final_logit_softcapping=30.0,
         attn_logit_softcapping=50.0,
+        use_post_attention_norm=True,
+        use_post_feedforward_norm=True,
         **kwargs,
     ):
         super().__init__(
@@ -172,6 +179,8 @@ def __init__(
         self.sliding_window = sliding_window
         self.final_logit_softcapping = final_logit_softcapping
         self.attn_logit_softcapping = attn_logit_softcapping
+        self.use_post_attention_norm = use_post_attention_norm
+        self.use_post_feedforward_norm = use_post_feedforward_norm
         self.layer_types = layer_types
 
         if self.layer_types is None:
diff --git a/src/transformers/models/t5gemma/modular_t5gemma.py b/src/transformers/models/t5gemma/modular_t5gemma.py
@@ -166,13 +166,17 @@ def __init__(
         encoder.is_decoder = False
         encoder.dropout_rate = dropout_rate
         encoder.attention_dropout = attention_dropout
+        encoder.use_post_attention_norm = True
+        encoder.use_post_feedforward_norm = True
         self.encoder = encoder
 
         decoder.is_decoder = True
         decoder.use_cache = True
         decoder.dropout_rate = dropout_rate
         decoder.attention_dropout = attention_dropout
         decoder.cross_attention_hidden_size = encoder.hidden_size
+        decoder.use_post_attention_norm = True
+        decoder.use_post_feedforward_norm = True
         self.decoder = decoder
 
         for special_token_key in ["bos_token_id", "pad_token_id", "eos_token_id"]: