-Added abba tests to test_custom_models.py

Hyperakan · Hyperakan · commit 5eb5865e3537 · 2025-10-17T16:39:03.000+03:00
diff --git a/tests/test_custom_models.py b/tests/test_custom_models.py
@@ -284,6 +284,41 @@
         {"target_modules": ["conv1d"], "r": 2, "use_effective_conv2d": False},
     ),
     ("Conv2d 1x1 LOHA", "Conv2d1x1", LoHaConfig, {"target_modules": ["conv2d"]}),
+    ########
+    # ABBA #
+    ########
+    # ABBA tests are included in main TEST_CASES for basic functionality
+    # Note: ABBA uses SVD-based initialization, so parameters are non-zero from start
+    ("Vanilla MLP 1 ABBA", "MLP", LoHaConfig, {"target_modules": "lin0", "init_weights": "abba"}),
+    ("Vanilla MLP 2 ABBA", "MLP", LoHaConfig, {"target_modules": ["lin0"], "init_weights": "abba"}),
+    (
+        "Vanilla MLP 3 ABBA",
+        "MLP",
+        LoHaConfig,
+        {
+            "target_modules": ["lin0"],
+            "alpha": 4,
+            "module_dropout": 0.1,
+            "init_weights": "abba",
+        },
+    ),
+    ("Vanilla MLP 4 ABBA", "MLP", LoHaConfig, {"target_modules": "lin0", "rank_dropout": 0.5, "init_weights": "abba"}),
+    (
+        "Vanilla MLP 5 ABBA with Khatri-Rao",
+        "MLP",
+        LoHaConfig,
+        {"target_modules": ["lin0"], "init_weights": "abba", "use_khatri_rao": True},
+    ),
+    ("Conv2d 1 ABBA", "Conv2d", LoHaConfig, {"target_modules": ["conv2d"], "init_weights": "abba"}),
+    ("Conv1d ABBA 1", "Conv1d", LoHaConfig, {"target_modules": ["conv1d"], "init_weights": "abba"}),
+    ("Conv1d ABBA 2", "Conv1d", LoHaConfig, {"target_modules": ["conv1d"], "r": 2, "init_weights": "abba"}),
+    (
+        "Conv1d ABBA 3",
+        "Conv1dBigger",
+        LoHaConfig,
+        {"target_modules": ["conv1d"], "r": 2, "init_weights": "abba"},
+    ),
+    ("Conv2d 1x1 ABBA", "Conv2d1x1", LoHaConfig, {"target_modules": ["conv2d"], "init_weights": "abba"}),
     # LoKr
     ("Vanilla MLP 1 LOKR", "MLP", LoKrConfig, {"target_modules": "lin0"}),
     ("Vanilla MLP 2 LOKR", "MLP", LoKrConfig, {"target_modules": ["lin0"]}),
@@ -2044,6 +2079,13 @@ def test_parameters_after_loading_model(self, test_name, model_id, config_cls, c
             lr = 1e-3  # we get exploding gradients with MHA when learning rate is too high
         elif issubclass(config_cls, VBLoRAConfig) or issubclass(config_cls, RandLoraConfig):
             lr = 0.01  # otherwise we get nan
+        elif config_kwargs.get("init_weights") == "abba":
+            # ABBA starts closer to pretrained, use gentler updates than standard (0.5)
+            # Conv layers with ABBA need much lower LR due to Hadamard product amplification
+            if model_id in ["Conv1d", "Conv1dBigger", "Conv2d", "Conv2d1x1"]:
+                lr = 0.01  # Very low LR to prevent exploding gradients with Hadamard products
+            else:
+                lr = 0.1
         optimizer = torch.optim.SGD(model.parameters(), lr=lr)
 
         # train at least 3 steps for all parameters to be updated (probably this is required because of symmetry
@@ -2093,7 +2135,9 @@ def test_disable_adapters(self, test_name, model_id, config_cls, config_kwargs):
             torch.nn.init.zeros_(model.vblora_vector_bank["default"])
         model.eval()
         outputs_before = model(**X)
-        assert torch.allclose(outputs_base, outputs_before)
+        # ABBA uses SVD initialization, so outputs won't match base model initially - skip that assertion
+        if config_kwargs.get("init_weights") != "abba":
+            assert torch.allclose(outputs_base, outputs_before)
 
         if issubclass(config_cls, VBLoRAConfig):
             # initialize `vblora_vector_bank` so it can be trained
@@ -2131,7 +2175,12 @@ def test_disable_adapters(self, test_name, model_id, config_cls, config_kwargs):
         else:
             rtol, atol = 1e-5, 1e-8
         assert not torch.allclose(outputs_before, outputs_after, rtol=rtol, atol=atol)
-        assert torch.allclose(outputs_before, outputs_disabled)
+        # For ABBA: outputs_before != outputs_disabled because ABBA uses non-zero init
+        # But outputs_disabled should equal base model for both ABBA and others
+        if config_kwargs.get("init_weights") == "abba":
+            assert torch.allclose(outputs_base, outputs_disabled)
+        else:
+            assert torch.allclose(outputs_before, outputs_disabled)
         assert torch.allclose(outputs_after, outputs_enabled_after_disable)
 
     @pytest.mark.parametrize("test_name, model_id, config_cls, config_kwargs", TEST_CASES)
@@ -2147,6 +2196,8 @@ def test_disable_adapters_with_merging(self, test_name, model_id, config_cls, co
         # same as test_disable_adapters, but with merging
         X = self.prepare_inputs_for_testing()
         model = self.transformers_class.from_pretrained(model_id).to(self.torch_device)
+        model.eval()
+        outputs_base = model(**X)  # Save base model outputs for ABBA comparison
         config = config_cls(
             base_model_name_or_path=model_id,
             **config_kwargs,
@@ -2169,6 +2220,9 @@ def test_disable_adapters_with_merging(self, test_name, model_id, config_cls, co
         else:
             # Adam optimizer since SGD isn't great for small models with IA3 + Conv1D
             lr = 0.01
+            # ABBA Conv layers need lower learning rate to prevent gradient explosion
+            if config_kwargs.get("init_weights") == "abba" and model_id in ["Conv1d", "Conv1dBigger", "Conv2d", "Conv2d1x1"]:
+                lr = 0.001  # Very low LR for ABBA Conv with Adam
             optimizer = torch.optim.Adam(model.parameters(), lr=lr)
 
         # train at least 3 steps for all parameters to be updated (probably this is required because of symmetry
@@ -2214,7 +2268,12 @@ def test_disable_adapters_with_merging(self, test_name, model_id, config_cls, co
         assert torch.allclose(outputs_after, outputs_unmerged, atol=atol, rtol=rtol)
 
         # check that disabling adapters gives the same results as before training
-        assert torch.allclose(outputs_before, outputs_disabled, atol=atol, rtol=rtol)
+        # For ABBA: outputs_before != outputs_disabled because ABBA uses non-zero init
+        # But outputs_disabled should equal base model for both ABBA and others
+        if config_kwargs.get("init_weights") == "abba":
+            assert torch.allclose(outputs_base, outputs_disabled, atol=atol, rtol=rtol)
+        else:
+            assert torch.allclose(outputs_before, outputs_disabled, atol=atol, rtol=rtol)
 
         # check that enabling + disabling adapters does not change the results
         assert torch.allclose(outputs_after, outputs_enabled_after_disable, atol=atol, rtol=rtol)