From 85d72a803ca5cb6bac75b097c8520506442627b6 Mon Sep 17 00:00:00 2001
From: Octavian <octavian@firstbatch.xyz>
Date: Thu, 27 Nov 2025 01:30:43 -0800
Subject: [PATCH 1/3] add warning when quant config exists but quantization
 fails

---
 src/dnet/core/models/base.py | 10 ++++++----
 src/dnet/shard/runtime.py    | 12 +++++++-----
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/src/dnet/core/models/base.py b/src/dnet/core/models/base.py
index 5597cbd0..b1e4ad83 100644
--- a/src/dnet/core/models/base.py
+++ b/src/dnet/core/models/base.py
@@ -226,7 +226,7 @@ def _abskey_to_local_path(self, key: str) -> Optional[str]:
 
     def apply_quantization_from_config(
         self, model_config: Any, model_metadata: Any
-    ) -> bool:
+    ) -> Tuple[bool, bool]:
         """Quantize using a simple MLX-style predicate with optional per-path overrides.
 
         - If config["quantization"][path] exists, use that for this path.
@@ -408,15 +408,17 @@ def _predicate(path: str, module: nn.Module):
                     )
             except Exception:
                 self._converted_to_quantized = False
-                return False
+                if g_bits != 0 and g_group != 0:
+                    return (True, False)
+                return (False, False)
             self._converted_to_quantized = True
-            return True
+            return (True, True)
         except Exception:
             try:
                 self._converted_to_quantized = False
             except Exception:
                 pass
-            return False
+            return (False, False)
 
     @staticmethod
     def _shrink_linear_like(mod) -> None:
diff --git a/src/dnet/shard/runtime.py b/src/dnet/shard/runtime.py
index ff3ea285..c3a4f5e7 100644
--- a/src/dnet/shard/runtime.py
+++ b/src/dnet/shard/runtime.py
@@ -201,12 +201,14 @@ def load_model_core(self, req: ShardLoadModelRequest) -> None:
             is_api_layer=False,
         )
         try:
-            applied = bool(
-                self.model.apply_quantization_from_config(
-                    self.model_metadata.model_config,
-                    model_metadata=self.model_metadata,
-                )
+            is_quant, applied = self.model.apply_quantization_from_config(
+                self.model_metadata.model_config,
+                model_metadata=self.model_metadata,
             )
+            if is_quant and not applied:
+                logger.warning(
+                    "Failed to quantize what appears to be a quantized model."
+                )
             logger.info(
                 "[QUANT] runtime=%s applied=%s model=%s",
                 self.shard_id,

From dfa11034a3c9713b0751cadcab98910c184ff327 Mon Sep 17 00:00:00 2001
From: Octavian <octavian@firstbatch.xyz>
Date: Thu, 27 Nov 2025 02:03:42 -0800
Subject: [PATCH 2/3] modify FakeModel object to also return (bool, bool)

---
 tests/fakes/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/fakes/models.py b/tests/fakes/models.py
index 650f844d..2563be17 100644
--- a/tests/fakes/models.py
+++ b/tests/fakes/models.py
@@ -62,7 +62,7 @@ def __init__(
         self.loaded = {}
 
     def apply_quantization_from_config(self, cfg, model_metadata=None):
-        return self._quant_applies
+        return (self._quant_applies, True)
 
     def eval(self):
         self.eval_called = True

From a3570b7abfb7ba1d639e3500a6c24c0b27e2fbd7 Mon Sep 17 00:00:00 2001
From: Octavian <octavian@firstbatch.xyz>
Date: Sun, 30 Nov 2025 12:08:31 -0800
Subject: [PATCH 3/3] raise exception and log error

---
 src/dnet/shard/runtime.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/dnet/shard/runtime.py b/src/dnet/shard/runtime.py
index c3a4f5e7..652030c9 100644
--- a/src/dnet/shard/runtime.py
+++ b/src/dnet/shard/runtime.py
@@ -206,9 +206,7 @@ def load_model_core(self, req: ShardLoadModelRequest) -> None:
                 model_metadata=self.model_metadata,
             )
             if is_quant and not applied:
-                logger.warning(
-                    "Failed to quantize what appears to be a quantized model."
-                )
+                raise RuntimeError("apply_quantization_from_config failed.")
             logger.info(
                 "[QUANT] runtime=%s applied=%s model=%s",
                 self.shard_id,
@@ -216,7 +214,10 @@ def load_model_core(self, req: ShardLoadModelRequest) -> None:
                 self.model_metadata.model_type,
             )
         except RuntimeError as e:
-            logger.warning("[QUANT] apply failed: %s", e)
+            logger.error(
+                f"[QUANT] Failed to quantize what appears to be a quantized model: {e}"
+            )
+            raise
 
         self.model.eval()
         self.cache = make_cache(