From 85d72a803ca5cb6bac75b097c8520506442627b6 Mon Sep 17 00:00:00 2001 From: Octavian Date: Thu, 27 Nov 2025 01:30:43 -0800 Subject: [PATCH 1/3] add warning when quant config exists but quantization fails --- src/dnet/core/models/base.py | 10 ++++++---- src/dnet/shard/runtime.py | 12 +++++++----- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/src/dnet/core/models/base.py b/src/dnet/core/models/base.py index 5597cbd0..b1e4ad83 100644 --- a/src/dnet/core/models/base.py +++ b/src/dnet/core/models/base.py @@ -226,7 +226,7 @@ def _abskey_to_local_path(self, key: str) -> Optional[str]: def apply_quantization_from_config( self, model_config: Any, model_metadata: Any - ) -> bool: + ) -> Tuple[bool, bool]: """Quantize using a simple MLX-style predicate with optional per-path overrides. - If config["quantization"][path] exists, use that for this path. @@ -408,15 +408,17 @@ def _predicate(path: str, module: nn.Module): ) except Exception: self._converted_to_quantized = False - return False + if g_bits != 0 and g_group != 0: + return (True, False) + return (False, False) self._converted_to_quantized = True - return True + return (True, True) except Exception: try: self._converted_to_quantized = False except Exception: pass - return False + return (False, False) @staticmethod def _shrink_linear_like(mod) -> None: diff --git a/src/dnet/shard/runtime.py b/src/dnet/shard/runtime.py index ff3ea285..c3a4f5e7 100644 --- a/src/dnet/shard/runtime.py +++ b/src/dnet/shard/runtime.py @@ -201,12 +201,14 @@ def load_model_core(self, req: ShardLoadModelRequest) -> None: is_api_layer=False, ) try: - applied = bool( - self.model.apply_quantization_from_config( - self.model_metadata.model_config, - model_metadata=self.model_metadata, - ) + is_quant, applied = self.model.apply_quantization_from_config( + self.model_metadata.model_config, + model_metadata=self.model_metadata, ) + if is_quant and not applied: + logger.warning( + "Failed to quantize what appears to be a quantized model." + ) logger.info( "[QUANT] runtime=%s applied=%s model=%s", self.shard_id, From dfa11034a3c9713b0751cadcab98910c184ff327 Mon Sep 17 00:00:00 2001 From: Octavian Date: Thu, 27 Nov 2025 02:03:42 -0800 Subject: [PATCH 2/3] modify FakeModel object to also return (bool, bool) --- tests/fakes/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/fakes/models.py b/tests/fakes/models.py index 650f844d..2563be17 100644 --- a/tests/fakes/models.py +++ b/tests/fakes/models.py @@ -62,7 +62,7 @@ def __init__( self.loaded = {} def apply_quantization_from_config(self, cfg, model_metadata=None): - return self._quant_applies + return (self._quant_applies, True) def eval(self): self.eval_called = True From a3570b7abfb7ba1d639e3500a6c24c0b27e2fbd7 Mon Sep 17 00:00:00 2001 From: Octavian Date: Sun, 30 Nov 2025 12:08:31 -0800 Subject: [PATCH 3/3] raise exception and log error --- src/dnet/shard/runtime.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/dnet/shard/runtime.py b/src/dnet/shard/runtime.py index c3a4f5e7..652030c9 100644 --- a/src/dnet/shard/runtime.py +++ b/src/dnet/shard/runtime.py @@ -206,9 +206,7 @@ def load_model_core(self, req: ShardLoadModelRequest) -> None: model_metadata=self.model_metadata, ) if is_quant and not applied: - logger.warning( - "Failed to quantize what appears to be a quantized model." - ) + raise RuntimeError("apply_quantization_from_config failed.") logger.info( "[QUANT] runtime=%s applied=%s model=%s", self.shard_id, @@ -216,7 +214,10 @@ def load_model_core(self, req: ShardLoadModelRequest) -> None: self.model_metadata.model_type, ) except RuntimeError as e: - logger.warning("[QUANT] apply failed: %s", e) + logger.error( + f"[QUANT] Failed to quantize what appears to be a quantized model: {e}" + ) + raise self.model.eval() self.cache = make_cache(