Add HELION_ASSERT_CACHE_HIT to debug/explain cache miss (#1006)

oulgen · web-flow · commit 4571892b0d30 · 2025-10-21T15:41:09.000-07:00
diff --git a/docs/api/settings.md b/docs/api/settings.md
@@ -257,6 +257,7 @@ Built-in values for ``HELION_AUTOTUNER`` include ``"PatternSearch"``, ``"Differe
 | ``HELION_AUTOTUNE_CONFIG_OVERRIDES`` | ``autotune_config_overrides`` | Supply JSON forcing particular autotuner config key/value pairs. |
 | ``HELION_CACHE_DIR`` | ``LocalAutotuneCache`` | Override the on-disk directory used for cached autotuning artifacts. |
 | ``HELION_SKIP_CACHE`` | ``LocalAutotuneCache`` | When set to ``1``, ignore cached autotuning entries and rerun searches. |
+| ``HELION_ASSERT_CACHE_HIT`` | ``AutotuneCacheBase`` | When set to ``1``, require a cache hit; raises ``CacheAssertionError`` on cache miss with detailed diagnostics. |
 | ``HELION_PRINT_OUTPUT_CODE`` | ``print_output_code`` | Print generated Triton code to stderr for inspection. |
 | ``HELION_OUTPUT_ORIGIN_LINES`` | ``output_origin_lines`` | Include ``# src[...]`` comments in generated Triton code; set to ``0`` to disable. |
 | ``HELION_IGNORE_WARNINGS`` | ``ignore_warnings`` | Comma-separated warning names defined in ``helion.exc`` to suppress. |
diff --git a/helion/autotuner/base_cache.py b/helion/autotuner/base_cache.py
@@ -1,11 +1,13 @@
 from __future__ import annotations
 
 import abc
+from collections.abc import Sequence
 import dataclasses
 import functools
 import hashlib
 import logging
 import os
+import sys
 from typing import TYPE_CHECKING
 from typing import Any
 from typing import Callable
@@ -14,6 +16,7 @@
 from torch._inductor.codecache import build_code_hash
 from torch._inductor.codecache import torch_key
 
+from .. import exc
 from .._utils import counters
 from .base_search import BaseAutotuner
 
@@ -67,7 +70,8 @@ def torch_key_wrapper() -> str:
 def triton_key_wrapper() -> str:
     from torch._inductor.runtime.triton_compat import triton_key
 
-    return triton_key()
+    full_key = triton_key()
+    return hashlib.sha256(full_key.encode("utf-8")).hexdigest()
 
 
 class CacheKeyBase:
@@ -157,6 +161,16 @@ def _get_cache_info_message(self) -> str:
         """Return a message describing where the cache is and how to clear it."""
         return ""
 
+    @abc.abstractmethod
+    def _get_cache_key(self) -> CacheKeyBase:
+        """Return the cache key for this cache instance."""
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def _list_cache_entries(self) -> Sequence[tuple[str, CacheKeyBase]]:
+        """Return a sequence of (description, key) tuples for all cache entries."""
+        raise NotImplementedError
+
     def autotune(self, *, skip_cache: bool = False) -> Config:
         if skip_cache or os.environ.get("HELION_SKIP_CACHE", "") not in {
             "",
@@ -178,6 +192,43 @@ def autotune(self, *, skip_cache: bool = False) -> Config:
         counters["autotune"]["cache_miss"] += 1
         log.debug("cache miss")
 
+        if os.environ.get("HELION_ASSERT_CACHE_HIT") == "1":
+            current_key = self._get_cache_key()
+            print("\n" + "=" * 80, file=sys.stderr)
+            print("HELION_ASSERT_CACHE_HIT: Cache miss detected!", file=sys.stderr)
+            print("=" * 80, file=sys.stderr)
+            print(f"\nKernel: {self.kernel.kernel.name}", file=sys.stderr)
+            print(f"\nCurrent cache key:\n{current_key}", file=sys.stderr)
+
+            cache_entries = self._list_cache_entries()
+            if cache_entries:
+                print(
+                    f"\n{len(cache_entries)} other cache entries exist (but don't match):",
+                    file=sys.stderr,
+                )
+                for i, (desc, cached_key) in enumerate(cache_entries, 1):
+                    print(f"\n[Entry {i}] {desc}", file=sys.stderr)
+                    print("  Key differences:", file=sys.stderr)
+                    has_diff = False
+                    for field_name in vars(current_key):
+                        current_val = str(getattr(current_key, field_name))
+                        cached_val = str(getattr(cached_key, field_name, "<missing>"))
+                        if current_val != cached_val:
+                            has_diff = True
+                            print(f"    {field_name}:", file=sys.stderr)
+                            print(f"      Current:  {current_val}", file=sys.stderr)
+                            print(f"      Cached:   {cached_val}", file=sys.stderr)
+                    if not has_diff:
+                        print(
+                            "    (no differences found, likely a hash collision)",
+                            file=sys.stderr,
+                        )
+            else:
+                print("\nNo existing cache entries found.", file=sys.stderr)
+
+            print("=" * 80 + "\n", file=sys.stderr)
+            raise exc.CacheAssertionError(self.kernel.kernel.name)
+
         self.autotuner.log("Starting autotuning process, this may take a while...")
 
         config = self.autotuner.autotune()
diff --git a/helion/autotuner/local_cache.py b/helion/autotuner/local_cache.py
@@ -2,11 +2,13 @@
 
 import hashlib
 import inspect
+import json
 import logging
 import os
 from pathlib import Path
 import textwrap
 from typing import TYPE_CHECKING
+import uuid
 
 import torch
 from torch._inductor.runtime.cache_dir_utils import (
@@ -19,6 +21,8 @@
 from .base_cache import StrictAutotuneCacheKey
 
 if TYPE_CHECKING:
+    from collections.abc import Sequence
+
     from .base_search import BaseSearch
 
 log: logging.Logger = logging.getLogger(__name__)
@@ -86,18 +90,71 @@ def _get_local_cache_path(self) -> Path:
     def get(self) -> Config | None:
         path = self._get_local_cache_path()
         try:
-            return Config.load(path)
+            data = json.loads(path.read_text())
+            return Config.from_json(data["config"])
         except Exception:
             return None
 
     def put(self, config: Config) -> None:
         path = self._get_local_cache_path()
-        config.save(path)
+        path.parent.mkdir(parents=True, exist_ok=True)
+
+        # Save both config and key for better debugging
+        # Store key as dict for safer reconstruction (avoids eval)
+        key_dict = {
+            "type": type(self.key).__name__,
+            "fields": {k: str(v) for k, v in vars(self.key).items()},
+        }
+
+        data = {
+            "config": config.to_json(),
+            "key": key_dict,
+        }
+
+        # Atomic write
+        tmp = path.parent / f"tmp.{uuid.uuid4()!s}"
+        tmp.write_text(json.dumps(data, indent=2))
+        os.rename(str(tmp), str(path))
 
     def _get_cache_info_message(self) -> str:
         cache_dir = self._get_local_cache_path().parent
         return f"Cache directory: {cache_dir}. To run autotuning again, delete the cache directory or set HELION_SKIP_CACHE=1."
 
+    def _get_cache_key(self) -> LooseAutotuneCacheKey:
+        return self.key
+
+    def _list_cache_entries(self) -> Sequence[tuple[str, LooseAutotuneCacheKey]]:
+        """List all cache entries in the cache directory."""
+        cache_dir = self._get_local_cache_path().parent
+        if not cache_dir.exists():
+            return []
+
+        current_key_hash = self.key.stable_hash()
+        entries: list[tuple[str, LooseAutotuneCacheKey]] = []
+        for cache_file in cache_dir.glob("*.best_config"):
+            try:
+                data = json.loads(cache_file.read_text())
+                file_hash = cache_file.stem
+
+                if file_hash == current_key_hash:
+                    continue
+
+                key_data = data["key"]
+
+                # Create a simple namespace object that has the same attributes
+                # for comparison purposes (we don't need the full key object)
+                class CachedKey:
+                    def __init__(self, fields: dict[str, str]) -> None:
+                        for name, value in fields.items():
+                            setattr(self, name, value)
+
+                cached_key = CachedKey(key_data["fields"])
+                entries.append((cache_file.name, cached_key))  # type: ignore[arg-type]
+            except Exception:
+                pass
+
+        return entries
+
 
 class StrictLocalAutotuneCache(LocalAutotuneCache):
     """
diff --git a/helion/exc.py b/helion/exc.py
@@ -52,6 +52,10 @@ class AutotuneError(BaseError):
     message = "{0}"
 
 
+class CacheAssertionError(BaseError):
+    message = "Expected cache hit for kernel '{0}', but got cache miss. See stderr for diagnostic information."
+
+
 class ClosureMutation(BaseError):
     message = "Closure mutation (of {0}) is not allowed in a function arg."
 
diff --git a/test/test_cache.py b/test/test_cache.py
@@ -1,12 +1,15 @@
 from __future__ import annotations
 
+import os
 import unittest
+from unittest.mock import patch
 
 import torch
 from torch.testing._internal.common_utils import instantiate_parametrized_tests
 from torch.testing._internal.common_utils import parametrize
 
 import helion
+from helion import exc
 from helion._testing import DEVICE
 from helion._testing import EXAMPLES_DIR
 from helion._testing import RefEagerTestDisabled
@@ -147,6 +150,37 @@ def add_one(x: torch.Tensor):
         self.assertEqual(counters["autotune"]["cache_hit"], 1)
         self.assertEqual(counters["autotune"]["cache_put"], 2)
 
+    def test_assert_cache_hit(self):
+        counters["autotune"].clear()
+        self.addCleanup(counters["autotune"].clear)
+
+        kernel, args_a, result_a, args_b, result_b = KERNELS["add"]()
+        kernel.reset()
+        kernel.settings.autotuner_fn = StrictLocalAutotuneCache[BasicSearch]
+        kernel.settings.autotune_effort = "full"
+
+        result = kernel(*args_a)
+        torch.testing.assert_close(result, result_a)
+        self.assertEqual(counters["autotune"]["cache_miss"], 1)
+        self.assertEqual(counters["autotune"]["cache_hit"], 0)
+
+        kernel.reset()
+        with patch.dict(os.environ, {"HELION_ASSERT_CACHE_HIT": "1"}):
+            result = kernel(*args_a)
+            torch.testing.assert_close(result, result_a)
+            self.assertEqual(counters["autotune"]["cache_miss"], 1)
+            self.assertEqual(counters["autotune"]["cache_hit"], 1)
+
+        kernel.reset()
+        with patch.dict(os.environ, {"HELION_ASSERT_CACHE_HIT": "1"}):
+            with self.assertRaises(exc.CacheAssertionError) as cm:
+                kernel(*args_b)
+
+            self.assertIn("add", str(cm.exception))
+            # cache_miss incremented before error, but cache_put not (autotuning prevented)
+            self.assertEqual(counters["autotune"]["cache_miss"], 2)
+            self.assertEqual(counters["autotune"]["cache_put"], 1)
+
 
 instantiate_parametrized_tests(TestCache)