pytorch
diff --git a/‎docs/api/settings.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/api/settings.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎helion/_compiler/type_propagation.py‎
Lines changed: 7 additions & 1 deletion b/‎helion/_compiler/type_propagation.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎helion/autotuner/base_cache.py‎
Lines changed: 52 additions & 1 deletion b/‎helion/autotuner/base_cache.py‎
Lines changed: 52 additions & 1 deletion
diff --git a/‎helion/autotuner/local_cache.py‎
Lines changed: 59 additions & 2 deletions b/‎helion/autotuner/local_cache.py‎
Lines changed: 59 additions & 2 deletions
diff --git a/‎helion/exc.py‎
Lines changed: 13 additions & 0 deletions b/‎helion/exc.py‎
Lines changed: 13 additions & 0 deletions
@@ -257,6 +257,7 @@ Built-in values for ``HELION_AUTOTUNER`` include ``"PatternSearch"``, ``"Differe
 | ``HELION_AUTOTUNE_CONFIG_OVERRIDES`` | ``autotune_config_overrides`` | Supply JSON forcing particular autotuner config key/value pairs. |
 | ``HELION_CACHE_DIR`` | ``LocalAutotuneCache`` | Override the on-disk directory used for cached autotuning artifacts. |
 | ``HELION_SKIP_CACHE`` | ``LocalAutotuneCache`` | When set to ``1``, ignore cached autotuning entries and rerun searches. |
+| ``HELION_ASSERT_CACHE_HIT`` | ``AutotuneCacheBase`` | When set to ``1``, require a cache hit; raises ``CacheAssertionError`` on cache miss with detailed diagnostics. |
 | ``HELION_PRINT_OUTPUT_CODE`` | ``print_output_code`` | Print generated Triton code to stderr for inspection. |
 | ``HELION_OUTPUT_ORIGIN_LINES`` | ``output_origin_lines`` | Include ``# src[...]`` comments in generated Triton code; set to ``0`` to disable. |
 | ``HELION_IGNORE_WARNINGS`` | ``ignore_warnings`` | Comma-separated warning names defined in ``helion.exc`` to suppress. |
 
@@ -30,6 +30,7 @@
 from ..language.stack_tensor import StackTensor
 from ..language.tile_proxy import Tile
 from ..language.tile_proxy import _CheckForIndexCalls
+from ..runtime.kernel import Kernel
 from .ast_extension import ExtendedAST
 from .ast_extension import LoopType
 from .ast_extension import create
@@ -105,6 +106,8 @@ def _get(self, name: str) -> TypeInfo:
                 return TypeInfo.from_example(value, origin)
 
             origin = self.function.global_scope_origin(name)
+            if isinstance(value, Kernel):
+                return TypeInfo.from_example(value, origin)
             if not isinstance(
                 value,
                 (types.ModuleType, types.FunctionType, types.BuiltinFunctionType),
@@ -1973,9 +1976,12 @@ def visit_Compare(self, node: ast.Compare) -> TypeInfo:
 
     def visit_Call(self, node: ast.Call) -> TypeInfo:
         # TODO(jansel): test handling if *args and **kwargs
-        # TODO(jansel): check for calling a Kernel here
         func = self.visit(node.func)
 
+        # Check for calling a Helion kernel from within another Helion kernel
+        if isinstance(func, CallableType) and isinstance(func.value, Kernel):
+            raise exc.NestedKernelCallsNotSupported
+
         if (
             isinstance(func, CallableType)
             and self.origin().is_device()
 
@@ -1,11 +1,13 @@
 from __future__ import annotations
 
 import abc
+from collections.abc import Sequence
 import dataclasses
 import functools
 import hashlib
 import logging
 import os
+import sys
 from typing import TYPE_CHECKING
 from typing import Any
 from typing import Callable
@@ -14,6 +16,7 @@
 from torch._inductor.codecache import build_code_hash
 from torch._inductor.codecache import torch_key
 
+from .. import exc
 from .._utils import counters
 from .base_search import BaseAutotuner
 
@@ -67,7 +70,8 @@ def torch_key_wrapper() -> str:
 def triton_key_wrapper() -> str:
     from torch._inductor.runtime.triton_compat import triton_key
 
-    return triton_key()
+    full_key = triton_key()
+    return hashlib.sha256(full_key.encode("utf-8")).hexdigest()
 
 
 class CacheKeyBase:
@@ -157,6 +161,16 @@ def _get_cache_info_message(self) -> str:
         """Return a message describing where the cache is and how to clear it."""
         return ""
 
+    @abc.abstractmethod
+    def _get_cache_key(self) -> CacheKeyBase:
+        """Return the cache key for this cache instance."""
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def _list_cache_entries(self) -> Sequence[tuple[str, CacheKeyBase]]:
+        """Return a sequence of (description, key) tuples for all cache entries."""
+        raise NotImplementedError
+
     def autotune(self, *, skip_cache: bool = False) -> Config:
         if skip_cache or os.environ.get("HELION_SKIP_CACHE", "") not in {
             "",
@@ -178,6 +192,43 @@ def autotune(self, *, skip_cache: bool = False) -> Config:
         counters["autotune"]["cache_miss"] += 1
         log.debug("cache miss")
 
+        if os.environ.get("HELION_ASSERT_CACHE_HIT") == "1":
+            current_key = self._get_cache_key()
+            print("\n" + "=" * 80, file=sys.stderr)
+            print("HELION_ASSERT_CACHE_HIT: Cache miss detected!", file=sys.stderr)
+            print("=" * 80, file=sys.stderr)
+            print(f"\nKernel: {self.kernel.kernel.name}", file=sys.stderr)
+            print(f"\nCurrent cache key:\n{current_key}", file=sys.stderr)
+
+            cache_entries = self._list_cache_entries()
+            if cache_entries:
+                print(
+                    f"\n{len(cache_entries)} other cache entries exist (but don't match):",
+                    file=sys.stderr,
+                )
+                for i, (desc, cached_key) in enumerate(cache_entries, 1):
+                    print(f"\n[Entry {i}] {desc}", file=sys.stderr)
+                    print("  Key differences:", file=sys.stderr)
+                    has_diff = False
+                    for field_name in vars(current_key):
+                        current_val = str(getattr(current_key, field_name))
+                        cached_val = str(getattr(cached_key, field_name, "<missing>"))
+                        if current_val != cached_val:
+                            has_diff = True
+                            print(f"    {field_name}:", file=sys.stderr)
+                            print(f"      Current:  {current_val}", file=sys.stderr)
+                            print(f"      Cached:   {cached_val}", file=sys.stderr)
+                    if not has_diff:
+                        print(
+                            "    (no differences found, likely a hash collision)",
+                            file=sys.stderr,
+                        )
+            else:
+                print("\nNo existing cache entries found.", file=sys.stderr)
+
+            print("=" * 80 + "\n", file=sys.stderr)
+            raise exc.CacheAssertionError(self.kernel.kernel.name)
+
         self.autotuner.log("Starting autotuning process, this may take a while...")
 
         config = self.autotuner.autotune()
 
@@ -2,11 +2,13 @@
 
 import hashlib
 import inspect
+import json
 import logging
 import os
 from pathlib import Path
 import textwrap
 from typing import TYPE_CHECKING
+import uuid
 
 import torch
 from torch._inductor.runtime.cache_dir_utils import (
@@ -19,6 +21,8 @@
 from .base_cache import StrictAutotuneCacheKey
 
 if TYPE_CHECKING:
+    from collections.abc import Sequence
+
     from .base_search import BaseSearch
 
 log: logging.Logger = logging.getLogger(__name__)
@@ -86,18 +90,71 @@ def _get_local_cache_path(self) -> Path:
     def get(self) -> Config | None:
         path = self._get_local_cache_path()
         try:
-            return Config.load(path)
+            data = json.loads(path.read_text())
+            return Config.from_json(data["config"])
         except Exception:
             return None
 
     def put(self, config: Config) -> None:
         path = self._get_local_cache_path()
-        config.save(path)
+        path.parent.mkdir(parents=True, exist_ok=True)
+
+        # Save both config and key for better debugging
+        # Store key as dict for safer reconstruction (avoids eval)
+        key_dict = {
+            "type": type(self.key).__name__,
+            "fields": {k: str(v) for k, v in vars(self.key).items()},
+        }
+
+        data = {
+            "config": config.to_json(),
+            "key": key_dict,
+        }
+
+        # Atomic write
+        tmp = path.parent / f"tmp.{uuid.uuid4()!s}"
+        tmp.write_text(json.dumps(data, indent=2))
+        os.rename(str(tmp), str(path))
 
     def _get_cache_info_message(self) -> str:
         cache_dir = self._get_local_cache_path().parent
         return f"Cache directory: {cache_dir}. To run autotuning again, delete the cache directory or set HELION_SKIP_CACHE=1."
 
+    def _get_cache_key(self) -> LooseAutotuneCacheKey:
+        return self.key
+
+    def _list_cache_entries(self) -> Sequence[tuple[str, LooseAutotuneCacheKey]]:
+        """List all cache entries in the cache directory."""
+        cache_dir = self._get_local_cache_path().parent
+        if not cache_dir.exists():
+            return []
+
+        current_key_hash = self.key.stable_hash()
+        entries: list[tuple[str, LooseAutotuneCacheKey]] = []
+        for cache_file in cache_dir.glob("*.best_config"):
+            try:
+                data = json.loads(cache_file.read_text())
+                file_hash = cache_file.stem
+
+                if file_hash == current_key_hash:
+                    continue
+
+                key_data = data["key"]
+
+                # Create a simple namespace object that has the same attributes
+                # for comparison purposes (we don't need the full key object)
+                class CachedKey:
+                    def __init__(self, fields: dict[str, str]) -> None:
+                        for name, value in fields.items():
+                            setattr(self, name, value)
+
+                cached_key = CachedKey(key_data["fields"])
+                entries.append((cache_file.name, cached_key))  # type: ignore[arg-type]
+            except Exception:
+                pass
+
+        return entries
+
 
 class StrictLocalAutotuneCache(LocalAutotuneCache):
     """
 
@@ -52,6 +52,10 @@ class AutotuneError(BaseError):
     message = "{0}"
 
 
+class CacheAssertionError(BaseError):
+    message = "Expected cache hit for kernel '{0}', but got cache miss. See stderr for diagnostic information."
+
+
 class ClosureMutation(BaseError):
     message = "Closure mutation (of {0}) is not allowed in a function arg."
 
@@ -448,3 +452,12 @@ class NoDeviceLoopsInKernel(BaseError):
         "Kernel contains no device loops. Add an hl.tile(...) or hl.grid(...) loop "
         "around your device computations."
     )
+
+
+class NestedKernelCallsNotSupported(BaseError):
+    message = (
+        "Calling a Helion kernel from within another Helion kernel is not supported. "
+        "Helion kernels can only be called from outside of @helion.kernel functions. "
+        "If you need to share code between kernels, consider extracting the shared logic "
+        "into a regular Python function that can be called from within both kernels."
+    )