Add HELION_AUTOTUNE_IGNORE_ERRORS

jansel · jansel · commit 694c4ea51dcc · 2025-10-15T22:50:17.000-07:00
stack-info: PR: #961, branch: jansel/stack/201
diff --git a/docs/api/settings.md b/docs/api/settings.md
@@ -148,6 +148,10 @@ with helion.set_default_settings(
 
    Lower values result in faster autotuning but may find less optimal configurations.
 
+.. autoattribute:: Settings.autotune_ignore_errors
+
+   Continue autotuning even when candidate configurations raise recoverable runtime errors (for example, GPU out-of-memory). Default is ``False``. Controlled by ``HELION_AUTOTUNE_IGNORE_ERRORS``.
+
 .. autoattribute:: Settings.autotune_accuracy_check
 
    Validate each candidate configuration against a baseline output before accepting it. Default is ``True``. Controlled by ``HELION_AUTOTUNE_ACCURACY_CHECK``.
@@ -248,6 +252,7 @@ Built-in values for ``HELION_AUTOTUNER`` include ``"PatternSearch"``, ``"Differe
 | ``HELION_AUTOTUNE_EFFORT`` | ``autotune_effort`` | Select autotuning preset (``"none"``, ``"quick"``, ``"full"``). |
 | ``HELION_REBENCHMARK_THRESHOLD`` | ``autotune_rebenchmark_threshold`` | Re-run configs whose performance is within a multiplier of the current best. |
 | ``HELION_AUTOTUNE_PROGRESS_BAR`` | ``autotune_progress_bar`` | Enable or disable the progress bar UI during autotuning. |
+| ``HELION_AUTOTUNE_IGNORE_ERRORS`` | ``autotune_ignore_errors`` | Continue autotuning even when recoverable runtime errors occur. |
 | ``HELION_CACHE_DIR`` | ``LocalAutotuneCache`` | Override the on-disk directory used for cached autotuning artifacts. |
 | ``HELION_SKIP_CACHE`` | ``LocalAutotuneCache`` | When set to ``1``, ignore cached autotuning entries and rerun searches. |
 | ``HELION_PRINT_OUTPUT_CODE`` | ``print_output_code`` | Print generated Triton code to stderr for inspection. |
diff --git a/helion/autotuner/base_search.py b/helion/autotuner/base_search.py
@@ -230,9 +230,10 @@ def _validate_against_baseline(
                 )
         except AssertionError as e:
             self.counters["accuracy_mismatch"] += 1
-            self.log.warning(
-                f"Skipping config with accuracy mismatch: {config!r}\n{e!s}\nUse HELION_AUTOTUNE_ACCURACY_CHECK=0 to disable this check.\n"
-            )
+            if not self.settings.autotune_ignore_errors:
+                self.log.warning(
+                    f"Skipping config with accuracy mismatch: {config!r}\n{e!s}\nUse HELION_AUTOTUNE_ACCURACY_CHECK=0 to disable this check.\n"
+                )
             return False
         return True
 
@@ -299,13 +300,17 @@ def benchmark_function(self, config: Config, fn: CompiledConfig) -> float:
             return res
         except Exception as e:
             action = classify_triton_exception(e)
-            if action == "raise":
+            if self.settings.autotune_ignore_errors:
+                pass
+            elif action == "raise":
                 raise exc.TritonError(
-                    f"{type(e).__qualname__}: {e}",
-                    self.kernel.format_kernel_decorator(config, self.settings),
-                    self.kernel.to_triton_code(config),
+                    error=f"{type(e).__qualname__}: {e}",
+                    decorator=self.kernel.format_kernel_decorator(
+                        config, self.settings
+                    ),
+                    code=self.kernel.to_triton_code(config),
                 ) from e
-            if action == "warn":
+            elif action == "warn":
                 self.log.warning(format_triton_compile_failure(config, e, self.kernel))
             else:
                 self.log.debug(f"Benchmarking failed: {type(e).__name__}: {e}")
@@ -1005,14 +1010,16 @@ def _mark_complete(self) -> bool:
         process.join(10)
         msg = f"Timeout after {self.elapsed:.0f}s compiling {self.config}"
         if process.is_alive():
-            self.search.log.warning(
-                msg,
-                "(SIGKILL required)",
-            )
+            if not self.search.settings.autotune_ignore_errors:
+                self.search.log.warning(
+                    msg,
+                    "(SIGKILL required)",
+                )
             process.kill()
             process.join()
         else:
-            self.search.log.warning(msg)
+            if not self.search.settings.autotune_ignore_errors:
+                self.search.log.warning(msg)
 
         self.ok = False
         self.failure_reason = "timeout"
@@ -1071,15 +1078,17 @@ def _handle_remote_error(self, *, raise_on_raise: bool) -> None:
             return
         exc_obj = error.to_exception()
         classification = error.classification or classify_triton_exception(exc_obj)
+        if ignore_errors := self.search.settings.autotune_ignore_errors:
+            classification = "debug"
         if classification == "raise":
             if raise_on_raise:
                 self._remote_error_handled = True
                 raise exc.TritonError(
-                    f"{type(exc_obj).__qualname__}: {exc_obj}",
-                    self.search.kernel.format_kernel_decorator(
+                    error=f"{type(exc_obj).__qualname__}: {exc_obj}",
+                    decorator=self.search.kernel.format_kernel_decorator(
                         self.config, self.search.settings
                     ),
-                    self.search.kernel.to_triton_code(self.config),
+                    code=self.search.kernel.to_triton_code(self.config),
                 ) from exc_obj
             return
 
@@ -1092,7 +1101,7 @@ def _handle_remote_error(self, *, raise_on_raise: bool) -> None:
             )
         if classification == "warn":
             self.search.log.warning(message)
-        else:
+        elif not ignore_errors:
             self.search.log.debug(message)
         self._remote_error_handled = True
 
diff --git a/helion/exc.py b/helion/exc.py
@@ -335,7 +335,14 @@ class TorchOpTracingError(_WrapException):
 
 
 class TritonError(BaseError):
-    message = "Error running generated Triton program:\n{1}\n{0}\n\nGenerated Triton code:\n{2}"
+    message = """\
+Error from Triton code:
+{code}
+
+Error running generated Triton program:
+{error}
+{decorator}
+Set autotune_ignore_errors=True or HELION_AUTOTUNE_IGNORE_ERRORS=1 to ignore Triton errors in autotuning."""
 
 
 class BaseWarning(_FixedMessage):
diff --git a/helion/runtime/settings.py b/helion/runtime/settings.py
@@ -155,6 +155,10 @@ def _get_autotune_precompile_jobs() -> int | None:
     return jobs
 
 
+def _get_autotune_ignore_errors() -> bool:
+    return os.environ.get("HELION_AUTOTUNE_IGNORE_ERRORS", "0") == "1"
+
+
 @dataclasses.dataclass
 class _Settings:
     # see __slots__ below for the doc strings that show up in help(Settings)
@@ -192,6 +196,9 @@ class _Settings:
     autotune_max_generations: int | None = dataclasses.field(
         default_factory=_get_autotune_max_generations
     )
+    autotune_ignore_errors: bool = dataclasses.field(
+        default_factory=_get_autotune_ignore_errors
+    )
     print_output_code: bool = os.environ.get("HELION_PRINT_OUTPUT_CODE", "0") == "1"
     force_autotune: bool = os.environ.get("HELION_FORCE_AUTOTUNE", "0") == "1"
     autotune_config_overrides: dict[str, object] = dataclasses.field(
@@ -230,6 +237,10 @@ class Settings(_Settings):
         "autotune_rebenchmark_threshold": "If a config is within threshold*best_perf, re-benchmark it to avoid outliers. Defaults to effort profile value. Set HELION_REBENCHMARK_THRESHOLD to override.",
         "autotune_progress_bar": "If True, show progress bar during autotuning. Default is True. Set HELION_AUTOTUNE_PROGRESS_BAR=0 to disable.",
         "autotune_max_generations": "Override the maximum number of generations for Pattern Search and Differential Evolution Search autotuning algorithms with HELION_AUTOTUNE_MAX_GENERATIONS=N or @helion.kernel(autotune_max_generations=N).",
+        "autotune_ignore_errors": (
+            "If True, skip logging and raising autotune errors. "
+            "Set HELION_AUTOTUNE_IGNORE_ERRORS=1 to enable globally."
+        ),
         "print_output_code": "If True, print the output code of the kernel to stderr.",
         "force_autotune": "If True, force autotuning even if a config is provided.",
         "autotune_config_overrides": "Dictionary of config key/value pairs forced during autotuning.",
diff --git a/test/test_autotuner.py b/test/test_autotuner.py
@@ -1,7 +1,9 @@
 from __future__ import annotations
 
+import collections
 from contextlib import contextmanager
 from contextlib import nullcontext
+import logging
 import math
 import os
 from pathlib import Path
@@ -18,23 +20,27 @@
 
 import helion
 from helion import _compat
+from helion import exc
 from helion._testing import DEVICE
 from helion._testing import RefEagerTestDisabled
 from helion._testing import TestCase
 from helion._testing import import_path
 from helion._testing import skipIfRocm
 from helion.autotuner import DifferentialEvolutionSearch
 from helion.autotuner import PatternSearch
+from helion.autotuner.base_search import BaseSearch
 from helion.autotuner.config_fragment import BooleanFragment
 from helion.autotuner.config_fragment import EnumFragment
 from helion.autotuner.config_fragment import IntegerFragment
 from helion.autotuner.config_fragment import PowerOfTwoFragment
 from helion.autotuner.config_generation import ConfigGeneration
 from helion.autotuner.effort_profile import get_effort_profile
 from helion.autotuner.finite_search import FiniteSearch
+from helion.autotuner.logger import LambdaLogger
 from helion.autotuner.random_search import RandomSearch
 import helion.language as hl
 from helion.language import loops
+from helion.runtime.settings import Settings
 
 datadir = Path(__file__).parent / "data"
 basic_kernels = import_path(datadir / "basic_kernels.py")
@@ -63,6 +69,64 @@ def _autotune(self):
         return super()._autotune()
 
 
+class TestAutotuneIgnoreErrors(TestCase):
+    def _make_search(self, settings: Settings) -> BaseSearch:
+        search = BaseSearch.__new__(BaseSearch)
+        search.settings = settings
+        search.kernel = SimpleNamespace(
+            format_kernel_decorator=lambda config, s: "decorator",
+            to_triton_code=lambda config: "code",
+        )
+        search.args = ()
+        search.counters = collections.Counter()
+        search.log = LambdaLogger(logging.CRITICAL)
+        search._kernel_mutates_args = False
+        search.best_perf_so_far = float("inf")
+        return search
+
+    def test_settings_flag_from_env(self):
+        with patch.dict(
+            os.environ, {"HELION_AUTOTUNE_IGNORE_ERRORS": "1"}, clear=False
+        ):
+            settings = Settings()
+        self.assertTrue(settings.autotune_ignore_errors)
+
+    def test_benchmark_raise_includes_hint(self):
+        settings = Settings(
+            autotune_ignore_errors=False,
+            autotune_log_level=logging.CRITICAL,
+        )
+        search = self._make_search(settings)
+
+        def bad_fn(*_args):
+            raise RuntimeError("boom")
+
+        with patch("torch.accelerator.synchronize", autospec=True) as sync:
+            sync.return_value = None
+            with pytest.raises(exc.TritonError) as err:
+                search.benchmark_function("cfg", bad_fn)
+
+        assert "HELION_AUTOTUNE_IGNORE_ERRORS" in str(err.value)
+
+    def test_ignore_errors_skips_logging_and_raise(self):
+        settings = Settings(
+            autotune_ignore_errors=True,
+            autotune_log_level=logging.CRITICAL,
+        )
+        search = self._make_search(settings)
+
+        def bad_fn(*_args):
+            raise RuntimeError("boom")
+
+        with patch("torch.accelerator.synchronize", autospec=True) as sync:
+            sync.return_value = None
+            with patch.object(search.log, "warning") as warn:
+                result = search.benchmark_function("cfg", bad_fn)
+
+        self.assertEqual(result, float("inf"))
+        warn.assert_not_called()
+
+
 class TestAutotuner(RefEagerTestDisabled, TestCase):
     def setUp(self):
         super().setUp()