add cute_inductor example (#157)

mayank31398 · web-flow · commit 5b7f6681b5d0 · 2025-02-19T13:58:26.000-05:00
Signed-off-by: Mayank Mishra &lt;mayank31398@gmail.com&gt;
diff --git a/cute_kernels/cute_inductor/compiler.py b/cute_kernels/cute_inductor/compiler.py
@@ -3,7 +3,7 @@
 import torch
 from torch._dynamo import lookup_backend
 
-from ..utils import get_boolean_env_variable, set_cute_tracing
+from ..utils import enable_cute_tracing, get_boolean_env_variable
 from .rmsnorm import replace_rmsnorm
 from .swiglu_unchunked import replace_swiglu_unchunked
 
@@ -21,26 +21,23 @@ def __init__(
         self.replace_functions = replace_functions
 
     def compiler(self, gm: torch.fx.GraphModule, example_inputs: list[torch.Tensor]) -> Callable:
-        set_cute_tracing(True)
-
-        if _DEBUG_CUTEINDUCTOR:
-            print("graph before cute inductor")
-            gm.print_readable()
-
-        for replace_function in self.replace_functions:
-            for node in gm.graph.nodes:
-                replace_function(gm, node)
-
-        if _DEBUG_CUTEINDUCTOR:
-            print("graph after cute inductor")
-            gm.print_readable()
-
-        if self.use_torch_inductor_after_cute_inductor:
-            inductor = lookup_backend("inductor")
-            compiled = inductor(gm, example_inputs)
-        else:
-            compiled = gm.forward
-
-        set_cute_tracing(False)
-
-        return compiled
+        with enable_cute_tracing():
+            if _DEBUG_CUTEINDUCTOR:
+                print("graph before cute inductor")
+                gm.print_readable()
+
+            for replace_function in self.replace_functions:
+                for node in gm.graph.nodes:
+                    replace_function(gm, node)
+
+            if _DEBUG_CUTEINDUCTOR:
+                print("graph after cute inductor")
+                gm.print_readable()
+
+            if self.use_torch_inductor_after_cute_inductor:
+                inductor = lookup_backend("inductor")
+                compiled = inductor(gm, example_inputs)
+            else:
+                compiled = gm.forward
+
+            return compiled
diff --git a/cute_kernels/utils/__init__.py b/cute_kernels/utils/__init__.py
@@ -1,5 +1,5 @@
 from .contiguous import ensure_contiguous, ensure_same_strides
-from .custom_op import cute_op, set_cute_tracing
+from .custom_op import cute_op, enable_cute_tracing
 from .device import device_synchronize, get_sm_count, is_hip
 from .env import get_boolean_env_variable
 from .ptx import get_ptx_from_triton_kernel
diff --git a/cute_kernels/utils/custom_op.py b/cute_kernels/utils/custom_op.py
@@ -1,4 +1,5 @@
 import inspect
+from contextlib import contextmanager
 from typing import Callable, Iterable, Sequence
 
 import torch
@@ -7,9 +8,14 @@
 _IS_CUTE_TRACING = False
 
 
-def set_cute_tracing(enable: bool) -> None:
+@contextmanager
+def enable_cute_tracing():
     global _IS_CUTE_TRACING
-    _IS_CUTE_TRACING = enable
+    _IS_CUTE_TRACING = True
+
+    yield
+
+    _IS_CUTE_TRACING = False
 
 
 def _dispatch(func: Callable, compileable_fn: Callable, *args, **kwargs):
diff --git a/examples/cute_inductor.py b/examples/cute_inductor.py
@@ -0,0 +1,45 @@
+import torch
+import torch.nn as nn
+
+from cute_kernels import CuteInductor
+from cute_kernels.cute_inductor.rmsnorm import replace_rmsnorm
+from cute_kernels.cute_inductor.swiglu_unchunked import replace_swiglu_unchunked
+
+
+# NOTE swiglu unchunked computes:
+# ------------------------------------------------------------------------------
+# def swiglu_unchunked_torch(x: torch.Tensor) -> torch.Tensor:
+#     x = x.chunk(2, dim=-1)
+#     return x[0] * F.silu(x[1])
+# ------------------------------------------------------------------------------
+
+
+class Model(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+        self.norm1 = nn.RMSNorm(4)
+        self.linear = nn.Linear(4, 4)
+        self.norm2 = nn.RMSNorm(4)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.norm1(x)
+        x = self.linear(x)
+        x = self.norm2(x)
+        return x
+
+
+model = Model().to(torch.cuda.current_device())
+
+use_torch_inductor_after_cute_inductor = True  # to use torch's compiler optimizations as well
+replace_functions = [replace_rmsnorm]  # add other replacing functions
+
+cute_inductor = CuteInductor(
+    use_torch_inductor_after_cute_inductor=use_torch_inductor_after_cute_inductor, replace_functions=replace_functions
+)
+
+compiled_model = torch.compile(model, backend=cute_inductor.compiler)
+
+# trigger JIT compilation
+x = torch.randn(4, 4, device=torch.cuda.current_device())
+y = compiled_model(x)