Add a pass to keep cond predicate on CPU memory

larryliu0820 · larryliu0820 · commit cb47477568db · 2025-11-27T16:53:45.000-08:00
diff --git a/backends/aoti/aoti_backend.py b/backends/aoti/aoti_backend.py
@@ -166,7 +166,10 @@ def preprocess(
         # Apply custom backend-specific passes
         custom_passes = cls.get_custom_passes()
         for custom_pass in custom_passes:
-            custom_pass(device_edge_program.graph_module)
+            if getattr(custom_pass, "requires_exported_program", False):
+                custom_pass(device_edge_program)
+            else:
+                custom_pass(device_edge_program.graph_module)
 
         # Run decompositions if any
         if decomposition_table:
@@ -187,9 +190,10 @@ def preprocess(
         missing_fallback_kernels: Set[str] = set()
 
         # Compile with fallback kernel collection
-        with cls.collect_unsupported_fallback_kernels(
-            missing_fallback_kernels
-        ), torch.no_grad():
+        with (
+            cls.collect_unsupported_fallback_kernels(missing_fallback_kernels),
+            torch.no_grad(),
+        ):
             paths = torch._inductor.aot_compile(
                 edge_program_module, tuple(user_input_placeholders), options=options
             )
diff --git a/backends/cuda/cuda_backend.py b/backends/cuda/cuda_backend.py
@@ -10,6 +10,9 @@
 
 import torch
 from executorch.backends.aoti.aoti_backend import AotiBackend
+from executorch.backends.cuda.passes.keep_cond_predicate_on_cpu import (
+    KeepCondPredicateOnCpuPass,
+)
 from executorch.backends.cuda.triton.replacement_pass import (
     ReplaceEdgeOpWithTritonOpPass,
 )
@@ -49,7 +52,7 @@ def get_decomposition_table(cls) -> Dict[Any, Any]:
     @classmethod
     def get_custom_passes(cls) -> List[typing.Any]:
         """Return CUDA-specific passes: ReplaceEdgeOpWithTritonOpPass"""
-        return [ReplaceEdgeOpWithTritonOpPass()]
+        return [KeepCondPredicateOnCpuPass(), ReplaceEdgeOpWithTritonOpPass()]
 
     @classmethod
     def get_aoti_compile_options(
diff --git a/backends/cuda/passes/__init__.py b/backends/cuda/passes/__init__.py
diff --git a/backends/cuda/passes/keep_cond_predicate_on_cpu.py b/backends/cuda/passes/keep_cond_predicate_on_cpu.py
@@ -0,0 +1,62 @@
+import torch
+from torch.export import ExportedProgram
+
+
+class KeepCondPredicateOnCpuPass:
+    """
+    A pass that locates torch.cond in the graph and makes sure the predicate stays on CPU
+    if the predicate is a buffer (placeholder).
+    """
+
+    requires_exported_program = True
+
+    def __call__(self, exported_program: ExportedProgram):
+        graph_module = exported_program.graph_module
+        state_dict = exported_program.state_dict
+
+        # Map input names to buffer names
+        inputs_to_buffers = exported_program.graph_signature.inputs_to_buffers
+
+        for node in graph_module.graph.nodes:
+            if (
+                node.op == "call_function"
+                and node.target == torch.ops.higher_order.cond
+            ):
+                pred_node = node.args[0]
+                if pred_node.op == "placeholder":
+                    # Found a placeholder used as predicate
+                    # Check if it corresponds to a buffer
+                    if pred_node.name in inputs_to_buffers:
+                        buffer_name = inputs_to_buffers[pred_node.name]
+
+                        # Move the buffer in state_dict to CPU
+                        if buffer_name in state_dict:
+                            # We modify the tensor in place or replace it?
+                            # Replacing it is safer.
+                            tensor = exported_program.state_dict[buffer_name]
+                            if tensor.device.type != "cpu":
+                                if isinstance(tensor, torch.nn.Parameter):
+                                    exported_program._state_dict[buffer_name] = (
+                                        torch.nn.Parameter(
+                                            tensor.to("cpu"),
+                                            tensor.requires_grad,
+                                        )
+                                    )
+                                else:
+                                    exported_program._state_dict[buffer_name] = (
+                                        tensor.to("cpu")
+                                    )
+
+                        if buffer_name in exported_program.constants:
+                            tensor = exported_program._constants[buffer_name]
+                            if tensor.device.type != "cpu":
+                                exported_program._constants[buffer_name] = tensor.to(
+                                    "cpu"
+                                )
+
+                        # Also update the placeholder metadata
+                        if "val" in pred_node.meta:
+                            fake_tensor = pred_node.meta["val"]
+                            if isinstance(fake_tensor, torch.Tensor):
+                                pred_node.meta["val"] = fake_tensor.to("cpu")
+        exported_program.validate()
diff --git a/backends/cuda/tests/test_keep_cond_predicate_on_cpu.py b/backends/cuda/tests/test_keep_cond_predicate_on_cpu.py
@@ -0,0 +1,69 @@
+import unittest
+
+import torch
+from executorch.backends.cuda.passes.keep_cond_predicate_on_cpu import (
+    KeepCondPredicateOnCpuPass,
+)
+from torch.export import export
+
+
+class TestKeepCondPredicateOnCpuPass(unittest.TestCase):
+    def test_keep_cond_predicate_on_cpu(self):
+        # Define a simple model using torch.cond
+        class Model(torch.nn.Module):
+            def forward(self, pred, x, y):
+                def true_fn(x, y):
+                    return x + y
+
+                def false_fn(x, y):
+                    return x - y
+
+                return torch.cond(pred, true_fn, false_fn, [x, y])
+
+        model = Model()
+        pred = torch.tensor(True)
+        x = torch.randn(2, 2)
+        y = torch.randn(2, 2)
+
+        # Export the model
+        ep = export(model, (pred, x, y))
+        gm = ep.graph_module
+
+        # Simulate move_to_device_pass by setting all placeholders to cuda using FakeTensorMode
+        # We need to be careful not to trigger CUDA init
+        from unittest.mock import MagicMock
+
+        for node in gm.graph.nodes:
+            if node.op == "placeholder":
+                if "val" in node.meta:
+                    # Use MagicMock to simulate a tensor on cuda
+                    val = MagicMock(spec=torch.Tensor)
+                    val.device = torch.device("cuda")
+
+                    def to_side_effect(device):
+                        new_val = MagicMock(spec=torch.Tensor)
+                        new_val.device = torch.device(device)
+                        return new_val
+
+                    val.to.side_effect = to_side_effect
+                    node.meta["val"] = val
+
+        # Verify that pred is on cuda
+        pred_node = list(gm.graph.nodes)[0]
+        self.assertEqual(pred_node.meta["val"].device.type, "cuda")
+
+        # Run the pass
+        pass_instance = KeepCondPredicateOnCpuPass()
+        pass_instance(gm)
+
+        # Verify that pred is back on cpu
+        self.assertEqual(pred_node.meta["val"].device.type, "cpu")
+
+        # Verify other nodes are still on cuda (if they were)
+        # The second node is x
+        x_node = list(gm.graph.nodes)[1]
+        self.assertEqual(x_node.meta["val"].device.type, "cuda")
+
+
+if __name__ == "__main__":
+    unittest.main()