Generalize test cases to support XPU (#983)

EikanWang · web-flow · commit 4af37e9132cf · 2025-10-19T21:55:47.000-07:00
diff --git a/test/test_constexpr.py b/test/test_constexpr.py
@@ -10,7 +10,6 @@
 from helion._testing import TestCase
 from helion._testing import code_and_output
 from helion._testing import skipIfRefEager
-from helion._testing import skipIfXPU
 import helion.language as hl
 
 
@@ -95,7 +94,6 @@ def fn(x: torch.Tensor, mode: str) -> torch.Tensor:
         self.assertExpectedJournal(code)
 
     @skipIfRefEager("Triton codegen does not work in ref eager mode")
-    @skipIfXPU("Failed on XPU due to a different configuration for min dot size")
     def test_block_size_constexpr_assignment_in_host_code(self) -> None:
         @helion.kernel(
             config=helion.Config(
diff --git a/test/test_dot.py b/test/test_dot.py
@@ -199,17 +199,19 @@ def test_hl_dot_codegen_acc_differs_uses_addition(self):
         self.assertIn("out_dtype=tl.float32", code)
 
         # Test case 2: separate addition (acc_dtype = float16, common dtype = float32)
-        input_dtype_2 = torch.float32
-        acc_dtype_2 = torch.float16
-        x2 = torch.randn(64, 64, device=DEVICE, dtype=input_dtype_2)
-        y2 = torch.randn(64, 64, device=DEVICE, dtype=input_dtype_2)
-        code2, out2 = code_and_output(dot_kernel_acc_arg, (x2, y2, acc_dtype_2))
-        # Validate we use separate addition pattern with cast
-        self.assertIn("tl.dot(", code2)
-        # Check for the addition pattern: acc + result
-        self.assertIn(" + ", code2)
-        # Check that we cast the result to acc_dtype
-        self.assertIn("tl.cast", code2)
+        # TODO(Eikan): Support this case on XPU
+        if not torch.xpu.is_available():
+            input_dtype_2 = torch.float32
+            acc_dtype_2 = torch.float16
+            x2 = torch.randn(64, 64, device=DEVICE, dtype=input_dtype_2)
+            y2 = torch.randn(64, 64, device=DEVICE, dtype=input_dtype_2)
+            code2, out2 = code_and_output(dot_kernel_acc_arg, (x2, y2, acc_dtype_2))
+            # Validate we use separate addition pattern with cast
+            self.assertIn("tl.dot(", code2)
+            # Check for the addition pattern: acc + result
+            self.assertIn(" + ", code2)
+            # Check that we cast the result to acc_dtype
+            self.assertIn("tl.cast", code2)
 
         # Test case 3: separate addition (acc_dtype = int32, common dtype = int8)
         input_dtype_3 = torch.int8
@@ -951,6 +953,17 @@ def test_matmul_reshape_n_2(self):
                 REF_EAGER_TEST_FAILURES_FP8_E4M3FN_LOW_COMPUTE_CAP[test_name]
             )(_test_func)
 
+    # Apply skipIfXPU decorator if needed
+    if acc_dtype is torch.float16 and input_dtype in (
+        torch.float8_e4m3fn,
+        torch.float8_e5m2,
+        torch.bfloat16,
+        torch.float32,
+    ):
+        _test_func = skipIfXPU("skip: float6 accmulator for non-fp16 input data types")(
+            _test_func
+        )
+
     # Additional ref eager skips for unsupported accumulator/input combos
     if acc_dtype is torch.float16 and input_dtype in (
         torch.bfloat16,
diff --git a/test/test_examples.py b/test/test_examples.py
@@ -433,6 +433,7 @@ def test_low_mem_dropout(self):
         )
 
     @skipIfRocm("precision differences with bf16xint16 operations on rocm")
+    @skipIfXPU("precision differences with bf16xint16 operations on xpu")
     def test_bf16xint16(self):
         from examples.bf16xint16_gemm import reference_bf16xint16_pytorch
 
diff --git a/test/test_indexing.py b/test/test_indexing.py
@@ -433,23 +433,23 @@ def run_case(
             kernel = make_kernel(index_dtype=index_dtype)
             x = torch.randn(*shape, device=DEVICE, dtype=torch.bfloat16)
             y = torch.randn(*shape, device=DEVICE, dtype=torch.bfloat16)
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
             if expect_error:
                 with self.assertRaisesRegex(
                     helion.exc.IndexOffsetOutOfRangeForInt32,
                     f"index_dtype is {index_dtype}",
                 ):
                     code_and_output(kernel, (x, y))
-                torch.cuda.synchronize()
+                torch.accelerator.synchronize()
                 return
 
             code, out = code_and_output(kernel, (x, y))
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
             checker = self.assertIn if expect_int64_in_code else self.assertNotIn
             checker("tl.int64", code)
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
             ref_out = torch.add(x, y)
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
             torch.testing.assert_close(out, ref_out, rtol=1e-2, atol=1e-2)
 
         small_shape = (128, 128)
diff --git a/test/test_matmul.py b/test/test_matmul.py
@@ -243,7 +243,7 @@ def matmul_bf16_packed_int4(
         C = torch.zeros((M, N), dtype=torch.float32, device=DEVICE)
 
         matmul_bf16_packed_int4(A, B_packed, C)
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
 
         self.assertTrue(torch.isfinite(C).all())
         self.assertFalse(torch.allclose(C, torch.zeros_like(C)))

Original file line number	Diff line number	Diff line change
`@@ -433,6 +433,7 @@ def test_low_mem_dropout(self):`
`433`	`433`	`)`
`434`	`434`
`435`	`435`	`@skipIfRocm("precision differences with bf16xint16 operations on rocm")`
	`436`	`+ @skipIfXPU("precision differences with bf16xint16 operations on xpu")`
`436`	`437`	`def test_bf16xint16(self):`
`437`	`438`	`from examples.bf16xint16_gemm import reference_bf16xint16_pytorch`
`438`	`439`