feat: support misc kernel launch

rainj-me · FlamingoPg · commit 92585b702909 · 2025-11-17T05:56:04.000Z
diff --git a/csrc/jit/device_runtime.hpp b/csrc/jit/device_runtime.hpp
@@ -11,11 +11,7 @@ namespace deep_gemm {
 class DeviceRuntime {
     int num_sms = 0, tc_util = 0;
     std::shared_ptr<cudaDeviceProp> cached_prop;
-
-    // cuBLASLt utils
-    static constexpr size_t kCublasLtWorkspaceSize = 32 * 1024 * 1024;
-    cublasLtHandle_t cublaslt_handle{};
-    std::shared_ptr<torch::Tensor> cublaslt_workspace;
+    int compile_mode = 0;
 
     // cuBLASLt utils
     static constexpr size_t kCublasLtWorkspaceSize = 32 * 1024 * 1024;
@@ -82,6 +78,15 @@ class DeviceRuntime {
         return num_sms;
     }
 
+    void set_compile_mode(const int& new_compile_mode) {
+        DG_HOST_ASSERT(0 <= new_compile_mode and new_compile_mode <= 1);
+        compile_mode = new_compile_mode;
+    }
+
+    int get_compile_mode() {
+        return compile_mode;
+    }
+
     void set_tc_util(const int& new_tc_util) {
         DG_HOST_ASSERT(0 <= new_tc_util and new_tc_util <= 100);
         tc_util = new_tc_util;
diff --git a/csrc/jit_kernels/impls/runtime_utils.hpp b/csrc/jit_kernels/impls/runtime_utils.hpp
@@ -67,13 +67,17 @@ static CUtensorMapDataType aten_dtype_to_tensor_map_dtype(const at::ScalarType&
 }
 
 static CUtensorMapSwizzle mode_into_tensor_map_swizzle(const int& mode, const int& base) {
-#if CUDA_VERSION >= 12080
     if (base != 0) {
+#if CUDA_VERSION >= 12080
         DG_HOST_ASSERT(base == 32 and mode == 128);
         return CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B;
-    }
 #endif
 
+    }
+
+    switch (mode) {
+        case   0:
+        case  16: return CU_TENSOR_MAP_SWIZZLE_NONE;
         case  32: return CU_TENSOR_MAP_SWIZZLE_32B;
         case  64: return CU_TENSOR_MAP_SWIZZLE_64B;
         case 128: return CU_TENSOR_MAP_SWIZZLE_128B;
@@ -215,4 +219,10 @@ static CUtensorMap make_tma_sf_desc(const cute::UMMA::Major& major,
                             allow_tf32);
 }
 
+#define MAYBE_LAUNCH(EXPR) do {                     \
+    if (device_runtime->get_compile_mode() == 0) {  \
+        (EXPR);                                     \
+    }                                               \
+} while (0)
+
 } // namespace deep_gemm
diff --git a/csrc/jit_kernels/impls/sm100_bf16_gemm.hpp b/csrc/jit_kernels/impls/sm100_bf16_gemm.hpp
@@ -134,7 +134,7 @@ static void sm100_bf16_gemm(const torch::Tensor& a,
     };
     const auto& code = SM100BF16GemmRuntime::generate(args);
     const auto& runtime = compiler->build("sm100_bf16_gemm", code);
-    SM100BF16GemmRuntime::launch(runtime, args);
+    MAYBE_LAUNCH(SM100BF16GemmRuntime::launch(runtime, args));
 }
 
 } // namespace deep_gemm
diff --git a/csrc/jit_kernels/impls/sm100_fp8_gemm_1d1d.hpp b/csrc/jit_kernels/impls/sm100_fp8_gemm_1d1d.hpp
@@ -148,7 +148,7 @@ static void sm100_fp8_gemm_1d1d(const torch::Tensor& a, const torch::Tensor& sfa
     };
     const auto& code = SM100FP8Gemm1D1DRuntime::generate(args);
     const auto& runtime = compiler->build("sm100_fp8_gemm_1d1d", code);
-    SM100FP8Gemm1D1DRuntime::launch(runtime, args);
+    MAYBE_LAUNCH(SM100FP8Gemm1D1DRuntime::launch(runtime, args));
 }
 
 static void sm100_m_grouped_fp8_gemm_contiguous_1d1d(const torch::Tensor& a, const torch::Tensor& sfa,
@@ -206,7 +206,7 @@ static void sm100_m_grouped_fp8_gemm_contiguous_1d1d(const torch::Tensor& a, con
     };
     const auto& code = SM100FP8Gemm1D1DRuntime::generate(args);
     const auto& runtime = compiler->build("sm100_m_grouped_fp8_gemm_contiguous_1d1d", code);
-    SM100FP8Gemm1D1DRuntime::launch(runtime, args);
+    MAYBE_LAUNCH(SM100FP8Gemm1D1DRuntime::launch(runtime, args));
 }
 
 static void sm100_m_grouped_fp8_gemm_masked_1d1d(const torch::Tensor& a, const torch::Tensor& sfa,
@@ -265,7 +265,7 @@ static void sm100_m_grouped_fp8_gemm_masked_1d1d(const torch::Tensor& a, const t
     };
     const auto& code = SM100FP8Gemm1D1DRuntime::generate(args);
     const auto& runtime = compiler->build("sm100_fp8_m_grouped_gemm_masked_1d1d", code);
-    SM100FP8Gemm1D1DRuntime::launch(runtime, args);
+    MAYBE_LAUNCH(SM100FP8Gemm1D1DRuntime::launch(runtime, args));
 }
 
 static void fp8_k_grouped_gemm_1d1d(const torch::Tensor& a, const torch::Tensor& sfa,
@@ -346,7 +346,7 @@ static void fp8_k_grouped_gemm_1d1d(const torch::Tensor& a, const torch::Tensor&
     };
     const auto& code = SM100FP8Gemm1D1DRuntime::generate(args);
     const auto& runtime = compiler->build("sm100_fp8_k_grouped_gemm_1d1d", code);
-    SM100FP8Gemm1D1DRuntime::launch(runtime, args);
+    MAYBE_LAUNCH(SM100FP8Gemm1D1DRuntime::launch(runtime, args));
 }
 
 } // namespace deep_gemm
diff --git a/csrc/jit_kernels/impls/sm100_fp8_gemm_1d2d.hpp b/csrc/jit_kernels/impls/sm100_fp8_gemm_1d2d.hpp
@@ -127,7 +127,7 @@ static void sm100_fp8_gemm_1d2d(const torch::Tensor& a, const torch::Tensor& sfa
     };
     const auto& code = SM100FP8Gemm1D2DRuntime::generate(args);
     const auto& runtime = compiler->build("sm100_fp8_gemm_1d2d", code);
-    SM100FP8Gemm1D2DRuntime::launch(runtime, args);
+    MAYBE_LAUNCH(SM100FP8Gemm1D2DRuntime::launch(runtime, args));
 }
 
 static void sm100_m_grouped_fp8_gemm_contiguous_1d2d(const torch::Tensor& a, const torch::Tensor& sfa,
@@ -181,7 +181,7 @@ static void sm100_m_grouped_fp8_gemm_contiguous_1d2d(const torch::Tensor& a, con
     };
     const auto& code = SM100FP8Gemm1D2DRuntime::generate(args);
     const auto& runtime = compiler->build("sm100_m_grouped_fp8_gemm_contiguous_1d2d", code);
-    SM100FP8Gemm1D2DRuntime::launch(runtime, args);
+    MAYBE_LAUNCH(SM100FP8Gemm1D2DRuntime::launch(runtime, args));
 }
 
 static void sm100_m_grouped_fp8_gemm_masked_1d2d(const torch::Tensor& a, const torch::Tensor& sfa,
@@ -236,7 +236,7 @@ static void sm100_m_grouped_fp8_gemm_masked_1d2d(const torch::Tensor& a, const t
     };
     const auto& code = SM100FP8Gemm1D2DRuntime::generate(args);
     const auto& runtime = compiler->build("sm100_fp8_m_grouped_gemm_masked_1d2d", code);
-    SM100FP8Gemm1D2DRuntime::launch(runtime, args);
+    MAYBE_LAUNCH(SM100FP8Gemm1D2DRuntime::launch(runtime, args));
 }
 
 } // namespace deep_gemm
diff --git a/csrc/jit_kernels/impls/sm90_bf16_gemm.hpp b/csrc/jit_kernels/impls/sm90_bf16_gemm.hpp
@@ -115,7 +115,7 @@ static void sm90_bf16_gemm(const torch::Tensor& a,
     };
     const auto& code = SM90BF16GemmRuntime::generate(args);
     const auto& runtime = compiler->build("sm90_bf16_gemm", code);
-    SM90BF16GemmRuntime::launch(runtime, args);
+    MAYBE_LAUNCH(SM90BF16GemmRuntime::launch(runtime, args));
 }
 
 static void sm90_m_grouped_bf16_gemm_contiguous(const torch::Tensor& a,
@@ -168,7 +168,7 @@ static void sm90_m_grouped_bf16_gemm_contiguous(const torch::Tensor& a,
     };
     const auto& code = SM90BF16GemmRuntime::generate(args);
     const auto& runtime = compiler->build("sm90_m_grouped_bf16_gemm_contiguous", code);
-    SM90BF16GemmRuntime::launch(runtime, args);
+    MAYBE_LAUNCH(SM90BF16GemmRuntime::launch(runtime, args));
 }
 
 static void sm90_bf16_m_grouped_gemm_masked(const torch::Tensor& a,
@@ -222,7 +222,7 @@ static void sm90_bf16_m_grouped_gemm_masked(const torch::Tensor& a,
     };
     const auto& code = SM90BF16GemmRuntime::generate(args);
     const auto& runtime = compiler->build("sm90_bf16_m_grouped_gemm_masked", code);
-    SM90BF16GemmRuntime::launch(runtime, args);
+    MAYBE_LAUNCH(SM90BF16GemmRuntime::launch(runtime, args));
 }
 
 } // namespace deep_gemm
diff --git a/csrc/jit_kernels/impls/sm90_fp8_gemm_1d2d.hpp b/csrc/jit_kernels/impls/sm90_fp8_gemm_1d2d.hpp
@@ -128,7 +128,7 @@ static void sm90_fp8_gemm_1d2d(const torch::Tensor& a, const torch::Tensor& sfa,
     };
     const auto& code = SM90FP8Gemm1D2DRuntime::generate(args);
     const auto& runtime = compiler->build("sm90_fp8_gemm_1d2d", code);
-    SM90FP8Gemm1D2DRuntime::launch(runtime, args);
+    MAYBE_LAUNCH(SM90FP8Gemm1D2DRuntime::launch(runtime, args));
 }
 
 static void sm90_m_grouped_fp8_gemm_contiguous_1d2d(const torch::Tensor& a, const torch::Tensor& sfa,
@@ -188,7 +188,7 @@ static void sm90_m_grouped_fp8_gemm_contiguous_1d2d(const torch::Tensor& a, cons
     };
     const auto& code = SM90FP8Gemm1D2DRuntime::generate(args);
     const auto& runtime = compiler->build("sm90_m_grouped_fp8_gemm_contiguous_1d2d", code);
-    SM90FP8Gemm1D2DRuntime::launch(runtime, args);
+    MAYBE_LAUNCH(SM90FP8Gemm1D2DRuntime::launch(runtime, args));
 }
 
 static void sm90_m_grouped_fp8_gemm_masked_1d2d(const torch::Tensor& a, const torch::Tensor& sfa,
@@ -249,7 +249,7 @@ static void sm90_m_grouped_fp8_gemm_masked_1d2d(const torch::Tensor& a, const to
     };
     const auto& code = SM90FP8Gemm1D2DRuntime::generate(args);
     const auto& runtime = compiler->build("sm90_fp8_m_grouped_gemm_masked_1d2d", code);
-    SM90FP8Gemm1D2DRuntime::launch(runtime, args);
+    MAYBE_LAUNCH(SM90FP8Gemm1D2DRuntime::launch(runtime, args));
 }
 
 } // namespace deep_gemm
diff --git a/csrc/python_api.cpp b/csrc/python_api.cpp
@@ -214,6 +214,16 @@ TORCH_LIBRARY(deep_gemm, m) {
         return static_cast<int64_t>(deep_gemm::device_runtime->get_num_sms());
     });
 
+    m.def("set_compile_mode(int new_compile_mode) -> ()");
+    m.impl("set_compile_mode", [](int64_t new_compile_mode) {
+        deep_gemm::device_runtime->set_compile_mode(static_cast<int>(new_compile_mode));
+    });
+
+    m.def("get_compile_mode() -> int");
+    m.impl("get_compile_mode", []() -> int64_t {
+        return static_cast<int64_t>(deep_gemm::device_runtime->get_compile_mode());
+    });
+
     m.def("set_tc_util(int new_tc_util) -> ()");
     m.impl("set_tc_util", [](int64_t new_tc_util) {
         deep_gemm::device_runtime->set_tc_util(static_cast<int>(new_tc_util));
diff --git a/deep_gemm/__init__.py b/deep_gemm/__init__.py
@@ -44,13 +44,16 @@ def _ensure_initialized() -> None:
 
 
 def _wrap_op(name: str):
+    func = getattr(torch.ops.deep_gemm, name)
     def _fn(*args, **kwargs):
         _ensure_initialized()
-        return getattr(torch.ops.deep_gemm, name)(*args, **kwargs)
+        return func(*args, **kwargs)
     return _fn
 
 set_num_sms = _wrap_op('set_num_sms')
 get_num_sms = _wrap_op('get_num_sms')
+set_compile_mode = _wrap_op('set_compile_mode')
+get_compile_mode = _wrap_op('get_compile_mode')
 set_tc_util = _wrap_op('set_tc_util')
 get_tc_util = _wrap_op('get_tc_util')
 
@@ -121,10 +124,10 @@ def _verify_ops_loaded():
         'cublaslt_gemm_nt', 'cublaslt_gemm_nn',
         'cublaslt_gemm_tn', 'cublaslt_gemm_tt',
     ]
-    
+
     available_ops = list(torch.ops.deep_gemm.__dict__.keys())
     missing_ops = [op for op in expected_ops if op not in available_ops]
-    
+
     if missing_ops:
         print(f"Warning: Missing operations: {missing_ops}")
 

Original file line number	Diff line number	Diff line change
`@@ -134,7 +134,7 @@ static void sm100_bf16_gemm(const torch::Tensor& a,`
`134`	`134`	`};`
`135`	`135`	`const auto& code = SM100BF16GemmRuntime::generate(args);`
`136`	`136`	`const auto& runtime = compiler->build("sm100_bf16_gemm", code);`
`137`		`- SM100BF16GemmRuntime::launch(runtime, args);`
	`137`	`+ MAYBE_LAUNCH(SM100BF16GemmRuntime::launch(runtime, args));`
`138`	`138`	`}`
`139`	`139`
`140`	`140`	`} // namespace deep_gemm`