feat: support libtorch

FlamingoPg · FlamingoPg · commit 389326bdb724 · 2025-11-17T05:55:57.000Z
diff --git a/csrc/apis/attention.hpp b/csrc/apis/attention.hpp
@@ -219,22 +219,4 @@ static torch::Tensor fp8_paged_mqa_logits(const torch::Tensor& q,
     return logits;
 }
 
-static void register_apis(pybind11::module_& m) {
-    m.def("fp8_gemm_nt_skip_head_mid", &fp8_gemm_nt_skip_head_mid,
-          py::arg("a"), py::arg("b"), py::arg("d"), py::arg("head_splits"),
-          py::arg("recipe") = std::nullopt,
-          py::arg("compiled_dims") = "nk",
-          py::arg("disable_ue8m0_cast") = false);
-    m.def("fp8_mqa_logits", &fp8_mqa_logits,
-      py::arg("q"), py::arg("kv"), py::arg("weights"),
-      py::arg("cu_seq_len_k_start"), py::arg("cu_seq_len_k_end"),
-      py::arg("clean_logits") = true);
-    m.def("get_paged_mqa_logits_metadata", &get_paged_mqa_logits_metadata,
-          py::arg("context_lens"), py::arg("block_kv"), py::arg("num_sms"));
-    m.def("fp8_paged_mqa_logits", &fp8_paged_mqa_logits,
-          py::arg("q"), py::arg("kv_cache"), py::arg("weights"),
-          py::arg("context_lens"), py::arg("block_table"), py::arg("schedule_meta"),
-          py::arg("max_context_len"), py::arg("clean_logits") = false);
-}
-
 } // namespace deep_gemm::attention
diff --git a/csrc/apis/einsum.hpp b/csrc/apis/einsum.hpp
@@ -1,8 +1,5 @@
 #pragma once
 
-#include <pybind11/pybind11.h>
-#include <torch/python.h>
-
 #include "../utils/exception.hpp"
 #include "../utils/format.hpp"
 #include "../utils/layout.hpp"
@@ -106,10 +103,4 @@ static void einsum(const std::string& expr,
     }
 }
 
-static void register_apis(pybind11::module_& m) {
-    m.def("einsum", &einsum,
-          py::arg("expr"), py::arg("a"), py::arg("b"),
-          py::arg("d"), py::arg("c") = std::nullopt);
-}
-
 } // namespace deep_gemm::einsum
diff --git a/csrc/apis/gemm.hpp b/csrc/apis/gemm.hpp
@@ -500,84 +500,4 @@ static void cublaslt_gemm_tt(const torch::Tensor& a, const torch::Tensor& b,
     cublaslt_gemm_nt(a.transpose(0, 1), b, d, c);
 }
 
-static void register_apis(pybind11::module_& m) {
-    // FP8 GEMMs
-     m.def("fp8_gemm_nt", &fp8_gemm_nt,
-          py::arg("a"), py::arg("b"), py::arg("d"),
-          py::arg("c") = std::nullopt, py::arg("recipe") = std::nullopt,
-          py::arg("compiled_dims") = "nk",
-          py::arg("disable_ue8m0_cast") = false);
-    m.def("fp8_gemm_nn", &fp8_gemm_nn,
-          py::arg("a"), py::arg("b"), py::arg("d"),
-          py::arg("c") = std::nullopt, py::arg("recipe") = std::nullopt,
-          py::arg("compiled_dims") = "nk",
-          py::arg("disable_ue8m0_cast") = false);
-    m.def("fp8_gemm_tn", &fp8_gemm_tn,
-          py::arg("a"), py::arg("b"), py::arg("d"),
-          py::arg("c") = std::nullopt, py::arg("recipe") = std::nullopt,
-          py::arg("compiled_dims") = "mn",
-          py::arg("disable_ue8m0_cast") = false);
-    m.def("fp8_gemm_tt", &fp8_gemm_tt,
-          py::arg("a"), py::arg("b"), py::arg("d"),
-          py::arg("c") = std::nullopt, py::arg("recipe") = std::nullopt,
-          py::arg("compiled_dims") = "mn",
-          py::arg("disable_ue8m0_cast") = false);
-    m.def("m_grouped_fp8_gemm_nt_contiguous", &m_grouped_fp8_gemm_nt_contiguous,
-          py::arg("a"), py::arg("b"), py::arg("d"), py::arg("m_indices"),
-          py::arg("recipe") = std::nullopt, py::arg("compiled_dims") = "nk",
-          py::arg("disable_ue8m0_cast") = false);
-    m.def("m_grouped_fp8_gemm_nn_contiguous", &m_grouped_fp8_gemm_nn_contiguous,
-          py::arg("a"), py::arg("b"), py::arg("d"), py::arg("m_indices"),
-          py::arg("recipe") = std::nullopt, py::arg("compiled_dims") = "nk",
-          py::arg("disable_ue8m0_cast") = false);
-    m.def("m_grouped_fp8_gemm_nt_masked", &m_grouped_fp8_gemm_nt_masked,
-          py::arg("a"), py::arg("b"), py::arg("d"), py::arg("masked_m"),
-          py::arg("expected_m"), py::arg("recipe") = std::nullopt,
-          py::arg("compiled_dims") = "nk", py::arg("disable_ue8m0_cast") = false);
-    m.def("k_grouped_fp8_gemm_tn_contiguous", &k_grouped_fp8_gemm_tn_contiguous,
-          py::arg("a"), py::arg("b"), py::arg("d"), py::arg("ks"),
-          py::arg("ks_tensor"), py::arg("c") = std::nullopt,
-          py::arg("recipe") = std::make_tuple(1, 1, 128),
-          py::arg("compiled_dims") = "mn");
-    m.def("k_grouped_fp8_gemm_nt_contiguous", &k_grouped_fp8_gemm_nt_contiguous,
-          py::arg("a"), py::arg("b"), py::arg("d"), py::arg("ks"),
-          py::arg("ks_tensor"), py::arg("c") = std::nullopt,
-          py::arg("recipe") = std::make_tuple(1, 1, 128),
-          py::arg("compiled_dims") = "mn");
-
-    // BF16 GEMMs
-    m.def("bf16_gemm_nt", &bf16_gemm_nt,
-          py::arg("a"), py::arg("b"), py::arg("d"),
-          py::arg("c") = std::nullopt,
-          py::arg("compiled_dims") = "nk");
-    m.def("bf16_gemm_nn", &bf16_gemm_nn,
-          py::arg("a"), py::arg("b"), py::arg("d"),
-          py::arg("c") = std::nullopt,
-          py::arg("compiled_dims") = "nk");
-    m.def("bf16_gemm_tn", &bf16_gemm_tn,
-          py::arg("a"), py::arg("b"), py::arg("d"),
-          py::arg("c") = std::nullopt,
-          py::arg("compiled_dims") = "mn");
-    m.def("bf16_gemm_tt", &bf16_gemm_tt,
-          py::arg("a"), py::arg("b"), py::arg("d"),
-          py::arg("c") = std::nullopt,
-          py::arg("compiled_dims") = "mn");
-    m.def("m_grouped_bf16_gemm_nt_contiguous", &m_grouped_bf16_gemm_nt_contiguous,
-          py::arg("a"), py::arg("b"), py::arg("d"), py::arg("m_indices"),
-          py::arg("compiled_dims") = "nk");
-    m.def("m_grouped_bf16_gemm_nt_masked", &m_grouped_bf16_gemm_nt_masked,
-          py::arg("a"), py::arg("b"), py::arg("d"), py::arg("masked_m"),
-          py::arg("expected_m"), py::arg("compiled_dims") = "nk");
-
-    // cuBLASLt GEMMs
-    m.def("cublaslt_gemm_nt", &cublaslt_gemm_nt,
-          py::arg("a"), py::arg("b"), py::arg("d"), py::arg("c") = std::nullopt);
-    m.def("cublaslt_gemm_nn", &cublaslt_gemm_nn,
-          py::arg("a"), py::arg("b"), py::arg("d"), py::arg("c") = std::nullopt);
-    m.def("cublaslt_gemm_tn", &cublaslt_gemm_tn,
-          py::arg("a"), py::arg("b"), py::arg("d"), py::arg("c") = std::nullopt);
-    m.def("cublaslt_gemm_tt", &cublaslt_gemm_tt,
-          py::arg("a"), py::arg("b"), py::arg("d"), py::arg("c") = std::nullopt);
-}
-
 } // namespace deep_gemm::gemm
diff --git a/csrc/apis/layout.hpp b/csrc/apis/layout.hpp
@@ -69,17 +69,4 @@ static torch::Tensor transform_k_grouped_sf_into_required_layout(const torch::Te
     DG_HOST_UNREACHABLE("Unknown cases");
 }
 
-static void register_apis(pybind11::module_& m) {
-    m.def("transform_sf_into_required_layout", &transform_sf_into_required_layout,
-      py::arg("sf"), py::arg("mn"), py::arg("k"), py::arg("recipe"),
-      py::arg("num_groups") = std::nullopt, py::arg("is_sfa") = false,
-      py::arg("disable_ue8m0_cast") = false);
-
-    m.def("get_tma_aligned_size", &get_tma_aligned_size);
-    m.def("get_mk_alignment_for_contiguous_layout", &get_mk_alignment_for_contiguous_layout);
-    m.def("get_mn_major_tma_aligned_tensor", &get_mn_major_tma_aligned_tensor);
-    m.def("get_mn_major_tma_aligned_packed_ue8m0_tensor", &get_mn_major_tma_aligned_packed_ue8m0_tensor);
-    m.def("get_k_grouped_mn_major_tma_aligned_packed_ue8m0_tensor", &get_k_grouped_mn_major_tma_aligned_packed_ue8m0_tensor);
-}
-
 } // namespace deep_gemm::layout
diff --git a/csrc/apis/runtime.hpp b/csrc/apis/runtime.hpp
@@ -5,24 +5,6 @@
 
 namespace deep_gemm::runtime {
 
-static void register_apis(pybind11::module_& m) {
-    m.def("set_num_sms", [&](const int& new_num_sms) {
-        device_runtime->set_num_sms(new_num_sms);
-    });
-    m.def("get_num_sms", [&]() {
-       return device_runtime->get_num_sms();
-    });
-    m.def("set_tc_util", [&](const int& new_tc_util) {
-        device_runtime->set_tc_util(new_tc_util);
-    });
-    m.def("get_tc_util", [&]() {
-        return device_runtime->get_tc_util();
-    });
-
-    m.def("init", [&](const std::string& library_root_path, const std::string& cuda_home_path_by_python) {
-        Compiler::prepare_init(library_root_path, cuda_home_path_by_python);
-        KernelRuntime::prepare_init(cuda_home_path_by_python);
-    });
-}
+// The init and other functions are now exposed via TORCH_LIBRARY in python_api.cpp
 
 } // namespace deep_gemm::runtime
diff --git a/csrc/jit/device_runtime.hpp b/csrc/jit/device_runtime.hpp
@@ -17,6 +17,11 @@ class DeviceRuntime {
     cublasLtHandle_t cublaslt_handle{};
     std::shared_ptr<torch::Tensor> cublaslt_workspace;
 
+    // cuBLASLt utils
+    static constexpr size_t kCublasLtWorkspaceSize = 32 * 1024 * 1024;
+    cublasLtHandle_t cublaslt_handle{};
+    std::shared_ptr<torch::Tensor> cublaslt_workspace;
+
 public:
     explicit DeviceRuntime() {
         cublaslt_workspace = std::make_shared<torch::Tensor>(torch::empty({kCublasLtWorkspaceSize}, dtype(torch::kByte).device(at::kCUDA)));
diff --git a/csrc/jit_kernels/impls/runtime_utils.hpp b/csrc/jit_kernels/impls/runtime_utils.hpp
@@ -1,7 +1,6 @@
 #pragma once
 
 #include <cuda.h>
-#include <torch/python.h>
 
 #include "../../utils/math.hpp"
 #include "../heuristics/sm90.hpp"
@@ -75,10 +74,6 @@ static CUtensorMapSwizzle mode_into_tensor_map_swizzle(const int& mode, const in
     }
 #endif
 
-    DG_HOST_ASSERT(base == 0);
-    switch (mode) {
-        case   0:
-        case  16: return CU_TENSOR_MAP_SWIZZLE_NONE;
         case  32: return CU_TENSOR_MAP_SWIZZLE_32B;
         case  64: return CU_TENSOR_MAP_SWIZZLE_64B;
         case 128: return CU_TENSOR_MAP_SWIZZLE_128B;
diff --git a/csrc/jit_kernels/impls/sm100_bf16_gemm.hpp b/csrc/jit_kernels/impls/sm100_bf16_gemm.hpp
@@ -1,7 +1,5 @@
 #pragma once
 
-#include <torch/python.h>
-
 #include "../../jit/compiler.hpp"
 #include "../../jit/device_runtime.hpp"
 #include "../../jit/kernel_runtime.hpp"
diff --git a/csrc/jit_kernels/impls/sm100_bmk_bnk_mn.hpp b/csrc/jit_kernels/impls/sm100_bmk_bnk_mn.hpp
@@ -1,7 +1,5 @@
 #pragma once
 
-#include <torch/python.h>
-
 #include "../../jit/compiler.hpp"
 #include "../../jit/device_runtime.hpp"
 #include "../../jit/kernel_runtime.hpp"
@@ -134,4 +132,4 @@ static void sm100_bmn_bnk_mn_gemm(const torch::Tensor &a,
     SM100BmkBnkMnRuntime::launch(runtime, args);
 }
 
-} // namespace deep_gemm
+} // namespace deep_gemm
diff --git a/csrc/jit_kernels/impls/sm100_fp8_gemm_1d1d.hpp b/csrc/jit_kernels/impls/sm100_fp8_gemm_1d1d.hpp
@@ -1,7 +1,5 @@
 #pragma once
 
-#include <torch/python.h>
-
 #include "../../jit/compiler.hpp"
 #include "../../jit/device_runtime.hpp"
 #include "../../jit/kernel_runtime.hpp"
diff --git a/csrc/jit_kernels/impls/sm100_fp8_gemm_1d2d.hpp b/csrc/jit_kernels/impls/sm100_fp8_gemm_1d2d.hpp
@@ -1,7 +1,5 @@
 #pragma once
 
-#include <torch/python.h>
-
 #include "../../jit/compiler.hpp"
 #include "../../jit/device_runtime.hpp"
 #include "../../jit/kernel_runtime.hpp"
diff --git a/csrc/jit_kernels/impls/sm90_bf16_gemm.hpp b/csrc/jit_kernels/impls/sm90_bf16_gemm.hpp
@@ -1,7 +1,5 @@
 #pragma once
 
-#include <torch/python.h>
-
 #include "../../jit/compiler.hpp"
 #include "../../jit/kernel_runtime.hpp"
 #include "../../utils/exception.hpp"
diff --git a/csrc/jit_kernels/impls/sm90_bmk_bnk_mn.hpp b/csrc/jit_kernels/impls/sm90_bmk_bnk_mn.hpp
@@ -1,7 +1,5 @@
 #pragma once
 
-#include <torch/python.h>
-
 #include "../../jit/compiler.hpp"
 #include "../../jit/device_runtime.hpp"
 #include "../../jit/kernel_runtime.hpp"
diff --git a/csrc/jit_kernels/impls/sm90_fp8_gemm_1d1d.hpp b/csrc/jit_kernels/impls/sm90_fp8_gemm_1d1d.hpp
@@ -1,7 +1,5 @@
 #pragma once
 
-#include <torch/python.h>
-
 #include "../../jit/compiler.hpp"
 #include "../../jit/device_runtime.hpp"
 #include "../../jit/kernel_runtime.hpp"
@@ -133,7 +131,7 @@ static void sm90_fp8_gemm_1d1d(const torch::Tensor& a, const torch::Tensor& sfa,
     const auto& code = SM90FP8Gemm1D1DRuntime::generate(args);
     const auto& runtime = compiler->build("sm90_fp8_gemm_1d1d", code);
 
-    SM90FP8Gemm1D1DRuntime::launch(runtime, args);
+    MAYBE_LAUNCH(SM90FP8Gemm1D1DRuntime::launch(runtime, args));
 }
 
 static void sm90_fp8_k_grouped_gemm_1d1d(const torch::Tensor& a, const torch::Tensor& sfa,
@@ -208,7 +206,7 @@ static void sm90_fp8_k_grouped_gemm_1d1d(const torch::Tensor& a, const torch::Te
     const auto& code = SM90FP8Gemm1D1DRuntime::generate(args);
     const auto& runtime = compiler->build("sm90_fp8_gemm_1d1d", code);
 
-    SM90FP8Gemm1D1DRuntime::launch(runtime, args);
+    MAYBE_LAUNCH(SM90FP8Gemm1D1DRuntime::launch(runtime, args));
 }
 
 } // namespace deep_gemm
diff --git a/csrc/jit_kernels/impls/sm90_fp8_gemm_1d2d.hpp b/csrc/jit_kernels/impls/sm90_fp8_gemm_1d2d.hpp
@@ -1,7 +1,5 @@
 #pragma once
 
-#include <torch/python.h>
-
 #include "../../jit/compiler.hpp"
 #include "../../jit/device_runtime.hpp"
 #include "../../jit/kernel_runtime.hpp"
diff --git a/csrc/jit_kernels/impls/smxx_layout.hpp b/csrc/jit_kernels/impls/smxx_layout.hpp
@@ -1,7 +1,5 @@
 #pragma once
 
-#include <torch/python.h>
-
 #include "../../jit/kernel_runtime.hpp"
 #include "../../utils/exception.hpp"
 #include "../../utils/format.hpp"
diff --git a/csrc/python_api.cpp b/csrc/python_api.cpp
diff --git a/csrc/utils/layout.hpp b/csrc/utils/layout.hpp
diff --git a/csrc/utils/math.hpp b/csrc/utils/math.hpp
diff --git a/deep_gemm/__init__.py b/deep_gemm/__init__.py
diff --git a/deep_gemm/utils/layout.py b/deep_gemm/utils/layout.py
diff --git a/tests/test_einsum.py b/tests/test_einsum.py