From a910908b88a1d1c066c05b10578c90ea1512db2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Wed, 15 Oct 2025 15:01:15 +0000 Subject: [PATCH] Remove examples/activation This has become a standalone kernel in kernels-community. It also precedes the relu example, which is much more minimal and easier to understand. --- .github/workflows/build_kernel.yaml | 13 +- README.md | 4 +- docs/docker.md | 14 +- docs/nix.md | 15 +- docs/writing-kernels.md | 2 +- examples/activation/LICENSE | 201 ----------------- examples/activation/README.md | 5 - .../activation/activation_kernels.cu | 204 ------------------ examples/activation/activation/cuda_compat.h | 49 ----- .../activation/activation/dispatch_utils.h | 35 --- examples/activation/build.toml | 18 -- examples/activation/flake.nix | 18 -- examples/activation/tests/__init__.py | 0 examples/activation/tests/kernels/__init__.py | 0 .../tests/kernels/allclose_default.py | 14 -- .../tests/kernels/test_activation.py | 139 ------------ examples/activation/tests/kernels/utils.py | 73 ------- .../torch-ext/activation/__init__.py | 47 ---- .../activation/torch-ext/torch_binding.cpp | 37 ---- examples/activation/torch-ext/torch_binding.h | 18 -- examples/relu-backprop-compile/build.toml | 4 +- examples/relu-specific-torch/build.toml | 4 +- examples/relu/build.toml | 8 +- tests/Dockerfile.test-kernel | 8 +- 24 files changed, 28 insertions(+), 902 deletions(-) delete mode 100644 examples/activation/LICENSE delete mode 100644 examples/activation/README.md delete mode 100644 examples/activation/activation/activation_kernels.cu delete mode 100644 examples/activation/activation/cuda_compat.h delete mode 100644 examples/activation/activation/dispatch_utils.h delete mode 100644 examples/activation/build.toml delete mode 100644 examples/activation/flake.nix delete mode 100644 examples/activation/tests/__init__.py delete mode 100644 examples/activation/tests/kernels/__init__.py delete mode 100644 examples/activation/tests/kernels/allclose_default.py delete mode 100644 examples/activation/tests/kernels/test_activation.py delete mode 100644 examples/activation/tests/kernels/utils.py delete mode 100644 examples/activation/torch-ext/activation/__init__.py delete mode 100644 examples/activation/torch-ext/torch_binding.cpp delete mode 100644 examples/activation/torch-ext/torch_binding.h diff --git a/.github/workflows/build_kernel.yaml b/.github/workflows/build_kernel.yaml index ad3327db..5a7f9462 100644 --- a/.github/workflows/build_kernel.yaml +++ b/.github/workflows/build_kernel.yaml @@ -28,21 +28,16 @@ jobs: USER: runner - name: Nix info run: nix-shell -p nix-info --run "nix-info -m" - - name: Build activation kernel - run: ( cd examples/activation && nix build .\#redistributable.torch29-cxx11-cu126-x86_64-linux ) - - name: Copy activation kernel - run: cp -rL examples/activation/result activation-kernel + - name: Build relu kernel + run: ( cd examples/relu && nix build .\#redistributable.torch29-cxx11-cu126-x86_64-linux ) + - name: Copy relu kernel + run: cp -rL examples/relu/result relu-kernel - name: Build cutlass GEMM kernel run: ( cd examples/cutlass-gemm && nix build .\#redistributable.torch29-cxx11-cu126-x86_64-linux ) - name: Copy cutlass GEMM kernel run: cp -rL examples/cutlass-gemm/result cutlass-gemm-kernel - - name: Build relu kernel - run: ( cd examples/relu && nix build .\#redistributable.torch29-cxx11-cu126-x86_64-linux ) - - name: Copy relu kernel - run: cp -rL examples/relu/result relu-kernel - - name: Build relu-backprop-compile kernel run: ( cd examples/relu-backprop-compile && nix build .\#redistributable.torch29-cxx11-cu126-x86_64-linux ) - name: Copy relu-backprop-compile kernel diff --git a/README.md b/README.md index e901306c..5ea826e2 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ nix run nixpkgs#cachix -- use huggingface Then quick start a build with: ```bash -cd examples/activation +cd examples/relu nix run .#build-and-copy \ --max-jobs 2 \ --cores 8 \ @@ -51,7 +51,7 @@ We also provide Docker containers for CI builds. For a quick build: ```bash # Using the prebuilt container -cd examples/activation +cd examples/relu docker run --rm \ --mount type=bind,source=$(pwd),target=/kernelcode \ -w /kernelcode ghcr.io/huggingface/kernel-builder:main build diff --git a/docs/docker.md b/docs/docker.md index 975dccb4..5a48ff1f 100644 --- a/docs/docker.md +++ b/docs/docker.md @@ -28,8 +28,8 @@ installed. We provide a Docker image with which you can build a kernel: ```bash -# navigate to the activation directory -cd examples/activation +# navigate to the relu directory +cd examples/relu # then run the following command to build the kernel docker run --rm \ @@ -39,7 +39,7 @@ docker run --rm \ ``` This will build the kernel and save the output in the `build` directory in -the activation folder. +the relu folder. ## CLI Interface @@ -55,10 +55,10 @@ The kernel builder includes a command-line interface for easier interaction. The ### Examples ```bash -# Build the example activation kernel from the root of the repository +# Build the example relu kernel from the root of the repository docker run --rm \ -v $(pwd):/kernel-builder \ - -w /kernel-builder/examples/activation \ + -w /kernel-builder/examples/relu \ ghcr.io/huggingface/kernel-builder:main \ build @@ -194,11 +194,11 @@ The kernel can then be imported as a Python module: ```python import torch -import activation +import relu x = torch.randn(10, 10) out = torch.empty_like(x) -activation.silu_and_mul(out, x) +relu.relu(x, out) print(out) ``` diff --git a/docs/nix.md b/docs/nix.md index b5fa1c93..97df7857 100644 --- a/docs/nix.md +++ b/docs/nix.md @@ -48,7 +48,7 @@ A kernel that has a `flake.nix` file can be built with the `build-and-copy` command. For example: ```bash -cd examples/activation +cd examples/relu nix run .#build-and-copy -L ``` @@ -94,7 +94,7 @@ with the kernel in Python's search path. This makes it more convenient to run tests: ```bash -cd examples/activation +cd examples/relu nix develop -L .#test python -m pytest tests ``` @@ -142,14 +142,3 @@ this check enabled, as it is one of the checks that validates that a kernel is compliant. This option is primarily intended for kernels with `triton.autotune` decorators, which can fail because there is no GPU available in the build sandbox. - -## Building a kernel without `flake.nix` - -If a kernels source directory does not have a `flake.nix` file, you can build the -kernel using the `buildTorchExtensionBundle` function from the kernel builder -itself: - -```bash -cd examples/activation -nix build --impure --expr 'with import ../..; lib.x86_64-linux.buildTorchExtensionBundle ./.' -L -``` diff --git a/docs/writing-kernels.md b/docs/writing-kernels.md index 70e9aa5e..559fd69b 100644 --- a/docs/writing-kernels.md +++ b/docs/writing-kernels.md @@ -77,7 +77,7 @@ src = [ "torch-ext/torch_binding.h" ] -[kernel.activation] +[kernel.relu] backend = "cuda" src = [ "relu_kernel/relu.cu", diff --git a/examples/activation/LICENSE b/examples/activation/LICENSE deleted file mode 100644 index 261eeb9e..00000000 --- a/examples/activation/LICENSE +++ /dev/null @@ -1,201 +0,0 @@ - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. diff --git a/examples/activation/README.md b/examples/activation/README.md deleted file mode 100644 index e05afcb0..00000000 --- a/examples/activation/README.md +++ /dev/null @@ -1,5 +0,0 @@ -## Activation - -Activation kernels from [vLLM](https://github.com/vllm-project/vllm/blob/main/csrc/activation_kernels.cu). - -Copyright 2023-2024, the vLLM team. diff --git a/examples/activation/activation/activation_kernels.cu b/examples/activation/activation/activation_kernels.cu deleted file mode 100644 index 839dc36b..00000000 --- a/examples/activation/activation/activation_kernels.cu +++ /dev/null @@ -1,204 +0,0 @@ -#include -#include -#include - -#include - -#include "cuda_compat.h" -#include "dispatch_utils.h" - -namespace vllm { - -// Activation and gating kernel template. -template -__global__ void act_and_mul_kernel( - scalar_t* __restrict__ out, // [..., d] - const scalar_t* __restrict__ input, // [..., 2, d] - const int d) { - const int64_t token_idx = blockIdx.x; - for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) { - const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]); - const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]); - out[token_idx * d + idx] = ACT_FN(x) * y; - } -} - -template -__device__ __forceinline__ T silu_kernel(const T& x) { - // x * sigmoid(x) - return (T)(((float)x) / (1.0f + expf((float)-x))); -} - -template -__device__ __forceinline__ T gelu_kernel(const T& x) { - // Equivalent to PyTorch GELU with 'none' approximation. - // Refer to: - // https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L36-L38 - const float f = (float)x; - constexpr float ALPHA = M_SQRT1_2; - return (T)(f * 0.5f * (1.0f + ::erf(f * ALPHA))); -} - -template -__device__ __forceinline__ T gelu_tanh_kernel(const T& x) { - // Equivalent to PyTorch GELU with 'tanh' approximation. - // Refer to: - // https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L25-L30 - const float f = (float)x; - constexpr float BETA = M_SQRT2 * M_2_SQRTPI * 0.5f; - constexpr float KAPPA = 0.044715; - float x_cube = f * f * f; - float inner = BETA * (f + KAPPA * x_cube); - return (T)(0.5f * f * (1.0f + ::tanhf(inner))); -} - -} // namespace vllm - -// Launch activation and gating kernel. -#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL) \ - int d = input.size(-1) / 2; \ - int64_t num_tokens = input.numel() / input.size(-1); \ - dim3 grid(num_tokens); \ - dim3 block(std::min(d, 1024)); \ - const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \ - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \ - VLLM_DISPATCH_FLOATING_TYPES( \ - input.scalar_type(), "act_and_mul_kernel", [&] { \ - vllm::act_and_mul_kernel> \ - <<>>(out.data_ptr(), \ - input.data_ptr(), d); \ - }); - -void silu_and_mul(torch::Tensor& out, // [..., d] - torch::Tensor& input) // [..., 2 * d] -{ - LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel); -} - -void gelu_and_mul(torch::Tensor& out, // [..., d] - torch::Tensor& input) // [..., 2 * d] -{ - LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_kernel); -} - -void gelu_tanh_and_mul(torch::Tensor& out, // [..., d] - torch::Tensor& input) // [..., 2 * d] -{ - LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_tanh_kernel); -} - -namespace vllm { - -template -__device__ __forceinline__ T fatrelu_kernel(const T& x, const float threshold) { - const float f = (float)x; - return (T)(f > threshold ? f : 0.0f); -} - -template -__global__ void act_and_mul_kernel_with_param( - scalar_t* __restrict__ out, const scalar_t* __restrict__ input, const int d, - const float param) { - const int64_t token_idx = blockIdx.x; - for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) { - const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]); - const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]); - out[token_idx * d + idx] = ACT_FN(x, param) * y; - } -} - -} // namespace vllm - -#define LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(KERNEL, PARAM) \ - int d = input.size(-1) / 2; \ - int64_t num_tokens = input.numel() / input.size(-1); \ - dim3 grid(num_tokens); \ - dim3 block(std::min(d, 1024)); \ - const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \ - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \ - VLLM_DISPATCH_FLOATING_TYPES( \ - input.scalar_type(), "act_and_mul_kernel_with_param", [&] { \ - vllm::act_and_mul_kernel_with_param> \ - <<>>(out.data_ptr(), \ - input.data_ptr(), d, \ - PARAM); \ - }); - -void fatrelu_and_mul(torch::Tensor& out, // [..., d], - torch::Tensor& input, // [..., 2 * d] - double threshold) { - LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(vllm::fatrelu_kernel, threshold); -} -namespace vllm { - -// Element-wise activation kernel template. -template -__global__ void activation_kernel( - scalar_t* __restrict__ out, // [..., d] - const scalar_t* __restrict__ input, // [..., d] - const int d) { - const int64_t token_idx = blockIdx.x; - for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) { - const scalar_t x = VLLM_LDG(&input[token_idx * d + idx]); - out[token_idx * d + idx] = ACT_FN(x); - } -} - -} // namespace vllm - -// Launch element-wise activation kernel. -#define LAUNCH_ACTIVATION_KERNEL(KERNEL) \ - int d = input.size(-1); \ - int64_t num_tokens = input.numel() / d; \ - dim3 grid(num_tokens); \ - dim3 block(std::min(d, 1024)); \ - const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \ - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \ - VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "activation_kernel", [&] { \ - vllm::activation_kernel> \ - <<>>(out.data_ptr(), \ - input.data_ptr(), d); \ - }); - -namespace vllm { - -template -__device__ __forceinline__ T gelu_new_kernel(const T& x) { - const float x3 = (float)(x * x * x); - const T t = (T)tanhf((T)(0.79788456f * (float)(x + (T)(0.044715f * x3)))); - return ((T)0.5) * x * (((T)1.0) + t); -} - -template -__device__ __forceinline__ T gelu_fast_kernel(const T& x) { - const float f = (float)x; - const T t = - (T)tanhf(((T)(f * 0.79788456f)) * (((T)1.0) + (T)(0.044715f * f) * x)); - return ((T)0.5) * x * (((T)1.0) + t); -} - -template -__device__ __forceinline__ T gelu_quick_kernel(const T& x) { - // x * sigmoid(1.702 * x) - return (T)(((float)x) / (1.0f + expf(-1.702f * (float)x))); -} - -} // namespace vllm - -void gelu_new(torch::Tensor& out, // [..., d] - torch::Tensor& input) // [..., d] -{ - LAUNCH_ACTIVATION_KERNEL(vllm::gelu_new_kernel); -} - -void gelu_fast(torch::Tensor& out, // [..., d] - torch::Tensor& input) // [..., d] -{ - LAUNCH_ACTIVATION_KERNEL(vllm::gelu_fast_kernel); -} - -void gelu_quick(torch::Tensor& out, // [..., d] - torch::Tensor& input) // [..., d] -{ - LAUNCH_ACTIVATION_KERNEL(vllm::gelu_quick_kernel); -} diff --git a/examples/activation/activation/cuda_compat.h b/examples/activation/activation/cuda_compat.h deleted file mode 100644 index 82e55613..00000000 --- a/examples/activation/activation/cuda_compat.h +++ /dev/null @@ -1,49 +0,0 @@ -#pragma once - -#ifdef USE_ROCM - #include -#endif - -#ifndef USE_ROCM - #define WARP_SIZE 32 -#else - #define WARP_SIZE warpSize -#endif - -#ifndef USE_ROCM - #define VLLM_LDG(arg) __ldg(arg) -#else - #define VLLM_LDG(arg) *(arg) -#endif - -#ifndef USE_ROCM - #define VLLM_SHFL_XOR_SYNC(var, lane_mask) \ - __shfl_xor_sync(uint32_t(-1), var, lane_mask) - #define VLLM_SHFL_XOR_SYNC_WIDTH(var, lane_mask, width) \ - __shfl_xor_sync(uint32_t(-1), var, lane_mask, width) -#else - #define VLLM_SHFL_XOR_SYNC(var, lane_mask) __shfl_xor(var, lane_mask) - #define VLLM_SHFL_XOR_SYNC_WIDTH(var, lane_mask, width) \ - __shfl_xor(var, lane_mask, width) -#endif - -#ifndef USE_ROCM - #define VLLM_SHFL_SYNC(var, src_lane) __shfl_sync(uint32_t(-1), var, src_lane) -#else - #define VLLM_SHFL_SYNC(var, src_lane) __shfl(var, src_lane) -#endif - -#ifndef USE_ROCM - #define VLLM_SHFL_DOWN_SYNC(var, lane_delta) \ - __shfl_down_sync(uint32_t(-1), var, lane_delta) -#else - #define VLLM_SHFL_DOWN_SYNC(var, lane_delta) __shfl_down(var, lane_delta) -#endif - -#ifndef USE_ROCM - #define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \ - cudaFuncSetAttribute(FUNC, cudaFuncAttributeMaxDynamicSharedMemorySize, VAL) -#else - #define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \ - hipFuncSetAttribute(FUNC, hipFuncAttributeMaxDynamicSharedMemorySize, VAL) -#endif diff --git a/examples/activation/activation/dispatch_utils.h b/examples/activation/activation/dispatch_utils.h deleted file mode 100644 index a634e1c3..00000000 --- a/examples/activation/activation/dispatch_utils.h +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Adapted from - * https://github.com/pytorch/pytorch/blob/v2.0.1/aten/src/ATen/Dispatch.h - */ -#pragma once - -#include - -#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ - AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ - AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \ - AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) - -#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ - AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)) - -#define VLLM_DISPATCH_CASE_FLOATING_AND_BYTE_TYPES(...) \ - AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ - AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \ - AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \ - AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__) - -#define VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES(TYPE, NAME, ...) \ - AT_DISPATCH_SWITCH(TYPE, NAME, \ - VLLM_DISPATCH_CASE_FLOATING_AND_BYTE_TYPES(__VA_ARGS__)) - -#define VLLM_DISPATCH_CASE_INTEGRAL_TYPES(...) \ - AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__) \ - AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__) \ - AT_DISPATCH_CASE(at::ScalarType::Short, __VA_ARGS__) \ - AT_DISPATCH_CASE(at::ScalarType::Int, __VA_ARGS__) \ - AT_DISPATCH_CASE(at::ScalarType::Long, __VA_ARGS__) - -#define VLLM_DISPATCH_INTEGRAL_TYPES(TYPE, NAME, ...) \ - AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_INTEGRAL_TYPES(__VA_ARGS__)) diff --git a/examples/activation/build.toml b/examples/activation/build.toml deleted file mode 100644 index 0108f3f8..00000000 --- a/examples/activation/build.toml +++ /dev/null @@ -1,18 +0,0 @@ -[general] -name = "activation" -universal = false - -[torch] -src = [ - "torch-ext/torch_binding.cpp", - "torch-ext/torch_binding.h", -] - -[kernel.activation] -backend = "cuda" -depends = ["torch"] -src = [ - "activation/activation_kernels.cu", - "activation/cuda_compat.h", - "activation/dispatch_utils.h", -] diff --git a/examples/activation/flake.nix b/examples/activation/flake.nix deleted file mode 100644 index 98671d63..00000000 --- a/examples/activation/flake.nix +++ /dev/null @@ -1,18 +0,0 @@ -{ - description = "Flake for activation kernels"; - - inputs = { - kernel-builder.url = "path:../.."; - }; - - outputs = - { - self, - kernel-builder, - }: - - kernel-builder.lib.genFlakeOutputs { - inherit self; - path = ./.; - }; -} diff --git a/examples/activation/tests/__init__.py b/examples/activation/tests/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/examples/activation/tests/kernels/__init__.py b/examples/activation/tests/kernels/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/examples/activation/tests/kernels/allclose_default.py b/examples/activation/tests/kernels/allclose_default.py deleted file mode 100644 index 80eb1eeb..00000000 --- a/examples/activation/tests/kernels/allclose_default.py +++ /dev/null @@ -1,14 +0,0 @@ -import torch - -# Reference default values of atol and rtol are from -# https://github.com/pytorch/pytorch/blob/6d96beb6bec24d73ee3f080bac54d2104068f675/test/test_transformers.py#L67 -default_atol = {torch.float16: 1e-3, torch.bfloat16: 1e-3, torch.float: 1e-5} -default_rtol = {torch.float16: 1e-3, torch.bfloat16: 1.6e-2, torch.float: 1.3e-6} - - -def get_default_atol(output) -> float: - return default_atol[output.dtype] - - -def get_default_rtol(output) -> float: - return default_rtol[output.dtype] diff --git a/examples/activation/tests/kernels/test_activation.py b/examples/activation/tests/kernels/test_activation.py deleted file mode 100644 index 2f67a94f..00000000 --- a/examples/activation/tests/kernels/test_activation.py +++ /dev/null @@ -1,139 +0,0 @@ -import math -import random -from typing import Type - -import activation -import pytest -import torch -import torch.nn.functional as F - -from .utils import opcheck -from .allclose_default import get_default_atol, get_default_rtol - -DTYPES = [torch.half, torch.bfloat16, torch.float] -NUM_TOKENS = [7, 83, 2048] # Arbitrary values for testing -D = [512, 13824] # Arbitrary values for testing -SEEDS = [0] -CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)] - - -def gelu_fast(x: torch.Tensor) -> torch.Tensor: - return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 * (1.0 + 0.044715 * x * x))) - - -def gelu_new(x: torch.Tensor) -> torch.Tensor: - c = math.sqrt(2.0 / math.pi) - return 0.5 * x * (1.0 + torch.tanh(c * (x + 0.044715 * torch.pow(x, 3.0)))) - - -def gelu_quick(x: torch.Tensor) -> torch.Tensor: - return x * torch.sigmoid(1.702 * x) - - -def fatrelu_and_mul(x: torch.Tensor, threshold: float) -> torch.Tensor: - d = x.shape[-1] // 2 - x1 = x[..., :d] - x2 = x[..., d:] - x1 = F.threshold(x1, threshold, 0.0) - return x1 * x2 - - -def silu_and_mul(x: torch.Tensor) -> torch.Tensor: - d = x.shape[-1] // 2 - return F.silu(x[..., :d]) * x[..., d:] - - -def gelu_and_mul(x: torch.Tensor, approximate: str) -> torch.Tensor: - d = x.shape[-1] // 2 - return F.gelu(x[..., :d], approximate=approximate) * x[..., d:] - - -@pytest.mark.parametrize("activation_name", ["silu", "gelu", "gelu_tanh", "fatrelu"]) -@pytest.mark.parametrize("num_tokens", NUM_TOKENS) -@pytest.mark.parametrize("d", D) -@pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) -@torch.inference_mode() -def test_act_and_mul( - activation_name: str, - num_tokens: int, - d: int, - dtype: torch.dtype, - seed: int, - device: str, -) -> None: - random.seed(seed) - torch.manual_seed(seed) - torch.set_default_device(device) - x = torch.randn(num_tokens, 2 * d, dtype=dtype) - if activation_name == "silu": - torch_fn = silu_and_mul - fn = activation.silu_and_mul - op = activation.ops.silu_and_mul - elif activation_name == "gelu": - torch_fn = lambda x: gelu_and_mul(x, "none") - fn = activation.gelu_and_mul - op = activation.ops.gelu_and_mul - elif activation_name == "gelu_tanh": - torch_fn = lambda x: gelu_and_mul(x, "tanh") - fn = activation.gelu_tanh_and_mul - op = activation.ops.gelu_tanh_and_mul - elif activation_name == "fatrelu": - threshold = random.uniform(0, 1) - torch_fn = lambda x: fatrelu_and_mul(x, threshold) - fn = lambda out, x: activation.fatrelu_and_mul(out, x, threshold) - op = activation.ops.fatrelu_and_mul - - out_shape = x.shape[:-1] + (x.shape[-1] // 2,) - out = torch.empty(out_shape, dtype=x.dtype, device=x.device) - out = fn(out, x) - ref_out = torch_fn(x) - - # The SiLU, GELU and FatReLU implementations are equivalent to the native - # PyTorch implementations, so we can do exact comparison. - torch.testing.assert_close(out, ref_out, atol=0.0, rtol=0.0) - - d = x.shape[-1] // 2 - output_shape = x.shape[:-1] + (d,) - out = torch.empty(output_shape, dtype=x.dtype, device=x.device) - if activation_name == "fatrelu": - opcheck(op, (out, x, threshold)) - else: - opcheck(op, (out, x)) - - -@pytest.mark.parametrize( - "activation_fns", - [ - (gelu_fast, activation.gelu_fast, activation.ops.gelu_fast), - (gelu_new, activation.gelu_new, activation.ops.gelu_new), - (gelu_quick, activation.gelu_quick, activation.ops.gelu_quick), - ], -) -@pytest.mark.parametrize("num_tokens", NUM_TOKENS) -@pytest.mark.parametrize("d", D) -@pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) -@torch.inference_mode() -def test_activation( - activation_fns, - num_tokens: int, - d: int, - dtype: torch.dtype, - seed: int, - device: str, -) -> None: - torch.manual_seed(seed) - torch.set_default_device(device) - x = torch.randn(num_tokens, d, dtype=dtype) - torch_fn, fn, op = activation_fns - out = fn(torch.empty_like(x), x) - ref_out = torch_fn(x) - torch.testing.assert_close( - out, ref_out, atol=get_default_atol(out), rtol=get_default_rtol(out) - ) - - out = torch.empty_like(x) - opcheck(op, (out, x)) diff --git a/examples/activation/tests/kernels/utils.py b/examples/activation/tests/kernels/utils.py deleted file mode 100644 index d24c5bab..00000000 --- a/examples/activation/tests/kernels/utils.py +++ /dev/null @@ -1,73 +0,0 @@ -"""Kernel test utils""" - -import itertools -import random -import unittest -from numbers import Number -from typing import Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union - -import pytest -import torch -from torch._prims_common import TensorLikeType - -# For now, disable "test_aot_dispatch_dynamic" since there are some -# bugs related to this test in PyTorch 2.4. -DEFAULT_OPCHECK_TEST_UTILS: Tuple[str, ...] = ( - "test_schema", - "test_autograd_registration", - "test_faketensor", -) - -ALL_OPCHECK_TEST_UTILS: Tuple[str, ...] = ( - "test_schema", - "test_autograd_registration", - "test_faketensor", - "test_aot_dispatch_dynamic", -) - - -# Copied/modified from torch._refs.__init__.py -def fp8_allclose( - a: TensorLikeType, - b: TensorLikeType, - rtol: float = 1e-05, - atol: float = 1e-08, - equal_nan: bool = False, -) -> bool: - """ - Reference implementation of torch.allclose - """ - torch._refs._check_close_args(name="torch.allclose", a=a, b=b, rtol=rtol, atol=atol) - - return bool( - torch.all( - torch.isclose( - a.double(), b.double(), rtol=rtol, atol=atol, equal_nan=equal_nan - ) - ).item() - ) - - -# A special version of op check that has a restricted default set of test_utils -# and a patched version of allclose that supports fp8 types. -def opcheck( - op: Union[ - torch._ops.OpOverload, - torch._ops.OpOverloadPacket, - torch._library.custom_ops.CustomOpDef, - ], - args: Tuple[Any, ...], - kwargs: Optional[Dict[str, Any]] = None, - *, - test_utils: Union[str, Sequence[str]] = ALL_OPCHECK_TEST_UTILS, - raise_exception: bool = True, - cond: bool = True -) -> Dict[str, str]: - with unittest.mock.patch("torch.allclose", new=fp8_allclose): - return ( - torch.library.opcheck( - op, args, kwargs, test_utils=test_utils, raise_exception=raise_exception - ) - if cond - else {} - ) diff --git a/examples/activation/torch-ext/activation/__init__.py b/examples/activation/torch-ext/activation/__init__.py deleted file mode 100644 index 71e0b01a..00000000 --- a/examples/activation/torch-ext/activation/__init__.py +++ /dev/null @@ -1,47 +0,0 @@ -import torch - -try: - from ._ops import ops -except ImportError as e: - # Fallback for local development. - try: - import _activation - - ops = torch.ops._activition - except ImportError: - raise e - - -def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: - ops.silu_and_mul(out, x) - return out - - -def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: - ops.gelu_and_mul(out, x) - return out - - -def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: - ops.gelu_tanh_and_mul(out, x) - return out - - -def fatrelu_and_mul(out: torch.Tensor, x: torch.Tensor, threshold: float = 0.0) -> None: - ops.fatrelu_and_mul(out, x, threshold) - return out - - -def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None: - ops.gelu_fast(out, x) - return out - - -def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None: - ops.gelu_new(out, x) - return out - - -def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None: - ops.gelu_quick(out, x) - return out diff --git a/examples/activation/torch-ext/torch_binding.cpp b/examples/activation/torch-ext/torch_binding.cpp deleted file mode 100644 index b6148ecc..00000000 --- a/examples/activation/torch-ext/torch_binding.cpp +++ /dev/null @@ -1,37 +0,0 @@ -#include - -#include "registration.h" -#include "torch_binding.h" - -TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { - // Activation ops - // Activation function used in SwiGLU. - ops.def("silu_and_mul(Tensor! out, Tensor input) -> ()"); - ops.impl("silu_and_mul", torch::kCUDA, &silu_and_mul); - - // Activation function used in GeGLU with `none` approximation. - ops.def("gelu_and_mul(Tensor! out, Tensor input) -> ()"); - ops.impl("gelu_and_mul", torch::kCUDA, &gelu_and_mul); - - // Activation function used in GeGLU with `tanh` approximation. - ops.def("gelu_tanh_and_mul(Tensor! out, Tensor input) -> ()"); - ops.impl("gelu_tanh_and_mul", torch::kCUDA, &gelu_tanh_and_mul); - - // FATReLU implementation. - ops.def("fatrelu_and_mul(Tensor! out, Tensor input, float threshold) -> ()"); - ops.impl("fatrelu_and_mul", torch::kCUDA, &fatrelu_and_mul); - - // GELU implementation used in GPT-2. - ops.def("gelu_new(Tensor! out, Tensor input) -> ()"); - ops.impl("gelu_new", torch::kCUDA, &gelu_new); - - // Approximate GELU implementation. - ops.def("gelu_fast(Tensor! out, Tensor input) -> ()"); - ops.impl("gelu_fast", torch::kCUDA, &gelu_fast); - - // Quick GELU implementation. - ops.def("gelu_quick(Tensor! out, Tensor input) -> ()"); - ops.impl("gelu_quick", torch::kCUDA, &gelu_quick); -} - -REGISTER_EXTENSION(TORCH_EXTENSION_NAME) diff --git a/examples/activation/torch-ext/torch_binding.h b/examples/activation/torch-ext/torch_binding.h deleted file mode 100644 index cb163cfc..00000000 --- a/examples/activation/torch-ext/torch_binding.h +++ /dev/null @@ -1,18 +0,0 @@ -#pragma once - -#include - -void silu_and_mul(torch::Tensor &out, torch::Tensor &input); - -void gelu_and_mul(torch::Tensor &out, torch::Tensor &input); - -void gelu_tanh_and_mul(torch::Tensor &out, torch::Tensor &input); - -void fatrelu_and_mul(torch::Tensor &out, torch::Tensor &input, - double threshold); - -void gelu_new(torch::Tensor &out, torch::Tensor &input); - -void gelu_fast(torch::Tensor &out, torch::Tensor &input); - -void gelu_quick(torch::Tensor &out, torch::Tensor &input); diff --git a/examples/relu-backprop-compile/build.toml b/examples/relu-backprop-compile/build.toml index 53bb6306..c9bfab3c 100644 --- a/examples/relu-backprop-compile/build.toml +++ b/examples/relu-backprop-compile/build.toml @@ -8,12 +8,12 @@ src = [ "torch-ext/torch_binding.h", ] -[kernel.activation] +[kernel.relu] backend = "cuda" depends = ["torch"] src = ["relu_cuda/relu.cu"] -[kernel.activation_rocm] +[kernel.relu_rocm] backend = "rocm" rocm-archs = [ "gfx906", diff --git a/examples/relu-specific-torch/build.toml b/examples/relu-specific-torch/build.toml index 53bb6306..c9bfab3c 100644 --- a/examples/relu-specific-torch/build.toml +++ b/examples/relu-specific-torch/build.toml @@ -8,12 +8,12 @@ src = [ "torch-ext/torch_binding.h", ] -[kernel.activation] +[kernel.relu] backend = "cuda" depends = ["torch"] src = ["relu_cuda/relu.cu"] -[kernel.activation_rocm] +[kernel.relu_rocm] backend = "rocm" rocm-archs = [ "gfx906", diff --git a/examples/relu/build.toml b/examples/relu/build.toml index 2effffc6..cfe490ba 100644 --- a/examples/relu/build.toml +++ b/examples/relu/build.toml @@ -8,12 +8,12 @@ src = [ "torch-ext/torch_binding.h", ] -[kernel.activation] +[kernel.relu] backend = "cuda" depends = ["torch"] src = ["relu_cuda/relu.cu"] -[kernel.activation_metal] +[kernel.relu_metal] backend = "metal" src = [ "relu_metal/relu.mm", @@ -22,7 +22,7 @@ src = [ ] depends = [ "torch" ] -[kernel.activation_rocm] +[kernel.relu_rocm] backend = "rocm" rocm-archs = [ "gfx906", @@ -38,7 +38,7 @@ rocm-archs = [ depends = ["torch"] src = ["relu_cuda/relu.cu"] -[kernel.activation_xpu] +[kernel.relu_xpu] backend = "xpu" depends = ["torch"] src = ["relu_xpu/relu.cpp"] diff --git a/tests/Dockerfile.test-kernel b/tests/Dockerfile.test-kernel index ad85d9f7..3fafaf62 100644 --- a/tests/Dockerfile.test-kernel +++ b/tests/Dockerfile.test-kernel @@ -63,15 +63,15 @@ RUN CUDA_MAJOR_MINOR=$(echo ${CUDA_VERSION} | cut -d'.' -f1,2) && \ RUN uv add numpy pytest # Copy kernels and tests -COPY activation-kernel ./activation-kernel +COPY relu-kernel ./relu-kernel COPY cutlass-gemm-kernel ./cutlass-gemm-kernel COPY silu-and-mul-universal-kernel ./silu-and-mul-universal-kernel -COPY examples/activation/tests ./activation_tests +COPY examples/relu/tests ./relu_tests COPY examples/cutlass-gemm/tests ./tests/cutlass_gemm_tests # Run tests -ENV PYTHONPATH="activation-kernel:cutlass-gemm-kernel:silu-and-mul-universal-kernel:$PYTHONPATH" -CMD ["/bin/sh", "-c", ".venv/bin/pytest", "activation_tests", "cutlass_gemm_tests"] +ENV PYTHONPATH="relu-kernel:cutlass-gemm-kernel:silu-and-mul-universal-kernel:$PYTHONPATH" +CMD ["/bin/sh", "-c", ".venv/bin/pytest", "relu_tests", "cutlass_gemm_tests"] # We only care about importing, the kernel is trivial. CMD ["/bin/sh", "-c", ".venv/bin/python", "-c", "'import silu_and_mul_universal'"]