init

mayank31398 · mayank31398 · commit 8ff705ba3269 · 2024-06-25T17:55:52.000-04:00
diff --git a/.clang-format b/.clang-format
@@ -0,0 +1,157 @@
+# copied from https://github.com/microsoft/DeepSpeed
+
+---
+# Refer to the following link for the explanation of each params:
+#   http://releases.llvm.org/8.0.0/tools/clang/docs/ClangFormatStyleOptions.html
+Language: Cpp
+# BasedOnStyle: Google
+AccessModifierOffset: -4
+AlignAfterOpenBracket: Align
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlines: Left
+AlignOperands: true
+AlignTrailingComments: true
+AllowAllParametersOfDeclarationOnNextLine: false
+AllowShortBlocksOnASingleLine: true
+AllowShortCaseLabelsOnASingleLine: true
+AllowShortFunctionsOnASingleLine: All
+AllowShortIfStatementsOnASingleLine: true
+AllowShortLoopsOnASingleLine: true
+# This is deprecated
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: true
+AlwaysBreakTemplateDeclarations: true
+BinPackArguments:  false
+BinPackParameters: false
+BraceWrapping:
+  AfterClass:            false
+  AfterControlStatement: false
+  AfterEnum:             false
+  AfterFunction:         false
+  AfterNamespace:        false
+  AfterObjCDeclaration:  false
+  AfterStruct:           false
+  AfterUnion:            false
+  AfterExternBlock:      false
+  BeforeCatch:           false
+  BeforeElse:            false
+  IndentBraces:          false
+  # disabling the below splits, else, they'll just add to the vertical length of source files!
+  SplitEmptyFunction: false
+  SplitEmptyRecord: false
+  SplitEmptyNamespace: false
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: WebKit
+BreakBeforeInheritanceComma: false
+BreakInheritanceList: BeforeColon
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+BreakConstructorInitializers: BeforeColon
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: true
+ColumnLimit: 119
+CommentPragmas: '^ IWYU pragma:'
+CompactNamespaces: false
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+# Kept the below 2 to be the same as `IndentWidth` to keep everything uniform
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DerivePointerAlignment: false
+DisableFormat: false
+ExperimentalAutoDetectBinPacking: false
+FixNamespaceComments: true
+ForEachMacros:
+  - foreach
+  - Q_FOREACH
+  - BOOST_FOREACH
+IncludeBlocks: Preserve
+IncludeCategories:
+  - Regex:           '^<ext/.*\.h>'
+    Priority:        2
+  - Regex:           '^<.*\.h>'
+    Priority:        1
+  - Regex:           '^<.*'
+    Priority:        2
+  - Regex:           '.*'
+    Priority:        3
+IncludeIsMainRegex: '([-_](test|unittest))?$'
+IndentCaseLabels: true
+IndentPPDirectives: None
+IndentWidth:     4
+IndentWrappedFunctionNames: false
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBinPackProtocolList: Never
+ObjCBlockIndentWidth: 4
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: true
+PenaltyBreakAssignment: 4
+PenaltyBreakBeforeFirstCallParameter: 1
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyBreakTemplateDeclaration: 10
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 200
+PointerAlignment: Left
+RawStringFormats:
+  - Language: Cpp
+    Delimiters:
+      - cc
+      - CC
+      - cpp
+      - Cpp
+      - CPP
+      - 'c++'
+      - 'C++'
+    CanonicalDelimiter: ''
+  - Language: TextProto
+    Delimiters:
+      - pb
+      - PB
+      - proto
+      - PROTO
+    EnclosingFunctions:
+      - EqualsProto
+      - EquivToProto
+      - PARSE_PARTIAL_TEXT_PROTO
+      - PARSE_TEST_PROTO
+      - PARSE_TEXT_PROTO
+      - ParseTextOrDie
+      - ParseTextProtoOrDie
+    CanonicalDelimiter: ''
+    BasedOnStyle: google
+# Enabling comment reflow causes doxygen comments to be messed up in their formats!
+ReflowComments: true
+SortIncludes: true
+SortUsingDeclarations: true
+SpaceAfterCStyleCast: false
+SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCpp11BracedList: false
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeParens: ControlStatements
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles: false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard: Cpp11
+StatementMacros:
+  - Q_UNUSED
+  - QT_REQUIRE_VERSION
+# Be consistent with indent-width, even for people who use tab for indentation!
+TabWidth: 4
+UseTab: Never
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,8 @@
+__pycache__
+.pytest_cache
+.vscode
+*.so
+*.pyc
+/appwrapper.yaml
+*.egg-info/
+build/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,17 @@
+repos:
+  - repo: https://github.com/pycqa/isort
+    rev: 5.12.0
+    hooks:
+      - id: isort
+        name: isort (python)
+  - repo: https://github.com/psf/black
+    rev: 23.10.0
+    hooks:
+      - id: black
+        args: [--line-length=119,--target-version=py36]
+  - repo: https://github.com/pre-commit/mirrors-clang-format
+    rev: v17.0.3
+    hooks:
+      - id: clang-format
+        types_or: [c++, c, cuda]
+        args: [-style=file:.clang-format]
diff --git a/Makefile b/Makefile
@@ -0,0 +1,11 @@
+install:
+	pip install .
+
+install-dev:
+	pip install -e .
+
+test:
+	pytest tests
+
+style:
+	pre-commit run --all-files
diff --git a/README.md b/README.md
@@ -0,0 +1,5 @@
+# Efficient GPU kernels written in both CUDA and Triton
+
+<p align="center">
+  <img src="assets/logo.jpeg" width="300px" height="300px">
+</p>
diff --git a/assets/logo.jpeg b/assets/logo.jpeg
diff --git a/kernels/__init__.py b/kernels/__init__.py
@@ -0,0 +1,12 @@
+from .utils import compile_helpers
+from .vector_addition import (
+    VectorAddition_CUDA,
+    VectorAddition_PyTorch,
+    VectorAddition_Triton,
+    vector_addition_cuda,
+    vector_addition_pytorch,
+    vector_addition_triton,
+)
+
+
+compile_helpers()
diff --git a/kernels/utils.h b/kernels/utils.h
@@ -0,0 +1,9 @@
+#include <torch/extension.h>
+
+// C++ interface
+#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " is not on CUDA device")
+#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " is not a contiguous tensor")
+
+#define CHECK_INPUT(x) \
+    CHECK_CUDA(x);     \
+    CHECK_CONTIGUOUS(x);
diff --git a/kernels/utils.py b/kernels/utils.py
@@ -0,0 +1,16 @@
+import os
+
+from torch.utils.cpp_extension import load as load_cpp_extension
+
+
+def compile_helpers() -> None:
+    load_cpp_extension(
+        "vector_addition_cuda",
+        sources=[
+            os.path.join(os.path.dirname(__file__), "vector_addition/cuda_kernel/vector_addition.cpp"),
+            os.path.join(os.path.dirname(__file__), "vector_addition/cuda_kernel/vector_addition.cu"),
+        ],
+        with_cuda=True,
+        extra_cflags=["-O3", "-Wall", "-shared", "-fPIC", "-fdiagnostics-color"],
+        verbose=True,
+    )
diff --git a/kernels/vector_addition/__init__.py b/kernels/vector_addition/__init__.py
@@ -0,0 +1,3 @@
+from .cuda_kernel import VectorAddition_CUDA, vector_addition_cuda
+from .pytorch import VectorAddition_PyTorch, vector_addition_pytorch
+from .triton_kernel import VectorAddition_Triton, vector_addition_triton
diff --git a/kernels/vector_addition/cuda_kernel/__init__.py b/kernels/vector_addition/cuda_kernel/__init__.py
@@ -0,0 +1,23 @@
+from typing import Tuple
+
+import torch
+import torch.nn as nn
+
+
+class _VectorAddition_CUDA(torch.autograd.Function):
+    def forward(ctx, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+        import vector_addition_cuda
+
+        return vector_addition_cuda.vector_addition_forward(x, y)
+
+    def backward(ctx, output_grad: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        return output_grad, output_grad
+
+
+def vector_addition_cuda(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+    return _VectorAddition_CUDA.apply(x, y)
+
+
+class VectorAddition_CUDA(nn.Module):
+    def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+        return vector_addition_cuda(x, y)
diff --git a/kernels/vector_addition/cuda_kernel/vector_addition.cpp b/kernels/vector_addition/cuda_kernel/vector_addition.cpp
@@ -0,0 +1,24 @@
+#include <torch/extension.h>
+#include "../../utils.h"
+
+// CUDA kernel declarations
+torch::Tensor vector_addition_forward_kernel_launcher(torch::Tensor x, torch::Tensor y, const int BLOCK_SIZE);
+
+torch::Tensor vector_addition_forward(torch::Tensor x, torch::Tensor y)
+{
+    CHECK_INPUT(x);
+    CHECK_INPUT(y);
+
+    TORCH_CHECK(x.dim() == 1, "tensor should be 1 dimensional")
+    TORCH_CHECK(y.dim() == 1, "tensor should be 1 dimensional")
+
+    TORCH_CHECK(x.numel() == y.numel(), "both tensors should have same number of elements");
+    TORCH_CHECK(x.type() == y.type(), "both tensors should have same dtype");
+
+    return vector_addition_forward_kernel_launcher(x, y, 1024);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("vector_addition_forward", &vector_addition_forward, "Vector addition forward (CUDA)");
+}
diff --git a/kernels/vector_addition/cuda_kernel/vector_addition.cu b/kernels/vector_addition/cuda_kernel/vector_addition.cu
@@ -0,0 +1,37 @@
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <torch/extension.h>
+
+template <typename scalar_t>
+__global__ void vector_addition_forward_kernel(const scalar_t* x,
+                                               const scalar_t* y,
+                                               scalar_t* output,
+                                               const int num_elements)
+{
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index < num_elements) output[index] = x[index] + y[index];
+}
+
+torch::Tensor vector_addition_forward_kernel_launcher(torch::Tensor x, torch::Tensor y, const int BLOCK_SIZE)
+{
+    int num_elements = x.numel();
+    torch::Tensor output = torch::empty_like(x);
+
+    int blocks = (int)ceil((float)num_elements / BLOCK_SIZE);
+
+    if (at::isReducedFloatingType(x.scalar_type())) {
+        AT_DISPATCH_REDUCED_FLOATING_TYPES(
+            x.scalar_type(), "vector_addition_forward_kernel", ([&] {
+                vector_addition_forward_kernel<scalar_t><<<blocks, BLOCK_SIZE>>>(
+                    x.data<scalar_t>(), y.data<scalar_t>(), output.data<scalar_t>(), num_elements);
+            }));
+    } else {
+        AT_DISPATCH_FLOATING_TYPES(
+            x.scalar_type(), "vector_addition_forward_kernel", ([&] {
+                vector_addition_forward_kernel<scalar_t><<<blocks, BLOCK_SIZE>>>(
+                    x.data<scalar_t>(), y.data<scalar_t>(), output.data<scalar_t>(), num_elements);
+            }));
+    }
+
+    return output;
+}
diff --git a/kernels/vector_addition/pytorch.py b/kernels/vector_addition/pytorch.py
@@ -0,0 +1,11 @@
+import torch
+import torch.nn as nn
+
+
+def vector_addition_pytorch(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+    return x + y
+
+
+class VectorAddition_PyTorch(nn.Module):
+    def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+        return vector_addition_pytorch(x, y)
diff --git a/kernels/vector_addition/triton_kernel.py b/kernels/vector_addition/triton_kernel.py
@@ -0,0 +1,48 @@
+from typing import Tuple
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def _vector_addition_forward(x_ptr, y_ptr, output_ptr, num_elements, BLOCK_SIZE: tl.constexpr):
+    pid = tl.program_id(axis=0)
+
+    block_start = pid * BLOCK_SIZE
+    block_indices = block_start + tl.arange(0, BLOCK_SIZE)
+
+    mask = block_indices < num_elements
+
+    x = tl.load(x_ptr + block_indices, mask=mask)
+    y = tl.load(y_ptr + block_indices, mask=mask)
+
+    output = x + y
+
+    tl.store(output_ptr + block_indices, output, mask=mask)
+
+
+class _VectorAddition_Triton(torch.autograd.Function):
+    def forward(ctx, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+        assert x.dim() == 1
+        output = torch.empty_like(x)
+
+        num_elements = x.numel()
+        grid = lambda meta: (triton.cdiv(num_elements, meta["BLOCK_SIZE"]),)
+
+        _vector_addition_forward[grid](x, y, output, num_elements, BLOCK_SIZE=1024)
+
+        return output
+
+    def backward(ctx, output_grad: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        return output_grad, output_grad
+
+
+def vector_addition_triton(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+    return _VectorAddition_Triton.apply(x, y)
+
+
+class VectorAddition_Triton(nn.Module):
+    def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+        return vector_addition_triton(x, y)
diff --git a/setup.cfg b/setup.cfg
diff --git a/setup.py b/setup.py
diff --git a/tests/__init__.py b/tests/__init__.py
diff --git a/tests/test_commons.py b/tests/test_commons.py
diff --git a/tests/vector_addition_test.py b/tests/vector_addition_test.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+from .cuda_kernel import VectorAddition_CUDA, vector_addition_cuda`
	`2`	`+from .pytorch import VectorAddition_PyTorch, vector_addition_pytorch`
	`3`	`+from .triton_kernel import VectorAddition_Triton, vector_addition_triton`