diff --git a/include/infiniop.h b/include/infiniop.h index d51b8d92e..572546cf7 100644 --- a/include/infiniop.h +++ b/include/infiniop.h @@ -17,5 +17,14 @@ #include "infiniop/ops/sub.h" #include "infiniop/ops/swiglu.h" #include "infiniop/tensor_descriptor.h" +#include "infiniop/ops/exp.h" +#include "infiniop/ops/sin.h" +#include "infiniop/ops/cos.h" +#include "infiniop/ops/tanh.h" +#include "infiniop/ops/sigmoid_backward.h" +#include "infiniop/ops/hardswish.h" +#include "infiniop/ops/where.h" +#include "infiniop/ops/leaky_relu.h" +#include "infiniop/ops/cast.h" #endif // __INFINIOP_API_H__ diff --git a/include/infiniop/ops/cast.h b/include/infiniop/ops/cast.h new file mode 100644 index 000000000..a3a84c00b --- /dev/null +++ b/include/infiniop/ops/cast.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_CAST_API_H__ +#define __INFINIOP_CAST_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopCastDescriptor_t; + +__C __export infiniStatus_t infiniopCreateCastDescriptor(infiniopHandle_t handle, + infiniopCastDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input); + +__C __export infiniStatus_t infiniopGetCastWorkspaceSize(infiniopCastDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopCast(infiniopCastDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +__C __export infiniStatus_t infiniopDestroyCastDescriptor(infiniopCastDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/include/infiniop/ops/cos.h b/include/infiniop/ops/cos.h new file mode 100644 index 000000000..098c0d7e1 --- /dev/null +++ b/include/infiniop/ops/cos.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_COS_API_H__ +#define __INFINIOP_COS_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopCosDescriptor_t; + +__C __export infiniStatus_t infiniopCreateCosDescriptor(infiniopHandle_t handle, + infiniopCosDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input); + +__C __export infiniStatus_t infiniopGetCosWorkspaceSize(infiniopCosDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopCos(infiniopCosDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +__C __export infiniStatus_t infiniopDestroyCosDescriptor(infiniopCosDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/include/infiniop/ops/exp.h b/include/infiniop/ops/exp.h new file mode 100644 index 000000000..1b7defcc5 --- /dev/null +++ b/include/infiniop/ops/exp.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_EXP_API_H__ +#define __INFINIOP_EXP_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopExpDescriptor_t; + +__C __export infiniStatus_t infiniopCreateExpDescriptor(infiniopHandle_t handle, + infiniopExpDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input); + +__C __export infiniStatus_t infiniopGetExpWorkspaceSize(infiniopExpDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopExp(infiniopExpDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +__C __export infiniStatus_t infiniopDestroyExpDescriptor(infiniopExpDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/include/infiniop/ops/hardswish.h b/include/infiniop/ops/hardswish.h new file mode 100644 index 000000000..8b54b207b --- /dev/null +++ b/include/infiniop/ops/hardswish.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_HARDSWISH_API_H__ +#define __INFINIOP_HARDSWISH_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopHardSwishDescriptor_t; + +__C __export infiniStatus_t infiniopCreateHardSwishDescriptor(infiniopHandle_t handle, + infiniopHardSwishDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input); + +__C __export infiniStatus_t infiniopGetHardSwishWorkspaceSize(infiniopHardSwishDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopHardSwish(infiniopHardSwishDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +__C __export infiniStatus_t infiniopDestroyHardSwishDescriptor(infiniopHardSwishDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/include/infiniop/ops/leaky_relu.h b/include/infiniop/ops/leaky_relu.h new file mode 100644 index 000000000..143ff28b6 --- /dev/null +++ b/include/infiniop/ops/leaky_relu.h @@ -0,0 +1,25 @@ +#ifndef __INFINIOP_LEAKY_RELU_API_H__ +#define __INFINIOP_LEAKY_RELU_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopLeakyReLUDescriptor_t; + +__C __export infiniStatus_t infiniopCreateLeakyReLUDescriptor(infiniopHandle_t handle, + infiniopLeakyReLUDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input, + float negative_slope); + +__C __export infiniStatus_t infiniopGetLeakyReLUWorkspaceSize(infiniopLeakyReLUDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopLeakyReLU(infiniopLeakyReLUDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +__C __export infiniStatus_t infiniopDestroyLeakyReLUDescriptor(infiniopLeakyReLUDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/include/infiniop/ops/sigmoid_backward.h b/include/infiniop/ops/sigmoid_backward.h new file mode 100644 index 000000000..82b55e9dd --- /dev/null +++ b/include/infiniop/ops/sigmoid_backward.h @@ -0,0 +1,26 @@ +#ifndef __INFINIOP_SIGMOID_BACKWARD_API_H__ +#define __INFINIOP_SIGMOID_BACKWARD_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopSigmoidBackwardDescriptor_t; + +__C __export infiniStatus_t infiniopCreateSigmoidBackwardDescriptor(infiniopHandle_t handle, + infiniopSigmoidBackwardDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t grad_input, + infiniopTensorDescriptor_t input, + infiniopTensorDescriptor_t grad_output); + +__C __export infiniStatus_t infiniopGetSigmoidBackwardWorkspaceSize(infiniopSigmoidBackwardDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopSigmoidBackward(infiniopSigmoidBackwardDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *grad_input, + const void *input, + const void *grad_output, + void *stream); + +__C __export infiniStatus_t infiniopDestroySigmoidBackwardDescriptor(infiniopSigmoidBackwardDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/include/infiniop/ops/sin.h b/include/infiniop/ops/sin.h new file mode 100644 index 000000000..dba8683e5 --- /dev/null +++ b/include/infiniop/ops/sin.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_SIN_API_H__ +#define __INFINIOP_SIN_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopSinDescriptor_t; + +__C __export infiniStatus_t infiniopCreateSinDescriptor(infiniopHandle_t handle, + infiniopSinDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input); + +__C __export infiniStatus_t infiniopGetSinWorkspaceSize(infiniopSinDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopSin(infiniopSinDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +__C __export infiniStatus_t infiniopDestroySinDescriptor(infiniopSinDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/include/infiniop/ops/tanh.h b/include/infiniop/ops/tanh.h new file mode 100644 index 000000000..b6531e391 --- /dev/null +++ b/include/infiniop/ops/tanh.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_TANH_API_H__ +#define __INFINIOP_TANH_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopTanhDescriptor_t; + +__C __export infiniStatus_t infiniopCreateTanhDescriptor(infiniopHandle_t handle, + infiniopTanhDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input); + +__C __export infiniStatus_t infiniopGetTanhWorkspaceSize(infiniopTanhDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopTanh(infiniopTanhDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +__C __export infiniStatus_t infiniopDestroyTanhDescriptor(infiniopTanhDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/include/infiniop/ops/where.h b/include/infiniop/ops/where.h new file mode 100644 index 000000000..d38c753df --- /dev/null +++ b/include/infiniop/ops/where.h @@ -0,0 +1,28 @@ +#ifndef __INFINIOP_WHERE_API_H__ +#define __INFINIOP_WHERE_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopWhereDescriptor_t; + +__C __export infiniStatus_t infiniopCreateWhereDescriptor(infiniopHandle_t handle, + infiniopWhereDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t condition, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b); + +__C __export infiniStatus_t infiniopGetWhereWorkspaceSize(infiniopWhereDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopWhere(infiniopWhereDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *condition, + const void *a, + const void *b, + void *stream); + +__C __export infiniStatus_t infiniopDestroyWhereDescriptor(infiniopWhereDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/scripts/python_test.py b/scripts/python_test.py index eb2d4319e..69b26ed8b 100644 --- a/scripts/python_test.py +++ b/scripts/python_test.py @@ -24,6 +24,15 @@ def run_tests(args): "rope.py", "sub.py", "swiglu.py", + "exp.py", + "sin.py", + "cos.py", + "tanh.py", + "sigmoid_backward.py", + "hardswish.py", + "where.py", + "leaky_relu.py", + "cast.py", ]: result = subprocess.run( f"python {test} {args} --debug", text=True, encoding="utf-8", shell=True diff --git a/src/infiniop-test/include/ops.hpp b/src/infiniop-test/include/ops.hpp index 3820f7cfd..aa2ef9389 100644 --- a/src/infiniop-test/include/ops.hpp +++ b/src/infiniop-test/include/ops.hpp @@ -16,6 +16,15 @@ DECLARE_INFINIOP_TEST(add) DECLARE_INFINIOP_TEST(causal_softmax) DECLARE_INFINIOP_TEST(rearrange) DECLARE_INFINIOP_TEST(sub) +DECLARE_INFINIOP_TEST(exp) +DECLARE_INFINIOP_TEST(sin) +DECLARE_INFINIOP_TEST(cos) +DECLARE_INFINIOP_TEST(tanh) +DECLARE_INFINIOP_TEST(sigmoid_backward) +DECLARE_INFINIOP_TEST(hardswish) +DECLARE_INFINIOP_TEST(where) +DECLARE_INFINIOP_TEST(leakyrelu) +DECLARE_INFINIOP_TEST(cast) #define REGISTER_INFINIOP_TEST(name) \ { \ @@ -30,19 +39,28 @@ DECLARE_INFINIOP_TEST(sub) /* * Register all the tests here */ -#define TEST_BUILDER_MAPPINGS \ - { \ - REGISTER_INFINIOP_TEST(gemm) \ - REGISTER_INFINIOP_TEST(random_sample) \ - REGISTER_INFINIOP_TEST(add) \ - REGISTER_INFINIOP_TEST(mul) \ - REGISTER_INFINIOP_TEST(clip) \ - REGISTER_INFINIOP_TEST(swiglu) \ - REGISTER_INFINIOP_TEST(rope) \ - REGISTER_INFINIOP_TEST(rms_norm) \ - REGISTER_INFINIOP_TEST(causal_softmax) \ - REGISTER_INFINIOP_TEST(rearrange) \ - REGISTER_INFINIOP_TEST(sub) \ +#define TEST_BUILDER_MAPPINGS \ + { \ + REGISTER_INFINIOP_TEST(gemm) \ + REGISTER_INFINIOP_TEST(random_sample) \ + REGISTER_INFINIOP_TEST(add) \ + REGISTER_INFINIOP_TEST(mul) \ + REGISTER_INFINIOP_TEST(clip) \ + REGISTER_INFINIOP_TEST(swiglu) \ + REGISTER_INFINIOP_TEST(rope) \ + REGISTER_INFINIOP_TEST(rms_norm) \ + REGISTER_INFINIOP_TEST(causal_softmax) \ + REGISTER_INFINIOP_TEST(rearrange) \ + REGISTER_INFINIOP_TEST(sub) \ + REGISTER_INFINIOP_TEST(exp) \ + REGISTER_INFINIOP_TEST(sin) \ + REGISTER_INFINIOP_TEST(cos) \ + REGISTER_INFINIOP_TEST(tanh) \ + REGISTER_INFINIOP_TEST(sigmoid_backward) \ + REGISTER_INFINIOP_TEST(hardswish) \ + REGISTER_INFINIOP_TEST(where) \ + REGISTER_INFINIOP_TEST(leakyrelu) \ + REGISTER_INFINIOP_TEST(cast) \ } namespace infiniop_test { @@ -64,4 +82,4 @@ bool check_names( } // namespace infiniop_test -#endif +#endif \ No newline at end of file diff --git a/src/infiniop-test/include/test.hpp b/src/infiniop-test/include/test.hpp index e2dd45f9f..277061029 100644 --- a/src/infiniop-test/include/test.hpp +++ b/src/infiniop-test/include/test.hpp @@ -47,7 +47,7 @@ std::vector> runAllTests( const GGUFFileReader &, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations, - double rtol, double atol); + double rtol, double atol, bool equal_nan = false); // Run a single test read from a GGUF file std::shared_ptr runTest( @@ -55,10 +55,11 @@ std::shared_ptr runTest( infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations, double rtol, double atol, - size_t test_id); + size_t test_id, + bool equal_nan = false); // Check if two tensors are close within given tolerance -void allClose(std::shared_ptr actual, std::shared_ptr expected, double rtol = 1e-3, double atol = 1e-3); +void allClose(std::shared_ptr actual, std::shared_ptr expected, double rtol = 1e-3, double atol = 1e-3, bool equal_nan = false); // Check if two tensors are equal void allEqual(std::shared_ptr actual, std::shared_ptr expected); @@ -85,13 +86,14 @@ class Test { namespace infiniop_test::name { \ class Test : public infiniop_test::base::Test { \ double _rtol, _atol; \ + bool _equal_nan; \ \ public: \ static std::string op_name() { return #name; } \ static std::shared_ptr build( \ std::unordered_map> attributes, \ std::unordered_map> tensors, \ - double, double); \ + double, double, bool); \ \ static std::vector attribute_names(); \ static std::vector tensor_names(); \ @@ -109,7 +111,8 @@ class Test { struct Attributes; \ Attributes *_attributes; \ Test() = delete; \ - Test(double rtol, double atol) : _rtol(rtol), _atol(atol) {} \ + Test(double rtol, double atol, bool equal_nan = false) \ + : _rtol(rtol), _atol(atol), _equal_nan(equal_nan) {} \ }; \ } @@ -117,7 +120,7 @@ namespace infiniop_test { using BuilderFunc = std::function( std::unordered_map>, std::unordered_map>, - double, double)>; + double, double, bool)>; // Testcase Registry // Each testcase should provid a formatted builder, attribute names, and tensor names diff --git a/src/infiniop-test/src/gguf.cpp b/src/infiniop-test/src/gguf.cpp index a4b200033..aee5b39a8 100644 --- a/src/infiniop-test/src/gguf.cpp +++ b/src/infiniop-test/src/gguf.cpp @@ -53,7 +53,9 @@ GGUFFileReader::GGUFFileReader(const std::string &filepath) { try { _file = std::make_shared(filepath); } catch (const std::exception &e) { - throw e; + // throw e; + std::cerr << "Error: " << e.what() << std::endl; + // throw e; } _data = _file->ptr(); _cursor = reinterpret_cast(_data); diff --git a/src/infiniop-test/src/main.cpp b/src/infiniop-test/src/main.cpp index 4863c8172..6805bd7f8 100644 --- a/src/infiniop-test/src/main.cpp +++ b/src/infiniop-test/src/main.cpp @@ -1,8 +1,8 @@ #include "gguf.hpp" #include "test.hpp" +#include #include #include - struct ParsedArgs { std::string file_path; // Mandatory argument: test.gguf file path infiniDevice_t device_type = INFINI_DEVICE_CPU; // Default to CPU @@ -11,12 +11,13 @@ struct ParsedArgs { int iterations = 0; // Default to 0 if not given double atol = 0.001; // Default absolute tolerance double rtol = 0.001; // Default relative tolerance + bool equal_nan = false; // Default relative tolerance }; void printUsage() { std::cout << "Usage:" << std::endl << std::endl; - std::cout << "infiniop-test [--[:id]] [--warmup ] [--run ] [--atol ] [--rtol ]" << std::endl + std::cout << "infiniop-test [--[:id]] [--warmup ] [--run ] [--atol ] [--rtol ] [--equal-nan ]" << std::endl << std::endl; std::cout << " >" << std::endl; std::cout << " Path to the test gguf file" << std::endl @@ -36,6 +37,9 @@ void printUsage() { std::cout << " --rtol " << std::endl; std::cout << " (Optional) Relative tolerance for correctness check. Default to 0.001" << std::endl << std::endl; + std::cout << " --equal-nan " << std::endl; + std::cout << " (Optional) If True, then two NaNs will be considered equal. Default to False" << std::endl + << std::endl; exit(-1); } @@ -91,6 +95,11 @@ ParsedArgs parseArgs(int argc, char *argv[]) { else if (arg == "--rtol" && i + 1 < argc) { args.rtol = std::stod(argv[++i]); } + else if (arg == "--equal-nan" && i + 1 < argc) { + args.equal_nan = (strcmp(argv[++i], "True") == 0 || strcmp(argv[i], "true") == 0) + ? true + : false; + } else { printUsage(); } @@ -119,7 +128,7 @@ int main(int argc, char *argv[]) { reader, (infiniDevice_t)args.device_type, args.device_id, args.warmups, args.iterations, - args.rtol, args.atol); + args.rtol, args.atol, args.equal_nan); std::cout << "=====================================" << std::endl; for (auto result : results) { diff --git a/src/infiniop-test/src/ops/add.cpp b/src/infiniop-test/src/ops/add.cpp index 27f69d687..e90290d55 100644 --- a/src/infiniop-test/src/ops/add.cpp +++ b/src/infiniop-test/src/ops/add.cpp @@ -15,8 +15,8 @@ struct Test::Attributes { std::shared_ptr Test::build( std::unordered_map> attributes, std::unordered_map> tensors, - double rtol, double atol) { - auto test = std::shared_ptr(new Test(rtol, atol)); + double rtol, double atol, bool equal_nan) { + auto test = std::shared_ptr(new Test(rtol, atol, equal_nan)); test->_attributes = new Attributes(); if (tensors.find("a") == tensors.end() || tensors.find("b") == tensors.end() @@ -58,7 +58,7 @@ std::shared_ptr Test::run( return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); try { - allClose(c, _attributes->ans, _rtol, _atol); + allClose(c, _attributes->ans, _rtol, _atol, _equal_nan); } catch (const std::exception &e) { return TEST_FAILED(RESULT_INCORRECT, e.what()); } @@ -98,7 +98,7 @@ std::string Test::toString() const { oss << "- b: " << _attributes->b->info() << std::endl; oss << "- c: " << _attributes->c->info() << std::endl; oss << std::scientific << std::setprecision(2); - oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl; return oss.str(); } diff --git a/src/infiniop-test/src/ops/cast.cpp b/src/infiniop-test/src/ops/cast.cpp new file mode 100644 index 000000000..258f74654 --- /dev/null +++ b/src/infiniop-test/src/ops/cast.cpp @@ -0,0 +1,111 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::cast { + +struct Test::Attributes { + std::shared_ptr input; + std::shared_ptr output; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol, bool equal_nan) { + + auto test = std::shared_ptr(new Test(rtol, atol, equal_nan)); + test->_attributes = new Attributes(); + + if (tensors.find("input") == tensors.end() + || tensors.find("output") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->input = tensors["input"]; + test->_attributes->output = tensors["output"]; + test->_attributes->ans = tensors["ans"]; + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, + size_t warm_ups, size_t iterations) { + + infiniopCastDescriptor_t op_desc; + + auto input = _attributes->input->to(device, device_id); + auto output = _attributes->output->to(device, device_id); + + if (input->data() == output->data()) { + return TEST_FAILED(OP_CREATION_FAILED, + "Cast does not support inplace: input and output alias."); + } + + CHECK_OR(infiniopCreateCastDescriptor(handle, &op_desc, + /*dst*/ output->desc(), + /*src*/ input->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create cast descriptor.")); + + size_t workspace_size; + CHECK_OR(infiniopGetCastWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + + CHECK_OR(infiniopCast(op_desc, workspace, workspace_size, + /*dst*/ output->data(), + /*src*/ input->data(), + /*stream*/ nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(output, _attributes->ans, _rtol, _atol, _equal_nan); + } catch (const std::exception &e) { + infiniopDestroyCastDescriptor(op_desc); + infinirtFree(workspace); + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = benchmark( + [=]() { + infiniopCast(op_desc, workspace, workspace_size, + /*dst*/ output->data(), + /*src*/ input->data(), + /*stream*/ nullptr); + }, + warm_ups, iterations); + + infiniopDestroyCastDescriptor(op_desc); + infinirtFree(workspace); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { return {}; } + +std::vector Test::tensor_names() { return {"input", "output", "ans"}; } + +std::vector Test::output_names() { return {"output"}; } + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- input: " << _attributes->input->info() << std::endl; + oss << "- output: " << _attributes->output->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol + << ", equal_nan=" << _equal_nan << std::endl; + oss << "- inplace: false" << std::endl; + return oss.str(); +} + +Test::~Test() { delete _attributes; } + +} // namespace infiniop_test::cast diff --git a/src/infiniop-test/src/ops/causal_softmax.cpp b/src/infiniop-test/src/ops/causal_softmax.cpp index 29612960a..97c65ef8c 100644 --- a/src/infiniop-test/src/ops/causal_softmax.cpp +++ b/src/infiniop-test/src/ops/causal_softmax.cpp @@ -14,8 +14,8 @@ struct Test::Attributes { std::shared_ptr Test::build( std::unordered_map> attributes, std::unordered_map> tensors, - double rtol, double atol) { - auto test = std::shared_ptr(new Test(rtol, atol)); + double rtol, double atol, bool equal_nan) { + auto test = std::shared_ptr(new Test(rtol, atol, equal_nan)); test->_attributes = new Attributes(); if (tensors.find("x") == tensors.end() || tensors.find("y") == tensors.end() @@ -53,7 +53,7 @@ std::shared_ptr Test::run( return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); try { - allClose(y, _attributes->ans, _rtol, _atol); + allClose(y, _attributes->ans, _rtol, _atol, _equal_nan); } catch (const std::exception &e) { return TEST_FAILED(RESULT_INCORRECT, e.what()); } @@ -92,7 +92,7 @@ std::string Test::toString() const { oss << "- y: " << _attributes->y->info() << std::endl; oss << "- ans: " << _attributes->ans->info() << std::endl; oss << std::scientific << std::setprecision(2); - oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl; return oss.str(); } diff --git a/src/infiniop-test/src/ops/clip.cpp b/src/infiniop-test/src/ops/clip.cpp index 82a0e9b10..a01c18a4d 100644 --- a/src/infiniop-test/src/ops/clip.cpp +++ b/src/infiniop-test/src/ops/clip.cpp @@ -16,8 +16,8 @@ struct Test::Attributes { std::shared_ptr Test::build( std::unordered_map> attributes, std::unordered_map> tensors, - double rtol, double atol) { - auto test = std::shared_ptr(new Test(rtol, atol)); + double rtol, double atol, bool equal_nan) { + auto test = std::shared_ptr(new Test(rtol, atol, equal_nan)); test->_attributes = new Attributes(); if (tensors.find("x") == tensors.end() || tensors.find("min_val") == tensors.end() @@ -64,7 +64,7 @@ std::shared_ptr Test::run( return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); try { - allClose(y, _attributes->ans, _rtol, _atol); + allClose(y, _attributes->ans, _rtol, _atol, _equal_nan); } catch (const std::exception &e) { return TEST_FAILED(RESULT_INCORRECT, e.what()); } @@ -109,7 +109,7 @@ std::string Test::toString() const { oss << "- max_val: " << _attributes->max_val->info() << std::endl; oss << "- y: " << _attributes->y->info() << std::endl; oss << std::scientific << std::setprecision(2); - oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl; return oss.str(); } diff --git a/src/infiniop-test/src/ops/cos.cpp b/src/infiniop-test/src/ops/cos.cpp new file mode 100644 index 000000000..d1d99a0a3 --- /dev/null +++ b/src/infiniop-test/src/ops/cos.cpp @@ -0,0 +1,105 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::cos { + +struct Test::Attributes { + std::shared_ptr input; + std::shared_ptr output; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol, bool equal_nan) { + + auto test = std::shared_ptr(new Test(rtol, atol, equal_nan)); + test->_attributes = new Attributes(); + + if (tensors.find("input") == tensors.end() + || tensors.find("output") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->input = tensors["input"]; + test->_attributes->output = tensors["output"]; + test->_attributes->ans = tensors["ans"]; + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, + size_t warm_ups, size_t iterations) { + + infiniopCosDescriptor_t op_desc; + + auto input = _attributes->input->to(device, device_id); + auto output = _attributes->output->to(device, device_id); + + CHECK_OR(infiniopCreateCosDescriptor(handle, &op_desc, + /*y*/ output->desc(), + /*x*/ input->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create cos descriptor.")); + + size_t workspace_size; + CHECK_OR(infiniopGetCosWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + + CHECK_OR(infiniopCos(op_desc, workspace, workspace_size, + /*y*/ output->data(), + /*x*/ input->data(), + /*stream*/ nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(output, _attributes->ans, _rtol, _atol, _equal_nan); + } catch (const std::exception &e) { + infiniopDestroyCosDescriptor(op_desc); + infinirtFree(workspace); + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = benchmark( + [=]() { + infiniopCos(op_desc, workspace, workspace_size, + output->data(), + input->data(), + nullptr); + }, + warm_ups, iterations); + + infiniopDestroyCosDescriptor(op_desc); + infinirtFree(workspace); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { return {}; } + +std::vector Test::tensor_names() { return {"input", "output", "ans"}; } + +std::vector Test::output_names() { return {"output"}; } + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- input: " << _attributes->input->info() << std::endl; + oss << "- output: " << _attributes->output->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol + << ", equal_nan=" << _equal_nan << std::endl; + return oss.str(); +} + +Test::~Test() { delete _attributes; } + +} // namespace infiniop_test::cos diff --git a/src/infiniop-test/src/ops/exp.cpp b/src/infiniop-test/src/ops/exp.cpp new file mode 100644 index 000000000..23dde66d0 --- /dev/null +++ b/src/infiniop-test/src/ops/exp.cpp @@ -0,0 +1,103 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::exp { + +struct Test::Attributes { + std::shared_ptr input; + std::shared_ptr output; // out + std::shared_ptr ans; // reference +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol, bool equal_nan) { + auto test = std::shared_ptr(new Test(rtol, atol, equal_nan)); + test->_attributes = new Attributes(); + + if (tensors.find("input") == tensors.end() + || tensors.find("output") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->input = tensors["input"]; + test->_attributes->output = tensors["output"]; + test->_attributes->ans = tensors["ans"]; + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, + size_t warm_ups, size_t iterations) { + + infiniopExpDescriptor_t op_desc; + + auto input = _attributes->input->to(device, device_id); + auto output = _attributes->output->to(device, device_id); + + CHECK_OR(infiniopCreateExpDescriptor(handle, &op_desc, + /*y*/ output->desc(), + /*x*/ input->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create exp descriptor.")); + + size_t workspace_size; + CHECK_OR(infiniopGetExpWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + + CHECK_OR(infiniopExp(op_desc, workspace, workspace_size, + /*y*/ output->data(), + /*x*/ input->data(), + /*stream*/ nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(output, _attributes->ans, _rtol, _atol, _equal_nan); + } catch (const std::exception &e) { + infiniopDestroyExpDescriptor(op_desc); + infinirtFree(workspace); + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = benchmark( + [=]() { + infiniopExp(op_desc, workspace, workspace_size, + output->data(), + input->data(), + nullptr); + }, + warm_ups, iterations); + + infiniopDestroyExpDescriptor(op_desc); + infinirtFree(workspace); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { return {}; } + +std::vector Test::tensor_names() { return {"input", "output", "ans"}; } + +std::vector Test::output_names() { return {"output"}; } + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- input: " << _attributes->input->info() << std::endl; + oss << "- output: " << _attributes->output->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl; + return oss.str(); +} + +Test::~Test() { delete _attributes; } + +} // namespace infiniop_test::exp diff --git a/src/infiniop-test/src/ops/gemm.cpp b/src/infiniop-test/src/ops/gemm.cpp index 37c8ed6fe..664288d73 100644 --- a/src/infiniop-test/src/ops/gemm.cpp +++ b/src/infiniop-test/src/ops/gemm.cpp @@ -18,8 +18,8 @@ struct Test::Attributes { std::shared_ptr Test::build( std::unordered_map> attributes, std::unordered_map> tensors, - double rtol, double atol) { - auto test = std::shared_ptr(new Test(rtol, atol)); + double rtol, double atol, bool equal_nan) { + auto test = std::shared_ptr(new Test(rtol, atol, equal_nan)); test->_attributes = new Attributes(); if (!check_names(attributes, Test::attribute_names()) || !check_names(tensors, Test::tensor_names())) { throw std::runtime_error("Invalid Test"); @@ -65,7 +65,7 @@ std::shared_ptr Test::run( return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); try { - allClose(c, _attributes->ans, _rtol, _atol); + allClose(c, _attributes->ans, _rtol, _atol, _equal_nan); } catch (const std::exception &e) { return TEST_FAILED(RESULT_INCORRECT, e.what()); } @@ -120,7 +120,7 @@ std::string Test::toString() const { oss << "- b: " << _attributes->b->info() << std::endl; oss << "- c: " << _attributes->c->info() << std::endl; oss << std::scientific << std::setprecision(2); - oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl; return oss.str(); } diff --git a/src/infiniop-test/src/ops/hardswish.cpp b/src/infiniop-test/src/ops/hardswish.cpp new file mode 100644 index 000000000..e47943bdb --- /dev/null +++ b/src/infiniop-test/src/ops/hardswish.cpp @@ -0,0 +1,105 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::hardswish { + +struct Test::Attributes { + std::shared_ptr input; + std::shared_ptr output; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol, bool equal_nan) { + + auto test = std::shared_ptr(new Test(rtol, atol, equal_nan)); + test->_attributes = new Attributes(); + + if (tensors.find("input") == tensors.end() + || tensors.find("output") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->input = tensors["input"]; + test->_attributes->output = tensors["output"]; + test->_attributes->ans = tensors["ans"]; + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, + size_t warm_ups, size_t iterations) { + + infiniopHardSwishDescriptor_t op_desc; + + auto input = _attributes->input->to(device, device_id); + auto output = _attributes->output->to(device, device_id); + + CHECK_OR(infiniopCreateHardSwishDescriptor(handle, &op_desc, + /*y*/ output->desc(), + /*x*/ input->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create HardSwish descriptor.")); + + size_t workspace_size; + CHECK_OR(infiniopGetHardSwishWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + + CHECK_OR(infiniopHardSwish(op_desc, workspace, workspace_size, + /*y*/ output->data(), + /*x*/ input->data(), + /*stream*/ nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(output, _attributes->ans, _rtol, _atol, _equal_nan); + } catch (const std::exception &e) { + infiniopDestroyHardSwishDescriptor(op_desc); + infinirtFree(workspace); + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = benchmark( + [=]() { + infiniopHardSwish(op_desc, workspace, workspace_size, + output->data(), + input->data(), + nullptr); + }, + warm_ups, iterations); + + infiniopDestroyHardSwishDescriptor(op_desc); + infinirtFree(workspace); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { return {}; } + +std::vector Test::tensor_names() { return {"input", "output", "ans"}; } + +std::vector Test::output_names() { return {"output"}; } + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- input: " << _attributes->input->info() << std::endl; + oss << "- output: " << _attributes->output->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol + << ", equal_nan=" << _equal_nan << std::endl; + return oss.str(); +} + +Test::~Test() { delete _attributes; } + +} // namespace infiniop_test::hardswish diff --git a/src/infiniop-test/src/ops/leakyrelu.cpp b/src/infiniop-test/src/ops/leakyrelu.cpp new file mode 100644 index 000000000..d26924ca2 --- /dev/null +++ b/src/infiniop-test/src/ops/leakyrelu.cpp @@ -0,0 +1,112 @@ +#include "ops.hpp" +#include "utils.hpp" +#include // for std::memcpy +#include +#include +#include + +namespace infiniop_test::leakyrelu { + +struct Test::Attributes { + std::shared_ptr input; + std::shared_ptr output; + std::shared_ptr ans; + float negative_slope = 0.01f; // 默认与 PyTorch 对齐 +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol, bool equal_nan) { + + auto test = std::shared_ptr(new Test(rtol, atol, equal_nan)); + test->_attributes = new Attributes(); + + if (tensors.find("input") == tensors.end() + || tensors.find("output") == tensors.end() + || tensors.find("ans") == tensors.end() + || attributes.find("negative_slope") == attributes.end()) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->input = tensors["input"]; + test->_attributes->output = tensors["output"]; + test->_attributes->ans = tensors["ans"]; + test->_attributes->negative_slope = *reinterpret_cast(attributes["negative_slope"].data()); + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, + size_t warm_ups, size_t iterations) { + + infiniopLeakyReLUDescriptor_t op_desc; + + auto input = _attributes->input->to(device, device_id); + auto output = _attributes->output->to(device, device_id); + + CHECK_OR(infiniopCreateLeakyReLUDescriptor( + handle, &op_desc, + /*y*/ output->desc(), + /*x*/ input->desc(), + /*negative_slope*/ _attributes->negative_slope), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create LeakyReLU descriptor.")); + + size_t workspace_size = 0; + CHECK_OR(infiniopGetLeakyReLUWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + + void *workspace = nullptr; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + + CHECK_OR(infiniopLeakyReLU(op_desc, workspace, workspace_size, + /*y*/ output->data(), + /*x*/ input->data(), + /*stream*/ nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + try { + allClose(output, _attributes->ans, _rtol, _atol, _equal_nan); + } catch (const std::exception &e) { + infiniopDestroyLeakyReLUDescriptor(op_desc); + infinirtFree(workspace); + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = benchmark( + [=]() { + infiniopLeakyReLU(op_desc, workspace, workspace_size, + /*y*/ output->data(), + /*x*/ input->data(), + /*stream*/ nullptr); + }, + warm_ups, iterations); + + infiniopDestroyLeakyReLUDescriptor(op_desc); + infinirtFree(workspace); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { return {"negative_slope"}; } + +std::vector Test::tensor_names() { return {"input", "output", "ans"}; } + +std::vector Test::output_names() { return {"output"}; } + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- input: " << _attributes->input->info() << std::endl; + oss << "- output: " << _attributes->output->info() << std::endl; + oss << "- negative_slope: " << _attributes->negative_slope << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol + << ", equal_nan=" << _equal_nan << std::endl; + oss << "- inplace: true" << std::endl; + return oss.str(); +} + +Test::~Test() { delete _attributes; } + +} // namespace infiniop_test::leakyrelu diff --git a/src/infiniop-test/src/ops/mul.cpp b/src/infiniop-test/src/ops/mul.cpp index 8ebfc426b..cb0b639bf 100644 --- a/src/infiniop-test/src/ops/mul.cpp +++ b/src/infiniop-test/src/ops/mul.cpp @@ -15,8 +15,8 @@ struct Test::Attributes { std::shared_ptr Test::build( std::unordered_map> attributes, std::unordered_map> tensors, - double rtol, double atol) { - auto test = std::shared_ptr(new Test(rtol, atol)); + double rtol, double atol, bool equal_nan) { + auto test = std::shared_ptr(new Test(rtol, atol, equal_nan)); test->_attributes = new Attributes(); if (tensors.find("a") == tensors.end() || tensors.find("b") == tensors.end() @@ -58,7 +58,7 @@ std::shared_ptr Test::run( return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); try { - allClose(c, _attributes->ans, _rtol, _atol); + allClose(c, _attributes->ans, _rtol, _atol, _equal_nan); } catch (const std::exception &e) { return TEST_FAILED(RESULT_INCORRECT, e.what()); } @@ -98,7 +98,7 @@ std::string Test::toString() const { oss << "- b: " << _attributes->b->info() << std::endl; oss << "- c: " << _attributes->c->info() << std::endl; oss << std::scientific << std::setprecision(2); - oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl; return oss.str(); } diff --git a/src/infiniop-test/src/ops/random_sample.cpp b/src/infiniop-test/src/ops/random_sample.cpp index a11e0f446..75ee07b44 100644 --- a/src/infiniop-test/src/ops/random_sample.cpp +++ b/src/infiniop-test/src/ops/random_sample.cpp @@ -20,8 +20,8 @@ struct Test::Attributes { std::shared_ptr Test::build( std::unordered_map> attributes, std::unordered_map> tensors, - double rtol, double atol) { - auto test = std::shared_ptr(new Test(rtol, atol)); + double rtol, double atol, bool equal_nan) { + auto test = std::shared_ptr(new Test(rtol, atol, equal_nan)); test->_attributes = new Attributes(); if (!check_names(attributes, Test::attribute_names()) || !check_names(tensors, Test::tensor_names())) { throw std::runtime_error("Invalid Test"); @@ -70,7 +70,7 @@ std::shared_ptr Test::run( return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); try { - allClose(result, _attributes->ans, _rtol, _atol); + allClose(result, _attributes->ans, _rtol, _atol, _equal_nan); } catch (const std::exception &e) { return TEST_FAILED(RESULT_INCORRECT, e.what()); } @@ -117,7 +117,7 @@ std::string Test::toString() const { oss << "- data: " << _attributes->data->info() << std::endl; oss << "- result: " << _attributes->result->info() << std::endl; oss << std::scientific << std::setprecision(2); - oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl; return oss.str(); } diff --git a/src/infiniop-test/src/ops/rearrange.cpp b/src/infiniop-test/src/ops/rearrange.cpp index 9fbf6f2cb..bdf162ce2 100644 --- a/src/infiniop-test/src/ops/rearrange.cpp +++ b/src/infiniop-test/src/ops/rearrange.cpp @@ -12,9 +12,9 @@ struct Test::Attributes { std::shared_ptr Test::build( std::unordered_map> attributes, std::unordered_map> tensors, - double rtol, double atol) { + double rtol, double atol, bool equal_nan) { - auto test = std::shared_ptr(new Test(rtol, atol)); + auto test = std::shared_ptr(new Test(rtol, atol, equal_nan)); test->_attributes = new Attributes(); if (!check_names(attributes, Test::attribute_names()) || !check_names(tensors, Test::tensor_names())) { throw std::runtime_error("Invalid Test"); diff --git a/src/infiniop-test/src/ops/rms_norm.cpp b/src/infiniop-test/src/ops/rms_norm.cpp index 8359a4536..786ce8470 100644 --- a/src/infiniop-test/src/ops/rms_norm.cpp +++ b/src/infiniop-test/src/ops/rms_norm.cpp @@ -16,8 +16,8 @@ struct Test::Attributes { std::shared_ptr Test::build( std::unordered_map> attributes, std::unordered_map> tensors, - double rtol, double atol) { - auto test = std::shared_ptr(new Test(rtol, atol)); + double rtol, double atol, bool equal_nan) { + auto test = std::shared_ptr(new Test(rtol, atol, equal_nan)); test->_attributes = new Attributes(); if (attributes.find("epsilon") == attributes.end() @@ -72,7 +72,7 @@ std::shared_ptr Test::run( return TEST_FAILED(OP_EXECUTION_FAILED, "RMSNorm execution failed")); try { - allClose(y, _attributes->ans, _rtol, _atol); + allClose(y, _attributes->ans, _rtol, _atol, _equal_nan); } catch (const std::exception &e) { return TEST_FAILED(RESULT_INCORRECT, e.what()); } @@ -117,7 +117,7 @@ std::string Test::toString() const { oss << "- w: " << _attributes->w->info() << std::endl; oss << "- y: " << _attributes->y->info() << std::endl; oss << std::scientific << std::setprecision(2); - oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl; return oss.str(); } diff --git a/src/infiniop-test/src/ops/rope.cpp b/src/infiniop-test/src/ops/rope.cpp index 636f565af..94010a122 100644 --- a/src/infiniop-test/src/ops/rope.cpp +++ b/src/infiniop-test/src/ops/rope.cpp @@ -17,8 +17,8 @@ struct Test::Attributes { std::shared_ptr Test::build( std::unordered_map> attributes, std::unordered_map> tensors, - double rtol, double atol) { - auto test = std::shared_ptr(new Test(rtol, atol)); + double rtol, double atol, bool equal_nan) { + auto test = std::shared_ptr(new Test(rtol, atol, equal_nan)); test->_attributes = new Attributes(); if (tensors.find("y") == tensors.end() @@ -77,7 +77,7 @@ std::shared_ptr Test::run( return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); try { - allClose(y, _attributes->ans, _rtol, _atol); + allClose(y, _attributes->ans, _rtol, _atol, _equal_nan); } catch (const std::exception &e) { return TEST_FAILED(RESULT_INCORRECT, e.what()); } @@ -121,7 +121,7 @@ std::string Test::toString() const { oss << "- sin_table: " << _attributes->sin_table->info() << std::endl; oss << "- cos_table: " << _attributes->cos_table->info() << std::endl; oss << std::scientific << std::setprecision(2); - oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl; return oss.str(); } diff --git a/src/infiniop-test/src/ops/sigmoid_backward.cpp b/src/infiniop-test/src/ops/sigmoid_backward.cpp new file mode 100644 index 000000000..0248d6a47 --- /dev/null +++ b/src/infiniop-test/src/ops/sigmoid_backward.cpp @@ -0,0 +1,115 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::sigmoid_backward { + +struct Test::Attributes { + std::shared_ptr input; + std::shared_ptr grad_output; + std::shared_ptr grad_input; // output + std::shared_ptr ans; // reference +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol, bool equal_nan) { + auto test = std::shared_ptr(new Test(rtol, atol, equal_nan)); + test->_attributes = new Attributes(); + + if (tensors.find("input") == tensors.end() + || tensors.find("grad_output") == tensors.end() + || tensors.find("grad_input") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->input = tensors["input"]; + test->_attributes->grad_output = tensors["grad_output"]; + test->_attributes->grad_input = tensors["grad_input"]; + test->_attributes->ans = tensors["ans"]; + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, + size_t warm_ups, size_t iterations) { + + infiniopSigmoidBackwardDescriptor_t op_desc; + + auto input = _attributes->input->to(device, device_id); + auto grad_output = _attributes->grad_output->to(device, device_id); + auto grad_input = _attributes->grad_input->to(device, device_id); + + CHECK_OR(infiniopCreateSigmoidBackwardDescriptor(handle, &op_desc, + /*dst*/ grad_input->desc(), + /*input*/ input->desc(), + /*dy*/ grad_output->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create sigmoid_backward descriptor.")); + + size_t workspace_size; + CHECK_OR(infiniopGetSigmoidBackwardWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + + CHECK_OR(infiniopSigmoidBackward(op_desc, workspace, workspace_size, + /*dst*/ grad_input->data(), + /*input*/ input->data(), + /*dy*/ grad_output->data(), + /*stream*/ nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + // 浮点比较;混合精度下建议适当放宽 rtol/atol + allClose(grad_input, _attributes->ans, _rtol, _atol, _equal_nan); + } catch (const std::exception &e) { + infiniopDestroySigmoidBackwardDescriptor(op_desc); + infinirtFree(workspace); + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = benchmark( + [=]() { + infiniopSigmoidBackward(op_desc, workspace, workspace_size, + grad_input->data(), + input->data(), + grad_output->data(), + nullptr); + }, + warm_ups, iterations); + + infiniopDestroySigmoidBackwardDescriptor(op_desc); + infinirtFree(workspace); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { return {}; } + +std::vector Test::tensor_names() { + return {"input", "grad_output", "grad_input", "ans"}; +} + +std::vector Test::output_names() { return {"grad_input"}; } + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- input: " << _attributes->input->info() << std::endl; + oss << "- grad_output: " << _attributes->grad_output->info() << std::endl; + oss << "- grad_input: " << _attributes->grad_input->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol + << ", equal_nan=" << _equal_nan << std::endl; + return oss.str(); +} + +Test::~Test() { delete _attributes; } + +} // namespace infiniop_test::sigmoid_backward diff --git a/src/infiniop-test/src/ops/sin.cpp b/src/infiniop-test/src/ops/sin.cpp new file mode 100644 index 000000000..daa10d1f8 --- /dev/null +++ b/src/infiniop-test/src/ops/sin.cpp @@ -0,0 +1,105 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::sin { + +struct Test::Attributes { + std::shared_ptr input; + std::shared_ptr output; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol, bool equal_nan) { + + auto test = std::shared_ptr(new Test(rtol, atol, equal_nan)); + test->_attributes = new Attributes(); + + if (tensors.find("input") == tensors.end() + || tensors.find("output") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->input = tensors["input"]; + test->_attributes->output = tensors["output"]; + test->_attributes->ans = tensors["ans"]; + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, + size_t warm_ups, size_t iterations) { + + infiniopSinDescriptor_t op_desc; + + auto input = _attributes->input->to(device, device_id); + auto output = _attributes->output->to(device, device_id); + + CHECK_OR(infiniopCreateSinDescriptor(handle, &op_desc, + /*y*/ output->desc(), + /*x*/ input->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create sin descriptor.")); + + size_t workspace_size; + CHECK_OR(infiniopGetSinWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + + CHECK_OR(infiniopSin(op_desc, workspace, workspace_size, + /*y*/ output->data(), + /*x*/ input->data(), + /*stream*/ nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(output, _attributes->ans, _rtol, _atol, _equal_nan); + } catch (const std::exception &e) { + infiniopDestroySinDescriptor(op_desc); + infinirtFree(workspace); + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = benchmark( + [=]() { + infiniopSin(op_desc, workspace, workspace_size, + output->data(), + input->data(), + nullptr); + }, + warm_ups, iterations); + + infiniopDestroySinDescriptor(op_desc); + infinirtFree(workspace); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { return {}; } + +std::vector Test::tensor_names() { return {"input", "output", "ans"}; } + +std::vector Test::output_names() { return {"output"}; } + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- input: " << _attributes->input->info() << std::endl; + oss << "- output: " << _attributes->output->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol + << ", equal_nan=" << _equal_nan << std::endl; + return oss.str(); +} + +Test::~Test() { delete _attributes; } + +} // namespace infiniop_test::sin diff --git a/src/infiniop-test/src/ops/sub.cpp b/src/infiniop-test/src/ops/sub.cpp index 6bb1fd1eb..bb3adc350 100644 --- a/src/infiniop-test/src/ops/sub.cpp +++ b/src/infiniop-test/src/ops/sub.cpp @@ -15,8 +15,8 @@ struct Test::Attributes { std::shared_ptr Test::build( std::unordered_map> attributes, std::unordered_map> tensors, - double rtol, double atol) { - auto test = std::shared_ptr(new Test(rtol, atol)); + double rtol, double atol, bool equal_nan) { + auto test = std::shared_ptr(new Test(rtol, atol, equal_nan)); test->_attributes = new Attributes(); if (tensors.find("a") == tensors.end() || tensors.find("b") == tensors.end() @@ -58,7 +58,7 @@ std::shared_ptr Test::run( return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); try { - allClose(c, _attributes->ans, _rtol, _atol); + allClose(c, _attributes->ans, _rtol, _atol, _equal_nan); } catch (const std::exception &e) { return TEST_FAILED(RESULT_INCORRECT, e.what()); } @@ -98,7 +98,7 @@ std::string Test::toString() const { oss << "- b: " << _attributes->b->info() << std::endl; oss << "- c: " << _attributes->c->info() << std::endl; oss << std::scientific << std::setprecision(2); - oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl; return oss.str(); } diff --git a/src/infiniop-test/src/ops/swiglu.cpp b/src/infiniop-test/src/ops/swiglu.cpp index 96b75efc5..f86dfadc6 100644 --- a/src/infiniop-test/src/ops/swiglu.cpp +++ b/src/infiniop-test/src/ops/swiglu.cpp @@ -15,8 +15,8 @@ struct Test::Attributes { std::shared_ptr Test::build( std::unordered_map> attributes, std::unordered_map> tensors, - double rtol, double atol) { - auto test = std::shared_ptr(new Test(rtol, atol)); + double rtol, double atol, bool equal_nan) { + auto test = std::shared_ptr(new Test(rtol, atol, equal_nan)); test->_attributes = new Attributes(); if (tensors.find("a") == tensors.end() @@ -54,7 +54,7 @@ std::shared_ptr Test::run( CHECK_OR(infiniopSwiGLU(op_desc, workspace, workspace_size, c->data(), a->data(), b->data(), nullptr), return TEST_FAILED(OP_CREATION_FAILED, "Failed during execution.")); try { - allClose(c, _attributes->ans, _rtol, _atol); + allClose(c, _attributes->ans, _rtol, _atol, _equal_nan); } catch (const std::exception &e) { return TEST_FAILED(RESULT_INCORRECT, e.what()); } @@ -93,7 +93,7 @@ std::string Test::toString() const { oss << "- b: " << _attributes->b->info() << std::endl; oss << "- c: " << _attributes->c->info() << std::endl; oss << std::scientific << std::setprecision(2); - oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl; return oss.str(); } diff --git a/src/infiniop-test/src/ops/tanh.cpp b/src/infiniop-test/src/ops/tanh.cpp new file mode 100644 index 000000000..4ccc2aa7b --- /dev/null +++ b/src/infiniop-test/src/ops/tanh.cpp @@ -0,0 +1,105 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::tanh { + +struct Test::Attributes { + std::shared_ptr input; + std::shared_ptr output; // out + std::shared_ptr ans; // reference +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol, bool equal_nan) { + + auto test = std::shared_ptr(new Test(rtol, atol, equal_nan)); + test->_attributes = new Attributes(); + + if (tensors.find("input") == tensors.end() + || tensors.find("output") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->input = tensors["input"]; + test->_attributes->output = tensors["output"]; + test->_attributes->ans = tensors["ans"]; + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, + size_t warm_ups, size_t iterations) { + + infiniopTanhDescriptor_t op_desc; + + auto input = _attributes->input->to(device, device_id); + auto output = _attributes->output->to(device, device_id); + + CHECK_OR(infiniopCreateTanhDescriptor(handle, &op_desc, + /*y*/ output->desc(), + /*x*/ input->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create tanh descriptor.")); + + size_t workspace_size; + CHECK_OR(infiniopGetTanhWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + + CHECK_OR(infiniopTanh(op_desc, workspace, workspace_size, + /*y*/ output->data(), + /*x*/ input->data(), + /*stream*/ nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(output, _attributes->ans, _rtol, _atol, _equal_nan); + } catch (const std::exception &e) { + infiniopDestroyTanhDescriptor(op_desc); + infinirtFree(workspace); + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = benchmark( + [=]() { + infiniopTanh(op_desc, workspace, workspace_size, + output->data(), + input->data(), + nullptr); + }, + warm_ups, iterations); + + infiniopDestroyTanhDescriptor(op_desc); + infinirtFree(workspace); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { return {}; } + +std::vector Test::tensor_names() { return {"input", "output", "ans"}; } + +std::vector Test::output_names() { return {"output"}; } + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- input: " << _attributes->input->info() << std::endl; + oss << "- output: " << _attributes->output->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol + << ", equal_nan=" << _equal_nan << std::endl; + return oss.str(); +} + +Test::~Test() { delete _attributes; } + +} // namespace infiniop_test::tanh diff --git a/src/infiniop-test/src/ops/where.cpp b/src/infiniop-test/src/ops/where.cpp new file mode 100644 index 000000000..c9bf8379f --- /dev/null +++ b/src/infiniop-test/src/ops/where.cpp @@ -0,0 +1,130 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::where { + +struct Test::Attributes { + std::shared_ptr cond; + std::shared_ptr a; + std::shared_ptr b; + std::shared_ptr out; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol, bool equal_nan) { + auto test = std::shared_ptr(new Test(rtol, atol, equal_nan)); + test->_attributes = new Attributes(); + + if (tensors.find("condition") == tensors.end() + || tensors.find("a") == tensors.end() + || tensors.find("b") == tensors.end() + || tensors.find("c") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->cond = tensors["condition"]; + test->_attributes->a = tensors["a"]; + test->_attributes->b = tensors["b"]; + test->_attributes->out = tensors["c"]; + test->_attributes->ans = tensors["ans"]; + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, + size_t warm_ups, size_t iterations) { + + infiniopWhereDescriptor_t op_desc; + + auto cond = _attributes->cond->to(device, device_id); + auto a = _attributes->a->to(device, device_id); + auto b = _attributes->b->to(device, device_id); + auto out = _attributes->out->to(device, device_id); + + CHECK_OR(infiniopCreateWhereDescriptor(handle, &op_desc, + out->desc(), + cond->desc(), + a->desc(), + b->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create where descriptor.")); + + size_t workspace_size; + CHECK_OR(infiniopGetWhereWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + + CHECK_OR(infiniopWhere(op_desc, workspace, workspace_size, + out->data(), + cond->data(), + a->data(), + b->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + // where 输出通常与 a/b 同 dtype;若为整型/布尔,建议 rtol=0, atol=0 + allClose(out, _attributes->ans, _rtol, _atol, _equal_nan); + } catch (const std::exception &e) { + infiniopDestroyWhereDescriptor(op_desc); + infinirtFree(workspace); + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.0; + elapsed_time = benchmark( + [=]() { + infiniopWhere(op_desc, workspace, workspace_size, + out->data(), + cond->data(), + a->data(), + b->data(), + nullptr); + }, + warm_ups, iterations); + + infiniopDestroyWhereDescriptor(op_desc); + infinirtFree(workspace); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {}; +} + +std::vector Test::tensor_names() { + return {"condition", "a", "b", "c", "ans"}; +} + +std::vector Test::output_names() { + return {"c"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- condition: " << _attributes->cond->info() << std::endl; + oss << "- a: " << _attributes->a->info() << std::endl; + oss << "- b: " << _attributes->b->info() << std::endl; + oss << "- out: " << _attributes->out->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::where diff --git a/src/infiniop-test/src/test.cpp b/src/infiniop-test/src/test.cpp index e312ac5f5..0cbfe067a 100644 --- a/src/infiniop-test/src/test.cpp +++ b/src/infiniop-test/src/test.cpp @@ -49,7 +49,7 @@ std::string Result::toString() const { std::vector> runAllTests(const GGUFFileReader &gguf_reader, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations, - double rtol, double atol) { + double rtol, double atol, bool equal_nan) { auto meta = gguf_reader.getAttributeMap(); auto count_meta = meta.find("test_count"); if (count_meta == meta.end()) { @@ -60,7 +60,7 @@ std::vector> runAllTests(const GGUFFileReader &gguf_read auto results = std::vector>(count); try { for (size_t i = 0; i < count; i++) { - results[i] = runTest(gguf_reader, device, device_id, warm_ups, iterations, rtol, atol, i); + results[i] = runTest(gguf_reader, device, device_id, warm_ups, iterations, rtol, atol, i, equal_nan); } } catch (const std::exception &e) { std::cerr << "Error: " << e.what() << std::endl; @@ -72,7 +72,7 @@ std::vector> runAllTests(const GGUFFileReader &gguf_read std::shared_ptr runTest(const GGUFFileReader &gguf_reader, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations, - double rtol, double atol, size_t test_id) { + double rtol, double atol, size_t test_id, bool equal_nan) { auto meta = gguf_reader.getAttributeMap(); auto tensor_info = gguf_reader.getTensorInfoMap(); auto name_meta = meta.find("test." + std::to_string(test_id) + ".op_name"); @@ -107,7 +107,7 @@ std::shared_ptr runTest(const GGUFFileReader &gguf_reader, } std::shared_ptr test; try { - test = builder.build(attrs, tensors, rtol, atol); + test = builder.build(attrs, tensors, rtol, atol, equal_nan); } catch (const std::exception &e) { return TEST_INIT_FAILED(op_name + "/n" + e.what()); } @@ -141,7 +141,7 @@ void incrementOffset(ptrdiff_t &offset_1, const std::vector &strides_ } } -void allClose(std::shared_ptr actual_, std::shared_ptr expected_, double rtol, double atol) { +void allClose(std::shared_ptr actual_, std::shared_ptr expected_, double rtol, double atol, bool equal_nan) { auto actual = actual_->to(INFINI_DEVICE_CPU); auto expected = expected_->to(INFINI_DEVICE_CPU); auto shape = actual->shape(); @@ -158,12 +158,22 @@ void allClose(std::shared_ptr actual_, std::shared_ptr expected_ for (size_t i = 0; i < total; i++) { double a_ = getVal((char *)actual->data() + actual_offset, actual->ggml_type()); double e_ = getVal((char *)expected->data() + expected_offset, expected->ggml_type()); - if (std::fabs(a_ - e_) > atol && std::fabs(a_ - e_) > rtol * std::fmax(std::fabs(a_), std::fabs(e_))) { - if (num_failed == 0) { - first_failed_msg = "First failed at index " + std::to_string(i) + " with value " + std::to_string(a_) + " but should be " + std::to_string(e_) + "."; + if (std::isnan(a_) || std::isnan(e_)) { + if ((equal_nan && (std::isnan(a_) != std::isnan(e_))) || !equal_nan) { + num_failed++; + if (num_failed == 0) { + first_failed_msg = "First failed at index " + std::to_string(i) + " with value " + std::to_string(a_) + " but should be " + std::to_string(e_) + "."; + } + } + } else { + if (std::fabs(a_ - e_) > atol && std::fabs(a_ - e_) > rtol * std::fmax(std::fabs(a_), std::fabs(e_))) { + if (num_failed == 0) { + first_failed_msg = "First failed at index " + std::to_string(i) + " with value " + std::to_string(a_) + " but should be " + std::to_string(e_) + "."; + } + num_failed++; } - num_failed++; } + incrementOffset(actual_offset, actual->strides(), ggmlTypeSize(actual->ggml_type()), expected_offset, expected->strides(), ggmlTypeSize(expected->ggml_type()), counter, shape); diff --git a/src/infiniop/ops/cast/cpu/cast_cpu.cc b/src/infiniop/ops/cast/cpu/cast_cpu.cc new file mode 100644 index 000000000..3c5336161 --- /dev/null +++ b/src/infiniop/ops/cast/cpu/cast_cpu.cc @@ -0,0 +1,86 @@ +#include "cast_cpu.h" + +namespace op::cast::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto input_dtype = input_desc_vec.at(0)->dtype(); + auto output_dtype = out_desc->dtype(); + + CHECK_SAME_SHAPE(out_desc->shape(), input_desc_vec.at(0)->shape()); + auto info_result = op::elementwise::ElementwiseInfo::create(out_desc, input_desc_vec); + CHECK_RESULT(info_result); + // create CPU elementwise descriptor + *desc_ptr = new Descriptor( + input_dtype, + output_dtype, + info_result.take(), + nullptr, + 0, + handle->device, + handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +#define SWITCH_IN_TYPE(OUT_TYPE, IN_TYPE) \ + switch (IN_TYPE) { \ + case INFINI_DTYPE_I32: \ + return _device_info->calculate(_info, output, inputs, stream); \ + case INFINI_DTYPE_I64: \ + return _device_info->calculate(_info, output, inputs, stream); \ + case INFINI_DTYPE_U32: \ + return _device_info->calculate(_info, output, inputs, stream); \ + case INFINI_DTYPE_U64: \ + return _device_info->calculate(_info, output, inputs, stream); \ + case INFINI_DTYPE_F16: \ + return _device_info->calculate(_info, output, inputs, stream); \ + case INFINI_DTYPE_F32: \ + return _device_info->calculate(_info, output, inputs, stream); \ + case INFINI_DTYPE_F64: \ + return _device_info->calculate(_info, output, inputs, stream); \ + case INFINI_DTYPE_BF16: \ + return _device_info->calculate(_info, output, inputs, stream); \ + default: \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + // Handle type conversions based on input and output types + switch (_output_dtype) { + case INFINI_DTYPE_I32: + SWITCH_IN_TYPE(int32_t, _input_dtype) + case INFINI_DTYPE_I64: + SWITCH_IN_TYPE(int64_t, _input_dtype) + case INFINI_DTYPE_U32: + SWITCH_IN_TYPE(uint32_t, _input_dtype) + case INFINI_DTYPE_U64: + SWITCH_IN_TYPE(uint64_t, _input_dtype) + case INFINI_DTYPE_F16: + SWITCH_IN_TYPE(fp16_t, _input_dtype) + case INFINI_DTYPE_F32: + SWITCH_IN_TYPE(float, _input_dtype) + case INFINI_DTYPE_F64: + SWITCH_IN_TYPE(double, _input_dtype) + case INFINI_DTYPE_BF16: + SWITCH_IN_TYPE(bf16_t, _input_dtype) + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::cast::cpu \ No newline at end of file diff --git a/src/infiniop/ops/cast/cpu/cast_cpu.h b/src/infiniop/ops/cast/cpu/cast_cpu.h new file mode 100644 index 000000000..47485e883 --- /dev/null +++ b/src/infiniop/ops/cast/cpu/cast_cpu.h @@ -0,0 +1,58 @@ +#ifndef CAST_CPU_H +#define CAST_CPU_H + +#include "../../../../utils/custom_types.h" +#include "../../../elementwise/cpu/elementwise_cpu.h" +#include + +namespace op::cast::cpu { +class Descriptor final : public InfiniopDescriptor { + infiniDtype_t _input_dtype, _output_dtype; + op::elementwise::ElementwiseInfo _info; + std::unique_ptr _device_info; + size_t _workspace_size; + + Descriptor( + infiniDtype_t input_dtype, + infiniDtype_t output_dtype, + op::elementwise::ElementwiseInfo info, + op::elementwise::cpu::DeviceImpl *device_info, + size_t workspace_size, + infiniDevice_t device_type, + int device_id) + : InfiniopDescriptor{device_type, device_id}, + _input_dtype(input_dtype), + _output_dtype(output_dtype), + _info(std::move(info)), + _device_info(std::move(device_info)), + _workspace_size(workspace_size) {} + +public: + ~Descriptor(); + + size_t workspaceSize() const { return _workspace_size; } + + static infiniStatus_t create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + std::vector input_desc); + + infiniStatus_t calculate( + void *workspace, size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const; +}; +struct CastOp { +public: + static constexpr size_t num_inputs = 1; + template + OUT_TYPE operator()(const IN_TYPE &x) const { + return utils::cast(x); + } +}; + +} // namespace op::cast::cpu + +#endif // CAST_CPU_H \ No newline at end of file diff --git a/src/infiniop/ops/cast/cuda/kernel.cuh b/src/infiniop/ops/cast/cuda/kernel.cuh new file mode 100644 index 000000000..98556fd71 --- /dev/null +++ b/src/infiniop/ops/cast/cuda/kernel.cuh @@ -0,0 +1,77 @@ +namespace op::cast::cuda { + +typedef struct CastOp { +public: + static constexpr size_t num_inputs = 1; + +private: + template + __device__ __forceinline__ T_dst cast_impl(const T_src &x) const { + if constexpr (std::is_same_v) { + return x; + } else if constexpr (std::is_same_v) { + // From half + if constexpr (std::is_same_v) { + return __half2float(x); + } else if constexpr (std::is_same_v) { + return static_cast(__half2float(x)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16(__half2float(x)); + } else if constexpr (std::is_integral_v) { + return static_cast(__half2float(x)); + } else { + return static_cast(__half2float(x)); + } + } else if constexpr (std::is_same_v) { + // From bfloat16 + if constexpr (std::is_same_v) { + return __bfloat162float(x); + } else if constexpr (std::is_same_v) { + return static_cast(__bfloat162float(x)); + } else if constexpr (std::is_same_v) { + return __float2half(__bfloat162float(x)); + } else if constexpr (std::is_integral_v) { + return static_cast(__bfloat162float(x)); + } else { + return static_cast(__bfloat162float(x)); + } + } else if constexpr (std::is_same_v) { + // To half + if constexpr (std::is_same_v) { + return __float2half(x); + } else if constexpr (std::is_same_v) { + return __float2half(static_cast(x)); + } else { + return __float2half(static_cast(x)); + } + } else if constexpr (std::is_same_v) { + // To bfloat16 + if constexpr (std::is_same_v) { + return __float2bfloat16(x); + } else if constexpr (std::is_same_v) { + return __float2bfloat16(static_cast(x)); + } else { + return __float2bfloat16(static_cast(x)); + } + } else if constexpr (std::is_same_v) { + // Handle half2 special case + if constexpr (std::is_same_v) { + return __half2float(__low2half(x)); + } else { + return static_cast(__half2float(__low2half(x))); + } + } else { + // Direct cast for other cases + return static_cast(x); + } + } + +public: + template + __device__ __forceinline__ T_dst operator()(const T_src &x) const { + return cast_impl(x); + } + +} CastOp; + +} // namespace op::cast::cuda diff --git a/src/infiniop/ops/cast/metax/cast_metax.h b/src/infiniop/ops/cast/metax/cast_metax.h new file mode 100644 index 000000000..0ae57feb2 --- /dev/null +++ b/src/infiniop/ops/cast/metax/cast_metax.h @@ -0,0 +1,48 @@ +#ifndef CAST_METAX_API_H +#define CAST_METAX_API_H + +#include "../../../elementwise/metax/elementwise_metax_api.h" +#include + +namespace op::cast::metax { +class Descriptor final : public InfiniopDescriptor { + infiniDtype_t _input_dtype, _output_dtype; + op::elementwise::ElementwiseInfo _info; + std::unique_ptr _device_info; + size_t _workspace_size; + + Descriptor( + infiniDtype_t input_dtype, + infiniDtype_t output_dtype, + op::elementwise::ElementwiseInfo info, + op::elementwise::metax::DeviceImpl *device_info, + size_t workspace_size, + infiniDevice_t device_type, + int device_id) + : InfiniopDescriptor{device_type, device_id}, + _input_dtype(input_dtype), + _output_dtype(output_dtype), + _info(std::move(info)), + _device_info(std::move(device_info)), + _workspace_size(workspace_size) {} + +public: + ~Descriptor(); + + size_t workspaceSize() const { return _workspace_size; } + + static infiniStatus_t create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + std::vector input_desc); + + infiniStatus_t calculate( + void *workspace, size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const; +}; +} // namespace op::cast::metax + +#endif // CAST_METAX_API_H \ No newline at end of file diff --git a/src/infiniop/ops/cast/metax/cast_metax.maca b/src/infiniop/ops/cast/metax/cast_metax.maca new file mode 100644 index 000000000..5e2c73022 --- /dev/null +++ b/src/infiniop/ops/cast/metax/cast_metax.maca @@ -0,0 +1,101 @@ +#include "cast_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::cast::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto output_dtype = out_desc->dtype(); + auto input_dtype = input_desc_vec.at(0)->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &out_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_SAME_SHAPE(out_desc->shape(), input_desc_vec.at(0)->shape()); + auto info_result = op::elementwise::ElementwiseInfo::create(out_desc, input_desc_vec); + CHECK_RESULT(info_result); + auto info = info_result.take(); + auto workspace_size = info.getMetaMemSize() + info.getInputSize() * sizeof(void *); + + auto device_impl_result = op::elementwise::metax::DeviceImpl::create(handle->internal()); + CHECK_RESULT(device_impl_result); + + // create metax elementwise descriptor + *desc_ptr = new Descriptor( + input_dtype, + output_dtype, + std::move(info), + std::move(device_impl_result.take()), + workspace_size, + handle->device, + handle->device_id); + return INFINI_STATUS_SUCCESS; +} + + +#define SWITCH_IN_TYPE_METAX(OUT_TYPE, IN_TYPE) \ + switch(IN_TYPE){ \ + case INFINI_DTYPE_I32: \ + return _device_info->calculate<256, cuda::CastOp, OUT_TYPE, int32_t>(_info, workspace, output, inputs, stream); \ + case INFINI_DTYPE_I64: \ + return _device_info->calculate<256, cuda::CastOp, OUT_TYPE, int64_t>(_info, workspace, output, inputs, stream); \ + case INFINI_DTYPE_U32: \ + return _device_info->calculate<256, cuda::CastOp, OUT_TYPE, uint32_t>(_info, workspace, output, inputs, stream); \ + case INFINI_DTYPE_U64: \ + return _device_info->calculate<256, cuda::CastOp, OUT_TYPE, uint64_t>(_info, workspace, output, inputs, stream); \ + case INFINI_DTYPE_F16: \ + return _device_info->calculate<256, cuda::CastOp, OUT_TYPE, half>(_info, workspace, output, inputs, stream); \ + case INFINI_DTYPE_F32: \ + return _device_info->calculate<256, cuda::CastOp, OUT_TYPE, float>(_info, workspace, output, inputs, stream); \ + case INFINI_DTYPE_F64: \ + return _device_info->calculate<256, cuda::CastOp, OUT_TYPE, double>(_info, workspace, output, inputs, stream); \ + case INFINI_DTYPE_BF16: \ + return _device_info->calculate<256, cuda::CastOp, OUT_TYPE, cuda_bfloat16>(_info, workspace, output, inputs, stream); \ + default: \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } + + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + // Handle type conversions based on input and output types + switch (_output_dtype) { + case INFINI_DTYPE_I32: + SWITCH_IN_TYPE_METAX(int32_t,_input_dtype) + case INFINI_DTYPE_I64: + SWITCH_IN_TYPE_METAX(int64_t,_input_dtype) + case INFINI_DTYPE_U32: + SWITCH_IN_TYPE_METAX(uint32_t,_input_dtype) + case INFINI_DTYPE_U64: + SWITCH_IN_TYPE_METAX(uint64_t,_input_dtype) + case INFINI_DTYPE_F16: + SWITCH_IN_TYPE_METAX(half,_input_dtype) + case INFINI_DTYPE_F32: + SWITCH_IN_TYPE_METAX(float,_input_dtype) + case INFINI_DTYPE_F64: + SWITCH_IN_TYPE_METAX(double,_input_dtype) + case INFINI_DTYPE_BF16: + SWITCH_IN_TYPE_METAX(cuda_bfloat16,_input_dtype) + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::cast::metax diff --git a/src/infiniop/ops/cast/nvidia/cast_nvidia.cu b/src/infiniop/ops/cast/nvidia/cast_nvidia.cu new file mode 100644 index 000000000..238af6857 --- /dev/null +++ b/src/infiniop/ops/cast/nvidia/cast_nvidia.cu @@ -0,0 +1,98 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "cast_nvidia.cuh" + +namespace op::cast::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto output_dtype = out_desc->dtype(); + auto input_dtype = input_desc_vec.at(0)->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &out_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_SAME_SHAPE(out_desc->shape(), input_desc_vec.at(0)->shape()); + auto info_result = op::elementwise::ElementwiseInfo::create(out_desc, input_desc_vec); + CHECK_RESULT(info_result); + auto info = info_result.take(); + auto workspace_size = info.getMetaMemSize() + info.getInputSize() * sizeof(void *); + // Create DeviceImpl using the correct pattern from the macro + auto device_impl_result = op::elementwise::nvidia::DeviceImpl::create(handle->internal()); + CHECK_RESULT(device_impl_result); + + // Create nvidia elementwise descriptor + *desc_ptr = new Descriptor( + input_dtype, + output_dtype, + std::move(info), + std::move(device_impl_result.take()), + workspace_size, + handle->device, + handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +#define SWITCH_IN_TYPE_NVIDIA(OUT_TYPE, IN_TYPE) \ + switch (IN_TYPE) { \ + case INFINI_DTYPE_I32: \ + return _device_info->calculate<256, cuda::CastOp, OUT_TYPE, int32_t>(_info, workspace, output, inputs, stream); \ + case INFINI_DTYPE_I64: \ + return _device_info->calculate<256, cuda::CastOp, OUT_TYPE, int64_t>(_info, workspace, output, inputs, stream); \ + case INFINI_DTYPE_U32: \ + return _device_info->calculate<256, cuda::CastOp, OUT_TYPE, uint32_t>(_info, workspace, output, inputs, stream); \ + case INFINI_DTYPE_U64: \ + return _device_info->calculate<256, cuda::CastOp, OUT_TYPE, uint64_t>(_info, workspace, output, inputs, stream); \ + case INFINI_DTYPE_F16: \ + return _device_info->calculate<256, cuda::CastOp, OUT_TYPE, half>(_info, workspace, output, inputs, stream); \ + case INFINI_DTYPE_F32: \ + return _device_info->calculate<256, cuda::CastOp, OUT_TYPE, float>(_info, workspace, output, inputs, stream); \ + case INFINI_DTYPE_F64: \ + return _device_info->calculate<256, cuda::CastOp, OUT_TYPE, double>(_info, workspace, output, inputs, stream); \ + case INFINI_DTYPE_BF16: \ + return _device_info->calculate<256, cuda::CastOp, OUT_TYPE, cuda_bfloat16>(_info, workspace, output, inputs, stream); \ + default: \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + // Handle type conversions based on input and output types + switch (_output_dtype) { + case INFINI_DTYPE_I32: + SWITCH_IN_TYPE_NVIDIA(int32_t, _input_dtype) + case INFINI_DTYPE_I64: + SWITCH_IN_TYPE_NVIDIA(int64_t, _input_dtype) + case INFINI_DTYPE_U32: + SWITCH_IN_TYPE_NVIDIA(uint32_t, _input_dtype) + case INFINI_DTYPE_U64: + SWITCH_IN_TYPE_NVIDIA(uint64_t, _input_dtype) + case INFINI_DTYPE_F16: + SWITCH_IN_TYPE_NVIDIA(half, _input_dtype) + case INFINI_DTYPE_F32: + SWITCH_IN_TYPE_NVIDIA(float, _input_dtype) + case INFINI_DTYPE_F64: + SWITCH_IN_TYPE_NVIDIA(double, _input_dtype) + case INFINI_DTYPE_BF16: + SWITCH_IN_TYPE_NVIDIA(cuda_bfloat16, _input_dtype) + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::cast::nvidia \ No newline at end of file diff --git a/src/infiniop/ops/cast/nvidia/cast_nvidia.cuh b/src/infiniop/ops/cast/nvidia/cast_nvidia.cuh new file mode 100644 index 000000000..09418f0db --- /dev/null +++ b/src/infiniop/ops/cast/nvidia/cast_nvidia.cuh @@ -0,0 +1,47 @@ +#ifndef CAST_NVIDIA_API_H +#define CAST_NVIDIA_API_H + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" +#include + +namespace op::cast::nvidia { +class Descriptor final : public InfiniopDescriptor { + infiniDtype_t _input_dtype, _output_dtype; + op::elementwise::ElementwiseInfo _info; + std::unique_ptr _device_info; + size_t _workspace_size; + + Descriptor( + infiniDtype_t input_dtype, + infiniDtype_t output_dtype, + op::elementwise::ElementwiseInfo info, + op::elementwise::nvidia::DeviceImpl *device_info, + size_t workspace_size, + infiniDevice_t device_type, + int device_id) + : InfiniopDescriptor{device_type, device_id}, + _input_dtype(input_dtype), + _output_dtype(output_dtype), + _info(std::move(info)), + _device_info(std::move(device_info)), + _workspace_size(workspace_size) {} + +public: + ~Descriptor(); + + size_t workspaceSize() const { return _workspace_size; } + + static infiniStatus_t create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + std::vector input_desc); + + infiniStatus_t calculate( + void *workspace, size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const; +}; +} // namespace op::cast::nvidia +#endif // CAST_NVIDIA_API_H diff --git a/src/infiniop/ops/cast/operator.cc b/src/infiniop/ops/cast/operator.cc new file mode 100644 index 000000000..dc2589741 --- /dev/null +++ b/src/infiniop/ops/cast/operator.cc @@ -0,0 +1,141 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/cast.h" + +#ifdef ENABLE_CPU_API +#include "cpu/cast_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/cast_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/cast_metax.h" +#endif + +__C infiniStatus_t infiniopCreateCastDescriptor( + infiniopHandle_t handle, + infiniopCastDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::cast::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + {input_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetCastWorkspaceSize(infiniopCastDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopCast( + infiniopCastDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, {input}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t infiniopDestroyCastDescriptor(infiniopCastDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} \ No newline at end of file diff --git a/src/infiniop/ops/cos/cpu/cos_cpu.cc b/src/infiniop/ops/cos/cpu/cos_cpu.cc new file mode 100644 index 000000000..578b55281 --- /dev/null +++ b/src/infiniop/ops/cos/cpu/cos_cpu.cc @@ -0,0 +1,52 @@ +#include "cos_cpu.h" + +namespace op::cos::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &out_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(out_shape, input_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::cos::cpu \ No newline at end of file diff --git a/src/infiniop/ops/cos/cpu/cos_cpu.h b/src/infiniop/ops/cos/cpu/cos_cpu.h new file mode 100644 index 000000000..45dbba919 --- /dev/null +++ b/src/infiniop/ops/cos/cpu/cos_cpu.h @@ -0,0 +1,33 @@ +#ifndef COS_CPU_H +#define COS_CPU_H + +#include "../../../elementwise/cpu/elementwise_cpu.h" +#include + +ELEMENTWISE_DESCRIPTOR(cos, cpu) + +namespace op::cos::cpu { +typedef struct CosOp { +public: + static constexpr size_t num_inputs = 1; + template + T operator()(const T &x) const { + // cos(x) = cosine of x + if constexpr (std::is_same_v) { + float x_f = static_cast(x); + return static_cast(std::cos(x_f)); + } else if constexpr (std::is_same_v) { + float x_f = static_cast(x); + return static_cast(std::cos(x_f)); + } else if constexpr (std::is_same_v) { + return std::cos(x); + } else if constexpr (std::is_same_v) { + return std::cos(x); + } else { + return std::cos(x); + } + } +} CosOp; +} // namespace op::cos::cpu + +#endif // COS_CPU_H \ No newline at end of file diff --git a/src/infiniop/ops/cos/cuda/kernel.cuh b/src/infiniop/ops/cos/cuda/kernel.cuh new file mode 100644 index 000000000..3d909ce4e --- /dev/null +++ b/src/infiniop/ops/cos/cuda/kernel.cuh @@ -0,0 +1,46 @@ +#ifndef COS_CUDA_H +#define COS_CUDA_H + +namespace op::cos::cuda { +typedef struct CosOp { +public: + static constexpr size_t num_inputs = 1; + + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + // For half2, process each half separately using CUDA intrinsics + half x_low = __low2half(x); + half x_high = __high2half(x); + + float x_low_f = __half2float(x_low); + float x_high_f = __half2float(x_high); + + half cos_low = __float2half(cosf(x_low_f)); + half cos_high = __float2half(cosf(x_high_f)); + + return __halves2half2(cos_low, cos_high); + } else if constexpr (std::is_same_v) { + // Convert to float for computation to maintain precision + float x_f = __half2float(x); + float result = cosf(x_f); + return __float2half(result); + } else if constexpr (std::is_same_v) { + // Convert to float for computation to maintain precision + float x_f = __bfloat162float(x); + float result = cosf(x_f); + return __float2bfloat16(result); + } else if constexpr (std::is_same_v) { + // Use fast math functions for float + return cosf(x); + } else if constexpr (std::is_same_v) { + return ::cos(x); + } else { + // Fallback + return cosf(x); + } + } +} CosOp; +} // namespace op::cos::cuda + +#endif // COS_CUDA_H \ No newline at end of file diff --git a/src/infiniop/ops/cos/metax/cos_metax.h b/src/infiniop/ops/cos/metax/cos_metax.h new file mode 100644 index 000000000..9c43dfd5f --- /dev/null +++ b/src/infiniop/ops/cos/metax/cos_metax.h @@ -0,0 +1,8 @@ +#ifndef COS_METAX_API_H +#define COS_METAX_API_H + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(cos, metax) + +#endif // COS_METAX_API_H diff --git a/src/infiniop/ops/cos/metax/cos_metax.maca b/src/infiniop/ops/cos/metax/cos_metax.maca new file mode 100644 index 000000000..894c8ca9f --- /dev/null +++ b/src/infiniop/ops/cos/metax/cos_metax.maca @@ -0,0 +1,60 @@ +#include "cos_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::cos::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &out_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(out_shape, input_shape); + + // create METAX elementwise descriptor + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::CosOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::CosOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::CosOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::CosOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::cos::metax \ No newline at end of file diff --git a/src/infiniop/ops/cos/nvidia/cos_nvidia.cu b/src/infiniop/ops/cos/nvidia/cos_nvidia.cu new file mode 100644 index 000000000..bee985672 --- /dev/null +++ b/src/infiniop/ops/cos/nvidia/cos_nvidia.cu @@ -0,0 +1,59 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "cos_nvidia.cuh" + +namespace op::cos::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &out_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(out_shape, input_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::CosOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::CosOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::CosOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::CosOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::cos::nvidia \ No newline at end of file diff --git a/src/infiniop/ops/cos/nvidia/cos_nvidia.cuh b/src/infiniop/ops/cos/nvidia/cos_nvidia.cuh new file mode 100644 index 000000000..7849028e9 --- /dev/null +++ b/src/infiniop/ops/cos/nvidia/cos_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef COS_NVIDIA_API_H +#define COS_NVIDIA_API_H + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(cos, nvidia) + +#endif // COS_NVIDIA_API_H diff --git a/src/infiniop/ops/cos/operator.cc b/src/infiniop/ops/cos/operator.cc new file mode 100644 index 000000000..e8ddeddf5 --- /dev/null +++ b/src/infiniop/ops/cos/operator.cc @@ -0,0 +1,141 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/cos.h" + +#ifdef ENABLE_CPU_API +#include "cpu/cos_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/cos_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/cos_metax.h" +#endif + +__C infiniStatus_t infiniopCreateCosDescriptor( + infiniopHandle_t handle, + infiniopCosDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::cos::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + {input_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetCosWorkspaceSize(infiniopCosDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopCos( + infiniopCosDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, {input}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t infiniopDestroyCosDescriptor(infiniopCosDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} \ No newline at end of file diff --git a/src/infiniop/ops/exp/cpu/exp_cpu.cc b/src/infiniop/ops/exp/cpu/exp_cpu.cc new file mode 100644 index 000000000..61456efad --- /dev/null +++ b/src/infiniop/ops/exp/cpu/exp_cpu.cc @@ -0,0 +1,52 @@ +#include "exp_cpu.h" + +namespace op::exp::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::exp::cpu \ No newline at end of file diff --git a/src/infiniop/ops/exp/cpu/exp_cpu.h b/src/infiniop/ops/exp/cpu/exp_cpu.h new file mode 100644 index 000000000..47eb5e7a3 --- /dev/null +++ b/src/infiniop/ops/exp/cpu/exp_cpu.h @@ -0,0 +1,27 @@ +#ifndef EXP_CPU_H +#define EXP_CPU_H + +#include "../../../elementwise/cpu/elementwise_cpu.h" +#include + +ELEMENTWISE_DESCRIPTOR(exp, cpu) + +namespace op::exp::cpu { +typedef struct ExpOp { +public: + static constexpr size_t num_inputs = 1; + template + T operator()(const T &a) const { + if constexpr (std::is_same_v) { + return std::exp(a); + } else if constexpr (std::is_same_v) { + return std::exp(a); + } else { + // For fp16_t and bf16_t, convert to float, compute exp, then convert back + return static_cast(std::exp(static_cast(a))); + } + } +} ExpOp; +} // namespace op::exp::cpu + +#endif // EXP_CPU_H diff --git a/src/infiniop/ops/exp/cuda/kernel.cuh b/src/infiniop/ops/exp/cuda/kernel.cuh new file mode 100644 index 000000000..2eafe9566 --- /dev/null +++ b/src/infiniop/ops/exp/cuda/kernel.cuh @@ -0,0 +1,37 @@ +#ifndef EXP_CUDA_H +#define EXP_CUDA_H + +namespace op::exp::cuda { +typedef struct ExpOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &a) const { + if constexpr (std::is_same_v) { + // For half2, split into two halves, compute exp, then combine + half2 result; + result.x = __float2half(expf(__half2float(a.x))); + result.y = __float2half(expf(__half2float(a.y))); + return result; + } else if constexpr (std::is_same_v) { + // Convert half to float, compute exp, convert back + float fa = __half2float(a); + float result = expf(fa); + return __float2half(result); + } else if constexpr (std::is_same_v) { + // Convert bf16 to float, compute exp, then convert back + float fa = __bfloat162float(a); + float result = expf(fa); + return __float2bfloat16(result); + } else if constexpr (std::is_same_v) { + return expf(a); + } else if constexpr (std::is_same_v) { + return ::exp(a); + } else { + return ::exp(a); + } + } +} ExpOp; +} // namespace op::exp::cuda + +#endif // EXP_CUDA_H \ No newline at end of file diff --git a/src/infiniop/ops/exp/metax/exp_metax.h b/src/infiniop/ops/exp/metax/exp_metax.h new file mode 100644 index 000000000..64fa186cf --- /dev/null +++ b/src/infiniop/ops/exp/metax/exp_metax.h @@ -0,0 +1,8 @@ +#ifndef EXP_METAX_API_H +#define EXP_METAX_API_H + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(exp, metax) + +#endif // EXP_METAX_API_H diff --git a/src/infiniop/ops/exp/metax/exp_metax.maca b/src/infiniop/ops/exp/metax/exp_metax.maca new file mode 100644 index 000000000..a214634b9 --- /dev/null +++ b/src/infiniop/ops/exp/metax/exp_metax.maca @@ -0,0 +1,60 @@ +#include "exp_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::exp::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create MetaX elementwise descriptor + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::ExpOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::ExpOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::ExpOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::ExpOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::exp::metax \ No newline at end of file diff --git a/src/infiniop/ops/exp/nvidia/exp_nvidia.cu b/src/infiniop/ops/exp/nvidia/exp_nvidia.cu new file mode 100644 index 000000000..f79846145 --- /dev/null +++ b/src/infiniop/ops/exp/nvidia/exp_nvidia.cu @@ -0,0 +1,59 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "exp_nvidia.cuh" + +namespace op::exp::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::ExpOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::ExpOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::ExpOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::ExpOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::exp::nvidia \ No newline at end of file diff --git a/src/infiniop/ops/exp/nvidia/exp_nvidia.cuh b/src/infiniop/ops/exp/nvidia/exp_nvidia.cuh new file mode 100644 index 000000000..596d88d62 --- /dev/null +++ b/src/infiniop/ops/exp/nvidia/exp_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef EXP_NVIDIA_API_H +#define EXP_NVIDIA_API_H + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(exp, nvidia) + +#endif // EXP_NVIDIA_API_H diff --git a/src/infiniop/ops/exp/operator.cc b/src/infiniop/ops/exp/operator.cc new file mode 100644 index 000000000..674d8dbfc --- /dev/null +++ b/src/infiniop/ops/exp/operator.cc @@ -0,0 +1,142 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/exp.h" + +#ifdef ENABLE_CPU_API +#include "cpu/exp_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/exp_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/exp_metax.h" +#endif + +__C infiniStatus_t infiniopCreateExpDescriptor( + infiniopHandle_t handle, + infiniopExpDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::exp::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + {input_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetExpWorkspaceSize(infiniopExpDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopExp( + infiniopExpDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, {input}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyExpDescriptor(infiniopExpDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} \ No newline at end of file diff --git a/src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc new file mode 100644 index 000000000..606db3a1f --- /dev/null +++ b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc @@ -0,0 +1,52 @@ +#include "hardswish_cpu.h" + +namespace op::hardswish::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &out_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(out_shape, input_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::hardswish::cpu \ No newline at end of file diff --git a/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h new file mode 100644 index 000000000..54ce51b18 --- /dev/null +++ b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h @@ -0,0 +1,41 @@ + +#ifndef HARDSWISH_CPU_H +#define HARDSWISH_CPU_H + +#include "../../../elementwise/cpu/elementwise_cpu.h" +#include +#include + +ELEMENTWISE_DESCRIPTOR(hardswish, cpu) + +namespace op::hardswish::cpu { +typedef struct HardSwishOp { +public: + static constexpr size_t num_inputs = 1; + template + T operator()(const T &x) const { + // HardSwish(x) = x * HardSigmoid(x) = x * max(0, min(1, (x + 3) / 6)) + if constexpr (std::is_same_v) { + float x_f = static_cast(x); + float hard_sigmoid = std::max(0.0f, std::min(1.0f, (x_f + 3.0f) / 6.0f)); + return static_cast(x_f * hard_sigmoid); + } else if constexpr (std::is_same_v) { + float x_f = static_cast(x); + float hard_sigmoid = std::max(0.0f, std::min(1.0f, (x_f + 3.0f) / 6.0f)); + return static_cast(x_f * hard_sigmoid); + } else if constexpr (std::is_same_v) { + float hard_sigmoid = std::max(0.0f, std::min(1.0f, (x + 3.0f) / 6.0f)); + return x * hard_sigmoid; + } else if constexpr (std::is_same_v) { + double hard_sigmoid = std::max(0.0, std::min(1.0, (x + 3.0) / 6.0)); + return x * hard_sigmoid; + } else { + float x_f = static_cast(x); + float hard_sigmoid = std::max(0.0f, std::min(1.0f, (x_f + 3.0f) / 6.0f)); + return static_cast(x_f * hard_sigmoid); + } + } +} HardSwishOp; +} // namespace op::hardswish::cpu + +#endif // HARDSWISH_CPU_H \ No newline at end of file diff --git a/src/infiniop/ops/hardswish/cuda/kernel.cuh b/src/infiniop/ops/hardswish/cuda/kernel.cuh new file mode 100644 index 000000000..cd55aa874 --- /dev/null +++ b/src/infiniop/ops/hardswish/cuda/kernel.cuh @@ -0,0 +1,36 @@ +#ifndef HARDSWISH_CUDA_H +#define HARDSWISH_CUDA_H + +namespace op::hardswish::cuda { + +typedef struct HardSwishOp { +public: + static constexpr size_t num_inputs = 1; + + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + float xf = __half2float(x); + float t = fminf(6.0f, fmaxf(0.0f, xf + 3.0f)) * (1.0f / 6.0f); + return __float2half(xf * t); + } else if constexpr (std::is_same_v) { + float xf = __bfloat162float(x); + float t = fminf(6.0f, fmaxf(0.0f, xf + 3.0f)) * (1.0f / 6.0f); + return __float2bfloat16(xf * t); + } else if constexpr (std::is_same_v) { + float t = fminf(6.0f, fmaxf(0.0f, x + 3.0f)) * (1.0f / 6.0f); + return x * t; + } else if constexpr (std::is_same_v) { + double t = fmin(6.0, fmax(0.0, x + 3.0)) * (1.0 / 6.0); + return x * t; + } else { + float xf = static_cast(x); + float t = fminf(6.0f, fmaxf(0.0f, xf + 3.0f)) * (1.0f / 6.0f); + return static_cast(xf * t); + } + } +} HardSwishOp; + +} // namespace op::hardswish::cuda + +#endif // HARDSWISH_CUDA_H diff --git a/src/infiniop/ops/hardswish/metax/hardswish_metax.h b/src/infiniop/ops/hardswish/metax/hardswish_metax.h new file mode 100644 index 000000000..cfde66aa0 --- /dev/null +++ b/src/infiniop/ops/hardswish/metax/hardswish_metax.h @@ -0,0 +1,9 @@ + +#ifndef HARDSWISH_METAX_API_H +#define HARDSWISH_METAX_API_H + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(hardswish, metax) + +#endif // HARDSWISH_METAX_API_H diff --git a/src/infiniop/ops/hardswish/metax/hardswish_metax.maca b/src/infiniop/ops/hardswish/metax/hardswish_metax.maca new file mode 100644 index 000000000..308f4c493 --- /dev/null +++ b/src/infiniop/ops/hardswish/metax/hardswish_metax.maca @@ -0,0 +1,60 @@ +#include "hardswish_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::hardswish::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &out_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(out_shape, input_shape); + + // create METAX elementwise descriptor + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::HardSwishOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::HardSwishOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::HardSwishOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::HardSwishOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::hardswish::metax \ No newline at end of file diff --git a/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu new file mode 100644 index 000000000..b3e9f7d5a --- /dev/null +++ b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu @@ -0,0 +1,59 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "hardswish_nvidia.cuh" + +namespace op::hardswish::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &out_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(out_shape, input_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::HardSwishOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::HardSwishOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::HardSwishOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::HardSwishOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::hardswish::nvidia \ No newline at end of file diff --git a/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cuh b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cuh new file mode 100644 index 000000000..8fd92a3a6 --- /dev/null +++ b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cuh @@ -0,0 +1,9 @@ + +#ifndef HARDSWISH_NVIDIA_API_H +#define HARDSWISH_NVIDIA_API_H + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(hardswish, nvidia) + +#endif // HARDSWISH_NVIDIA_API_H diff --git a/src/infiniop/ops/hardswish/operator.cc b/src/infiniop/ops/hardswish/operator.cc new file mode 100644 index 000000000..0a807a022 --- /dev/null +++ b/src/infiniop/ops/hardswish/operator.cc @@ -0,0 +1,141 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/hardswish.h" + +#ifdef ENABLE_CPU_API +#include "cpu/hardswish_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/hardswish_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/hardswish_metax.h" +#endif + +__C infiniStatus_t infiniopCreateHardSwishDescriptor( + infiniopHandle_t handle, + infiniopHardSwishDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::hardswish::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + {input_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetHardSwishWorkspaceSize(infiniopHardSwishDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopHardSwish( + infiniopHardSwishDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, {input}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t infiniopDestroyHardSwishDescriptor(infiniopHardSwishDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} \ No newline at end of file diff --git a/src/infiniop/ops/leaky_relu/cpu/leaky_relu_cpu.cc b/src/infiniop/ops/leaky_relu/cpu/leaky_relu_cpu.cc new file mode 100644 index 000000000..b8d09a1af --- /dev/null +++ b/src/infiniop/ops/leaky_relu/cpu/leaky_relu_cpu.cc @@ -0,0 +1,64 @@ +#include "leaky_relu_cpu.h" + +namespace op::leaky_relu::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec, + float negative_slope) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CPU elementwise descriptor + auto info_result = op::elementwise::ElementwiseInfo::create(out_desc, input_desc_vec); + CHECK_RESULT(info_result); + + *desc_ptr = new Descriptor( + dtype, + info_result.take(), + nullptr, + 0, + handle->device, + handle->device_id, + negative_slope); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream, _negative_slope); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream, _negative_slope); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream, _negative_slope); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream, _negative_slope); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::leaky_relu::cpu \ No newline at end of file diff --git a/src/infiniop/ops/leaky_relu/cpu/leaky_relu_cpu.h b/src/infiniop/ops/leaky_relu/cpu/leaky_relu_cpu.h new file mode 100644 index 000000000..7cd46289c --- /dev/null +++ b/src/infiniop/ops/leaky_relu/cpu/leaky_relu_cpu.h @@ -0,0 +1,78 @@ +#ifndef LEAKY_RELU_CPU_H +#define LEAKY_RELU_CPU_H + +#include "../../../elementwise/cpu/elementwise_cpu.h" +#include + +namespace op::leaky_relu::cpu { +class Descriptor final : public InfiniopDescriptor { + infiniDtype_t _dtype; + op::elementwise::ElementwiseInfo _info; + std::unique_ptr _device_info; + size_t _workspace_size; + float _negative_slope; + + Descriptor( + infiniDtype_t dtype, + op::elementwise::ElementwiseInfo info, + op::elementwise::cpu::DeviceImpl *device_info, + size_t workspace_size, + infiniDevice_t device_type, + int device_id, + float negative_slope) + : InfiniopDescriptor{device_type, device_id}, + _dtype(dtype), + _info(std::move(info)), + _device_info(std::move(device_info)), + _workspace_size(workspace_size), + _negative_slope(negative_slope) {} + +public: + ~Descriptor(); + + size_t workspaceSize() const { return _workspace_size; } + + static infiniStatus_t create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + std::vector input_desc, + float negative_slope); + + infiniStatus_t calculate( + void *workspace, size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const; +}; + +typedef struct LeakyReLUOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x, float negative_slope) const { + // LeakyReLU(x) = max(0, x) + negative_slope * min(0, x) + // Equivalent to: x >= 0 ? x : negative_slope * x + if constexpr (std::is_same_v) { + float x_f = static_cast(x); + float result = x_f >= 0.0f ? x_f : negative_slope * x_f; + return static_cast(result); + } else if constexpr (std::is_same_v) { + float x_f = static_cast(x); + float result = x_f >= 0.0f ? x_f : negative_slope * x_f; + return static_cast(result); + } else if constexpr (std::is_same_v) { + return x >= 0.0f ? x : negative_slope * x; + } else if constexpr (std::is_same_v) { + return x >= 0.0 ? x : static_cast(negative_slope) * x; + } else { + return x >= T(0) ? x : static_cast(negative_slope) * x; + } + } + +} LeakyReLUOp; + +} // namespace op::leaky_relu::cpu + +#endif // LEAKY_RELU_CPU_H \ No newline at end of file diff --git a/src/infiniop/ops/leaky_relu/cuda/kernel.cuh b/src/infiniop/ops/leaky_relu/cuda/kernel.cuh new file mode 100644 index 000000000..460538443 --- /dev/null +++ b/src/infiniop/ops/leaky_relu/cuda/kernel.cuh @@ -0,0 +1,42 @@ +#ifndef LEAKY_RELU_CUDA_H +#define LEAKY_RELU_CUDA_H + +namespace op::leaky_relu::cuda { +typedef struct LeakyReLUOp { +public: + static constexpr size_t num_inputs = 1; + + // __host__ __device__ LeakyReLUOp() = default; + + template + __device__ __forceinline__ T operator()(const T &x, const float *negative_slope) const { + if constexpr (std::is_same_v) { + // For half2, process each half separately + half x_low = __low2half(x); + half x_high = __high2half(x); + half result_low = x_low >= __float2half(0.0f) ? x_low : __float2half(*negative_slope) * x_low; + half result_high = x_high >= __float2half(0.0f) ? x_high : __float2half(*negative_slope) * x_high; + return __halves2half2(result_low, result_high); + } else if constexpr (std::is_same_v) { + // Use CUDA half operations + half zero = __float2half(0.0f); + half neg_slope = __float2half(*negative_slope); + return x >= zero ? x : neg_slope * x; + } else if constexpr (std::is_same_v) { + // Convert to float for computation to maintain precision + float x_f = __bfloat162float(x); + float result = x_f >= 0.0f ? x_f : *negative_slope * x_f; + return __float2bfloat16(result); + } else if constexpr (std::is_same_v) { + return x >= 0.0f ? x : *negative_slope * x; + } else if constexpr (std::is_same_v) { + return x >= 0.0 ? x : static_cast(*negative_slope) * x; + } else { + // Fallback + return x >= T(0) ? x : static_cast(*negative_slope) * x; + } + } +} LeakyReLUOp; +} // namespace op::leaky_relu::cuda + +#endif // LEAKY_RELU_CUDA_H \ No newline at end of file diff --git a/src/infiniop/ops/leaky_relu/metax/leaky_relu_metax.h b/src/infiniop/ops/leaky_relu/metax/leaky_relu_metax.h new file mode 100644 index 000000000..3feb273a0 --- /dev/null +++ b/src/infiniop/ops/leaky_relu/metax/leaky_relu_metax.h @@ -0,0 +1,49 @@ +#ifndef LEAKY_RELU_METAX_API_H +#define LEAKY_RELU_METAX_API_H + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +namespace op::leaky_relu::metax { +class Descriptor final : public InfiniopDescriptor { + infiniDtype_t _dtype; + op::elementwise::ElementwiseInfo _info; + std::unique_ptr _device_info; + size_t _workspace_size; + float _negative_slope; + + Descriptor( + infiniDtype_t dtype, + op::elementwise::ElementwiseInfo info, + op::elementwise::metax::DeviceImpl *device_info, + size_t workspace_size, + infiniDevice_t device_type, + int device_id, + float negative_slope) + : InfiniopDescriptor{device_type, device_id}, + _dtype(dtype), + _info(std::move(info)), + _device_info(std::move(device_info)), + _workspace_size(workspace_size), + _negative_slope(negative_slope) {} + +public: + ~Descriptor(); + + size_t workspaceSize() const { return _workspace_size; } + + static infiniStatus_t create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + std::vector input_desc, + float negative_slope); + + infiniStatus_t calculate( + void *workspace, size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const; +}; +} // namespace op::leaky_relu::metax + +#endif // LEAKY_RELU_METAX_API_H \ No newline at end of file diff --git a/src/infiniop/ops/leaky_relu/metax/leaky_relu_metax.maca b/src/infiniop/ops/leaky_relu/metax/leaky_relu_metax.maca new file mode 100644 index 000000000..441f89e59 --- /dev/null +++ b/src/infiniop/ops/leaky_relu/metax/leaky_relu_metax.maca @@ -0,0 +1,77 @@ +#include "leaky_relu_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::leaky_relu::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec, + float negative_slope) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &out_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(out_shape, input_shape); + + auto info_result = op::elementwise::ElementwiseInfo::create(out_desc, input_desc_vec); + CHECK_RESULT(info_result); + auto info = info_result.take(); + auto workspace_size = info.getMetaMemSize() + info.getInputSize() * sizeof(void *) + sizeof(float);//device negative_slope + auto device_impl_result = op::elementwise::metax::DeviceImpl::create(handle->internal()); + CHECK_RESULT(device_impl_result); + *desc_ptr = new Descriptor( + dtype, + std::move(info), + std::move(device_impl_result.take()), + workspace_size, + handle->device, + handle->device_id, + negative_slope); + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + const int8_t *d_negative_slope_start = reinterpret_cast(workspace) + workspace_size - sizeof(_negative_slope); + CHECK_METAX(hcMemcpyAsync((void *)d_negative_slope_start, + &_negative_slope, + sizeof(_negative_slope), + hcMemcpyHostToDevice, + reinterpret_cast(stream))); + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::LeakyReLUOp, half>(_info, workspace, output, inputs, stream, reinterpret_cast(d_negative_slope_start)); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::LeakyReLUOp, cuda_bfloat16>(_info, workspace, output, inputs, stream, reinterpret_cast(d_negative_slope_start)); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::LeakyReLUOp, float>(_info, workspace, output, inputs, stream, reinterpret_cast(d_negative_slope_start)); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::LeakyReLUOp, double>(_info, workspace, output, inputs, stream, reinterpret_cast(d_negative_slope_start)); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::leaky_relu::metax \ No newline at end of file diff --git a/src/infiniop/ops/leaky_relu/nvidia/leaky_relu_nvidia.cu b/src/infiniop/ops/leaky_relu/nvidia/leaky_relu_nvidia.cu new file mode 100644 index 000000000..01ad07734 --- /dev/null +++ b/src/infiniop/ops/leaky_relu/nvidia/leaky_relu_nvidia.cu @@ -0,0 +1,77 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "leaky_relu_nvidia.cuh" + +namespace op::leaky_relu::nvidia { + +Descriptor::~Descriptor() = default; +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec, + float negative_slope) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create NVIDIA elementwise descriptor + auto info_result = op::elementwise::ElementwiseInfo::create(out_desc, input_desc_vec); + CHECK_RESULT(info_result); + auto info = info_result.take(); + auto workspace_size = info.getMetaMemSize() + info.getInputSize() * sizeof(void *) + sizeof(float); // device negative_slope + auto device_impl_result = op::elementwise::nvidia::DeviceImpl::create(handle->internal()); + CHECK_RESULT(device_impl_result); + + *desc_ptr = new Descriptor( + dtype, + std::move(info), + std::move(device_impl_result.take()), + workspace_size, + handle->device, + handle->device_id, + negative_slope); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + const int8_t *d_negative_slope_start = reinterpret_cast(workspace) + workspace_size - sizeof(_negative_slope); + CHECK_CUDA(cudaMemcpyAsync((void *)d_negative_slope_start, + &_negative_slope, + sizeof(_negative_slope), + cudaMemcpyHostToDevice, + reinterpret_cast(stream))); + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::LeakyReLUOp, half>(_info, workspace, output, inputs, stream, reinterpret_cast(d_negative_slope_start)); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::LeakyReLUOp, cuda_bfloat16>(_info, workspace, output, inputs, stream, reinterpret_cast(d_negative_slope_start)); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::LeakyReLUOp, float>(_info, workspace, output, inputs, stream, reinterpret_cast(d_negative_slope_start)); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::LeakyReLUOp, double>(_info, workspace, output, inputs, stream, reinterpret_cast(d_negative_slope_start)); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } +} + +} // namespace op::leaky_relu::nvidia \ No newline at end of file diff --git a/src/infiniop/ops/leaky_relu/nvidia/leaky_relu_nvidia.cuh b/src/infiniop/ops/leaky_relu/nvidia/leaky_relu_nvidia.cuh new file mode 100644 index 000000000..bb3bf3c54 --- /dev/null +++ b/src/infiniop/ops/leaky_relu/nvidia/leaky_relu_nvidia.cuh @@ -0,0 +1,49 @@ +#ifndef LEAKY_RELU_NVIDIA_API_H +#define LEAKY_RELU_NVIDIA_API_H + +#include "../../../../utils/custom_types.h" +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +namespace op::leaky_relu::nvidia { +class Descriptor final : public InfiniopDescriptor { + infiniDtype_t _dtype; + op::elementwise::ElementwiseInfo _info; + std::unique_ptr _device_info; + size_t _workspace_size; + float _negative_slope; + + Descriptor( + infiniDtype_t dtype, + op::elementwise::ElementwiseInfo info, + op::elementwise::nvidia::DeviceImpl *device_info, + size_t workspace_size, + infiniDevice_t device_type, + int device_id, + float negative_slope) + : InfiniopDescriptor{device_type, device_id}, + _dtype(dtype), + _info(std::move(info)), + _device_info(std::move(device_info)), + _workspace_size(workspace_size), + _negative_slope(negative_slope) {} + +public: + ~Descriptor(); + + size_t workspaceSize() const { return _workspace_size; } + + static infiniStatus_t create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + std::vector input_desc, + float negative_slope); + + infiniStatus_t calculate( + void *workspace, size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const; +}; +} // namespace op::leaky_relu::nvidia +#endif // LEAKY_RELU_NVIDIA_API_H \ No newline at end of file diff --git a/src/infiniop/ops/leaky_relu/operator.cc b/src/infiniop/ops/leaky_relu/operator.cc new file mode 100644 index 000000000..0f4ce436e --- /dev/null +++ b/src/infiniop/ops/leaky_relu/operator.cc @@ -0,0 +1,143 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/leaky_relu.h" + +#ifdef ENABLE_CPU_API +#include "cpu/leaky_relu_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/leaky_relu_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/leaky_relu_metax.h" +#endif + +__C infiniStatus_t infiniopCreateLeakyReLUDescriptor( + infiniopHandle_t handle, + infiniopLeakyReLUDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + float negative_slope) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::leaky_relu::NAMESPACE::Descriptor::create( \ + infiniopHandle_t(handle), \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + std::vector{input_desc}, \ + negative_slope) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetLeakyReLUWorkspaceSize(infiniopLeakyReLUDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopLeakyReLU( + infiniopLeakyReLUDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, {input}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t infiniopDestroyLeakyReLUDescriptor(infiniopLeakyReLUDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} \ No newline at end of file diff --git a/src/infiniop/ops/sigmoid_backward/cpu/sigmoid_backward_cpu.cc b/src/infiniop/ops/sigmoid_backward/cpu/sigmoid_backward_cpu.cc new file mode 100644 index 000000000..30dc3bf56 --- /dev/null +++ b/src/infiniop/ops/sigmoid_backward/cpu/sigmoid_backward_cpu.cc @@ -0,0 +1,54 @@ +#include "sigmoid_backward_cpu.h" + +namespace op::sigmoid_backward::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &grad_output_desc = input_desc_vec.at(1); + const auto &grad_input_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + const auto &grad_output_shape = grad_output_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(grad_input_shape, input_shape, grad_output_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::sigmoid_backward::cpu \ No newline at end of file diff --git a/src/infiniop/ops/sigmoid_backward/cpu/sigmoid_backward_cpu.h b/src/infiniop/ops/sigmoid_backward/cpu/sigmoid_backward_cpu.h new file mode 100644 index 000000000..a581874e9 --- /dev/null +++ b/src/infiniop/ops/sigmoid_backward/cpu/sigmoid_backward_cpu.h @@ -0,0 +1,29 @@ +#ifndef SIGMOID_BACKWARD_CPU_H +#define SIGMOID_BACKWARD_CPU_H + +#include "../../../elementwise/cpu/elementwise_cpu.h" +#include + +ELEMENTWISE_DESCRIPTOR(sigmoid_backward, cpu) + +namespace op::sigmoid_backward::cpu { +typedef struct SigmoidBackwardOp { +public: + static constexpr size_t num_inputs = 2; + template + T operator()(const T &input, const T &grad_output) const { + // sigmoid_backward: grad_input = grad_output * sigmoid(input) * (1 - sigmoid(input)) + T sigmoid_val; + if constexpr (std::is_same_v) { + sigmoid_val = static_cast(1.0f / (1.0f + std::exp(-static_cast(input)))); + } else if constexpr (std::is_same_v) { + sigmoid_val = static_cast(1.0f / (1.0f + std::exp(-static_cast(input)))); + } else { + sigmoid_val = static_cast(1.0) / (static_cast(1.0) + std::exp(-input)); + } + return grad_output * sigmoid_val * (static_cast(1.0) - sigmoid_val); + } +} SigmoidBackwardOp; +} // namespace op::sigmoid_backward::cpu + +#endif // SIGMOID_BACKWARD_CPU_H diff --git a/src/infiniop/ops/sigmoid_backward/cuda/kernel.cuh b/src/infiniop/ops/sigmoid_backward/cuda/kernel.cuh new file mode 100644 index 000000000..38ac9607d --- /dev/null +++ b/src/infiniop/ops/sigmoid_backward/cuda/kernel.cuh @@ -0,0 +1,56 @@ +#ifndef SIGMOID_BACKWARD_CUDA_H +#define SIGMOID_BACKWARD_CUDA_H + +namespace op::sigmoid_backward::cuda { +typedef struct SigmoidBackwardOp { +public: + static constexpr size_t num_inputs = 2; + template + __device__ __forceinline__ T operator()(const T &input, const T &grad_output) const { + T sigmoid_val; + + if constexpr (std::is_same_v) { + // For half2, process each component + half2 one = __float2half2_rn(1.0f); + half2 neg_input = __hneg2(input); + half2 exp_neg_input = h2exp(neg_input); + sigmoid_val = __h2div(one, __hadd2(one, exp_neg_input)); + return __hmul2(__hmul2(grad_output, sigmoid_val), __hsub2(one, sigmoid_val)); + } else if constexpr (std::is_same_v) { + half one = __float2half(1.0f); + // half neg_input = __hneg(input); + // half exp_neg_input = hexp(neg_input); + // sigmoid_val = __hdiv(one, __hadd(one, exp_neg_input)); + sigmoid_val = sigmoid(input); + return __hmul(__hmul(grad_output, sigmoid_val), __hsub(one, sigmoid_val)); + } else if constexpr (std::is_same_v) { + cuda_bfloat16 one = __float2bfloat16(1.0f); + // cuda_bfloat16 neg_input = __hneg(input); + // cuda_bfloat16 exp_neg_input = hexp(neg_input); + // sigmoid_val = __hdiv(one, __hadd(one, exp_neg_input)); + sigmoid_val = sigmoid(input); + return __hmul(__hmul(grad_output, sigmoid_val), __hsub(one, sigmoid_val)); + } else if constexpr (std::is_same_v) { + sigmoid_val = __fdiv_rn(1.0f, __fadd_rn(1.0f, expf(-input))); + return __fmul_rn(__fmul_rn(grad_output, sigmoid_val), __fsub_rn(1.0f, sigmoid_val)); + } else if constexpr (std::is_same_v) { + sigmoid_val = 1.0 / (1.0 + exp(-input)); + return grad_output * sigmoid_val * (1.0 - sigmoid_val); + } else { + // Fallback for other types + sigmoid_val = static_cast(1.0) / (static_cast(1.0) + exp(-input)); + return grad_output * sigmoid_val * (static_cast(1.0) - sigmoid_val); + } + } + +private: + __device__ __forceinline__ half sigmoid(const half &x) const { + return __float2half(__fdiv_rn(1.0f, __fadd_rn(1.0f, expf(__half2float(-x))))); + } + __device__ __forceinline__ cuda_bfloat16 sigmoid(const cuda_bfloat16 &x) const { + return __float2bfloat16(__fdiv_rn(1.0f, __fadd_rn(1.0f, expf(__bfloat162float(-x))))); + } +} SigmoidBackwardOp; +} // namespace op::sigmoid_backward::cuda + +#endif // SIGMOID_BACKWARD_CUDA_H diff --git a/src/infiniop/ops/sigmoid_backward/metax/sigmoid_backward_metax.h b/src/infiniop/ops/sigmoid_backward/metax/sigmoid_backward_metax.h new file mode 100644 index 000000000..412e91e55 --- /dev/null +++ b/src/infiniop/ops/sigmoid_backward/metax/sigmoid_backward_metax.h @@ -0,0 +1,8 @@ +#ifndef SIGMOID_BACKWARD_METAX_API_H +#define SIGMOID_BACKWARD_METAX_API_H + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(sigmoid_backward, metax) + +#endif // SIGMOID_BACKWARD_METAX_API_H diff --git a/src/infiniop/ops/sigmoid_backward/metax/sigmoid_backward_metax.maca b/src/infiniop/ops/sigmoid_backward/metax/sigmoid_backward_metax.maca new file mode 100644 index 000000000..3f3cf5382 --- /dev/null +++ b/src/infiniop/ops/sigmoid_backward/metax/sigmoid_backward_metax.maca @@ -0,0 +1,62 @@ +#include "sigmoid_backward_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::sigmoid_backward::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &grad_output_desc = input_desc_vec.at(1); + const auto &grad_input_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + const auto &grad_output_shape = grad_output_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(grad_input_shape, input_shape, grad_output_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::SigmoidBackwardOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::SigmoidBackwardOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::SigmoidBackwardOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::SigmoidBackwardOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::sigmoid_backward::metax diff --git a/src/infiniop/ops/sigmoid_backward/nvidia/sigmoid_backward_nvidia.cu b/src/infiniop/ops/sigmoid_backward/nvidia/sigmoid_backward_nvidia.cu new file mode 100644 index 000000000..6cab4d3f1 --- /dev/null +++ b/src/infiniop/ops/sigmoid_backward/nvidia/sigmoid_backward_nvidia.cu @@ -0,0 +1,61 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "sigmoid_backward_nvidia.cuh" + +namespace op::sigmoid_backward::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &grad_output_desc = input_desc_vec.at(1); + const auto &grad_input_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + const auto &grad_output_shape = grad_output_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(grad_input_shape, input_shape, grad_output_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::SigmoidBackwardOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::SigmoidBackwardOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::SigmoidBackwardOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::SigmoidBackwardOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::sigmoid_backward::nvidia diff --git a/src/infiniop/ops/sigmoid_backward/nvidia/sigmoid_backward_nvidia.cuh b/src/infiniop/ops/sigmoid_backward/nvidia/sigmoid_backward_nvidia.cuh new file mode 100644 index 000000000..2dd69a167 --- /dev/null +++ b/src/infiniop/ops/sigmoid_backward/nvidia/sigmoid_backward_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef SIGMOID_BACKWARD_NVIDIA_API_H +#define SIGMOID_BACKWARD_NVIDIA_API_H + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(sigmoid_backward, nvidia) + +#endif // SIGMOID_BACKWARD_NVIDIA_API_H diff --git a/src/infiniop/ops/sigmoid_backward/operator.cc b/src/infiniop/ops/sigmoid_backward/operator.cc new file mode 100644 index 000000000..ce4258ae6 --- /dev/null +++ b/src/infiniop/ops/sigmoid_backward/operator.cc @@ -0,0 +1,145 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/sigmoid_backward.h" + +#ifdef ENABLE_CPU_API +#include "cpu/sigmoid_backward_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/sigmoid_backward_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/sigmoid_backward_metax.h" +#endif + +__C infiniStatus_t infiniopCreateSigmoidBackwardDescriptor( + infiniopHandle_t handle, + infiniopSigmoidBackwardDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t grad_input_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t grad_output_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::sigmoid_backward::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + grad_input_desc, \ + {input_desc, \ + grad_output_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetSigmoidBackwardWorkspaceSize(infiniopSigmoidBackwardDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopSigmoidBackward( + infiniopSigmoidBackwardDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *grad_input, + const void *input, + const void *grad_output, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, grad_input, {input, grad_output}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroySigmoidBackwardDescriptor(infiniopSigmoidBackwardDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} \ No newline at end of file diff --git a/src/infiniop/ops/sin/cpu/sin_cpu.cc b/src/infiniop/ops/sin/cpu/sin_cpu.cc new file mode 100644 index 000000000..3a96798fb --- /dev/null +++ b/src/infiniop/ops/sin/cpu/sin_cpu.cc @@ -0,0 +1,52 @@ +#include "sin_cpu.h" + +namespace op::sin::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &out_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(out_shape, input_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::sin::cpu \ No newline at end of file diff --git a/src/infiniop/ops/sin/cpu/sin_cpu.h b/src/infiniop/ops/sin/cpu/sin_cpu.h new file mode 100644 index 000000000..838fd97ec --- /dev/null +++ b/src/infiniop/ops/sin/cpu/sin_cpu.h @@ -0,0 +1,33 @@ +#ifndef SIN_CPU_H +#define SIN_CPU_H + +#include "../../../elementwise/cpu/elementwise_cpu.h" +#include + +ELEMENTWISE_DESCRIPTOR(sin, cpu) + +namespace op::sin::cpu { +typedef struct SinOp { +public: + static constexpr size_t num_inputs = 1; + template + T operator()(const T &x) const { + // sin(x) + if constexpr (std::is_same_v) { + float x_f = static_cast(x); + return static_cast(std::sin(x_f)); + } else if constexpr (std::is_same_v) { + float x_f = static_cast(x); + return static_cast(std::sin(x_f)); + } else if constexpr (std::is_same_v) { + return std::sin(x); + } else if constexpr (std::is_same_v) { + return std::sin(x); + } else { + return std::sin(x); + } + } +} SinOp; +} // namespace op::sin::cpu + +#endif // SIN_CPU_H \ No newline at end of file diff --git a/src/infiniop/ops/sin/cuda/kernel.cuh b/src/infiniop/ops/sin/cuda/kernel.cuh new file mode 100644 index 000000000..d052e7d2b --- /dev/null +++ b/src/infiniop/ops/sin/cuda/kernel.cuh @@ -0,0 +1,39 @@ +#ifndef SIN_CUDA_H +#define SIN_CUDA_H + +namespace op::sin::cuda { +typedef struct SinOp { +public: + static constexpr size_t num_inputs = 1; + + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + // For half2, process each half separately + half x_low = __low2half(x); + half x_high = __high2half(x); + half sin_low = hsin(x_low); + half sin_high = hsin(x_high); + return __halves2half2(sin_low, sin_high); + } else if constexpr (std::is_same_v) { + // Use CUDA half sin function + return hsin(x); + } else if constexpr (std::is_same_v) { + // Convert to float for computation to maintain precision + float x_f = __bfloat162float(x); + float result = sinf(x_f); + return __float2bfloat16(result); + } else if constexpr (std::is_same_v) { + // Use fast math functions for float + return sinf(x); + } else if constexpr (std::is_same_v) { + return ::sin(x); + } else { + // Fallback + return sinf(x); + } + } +} SinOp; +} // namespace op::sin::cuda + +#endif // SIN_CUDA_H \ No newline at end of file diff --git a/src/infiniop/ops/sin/metax/sin_metax.h b/src/infiniop/ops/sin/metax/sin_metax.h new file mode 100644 index 000000000..2b744fc60 --- /dev/null +++ b/src/infiniop/ops/sin/metax/sin_metax.h @@ -0,0 +1,8 @@ +#ifndef SIN_METAX_API_H +#define SIN_METAX_API_H + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(sin, metax) + +#endif // SIN_METAX_API_H diff --git a/src/infiniop/ops/sin/metax/sin_metax.maca b/src/infiniop/ops/sin/metax/sin_metax.maca new file mode 100644 index 000000000..5700c791b --- /dev/null +++ b/src/infiniop/ops/sin/metax/sin_metax.maca @@ -0,0 +1,60 @@ +#include "sin_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::sin::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &out_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(out_shape, input_shape); + + // create METAX elementwise descriptor + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::SinOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::SinOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::SinOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::SinOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::sin::metax diff --git a/src/infiniop/ops/sin/nvidia/sin_nvidia.cu b/src/infiniop/ops/sin/nvidia/sin_nvidia.cu new file mode 100644 index 000000000..4676e3290 --- /dev/null +++ b/src/infiniop/ops/sin/nvidia/sin_nvidia.cu @@ -0,0 +1,59 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "sin_nvidia.cuh" + +namespace op::sin::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &out_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(out_shape, input_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::SinOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::SinOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::SinOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::SinOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::sin::nvidia diff --git a/src/infiniop/ops/sin/nvidia/sin_nvidia.cuh b/src/infiniop/ops/sin/nvidia/sin_nvidia.cuh new file mode 100644 index 000000000..93debcaf6 --- /dev/null +++ b/src/infiniop/ops/sin/nvidia/sin_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef SIN_NVIDIA_API_H +#define SIN_NVIDIA_API_H + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(sin, nvidia) + +#endif // SIN_NVIDIA_API_H diff --git a/src/infiniop/ops/sin/operator.cc b/src/infiniop/ops/sin/operator.cc new file mode 100644 index 000000000..5f1873f3d --- /dev/null +++ b/src/infiniop/ops/sin/operator.cc @@ -0,0 +1,141 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/sin.h" + +#ifdef ENABLE_CPU_API +#include "cpu/sin_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/sin_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/sin_metax.h" +#endif + +__C infiniStatus_t infiniopCreateSinDescriptor( + infiniopHandle_t handle, + infiniopSinDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::sin::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + {input_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetSinWorkspaceSize(infiniopSinDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopSin( + infiniopSinDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, {input}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t infiniopDestroySinDescriptor(infiniopSinDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} \ No newline at end of file diff --git a/src/infiniop/ops/tanh/cpu/tanh_cpu.cc b/src/infiniop/ops/tanh/cpu/tanh_cpu.cc new file mode 100644 index 000000000..4ce419477 --- /dev/null +++ b/src/infiniop/ops/tanh/cpu/tanh_cpu.cc @@ -0,0 +1,52 @@ +#include "tanh_cpu.h" + +namespace op::tanh::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &out_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(out_shape, input_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::tanh::cpu \ No newline at end of file diff --git a/src/infiniop/ops/tanh/cpu/tanh_cpu.h b/src/infiniop/ops/tanh/cpu/tanh_cpu.h new file mode 100644 index 000000000..86592b666 --- /dev/null +++ b/src/infiniop/ops/tanh/cpu/tanh_cpu.h @@ -0,0 +1,34 @@ +#ifndef TANH_CPU_H +#define TANH_CPU_H + +#include "../../../elementwise/cpu/elementwise_cpu.h" +#include + +ELEMENTWISE_DESCRIPTOR(tanh, cpu) + +namespace op::tanh::cpu { +typedef struct TanhOp { +public: + static constexpr size_t num_inputs = 1; + template + T operator()(const T &x) const { + // tanh(x) = (exp(2*x) - 1) / (exp(2*x) + 1) + // or more stable: tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x)) + if constexpr (std::is_same_v) { + float x_f = static_cast(x); + return static_cast(std::tanh(x_f)); + } else if constexpr (std::is_same_v) { + float x_f = static_cast(x); + return static_cast(std::tanh(x_f)); + } else if constexpr (std::is_same_v) { + return std::tanh(x); + } else if constexpr (std::is_same_v) { + return std::tanh(x); + } else { + return std::tanh(x); + } + } +} TanhOp; +} // namespace op::tanh::cpu + +#endif // TANH_CPU_H \ No newline at end of file diff --git a/src/infiniop/ops/tanh/cuda/kernel.cuh b/src/infiniop/ops/tanh/cuda/kernel.cuh new file mode 100644 index 000000000..433ae7d68 --- /dev/null +++ b/src/infiniop/ops/tanh/cuda/kernel.cuh @@ -0,0 +1,44 @@ +#ifndef TANH_CUDA_H +#define TANH_CUDA_H + +namespace op::tanh::cuda { +typedef struct TanhOp { +public: + static constexpr size_t num_inputs = 1; + + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + // Use CUDA intrinsic for half precision + return htanh(x); + } else if constexpr (std::is_same_v) { + // Convert to float for computation to maintain precision + float x_f = __bfloat162float(x); + float result = tanhf(x_f); + return __float2bfloat16(result); + } else if constexpr (std::is_same_v) { + // Use fast math functions for float + return tanhf(x); + // return static_cast(::tanhf(static_cast(x))); + } else if constexpr (std::is_same_v) { + return ::tanh(x); + } else { + // Fallback + return tanhf(x); + } + } + +private: + // Helper function for half precision tanh (assuming it exists or can be approximated) + __device__ __forceinline__ half htanh(const half &x) const { + return __float2half(tanhf(__half2float(x))); + } + + // Helper function for bfloat16 precision tanh (assuming it exists or can be approximated) + __device__ __forceinline__ cuda_bfloat16 htanh(const cuda_bfloat16 &x) const { + return __float2bfloat16(tanhf(__bfloat162float(x))); + } +} TanhOp; +} // namespace op::tanh::cuda + +#endif // TANH_CUDA_H \ No newline at end of file diff --git a/src/infiniop/ops/tanh/metax/tanh_metax.h b/src/infiniop/ops/tanh/metax/tanh_metax.h new file mode 100644 index 000000000..c05b4ec26 --- /dev/null +++ b/src/infiniop/ops/tanh/metax/tanh_metax.h @@ -0,0 +1,8 @@ +#ifndef TANH_METAX_API_H +#define TANH_METAX_API_H + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(tanh, metax) + +#endif // TANH_METAX_API_H diff --git a/src/infiniop/ops/tanh/metax/tanh_metax.maca b/src/infiniop/ops/tanh/metax/tanh_metax.maca new file mode 100644 index 000000000..cf0756f6b --- /dev/null +++ b/src/infiniop/ops/tanh/metax/tanh_metax.maca @@ -0,0 +1,60 @@ +#include "tanh_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::tanh::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &out_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(out_shape, input_shape); + + // create METAX elementwise descriptor + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::TanhOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::TanhOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::TanhOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::TanhOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::tanh::metax \ No newline at end of file diff --git a/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu b/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu new file mode 100644 index 000000000..b93164765 --- /dev/null +++ b/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu @@ -0,0 +1,59 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "tanh_nvidia.cuh" + +namespace op::tanh::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &out_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(out_shape, input_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::TanhOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::TanhOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::TanhOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::TanhOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::tanh::nvidia \ No newline at end of file diff --git a/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cuh b/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cuh new file mode 100644 index 000000000..8c1acb30b --- /dev/null +++ b/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef TANH_NVIDIA_API_H +#define TANH_NVIDIA_API_H + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(tanh, nvidia) + +#endif // TANH_NVIDIA_API_H diff --git a/src/infiniop/ops/tanh/operator.cc b/src/infiniop/ops/tanh/operator.cc new file mode 100644 index 000000000..bbdb19c29 --- /dev/null +++ b/src/infiniop/ops/tanh/operator.cc @@ -0,0 +1,141 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/tanh.h" + +#ifdef ENABLE_CPU_API +#include "cpu/tanh_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/tanh_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/tanh_metax.h" +#endif + +__C infiniStatus_t infiniopCreateTanhDescriptor( + infiniopHandle_t handle, + infiniopTanhDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::tanh::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + {input_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetTanhWorkspaceSize(infiniopTanhDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopTanh( + infiniopTanhDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, {input}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t infiniopDestroyTanhDescriptor(infiniopTanhDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} \ No newline at end of file diff --git a/src/infiniop/ops/where/cpu/where_cpu.cc b/src/infiniop/ops/where/cpu/where_cpu.cc new file mode 100644 index 000000000..789397c5a --- /dev/null +++ b/src/infiniop/ops/where/cpu/where_cpu.cc @@ -0,0 +1,83 @@ +#include "where_cpu.h" + +namespace op::where::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &condition_desc = input_desc_vec.at(0); + const auto &a_desc = input_desc_vec.at(1); + const auto &b_desc = input_desc_vec.at(2); + const auto &out_shape = out_desc->shape(); + const auto &condition_shape = condition_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + // Check a, b, c have same dtype and support all legal types + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16, + INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64, + INFINI_DTYPE_U8, INFINI_DTYPE_U16, INFINI_DTYPE_U32, INFINI_DTYPE_U64, + INFINI_DTYPE_BOOL); + + if (a_desc->dtype() != dtype || b_desc->dtype() != dtype) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + CHECK_SAME_SHAPE(out_shape, condition_shape); + CHECK_SAME_SHAPE(out_shape, a_shape); + CHECK_SAME_SHAPE(out_shape, b_shape); + + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_I8: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_I16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_I32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_I64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_U8: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_U16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_U32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_U64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BOOL: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::where::cpu \ No newline at end of file diff --git a/src/infiniop/ops/where/cpu/where_cpu.h b/src/infiniop/ops/where/cpu/where_cpu.h new file mode 100644 index 000000000..c294751a4 --- /dev/null +++ b/src/infiniop/ops/where/cpu/where_cpu.h @@ -0,0 +1,31 @@ +#ifndef __WHERE_CPU_H__ +#define __WHERE_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" +#include "infiniop/ops/where.h" + +ELEMENTWISE_DESCRIPTOR(where, cpu) + +namespace op::where::cpu { + +typedef struct WhereOp { +public: + static constexpr size_t num_inputs = 3; + + template + T operator()(const Tcond &cond, const Ta &a, const Tb &b) const { + if constexpr (std::is_same_v) { + // For vectorized half types, apply element-wise selection + return static_cast(_f16_to_bool(cond) ? a : b); + } else if constexpr (std::is_same_v) { + // For vectorized half types, apply element-wise selection + return static_cast(_bf16_to_bool(cond) ? a : b); + } else { + return static_cast(static_cast(cond) ? a : b); + } + } +} WhereOp; + +} // namespace op::where::cpu + +#endif // __WHERE_CPU_H__ \ No newline at end of file diff --git a/src/infiniop/ops/where/cuda/kernel.cuh b/src/infiniop/ops/where/cuda/kernel.cuh new file mode 100644 index 000000000..d9744dad2 --- /dev/null +++ b/src/infiniop/ops/where/cuda/kernel.cuh @@ -0,0 +1,24 @@ +#ifndef __WHERE_CUDA_H__ +#define __WHERE_CUDA_H__ + +namespace op::where::cuda { + +typedef struct WhereOp { +public: + static constexpr size_t num_inputs = 3; + + template + __device__ __forceinline__ T operator()(const Tcond &condition, const Ta &a, const Tb &b) const { + if constexpr (std::is_same_v || std::is_same_v) { + // For vectorized half types, apply element-wise selection + return static_cast(static_cast(condition) ? a : b); + } else { + // return condition ? a : b; + return static_cast(static_cast(condition) ? a : b); + } + } +} WhereOp; + +} // namespace op::where::cuda + +#endif // __WHERE_CUDA_H__ \ No newline at end of file diff --git a/src/infiniop/ops/where/metax/where_metax.h b/src/infiniop/ops/where/metax/where_metax.h new file mode 100644 index 000000000..43bb1a945 --- /dev/null +++ b/src/infiniop/ops/where/metax/where_metax.h @@ -0,0 +1,8 @@ +#ifndef __WHERE_METAX_API_H__ +#define __WHERE_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(where, metax) + +#endif // __WHERE_METAX_API_H__ diff --git a/src/infiniop/ops/where/metax/where_metax.maca b/src/infiniop/ops/where/metax/where_metax.maca new file mode 100644 index 000000000..b97f1fdc5 --- /dev/null +++ b/src/infiniop/ops/where/metax/where_metax.maca @@ -0,0 +1,90 @@ +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" +#include "where_metax.h" + +namespace op::where::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &condition_desc = input_desc_vec.at(0); + const auto &a_desc = input_desc_vec.at(1); + const auto &b_desc = input_desc_vec.at(2); + const auto &out_shape = out_desc->shape(); + const auto &condition_shape = condition_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + // Check a, b, c have same dtype and support all legal types + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16, + INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64, + INFINI_DTYPE_U8, INFINI_DTYPE_U16, INFINI_DTYPE_U32, INFINI_DTYPE_U64, + INFINI_DTYPE_BOOL); + + if (a_desc->dtype() != dtype || b_desc->dtype() != dtype) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + CHECK_SAME_SHAPE(out_shape, condition_shape); + CHECK_SAME_SHAPE(out_shape, a_shape); + CHECK_SAME_SHAPE(out_shape, b_shape); + + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::WhereOp, half, half, half, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::WhereOp, cuda_bfloat16, cuda_bfloat16, cuda_bfloat16, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::WhereOp, float, float, float, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::WhereOp, double, double, double, double>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I8: + return _device_info->calculate<256, cuda::WhereOp, int8_t, int8_t, int8_t, int8_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I16: + return _device_info->calculate<256, cuda::WhereOp, int16_t, int16_t, int16_t, int16_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I32: + return _device_info->calculate<256, cuda::WhereOp, int32_t, int32_t, int32_t, int32_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I64: + return _device_info->calculate<256, cuda::WhereOp, int64_t, int64_t, int64_t, int64_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_U8: + return _device_info->calculate<256, cuda::WhereOp, uint8_t, uint8_t, uint8_t, uint8_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_U16: + return _device_info->calculate<256, cuda::WhereOp, uint16_t, uint16_t, uint16_t, uint16_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_U32: + return _device_info->calculate<256, cuda::WhereOp, uint32_t, uint32_t, uint32_t, uint32_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_U64: + return _device_info->calculate<256, cuda::WhereOp, uint64_t, uint64_t, uint64_t, uint64_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BOOL: + return _device_info->calculate<256, cuda::WhereOp, bool, bool, bool, bool>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::where::metax \ No newline at end of file diff --git a/src/infiniop/ops/where/nvidia/where_nvidia.cu b/src/infiniop/ops/where/nvidia/where_nvidia.cu new file mode 100644 index 000000000..c992663dc --- /dev/null +++ b/src/infiniop/ops/where/nvidia/where_nvidia.cu @@ -0,0 +1,90 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "where_nvidia.cuh" + +namespace op::where::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &condition_desc = input_desc_vec.at(0); + const auto &a_desc = input_desc_vec.at(1); + const auto &b_desc = input_desc_vec.at(2); + const auto &out_shape = out_desc->shape(); + const auto &condition_shape = condition_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + // Check a, b, c have same dtype and support all legal types + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16, + INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64, + INFINI_DTYPE_U8, INFINI_DTYPE_U16, INFINI_DTYPE_U32, INFINI_DTYPE_U64, + INFINI_DTYPE_BOOL); + + if (a_desc->dtype() != dtype || b_desc->dtype() != dtype) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + CHECK_SAME_SHAPE(out_shape, condition_shape); + CHECK_SAME_SHAPE(out_shape, a_shape); + CHECK_SAME_SHAPE(out_shape, b_shape); + + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::WhereOp, half, half, half, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::WhereOp, cuda_bfloat16, cuda_bfloat16, cuda_bfloat16, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::WhereOp, float, float, float, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::WhereOp, double, double, double, double>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I8: + return _device_info->calculate<256, cuda::WhereOp, int8_t, int8_t, int8_t, int8_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I16: + return _device_info->calculate<256, cuda::WhereOp, int16_t, int16_t, int16_t, int16_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I32: + return _device_info->calculate<256, cuda::WhereOp, int32_t, int32_t, int32_t, int32_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I64: + return _device_info->calculate<256, cuda::WhereOp, int64_t, int64_t, int64_t, int64_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_U8: + return _device_info->calculate<256, cuda::WhereOp, uint8_t, uint8_t, uint8_t, uint8_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_U16: + return _device_info->calculate<256, cuda::WhereOp, uint16_t, uint16_t, uint16_t, uint16_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_U32: + return _device_info->calculate<256, cuda::WhereOp, uint32_t, uint32_t, uint32_t, uint32_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_U64: + return _device_info->calculate<256, cuda::WhereOp, uint64_t, uint64_t, uint64_t, uint64_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BOOL: + return _device_info->calculate<256, cuda::WhereOp, bool, bool, bool, bool>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::where::nvidia \ No newline at end of file diff --git a/src/infiniop/ops/where/nvidia/where_nvidia.cuh b/src/infiniop/ops/where/nvidia/where_nvidia.cuh new file mode 100644 index 000000000..c168364a8 --- /dev/null +++ b/src/infiniop/ops/where/nvidia/where_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __WHERE_CUDA_API_H__ +#define __WHERE_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(where, nvidia) + +#endif // __WHERE_CUDA_API_H__ diff --git a/src/infiniop/ops/where/operator.cc b/src/infiniop/ops/where/operator.cc new file mode 100644 index 000000000..2a8a66923 --- /dev/null +++ b/src/infiniop/ops/where/operator.cc @@ -0,0 +1,145 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/where.h" + +#ifdef ENABLE_CPU_API +#include "cpu/where_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/where_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/where_metax.h" +#endif + +__C infiniStatus_t infiniopCreateWhereDescriptor( + infiniopHandle_t handle, + infiniopWhereDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t condition, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::where::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + c, \ + {condition, a, b}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetWhereWorkspaceSize(infiniopWhereDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax) +#endif + } + +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopWhere( + infiniopWhereDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *condition, + const void *a, + const void *b, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, c, {condition, a, b}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyWhereDescriptor(infiniopWhereDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} \ No newline at end of file diff --git a/src/infinirt/infinirt_impl.h b/src/infinirt/infinirt_impl.h index 0d6f8cf05..4c41a1198 100644 --- a/src/infinirt/infinirt_impl.h +++ b/src/infinirt/infinirt_impl.h @@ -30,7 +30,6 @@ infiniStatus_t freeAsync(void *ptr, infinirtStream_t stream) IMPL; #define INFINIRT_DEVICE_API_IMPL INFINIRT_DEVICE_API(, ) -#define INFINIRT_DEVICE_API_NOOP INFINIRT_DEVICE_API({ return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; }, \ - {*count = 0; return INFINI_STATUS_SUCCESS; }) +#define INFINIRT_DEVICE_API_NOOP INFINIRT_DEVICE_API({ return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; }, {*count = 0; return INFINI_STATUS_SUCCESS; }) #endif // __INFINIRT_IMPL_H__ diff --git a/src/utils.h b/src/utils.h index f4e63be25..e721f05a6 100644 --- a/src/utils.h +++ b/src/utils.h @@ -108,4 +108,4 @@ inline size_t align(size_t size, size_t alignment) { } // namespace utils -#endif +#endif \ No newline at end of file diff --git a/src/utils/custom_types.cc b/src/utils/custom_types.cc index 1a6da3c70..a2c1ec538 100644 --- a/src/utils/custom_types.cc +++ b/src/utils/custom_types.cc @@ -2,6 +2,14 @@ #include #include +bool _f16_to_bool(fp16_t val) { + uint16_t h = val._v; + const uint16_t exponent_mask = 0x7C00; // 指数掩码 (5 bits) + const uint16_t mantissa_mask = 0x03FF; // 尾数掩码 (10 bits) + // 判断条件:当指数和尾数全为0时表示浮点0值(无论正负) + return (h & (exponent_mask | mantissa_mask)) != 0; +} + float _f16_to_f32(fp16_t val) { uint16_t h = val._v; uint32_t sign = (h & 0x8000) << 16; @@ -62,6 +70,13 @@ fp16_t _f32_to_f16(float val) { } } +bool _bf16_to_bool(bf16_t val) { + // 提取指数和尾数部分(忽略符号位) + const uint16_t exponent_and_mantissa = val._v & 0x7FFF; + // 当指数和尾数部分全为0时表示浮点0值 + return exponent_and_mantissa != 0; +} + float _bf16_to_f32(bf16_t val) { // 只需把 bf16 放到 float32 高 16 bit,其余 16 位置 0。 uint32_t bits32 = static_cast(val._v) << 16; diff --git a/src/utils/custom_types.h b/src/utils/custom_types.h index 05a5c2fca..1dfe8ebc2 100644 --- a/src/utils/custom_types.h +++ b/src/utils/custom_types.h @@ -13,9 +13,11 @@ struct CustomBFloat16 { }; typedef struct CustomBFloat16 bf16_t; +bool _f16_to_bool(fp16_t val); float _f16_to_f32(fp16_t val); fp16_t _f32_to_f16(float val); +bool _bf16_to_bool(bf16_t val); float _bf16_to_f32(bf16_t val); bf16_t _f32_to_bf16(float val); @@ -25,6 +27,14 @@ template TypeTo cast(TypeFrom val) { if constexpr (std::is_same::value) { return val; + } else if constexpr (std::is_same::value && std::is_same::value) { + return _f32_to_f16(_bf16_to_f32(val)); + } else if constexpr (std::is_same::value && std::is_same::value) { + return _f32_to_bf16(_f16_to_f32(val)); + } else if constexpr (std::is_same::value && std::is_same::value) { + return static_cast(_bf16_to_bool(val)); + } else if constexpr (std::is_same::value && std::is_same::value) { + return static_cast(_f16_to_bool(val)); } else if constexpr (std::is_same::value && std::is_same::value) { return _f32_to_f16(val); } else if constexpr (std::is_same::value && !std::is_same::value) { @@ -42,6 +52,25 @@ TypeTo cast(TypeFrom val) { } else if constexpr (std::is_same::value && !std::is_same::value) { return static_cast(_bf16_to_f32(val)); } else { + // float tmp; + // if constexpr (std::is_same::value){ + // tmp = _f16_to_f32(val); + // } + // else if constexpr (std::is_same::value){ + // tmp = _bf16_to_f32(val); + // } + // else{ + // tmp = static_cast(val); + // } + // if constexpr (std::is_same::value){ + // return _f32_to_f16(tmp); + // } + // else if constexpr (std::is_same::value){ + // return _f32_to_bf16(tmp); + // } + // else{ + // return static_cast(tmp); + // } return static_cast(val); } } diff --git a/test/infiniop-test/test_generate/__init__.py b/test/infiniop-test/test_generate/__init__.py index a61f63f7c..8db1e6755 100644 --- a/test/infiniop-test/test_generate/__init__.py +++ b/test/infiniop-test/test_generate/__init__.py @@ -1 +1,8 @@ -from .infiniop_test import InfiniopTestCase, InfiniopTestWriter, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides, process_zero_stride_tensor +from .infiniop_test import ( + InfiniopTestCase, + InfiniopTestWriter, + np_dtype_to_ggml, + gguf_strides, + contiguous_gguf_strides, + process_zero_stride_tensor, +) diff --git a/test/infiniop-test/test_generate/testcases/add.py b/test/infiniop-test/test_generate/testcases/add.py index b04ba2042..052ef18a7 100644 --- a/test/infiniop-test/test_generate/testcases/add.py +++ b/test/infiniop-test/test_generate/testcases/add.py @@ -4,7 +4,14 @@ from typing import List from numpy.lib.stride_tricks import as_strided -from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides, process_zero_stride_tensor +from .. import ( + InfiniopTestWriter, + InfiniopTestCase, + np_dtype_to_ggml, + gguf_strides, + contiguous_gguf_strides, + process_zero_stride_tensor, +) def add( @@ -26,7 +33,6 @@ def __init__( c: np.ndarray, shape_c: List[int] | None, stride_c: List[int] | None, - ): super().__init__("add") self.a = a @@ -39,7 +45,6 @@ def __init__( self.shape_c = shape_c self.stride_c = stride_c - def write_test(self, test_writer: "InfiniopTestWriter"): super().write_test(test_writer) if self.shape_a is not None: @@ -49,12 +54,22 @@ def write_test(self, test_writer: "InfiniopTestWriter"): if self.shape_c is not None: test_writer.add_array(test_writer.gguf_key("c.shape"), self.shape_c) if self.stride_a is not None: - test_writer.add_array(test_writer.gguf_key("a.strides"), gguf_strides(*self.stride_a)) + test_writer.add_array( + test_writer.gguf_key("a.strides"), gguf_strides(*self.stride_a) + ) if self.stride_b is not None: - test_writer.add_array(test_writer.gguf_key("b.strides"), gguf_strides(*self.stride_b)) + test_writer.add_array( + test_writer.gguf_key("b.strides"), gguf_strides(*self.stride_b) + ) test_writer.add_array( test_writer.gguf_key("c.strides"), - gguf_strides(*self.stride_c if self.stride_c is not None else contiguous_gguf_strides(self.shape_c)) + gguf_strides( + *( + self.stride_c + if self.stride_c is not None + else contiguous_gguf_strides(self.shape_c) + ) + ), ) test_writer.add_tensor( test_writer.gguf_key("a"), self.a, raw_dtype=np_dtype_to_ggml(self.a.dtype) @@ -114,7 +129,6 @@ def write_test(self, test_writer: "InfiniopTestWriter"): stride_c=stride_c, ) test_cases.append(test_case) - + test_writer.add_tests(test_cases) test_writer.save() - \ No newline at end of file diff --git a/test/infiniop-test/test_generate/testcases/causal_softmax.py b/test/infiniop-test/test_generate/testcases/causal_softmax.py index 74c3efcf0..037701865 100644 --- a/test/infiniop-test/test_generate/testcases/causal_softmax.py +++ b/test/infiniop-test/test_generate/testcases/causal_softmax.py @@ -4,7 +4,13 @@ from typing import List from enum import Enum, auto -from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides +from .. import ( + InfiniopTestWriter, + InfiniopTestCase, + np_dtype_to_ggml, + gguf_strides, + contiguous_gguf_strides, +) def causal_softmax(x): @@ -37,8 +43,8 @@ def __init__( super().__init__("causal_softmax") self.x = x self.y = y - self.shape_x=shape_x - self.shape_y=shape_y + self.shape_x = shape_x + self.shape_y = shape_y self.stride_x = stride_x self.stride_y = stride_y @@ -49,10 +55,18 @@ def write_test(self, test_writer: "InfiniopTestWriter"): if self.shape_y is not None: test_writer.add_array(test_writer.gguf_key("y.shape"), self.shape_y) if self.stride_x is not None: - test_writer.add_array(test_writer.gguf_key("x.strides"), gguf_strides(*self.stride_x)) + test_writer.add_array( + test_writer.gguf_key("x.strides"), gguf_strides(*self.stride_x) + ) test_writer.add_array( test_writer.gguf_key("y.strides"), - gguf_strides(*self.stride_y if self.stride_y is not None else contiguous_gguf_strides(self.shape_y)) + gguf_strides( + *( + self.stride_y + if self.stride_y is not None + else contiguous_gguf_strides(self.shape_y) + ) + ), ) test_writer.add_tensor( test_writer.gguf_key("x"), @@ -102,6 +116,6 @@ def write_test(self, test_writer: "InfiniopTestWriter"): stride_y, ) test_cases.append(test_case) - + test_writer.add_tests(test_cases) test_writer.save() diff --git a/test/infiniop-test/test_generate/testcases/clip.py b/test/infiniop-test/test_generate/testcases/clip.py index f08a59929..786153197 100644 --- a/test/infiniop-test/test_generate/testcases/clip.py +++ b/test/infiniop-test/test_generate/testcases/clip.py @@ -2,7 +2,13 @@ import gguf from typing import List, Optional, Tuple -from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides +from .. import ( + InfiniopTestWriter, + InfiniopTestCase, + np_dtype_to_ggml, + gguf_strides, + contiguous_gguf_strides, +) def clip( @@ -35,7 +41,7 @@ def random_tensor(shape, dtype): Returns: Random tensor with the specified shape and dtype """ - return (np.random.rand(*shape).astype(dtype) * 4.0 - 2.0) + return np.random.rand(*shape).astype(dtype) * 4.0 - 2.0 class ClipTestCase(InfiniopTestCase): @@ -52,7 +58,7 @@ def __init__( max_val: np.ndarray, max_stride: Optional[List[int]], y: np.ndarray, - y_shape: Optional[List[int]], + y_shape: Optional[List[int]], y_stride: Optional[List[int]], ): super().__init__("clip") @@ -63,7 +69,7 @@ def __init__( self.max_val = max_val self.max_stride = max_stride self.y = y - self.y_shape=y_shape + self.y_shape = y_shape self.y_stride = y_stride def write_test(self, test_writer: "InfiniopTestWriter"): @@ -71,57 +77,64 @@ def write_test(self, test_writer: "InfiniopTestWriter"): # Add strides as arrays if they exist if self.x_stride is not None: - test_writer.add_array(test_writer.gguf_key("x.strides"), gguf_strides(*self.x_stride)) + test_writer.add_array( + test_writer.gguf_key("x.strides"), gguf_strides(*self.x_stride) + ) if self.min_stride is not None: - test_writer.add_array(test_writer.gguf_key("min_val.strides"), gguf_strides(*self.min_stride)) + test_writer.add_array( + test_writer.gguf_key("min_val.strides"), gguf_strides(*self.min_stride) + ) if self.max_stride is not None: - test_writer.add_array(test_writer.gguf_key("max_val.strides"), gguf_strides(*self.max_stride)) + test_writer.add_array( + test_writer.gguf_key("max_val.strides"), gguf_strides(*self.max_stride) + ) if self.y_shape is not None: test_writer.add_array(test_writer.gguf_key("y.shape"), self.y_shape) test_writer.add_array( test_writer.gguf_key("y.strides"), - gguf_strides(*self.y_stride if self.y_stride is not None else contiguous_gguf_strides(self.y_shape)) + gguf_strides( + *( + self.y_stride + if self.y_stride is not None + else contiguous_gguf_strides(self.y_shape) + ) + ), ) # Add tensors to the test test_writer.add_tensor( - test_writer.gguf_key("x"), - self.x, - raw_dtype=np_dtype_to_ggml(self.x.dtype) + test_writer.gguf_key("x"), self.x, raw_dtype=np_dtype_to_ggml(self.x.dtype) ) test_writer.add_tensor( test_writer.gguf_key("min_val"), self.min_val, - raw_dtype=np_dtype_to_ggml(self.min_val.dtype) + raw_dtype=np_dtype_to_ggml(self.min_val.dtype), ) test_writer.add_tensor( test_writer.gguf_key("max_val"), self.max_val, - raw_dtype=np_dtype_to_ggml(self.max_val.dtype) + raw_dtype=np_dtype_to_ggml(self.max_val.dtype), ) test_writer.add_tensor( - test_writer.gguf_key("y"), - self.y, - raw_dtype=np_dtype_to_ggml(self.y.dtype) + test_writer.gguf_key("y"), self.y, raw_dtype=np_dtype_to_ggml(self.y.dtype) ) # Calculate the expected result ans = clip( self.x.astype(np.float64), self.min_val.astype(np.float64), - self.max_val.astype(np.float64) + self.max_val.astype(np.float64), ) # Add the expected result to the test test_writer.add_tensor( - test_writer.gguf_key("ans"), - ans, - raw_dtype=gguf.GGMLQuantizationType.F64 + test_writer.gguf_key("ans"), ans, raw_dtype=gguf.GGMLQuantizationType.F64 ) + if __name__ == "__main__": test_writer = InfiniopTestWriter("clip.gguf") @@ -130,23 +143,23 @@ def write_test(self, test_writer: "InfiniopTestWriter"): # Test case shapes shapes = [ - (10,), # 1D tensor - (5, 10), # 2D tensor - (2, 3, 4), # 3D tensor - (7, 13), # Prime dimensions - (1, 1), # Minimum shape - (100, 100), # Large shape - (16, 16, 16), # Large 3D + (10,), # 1D tensor + (5, 10), # 2D tensor + (2, 3, 4), # 3D tensor + (7, 13), # Prime dimensions + (1, 1), # Minimum shape + (100, 100), # Large shape + (16, 16, 16), # Large 3D ] # Test case min/max values min_max_values = [ - (-1.0, 1.0), # Standard range - (0.0, 2.0), # Positive range - (-2.0, 0.0), # Negative range - (-1000.0, 1000.0), # Large range - (-0.001, 0.001), # Small range - (0.0, 0.0), # min=max + (-1.0, 1.0), # Standard range + (0.0, 2.0), # Positive range + (-2.0, 0.0), # Negative range + (-1000.0, 1000.0), # Large range + (-0.001, 0.001), # Small range + (0.0, 0.0), # min=max ] # Data types to test @@ -171,7 +184,7 @@ def write_test(self, test_writer: "InfiniopTestWriter"): max_stride=None, y=y, y_shape=shape, - y_stride=None + y_stride=None, ) ) @@ -199,7 +212,7 @@ def write_test(self, test_writer: "InfiniopTestWriter"): max_stride=row_stride, y=y, y_shape=shape, - y_stride=row_stride + y_stride=row_stride, ) ) @@ -219,7 +232,7 @@ def write_test(self, test_writer: "InfiniopTestWriter"): max_stride=col_stride, y=y, y_shape=shape, - y_stride=col_stride + y_stride=col_stride, ) ) @@ -239,7 +252,7 @@ def write_test(self, test_writer: "InfiniopTestWriter"): max_stride=row_stride, y=y, y_shape=shape, - y_stride=col_stride + y_stride=col_stride, ) ) diff --git a/test/infiniop-test/test_generate/testcases/mul.py b/test/infiniop-test/test_generate/testcases/mul.py index 00c427bcb..ad4f6b806 100644 --- a/test/infiniop-test/test_generate/testcases/mul.py +++ b/test/infiniop-test/test_generate/testcases/mul.py @@ -2,30 +2,36 @@ import gguf from typing import List -from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides +from .. import ( + InfiniopTestWriter, + InfiniopTestCase, + np_dtype_to_ggml, + gguf_strides, + contiguous_gguf_strides, +) -def mul( - a: np.ndarray, - b: np.ndarray -): + +def mul(a: np.ndarray, b: np.ndarray): return np.multiply(a, b) + def random_tensor(shape, dtype): rate = 1e-3 var = 0.5 * rate # 数值范围在[-5e-4, 5e-4] return rate * np.random.rand(*shape).astype(dtype) - var + class MulTestCase(InfiniopTestCase): def __init__( self, a: np.ndarray, - shape_a: List[int] | None, + shape_a: List[int] | None, stride_a: List[int] | None, b: np.ndarray, - shape_b: List[int] | None, + shape_b: List[int] | None, stride_b: List[int] | None, c: np.ndarray, - shape_c: List[int] | None, + shape_c: List[int] | None, stride_c: List[int] | None, ): super().__init__("mul") @@ -39,7 +45,6 @@ def __init__( self.shape_c = shape_c self.stride_c = stride_c - def write_test(self, test_writer: "InfiniopTestWriter"): super().write_test(test_writer) if self.shape_a is not None: @@ -49,12 +54,22 @@ def write_test(self, test_writer: "InfiniopTestWriter"): if self.shape_c is not None: test_writer.add_array(test_writer.gguf_key("c.shape"), self.shape_c) if self.stride_a is not None: - test_writer.add_array(test_writer.gguf_key("a.strides"), gguf_strides(*self.stride_a)) + test_writer.add_array( + test_writer.gguf_key("a.strides"), gguf_strides(*self.stride_a) + ) if self.stride_b is not None: - test_writer.add_array(test_writer.gguf_key("b.strides"), gguf_strides(*self.stride_b)) + test_writer.add_array( + test_writer.gguf_key("b.strides"), gguf_strides(*self.stride_b) + ) test_writer.add_array( test_writer.gguf_key("c.strides"), - gguf_strides(*self.stride_c if self.stride_c is not None else contiguous_gguf_strides(self.shape_c)) + gguf_strides( + *( + self.stride_c + if self.stride_c is not None + else contiguous_gguf_strides(self.shape_c) + ) + ), ) test_writer.add_tensor( @@ -68,7 +83,7 @@ def write_test(self, test_writer: "InfiniopTestWriter"): ) a_fp64 = self.a.astype(np.float64) b_fp64 = self.b.astype(np.float64) - + ans_fp64 = np.multiply(a_fp64, b_fp64) ans = mul(self.a, self.b) test_writer.add_tensor( @@ -80,7 +95,8 @@ def write_test(self, test_writer: "InfiniopTestWriter"): raw_dtype=np_dtype_to_ggml(ans_fp64.dtype), ) -if __name__ == '__main__': + +if __name__ == "__main__": test_writer = InfiniopTestWriter("mul.gguf") test_cases = [] @@ -96,16 +112,15 @@ def write_test(self, test_writer: "InfiniopTestWriter"): ((2048, 2560), (2560, 1), (1, 2048), (2560, 1)), ((4, 48, 64), (64 * 48, 64, 1), (1, 4, 192), None), ((4, 48, 64), None, (1, 4, 192), (48 * 64, 64, 1)), - ] + ] _TENSOR_DTYPES_ = [np.float32, np.float16] - + for dtype in _TENSOR_DTYPES_: for shape, stride_a, stride_b, stride_c in _TEST_CASES_: a = random_tensor(shape, dtype) b = random_tensor(shape, dtype) c = np.empty(tuple(0 for _ in shape), dtype=dtype) - test_cases.append( MulTestCase( a=a, @@ -118,7 +133,7 @@ def write_test(self, test_writer: "InfiniopTestWriter"): shape_c=shape, stride_c=stride_c, ) - ) - + ) + test_writer.add_tests(test_cases) test_writer.save() diff --git a/test/infiniop-test/test_generate/testcases/rearrange.py b/test/infiniop-test/test_generate/testcases/rearrange.py index 9617a1fc0..3d3a0e73b 100644 --- a/test/infiniop-test/test_generate/testcases/rearrange.py +++ b/test/infiniop-test/test_generate/testcases/rearrange.py @@ -1,14 +1,21 @@ import torch from typing import List -from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides +from .. import ( + InfiniopTestWriter, + InfiniopTestCase, + np_dtype_to_ggml, + gguf_strides, + contiguous_gguf_strides, +) + def row_major_strides(shape): """生成张量的行优先stride - + Args: shape: 张量形状 - + Returns: 行优先strides列表 """ @@ -19,12 +26,13 @@ def row_major_strides(shape): strides.insert(0, stride) return strides + def column_major_strides(shape): """生成张量的列优先stride - + Args: shape: 张量形状 - + Returns: 列优先strides列表 """ @@ -35,6 +43,7 @@ def column_major_strides(shape): strides.append(stride) return strides + def rearrange_using_torch(src: torch.Tensor, dst_strides: List[int]) -> torch.Tensor: """ 使用torch的rearrange函数计算结果 @@ -66,27 +75,35 @@ def __init__( self.shape = shape self.src_strides = src_strides self.dst_strides = dst_strides - + def write_test(self, test_writer: "InfiniopTestWriter"): super().write_test(test_writer) - + # 写入形状信息 if self.shape is not None: test_writer.add_array(test_writer.gguf_key("src.shape"), self.shape) test_writer.add_array(test_writer.gguf_key("dst.shape"), self.shape) - + # 写入strides信息 if self.src_strides is not None: - test_writer.add_array(test_writer.gguf_key("src.strides"), gguf_strides(*self.src_strides)) + test_writer.add_array( + test_writer.gguf_key("src.strides"), gguf_strides(*self.src_strides) + ) test_writer.add_array( test_writer.gguf_key("dst.strides"), - gguf_strides(*self.dst_strides if self.dst_strides is not None else contiguous_gguf_strides(self.shape)) + gguf_strides( + *( + self.dst_strides + if self.dst_strides is not None + else contiguous_gguf_strides(self.shape) + ) + ), ) - + # 转换torch tensor为numpy用于写入文件 src_numpy = self.src.detach().cpu().numpy() dst_numpy = self.dst.detach().cpu().numpy() - + # 写入张量数据 test_writer.add_tensor( test_writer.gguf_key("src"), @@ -98,9 +115,13 @@ def write_test(self, test_writer: "InfiniopTestWriter"): dst_numpy, raw_dtype=np_dtype_to_ggml(dst_numpy.dtype), ) - + # 计算并写入答案 - dst_strides_for_ans = self.dst_strides if self.dst_strides is not None else list(contiguous_gguf_strides(self.shape)) + dst_strides_for_ans = ( + self.dst_strides + if self.dst_strides is not None + else list(contiguous_gguf_strides(self.shape)) + ) ans_torch = rearrange_using_torch(self.src, dst_strides_for_ans) ans_numpy = ans_torch.detach().cpu().numpy() test_writer.add_tensor( @@ -109,6 +130,7 @@ def write_test(self, test_writer: "InfiniopTestWriter"): raw_dtype=np_dtype_to_ggml(src_numpy.dtype), ) + if __name__ == "__main__": test_writer = InfiniopTestWriter("rearrange.gguf") test_cases = [] @@ -117,12 +139,20 @@ def write_test(self, test_writer: "InfiniopTestWriter"): # (shape, src_stride, dst_stride) ((100, 100), (1, 100), (100, 1)), ((4, 4), (1, 4), (4, 1)), - ((4, 6, 64), (64, 4*64, 1), (6*64, 64, 1)), + ((4, 6, 64), (64, 4 * 64, 1), (6 * 64, 64, 1)), ((2000, 2000), (1, 2000), (2000, 1)), ((2001, 2001), (1, 2001), (2001, 1)), ((2, 2, 2, 4), (16, 8, 4, 1), (16, 8, 1, 2)), - ((3, 4, 7, 53, 9), row_major_strides((3, 4, 7, 53, 9)), column_major_strides((3, 4, 7, 53, 9))), - ((3, 4, 50, 50, 5, 7), row_major_strides((3, 4, 50, 50, 5, 7)), column_major_strides((3, 4, 50, 50, 5, 7))), + ( + (3, 4, 7, 53, 9), + row_major_strides((3, 4, 7, 53, 9)), + column_major_strides((3, 4, 7, 53, 9)), + ), + ( + (3, 4, 50, 50, 5, 7), + row_major_strides((3, 4, 50, 50, 5, 7)), + column_major_strides((3, 4, 50, 50, 5, 7)), + ), ] _TENSOR_DTYPES_ = [torch.float32, torch.float16] @@ -132,7 +162,7 @@ def write_test(self, test_writer: "InfiniopTestWriter"): src = torch.rand(*shape, dtype=dtype) # 生成目标张量,使用正确的形状 dst = torch.empty(shape, dtype=dtype) - + test_case = RearrangeTestCase( src=src, dst=dst, @@ -140,7 +170,7 @@ def write_test(self, test_writer: "InfiniopTestWriter"): src_strides=src_strides, dst_strides=dst_strides, ) - test_cases.append(test_case) + test_cases.append(test_case) test_writer.add_tests(test_cases) - test_writer.save() + test_writer.save() diff --git a/test/infiniop-test/test_generate/testcases/rms_norm.py b/test/infiniop-test/test_generate/testcases/rms_norm.py index 681ebafc4..9332c090a 100644 --- a/test/infiniop-test/test_generate/testcases/rms_norm.py +++ b/test/infiniop-test/test_generate/testcases/rms_norm.py @@ -1,11 +1,19 @@ import numpy as np from typing import List -from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides +from .. import ( + InfiniopTestWriter, + InfiniopTestCase, + np_dtype_to_ggml, + gguf_strides, + contiguous_gguf_strides, +) + def random_tensor(shape: tuple, dtype: np.dtype) -> np.ndarray: return np.random.uniform(-1.0, 1.0, shape).astype(dtype) * 0.001 + def rms_norm(x: np.ndarray, w: np.ndarray, epsilon: float) -> np.ndarray: """ 使用numpy计算rms_norm结果 @@ -16,13 +24,14 @@ def rms_norm(x: np.ndarray, w: np.ndarray, epsilon: float) -> np.ndarray: Returns: 输出张量, 形状与 input 相同 """ - squared = x ** 2 + squared = x**2 mean = np.mean(squared, axis=-1, keepdims=True) rms = np.sqrt(mean + epsilon) - + normalized = x / rms return normalized * w + class RMSNormTestCase(InfiniopTestCase): def __init__( self, @@ -40,9 +49,9 @@ def __init__( self.y = y self.shape = shape self.epsilon = epsilon - self.x_strides=x_strides - self.y_strides=y_strides - + self.x_strides = x_strides + self.y_strides = y_strides + def write_test(self, test_writer: "InfiniopTestWriter"): super().write_test(test_writer) test_writer.add_float32(test_writer.gguf_key("epsilon"), self.epsilon) @@ -50,10 +59,18 @@ def write_test(self, test_writer: "InfiniopTestWriter"): test_writer.add_array(test_writer.gguf_key("x.shape"), self.shape) test_writer.add_array(test_writer.gguf_key("y.shape"), self.shape) if self.x_strides is not None: - test_writer.add_array(test_writer.gguf_key("x.strides"), gguf_strides(*self.x_strides)) + test_writer.add_array( + test_writer.gguf_key("x.strides"), gguf_strides(*self.x_strides) + ) test_writer.add_array( test_writer.gguf_key("y.strides"), - gguf_strides(*self.y_strides if self.y_strides is not None else contiguous_gguf_strides(self.shape)) + gguf_strides( + *( + self.y_strides + if self.y_strides is not None + else contiguous_gguf_strides(self.shape) + ) + ), ) test_writer.add_tensor( test_writer.gguf_key("x"), @@ -70,13 +87,16 @@ def write_test(self, test_writer: "InfiniopTestWriter"): self.y, raw_dtype=np_dtype_to_ggml(self.y.dtype), ) - ans = rms_norm(self.x.astype(np.float64), self.w.astype(np.float64), self.epsilon) + ans = rms_norm( + self.x.astype(np.float64), self.w.astype(np.float64), self.epsilon + ) test_writer.add_tensor( test_writer.gguf_key("ans"), ans, raw_dtype=np_dtype_to_ggml(np.float64), ) + if __name__ == "__main__": test_writer = InfiniopTestWriter("rms_norm.gguf") test_cases = [] @@ -112,9 +132,9 @@ def write_test(self, test_writer: "InfiniopTestWriter"): shape=shape, x_strides=x_strides, y_strides=y_strides, - epsilon=epsilon + epsilon=epsilon, ) - test_cases.append(test_case) + test_cases.append(test_case) test_writer.add_tests(test_cases) test_writer.save() diff --git a/test/infiniop-test/test_generate/testcases/rope.py b/test/infiniop-test/test_generate/testcases/rope.py index 85d9685dd..27f5a06db 100644 --- a/test/infiniop-test/test_generate/testcases/rope.py +++ b/test/infiniop-test/test_generate/testcases/rope.py @@ -4,11 +4,17 @@ from typing import List -from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides +from .. import ( + InfiniopTestWriter, + InfiniopTestCase, + np_dtype_to_ggml, + gguf_strides, + contiguous_gguf_strides, +) def rotary_embedding(t, sin, cos): - dh = t.shape[2] + dh = t.shape[2] assert dh % 2 == 0, "Embedding dimension must be even." t_even = t[..., 0::2] # [seq_len, n_head, dh // 2] @@ -30,7 +36,9 @@ def rotary_embedding(t, sin, cos): def sin_cos_table(pos, dim, theta, dtype): assert dim % 2 == 0, "Embedding dimension must be even." - freqs = 1.0 / (theta ** (np.arange(0, dim, 2)[: (dim // 2)].astype(np.float32) / dim)) + freqs = 1.0 / ( + theta ** (np.arange(0, dim, 2)[: (dim // 2)].astype(np.float32) / dim) + ) angles = np.outer(pos, freqs) @@ -79,19 +87,33 @@ def write_test(self, test_writer: "InfiniopTestWriter"): test_writer.add_array(test_writer.gguf_key("x.shape"), self.shape_x) test_writer.add_array( test_writer.gguf_key("y.strides"), - gguf_strides(*self.stride_y if self.stride_y is not None else contiguous_gguf_strides(self.shape_y)) + gguf_strides( + *( + self.stride_y + if self.stride_y is not None + else contiguous_gguf_strides(self.shape_y) + ) + ), ) if self.stride_x is not None: - test_writer.add_array(test_writer.gguf_key("x.strides"), gguf_strides(*self.stride_x)) + test_writer.add_array( + test_writer.gguf_key("x.strides"), gguf_strides(*self.stride_x) + ) test_writer.add_tensor( - test_writer.gguf_key("pos_ids"), self.pos_ids, raw_dtype=np_dtype_to_ggml(self.pos_ids.dtype) + test_writer.gguf_key("pos_ids"), + self.pos_ids, + raw_dtype=np_dtype_to_ggml(self.pos_ids.dtype), ) test_writer.add_tensor( - test_writer.gguf_key("sin_table"), self.sin_table, raw_dtype=np_dtype_to_ggml(self.sin_table.dtype) + test_writer.gguf_key("sin_table"), + self.sin_table, + raw_dtype=np_dtype_to_ggml(self.sin_table.dtype), ) test_writer.add_tensor( - test_writer.gguf_key("cos_table"), self.cos_table, raw_dtype=np_dtype_to_ggml(self.cos_table.dtype) + test_writer.gguf_key("cos_table"), + self.cos_table, + raw_dtype=np_dtype_to_ggml(self.cos_table.dtype), ) ans = rotary_embedding( self.x.astype(np.float64), @@ -103,8 +125,6 @@ def write_test(self, test_writer: "InfiniopTestWriter"): ) - - if __name__ == "__main__": # ============================================================================== # Configuration (Internal Use Only) @@ -130,7 +150,9 @@ def write_test(self, test_writer: "InfiniopTestWriter"): x = np.random.rand(*shape).astype(dtype) y = np.empty(tuple(0 for _ in shape), dtype=dtype) pos_ids = np.arange(0, x.shape[0], dtype=np.int32) - sin_table, cos_table = sin_cos_table(pos_ids, x.shape[2], theta=1e5, dtype=dtype) + sin_table, cos_table = sin_cos_table( + pos_ids, x.shape[2], theta=1e5, dtype=dtype + ) test_case = RoPETestCase( y=y, x=x, diff --git a/test/infiniop-test/test_generate/testcases/swiglu.py b/test/infiniop-test/test_generate/testcases/swiglu.py index cb692b613..aa3450fed 100644 --- a/test/infiniop-test/test_generate/testcases/swiglu.py +++ b/test/infiniop-test/test_generate/testcases/swiglu.py @@ -2,7 +2,14 @@ import gguf from typing import List -from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides, process_zero_stride_tensor +from .. import ( + InfiniopTestWriter, + InfiniopTestCase, + np_dtype_to_ggml, + gguf_strides, + contiguous_gguf_strides, + process_zero_stride_tensor, +) def swiglu( @@ -26,7 +33,6 @@ def __init__( c: np.ndarray, shape_c: List[int] | None, stride_c: List[int] | None, - ): super().__init__("swiglu") self.a = a @@ -39,7 +45,6 @@ def __init__( self.shape_c = shape_c self.stride_c = stride_c - def write_test(self, test_writer: "InfiniopTestWriter"): super().write_test(test_writer) if self.shape_a is not None: @@ -47,14 +52,24 @@ def write_test(self, test_writer: "InfiniopTestWriter"): if self.shape_b is not None: test_writer.add_array(test_writer.gguf_key("b.shape"), self.shape_b) if self.shape_c is not None: - test_writer.add_array(test_writer.gguf_key("c.shape"), self.shape_c) + test_writer.add_array(test_writer.gguf_key("c.shape"), self.shape_c) if self.stride_a is not None: - test_writer.add_array(test_writer.gguf_key("a.strides"), gguf_strides(*self.stride_a)) + test_writer.add_array( + test_writer.gguf_key("a.strides"), gguf_strides(*self.stride_a) + ) if self.stride_b is not None: - test_writer.add_array(test_writer.gguf_key("b.strides"), gguf_strides(*self.stride_b)) + test_writer.add_array( + test_writer.gguf_key("b.strides"), gguf_strides(*self.stride_b) + ) test_writer.add_array( test_writer.gguf_key("c.strides"), - gguf_strides(*self.stride_c if self.stride_c is not None else contiguous_gguf_strides(self.shape_c)) + gguf_strides( + *( + self.stride_c + if self.stride_c is not None + else contiguous_gguf_strides(self.shape_c) + ) + ), ) test_writer.add_tensor( test_writer.gguf_key("a"), self.a, raw_dtype=np_dtype_to_ggml(self.a.dtype) diff --git a/test/infiniop/cast.py b/test/infiniop/cast.py new file mode 100644 index 000000000..3065021de --- /dev/null +++ b/test/infiniop/cast.py @@ -0,0 +1,242 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_SHAPES_ = [ + (13, 4), + (13, 4, 4), + (16, 5632), + (4, 4, 5632), + (1024,), + (32, 32), +] + +_TEST_STRIDES_ = [ + None, # Contiguous + # Add some non-contiguous strides for specific shapes +] + +# Define type conversion test matrix +_TYPE_CONVERSIONS_ = [ + # Integer to integer conversions + (InfiniDtype.I32, InfiniDtype.I64), + (InfiniDtype.I64, InfiniDtype.I32), + (InfiniDtype.U32, InfiniDtype.U64), + (InfiniDtype.U64, InfiniDtype.U32), + (InfiniDtype.I32, InfiniDtype.U32), + (InfiniDtype.U32, InfiniDtype.I32), + # Integer to float conversions + (InfiniDtype.I32, InfiniDtype.F32), + (InfiniDtype.I32, InfiniDtype.F64), + (InfiniDtype.I64, InfiniDtype.F32), + (InfiniDtype.I64, InfiniDtype.F64), + (InfiniDtype.U32, InfiniDtype.F32), + (InfiniDtype.U32, InfiniDtype.F64), + (InfiniDtype.U64, InfiniDtype.F32), + (InfiniDtype.U64, InfiniDtype.F64), + # Float to integer conversions + (InfiniDtype.F32, InfiniDtype.I32), + (InfiniDtype.F32, InfiniDtype.I64), + (InfiniDtype.F64, InfiniDtype.I32), + (InfiniDtype.F64, InfiniDtype.I64), + (InfiniDtype.F32, InfiniDtype.U32), + (InfiniDtype.F32, InfiniDtype.U64), + (InfiniDtype.F64, InfiniDtype.U32), + (InfiniDtype.F64, InfiniDtype.U64), + # Float to float conversions + (InfiniDtype.F32, InfiniDtype.F64), + (InfiniDtype.F64, InfiniDtype.F32), + (InfiniDtype.F16, InfiniDtype.F32), + (InfiniDtype.F32, InfiniDtype.F16), + (InfiniDtype.F16, InfiniDtype.F64), + (InfiniDtype.F64, InfiniDtype.F16), + (InfiniDtype.BF16, InfiniDtype.F32), + (InfiniDtype.F32, InfiniDtype.BF16), +] + +# Form the test cases +_TEST_CASES = [] +for input_dtype, output_dtype in _TYPE_CONVERSIONS_: + for shape in _TEST_SHAPES_: + for stride in _TEST_STRIDES_: + _TEST_CASES.append((shape, stride, stride, input_dtype, output_dtype)) + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, + InfiniDtype.F64: {"atol": 1e-15, "rtol": 1e-15}, + InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.I32: {"atol": 0, "rtol": 0}, + InfiniDtype.I64: {"atol": 0, "rtol": 0}, + InfiniDtype.U32: {"atol": 0, "rtol": 0}, + InfiniDtype.U64: {"atol": 0, "rtol": 0}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def cast_pytorch(output, input_tensor): + """Cast using PyTorch""" + output.copy_(input_tensor) + + +def test( + handle, + device, + shape, + input_stride=None, + output_stride=None, + input_dtype=InfiniDtype.F32, + output_dtype=InfiniDtype.F16, + sync=None, +): + # Create input tensor with appropriate data based on type + if input_dtype in [InfiniDtype.I32, InfiniDtype.I64]: + # Signed integer: use both positive and negative values + input_tensor = TestTensor( + shape, input_stride, input_dtype, device, mode="randint", low=-50, high=50 + ) + elif input_dtype in [InfiniDtype.U32, InfiniDtype.U64]: + # Unsigned integer: use positive values + input_tensor = TestTensor( + shape, input_stride, input_dtype, device, mode="randint", low=0, high=100 + ) + else: + # Float: use random values + input_tensor = TestTensor(shape, input_stride, input_dtype, device) + + output_tensor = TestTensor(shape, output_stride, output_dtype, device, mode="zeros") + + print( + f"Testing Cast on {InfiniDeviceNames[device]} with shape:{shape} " + f"input_stride:{input_stride} output_stride:{output_stride} " + f"input_dtype:{InfiniDtypeNames[input_dtype]} output_dtype:{InfiniDtypeNames[output_dtype]}" + ) + + # Perform PyTorch cast for reference + cast_pytorch(output_tensor.torch_tensor(), input_tensor.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateCastDescriptor( + handle, + ctypes.byref(descriptor), + output_tensor.descriptor, + input_tensor.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [input_tensor, output_tensor]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetCastWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, output_tensor.device) + + def lib_cast(): + check_error( + LIBINFINIOP.infiniopCast( + descriptor, + workspace.data(), + workspace.size(), + output_tensor.data(), + input_tensor.data(), + None, + ) + ) + + lib_cast() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, output_dtype) + if DEBUG: + debug( + output_tensor.actual_tensor(), + output_tensor.torch_tensor(), + atol=atol, + rtol=rtol, + ) + + # For integer types, use exact comparison + if output_dtype in [ + InfiniDtype.I32, + InfiniDtype.I64, + InfiniDtype.U32, + InfiniDtype.U64, + ]: + assert torch.equal(output_tensor.actual_tensor(), output_tensor.torch_tensor()) + else: + assert torch.allclose( + output_tensor.actual_tensor(), + output_tensor.torch_tensor(), + atol=atol, + rtol=rtol, + ) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: cast_pytorch(output_tensor.torch_tensor(), input_tensor.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_cast(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyCastDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + print(f"\033[94mRunning Cast operator tests...\033[0m") + print(f"Total test cases: {len(_TEST_CASES)}") + print(f"Type conversions tested: {len(_TYPE_CONVERSIONS_)}") + print("\nType conversion matrix:") + for i, (input_dtype, output_dtype) in enumerate(_TYPE_CONVERSIONS_): + print( + f" {i+1:2d}. {InfiniDtypeNames[input_dtype]:>6} -> {InfiniDtypeNames[output_dtype]:<6}" + ) + print() + + for device in get_test_devices(args): + print(f"\033[93mTesting on device: {InfiniDeviceNames[device]}\033[0m") + test_operator( + device, test, _TEST_CASES, [] + ) # Empty dtype list since we handle dtypes in test cases + + print("\033[92mAll Cast tests passed!\033[0m") diff --git a/test/infiniop/cos.py b/test/infiniop/cos.py new file mode 100644 index 000000000..8393eb418 --- /dev/null +++ b/test/infiniop/cos.py @@ -0,0 +1,162 @@ +import ctypes +from ctypes import c_uint64 +from enum import Enum, auto + +import torch +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # tensor_shape, inplace + # TODO: Uncomment the following line. + # ((),), + ((1, 3),), + ((3, 3),), + ((32, 20, 512),), + ((33, 333, 333),), + ((32, 256, 112, 112),), + ((3, 3, 13, 9, 17),), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_X, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, + InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-3}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def cos(x): + return torch.cos(x).to(x.dtype) + + +def test( + handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=torch.float16, sync=None +): + x_torch_tensor = torch.rand(shape) * 2 - 1 + + x = TestTensor( + shape, + x_torch_tensor.stride(), + dtype, + device, + mode="manual", + set_tensor=x_torch_tensor, + ) + + if inplace == Inplace.INPLACE_X: + y = x + else: + y = TestTensor(shape, None, dtype, device) + + if y.is_broadcast(): + return + + print( + f"Testing Cos on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" + ) + + ans = cos(x.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateCosDescriptor( + handle, ctypes.byref(descriptor), y.descriptor, x.descriptor + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [x, y]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetCosWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, y.device) + + def lib_cos(): + LIBINFINIOP.infiniopCos( + descriptor, workspace.data(), workspace.size(), y.data(), x.data(), None + ) + + lib_cos() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) + assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: cos(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_cos(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyCosDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mCos Test passed!\033[0m") diff --git a/test/infiniop/exp.py b/test/infiniop/exp.py new file mode 100644 index 000000000..47849d8b5 --- /dev/null +++ b/test/infiniop/exp.py @@ -0,0 +1,162 @@ +import ctypes +from ctypes import c_uint64 +from enum import Enum, auto + +import torch +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # tensor_shape, inplace + # TODO: Uncomment the following line. + # ((),), + ((1, 3),), + ((3, 3),), + ((32, 20, 512),), + ((33, 333, 333),), + ((32, 256, 112, 112),), + ((3, 3, 13, 9, 17),), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_X, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, + InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-3}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def exp(x): + return torch.exp(x).to(x.dtype) + + +def test( + handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=torch.float16, sync=None +): + x_torch_tensor = torch.rand(shape) * 2 - 1 + + x = TestTensor( + shape, + x_torch_tensor.stride(), + dtype, + device, + mode="manual", + set_tensor=x_torch_tensor, + ) + + if inplace == Inplace.INPLACE_X: + y = x + else: + y = TestTensor(shape, None, dtype, device) + + if y.is_broadcast(): + return + + print( + f"Testing Exp on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" + ) + + ans = exp(x.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateExpDescriptor( + handle, ctypes.byref(descriptor), y.descriptor, x.descriptor + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [x, y]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetExpWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, y.device) + + def lib_exp(): + LIBINFINIOP.infiniopExp( + descriptor, workspace.data(), workspace.size(), y.data(), x.data(), None + ) + + lib_exp() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) + assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: exp(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_exp(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyExpDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mExp Test passed!\033[0m") diff --git a/test/infiniop/hardswish.py b/test/infiniop/hardswish.py new file mode 100644 index 000000000..05afc92e9 --- /dev/null +++ b/test/infiniop/hardswish.py @@ -0,0 +1,190 @@ +import ctypes +from ctypes import c_uint64 +from enum import Enum, auto + +import torch +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # tensor_shape, inplace + # TODO: Uncomment the following line. + # ((),), + ((1, 3),), + ((3, 3),), + ((32, 20, 512),), + ((33, 333, 333),), + ((32, 256, 112, 112),), + ((3, 3, 13, 9, 17),), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_INPUT = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_INPUT, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +# HardSwish 是分段线性(含除法),数值较稳定;容差与 GeLU 保持一致或更严格均可 +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, + InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-3}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + +_hswish = torch.nn.Hardswish(inplace=False) + + +def hardswish(x): + """ + Reference HardSwish using PyTorch: + hswish(x) = x * clamp(x + 3, 0, 6) / 6 + """ + # return torch.nn.functional.hardswish(x).to(x.dtype) + return _hswish(x).to(x.dtype) + + +def test( + handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=torch.float16, sync=None +): + # Sample inputs from [-2, 2],覆盖线性区和中间平滑区;可根据需要扩大范围 + input_torch_tensor = torch.rand(shape) * 4 - 2 + + input_tensor = TestTensor( + shape, + input_torch_tensor.stride(), + dtype, + device, + mode="manual", + set_tensor=input_torch_tensor, + ) + + if inplace == Inplace.INPLACE_INPUT: + output_tensor = input_tensor + else: + output_tensor = TestTensor(shape, None, dtype, device) + + if output_tensor.is_broadcast(): + return + + print( + f"Testing HardSwish on {InfiniDeviceNames[device]} with shape:{shape} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + + # PyTorch reference + ans = hardswish(input_tensor.torch_tensor()) + + if sync is not None: + sync() + + # Create HardSwish descriptor + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateHardSwishDescriptor( + handle, + ctypes.byref(descriptor), + output_tensor.descriptor, + input_tensor.descriptor, + ) + ) + + # Invalidate desc shapes/strides to ensure kernel uses runtime args + for tensor in [input_tensor, output_tensor]: + tensor.destroy_desc() + + # Workspace + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetHardSwishWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, output_tensor.device) + + def lib_hardswish(): + LIBINFINIOP.infiniopHardSwish( + descriptor, + workspace.data(), + workspace.size(), + output_tensor.data(), + input_tensor.data(), + None, + ) + + # Run lib op + lib_hardswish() + + # Verify + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(output_tensor.actual_tensor(), ans, atol=atol, rtol=rtol) + assert torch.allclose(output_tensor.actual_tensor(), ans, atol=atol, rtol=rtol) + + # Profiling (optional) + if PROFILE: + profile_operation( + "PyTorch", + lambda: hardswish(input_tensor.torch_tensor()), + device, + NUM_PRERUN, + NUM_ITERATIONS, + ) + profile_operation( + " lib", lambda: lib_hardswish(), device, NUM_PRERUN, NUM_ITERATIONS + ) + + # Clean up + check_error(LIBINFINIOP.infiniopDestroyHardSwishDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mHardSwish test passed!\033[0m") diff --git a/test/infiniop/leaky_relu.py b/test/infiniop/leaky_relu.py new file mode 100644 index 000000000..57de370b9 --- /dev/null +++ b/test/infiniop/leaky_relu.py @@ -0,0 +1,340 @@ +# import ctypes +# from ctypes import c_uint64, c_float +# from enum import Enum, auto + +# import torch +# from libinfiniop import ( +# LIBINFINIOP, +# InfiniDeviceNames, +# InfiniDtype, +# InfiniDtypeNames, +# TestTensor, +# TestWorkspace, +# check_error, +# debug, +# get_args, +# get_test_devices, +# get_tolerance, +# infiniopOperatorDescriptor_t, +# profile_operation, +# test_operator, +# ) + +# # ============================================================================== +# # Configuration (Internal Use Only) +# # ============================================================================== +# # These are not meant to be imported from other modules +# _TEST_CASES_ = [ +# # tensor_shape, negative_slope +# ((1, 3), 0.01), +# ((3, 3), 0.1), +# ((32, 20, 512), 0.2), +# ((33, 333, 333), 0.01), +# ((32, 256, 112, 112), 0.1), +# ((3, 3, 13, 9, 17), 0.2), +# ] + + +# class Inplace(Enum): +# OUT_OF_PLACE = auto() +# INPLACE_X = auto() + + +# # Inplace options applied for each test case in _TEST_CASES_ +# _INPLACE = [ +# Inplace.OUT_OF_PLACE, +# Inplace.INPLACE_X, +# ] + +# # Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +# _TEST_CASES = [ +# test_case + (inplace_item,) +# for test_case in _TEST_CASES_ +# for inplace_item in _INPLACE +# ] + +# # Data types used for testing +# _TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +# # Tolerance map for different data types +# _TOLERANCE_MAP = { +# InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, +# InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, +# InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-3}, +# } + +# DEBUG = False +# PROFILE = False +# NUM_PRERUN = 10 +# NUM_ITERATIONS = 1000 + + +# def leaky_relu(x, negative_slope): +# return torch.nn.functional.leaky_relu(x, negative_slope=negative_slope).to(x.dtype) + + +# def test( +# handle, device, shape, negative_slope, inplace=Inplace.OUT_OF_PLACE, dtype=torch.float16, sync=None +# ): +# x_torch_tensor = torch.rand(shape) * 2 - 1 + +# x = TestTensor( +# shape, +# x_torch_tensor.stride(), +# dtype, +# device, +# mode="manual", +# set_tensor=x_torch_tensor, +# ) + +# if inplace == Inplace.INPLACE_X: +# y = x +# else: +# y = TestTensor(shape, None, dtype, device) + +# if y.is_broadcast(): +# return + +# print( +# f"Testing LeakyReLU on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} negative_slope:{negative_slope} inplace: {inplace}" +# ) + +# ans = leaky_relu(x.torch_tensor(), negative_slope) + +# if sync is not None: +# sync() + +# descriptor = infiniopOperatorDescriptor_t() +# check_error( +# LIBINFINIOP.infiniopCreateLeakyReLUDescriptor( +# handle, ctypes.byref(descriptor), y.descriptor, x.descriptor, c_float(negative_slope) +# ) +# ) + +# # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel +# for tensor in [x, y]: +# tensor.destroy_desc() + +# workspace_size = c_uint64(0) +# check_error( +# LIBINFINIOP.infiniopGetLeakyReLUWorkspaceSize( +# descriptor, ctypes.byref(workspace_size) +# ) +# ) +# workspace = TestWorkspace(workspace_size.value, y.device) + +# def lib_leaky_relu(): +# LIBINFINIOP.infiniopLeakyReLU( +# descriptor, workspace.data(), workspace.size(), y.data(), x.data(), None +# ) + +# lib_leaky_relu() + +# atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) +# if DEBUG: +# debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) +# assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol) + +# # Profiling workflow +# if PROFILE: +# # fmt: off +# profile_operation("PyTorch", lambda: leaky_relu(x.torch_tensor(), negative_slope), device, NUM_PRERUN, NUM_ITERATIONS) +# profile_operation(" lib", lambda: lib_leaky_relu(), device, NUM_PRERUN, NUM_ITERATIONS) +# # fmt: on + +# check_error(LIBINFINIOP.infiniopDestroyLeakyReLUDescriptor(descriptor)) + + +# if __name__ == "__main__": +# args = get_args() + +# # Configure testing options +# DEBUG = args.debug +# PROFILE = args.profile +# NUM_PRERUN = args.num_prerun +# NUM_ITERATIONS = args.num_iterations + +# for device in get_test_devices(args): +# test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + +# print("\033[92mTest passed!\033[0m") + + +import ctypes +from ctypes import c_uint64, c_float +from enum import Enum, auto + +import torch +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # tensor_shape, negative_slope, inplace + # TODO: Uncomment the following line. + # ((), 0.01), + ((1, 3), 0.01), + ((3, 3), 0.1), + ((32, 20, 512), 0.2), + ((33, 333, 333), 0.01), + ((32, 256, 112, 112), 0.1), + ((3, 3, 13, 9, 17), 0.02), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_INPUT = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_INPUT, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, + InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-3}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def leaky_relu(x, negative_slope=0.01): + return torch.nn.functional.leaky_relu(x, negative_slope=negative_slope).to(x.dtype) + + +def test( + handle, + device, + shape, + negative_slope, + inplace=Inplace.OUT_OF_PLACE, + dtype=torch.float16, + sync=None, +): + # Generate test data with both positive and negative values to test LeakyReLU properly + input_torch_tensor = torch.randn(shape) * 2 # Range around [-2, 2] + + input_tensor = TestTensor( + shape, + input_torch_tensor.stride(), + dtype, + device, + mode="manual", + set_tensor=input_torch_tensor, + ) + + if inplace == Inplace.INPLACE_INPUT: + output = input_tensor + else: + output = TestTensor(shape, None, dtype, device) + + if output.is_broadcast(): + return + + print( + f"Testing LeakyReLU on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} negative_slope:{negative_slope} inplace: {inplace}" + ) + + ans = leaky_relu(input_tensor.torch_tensor(), negative_slope) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateLeakyReLUDescriptor( + handle, + ctypes.byref(descriptor), + output.descriptor, + input_tensor.descriptor, + c_float(negative_slope), + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [input_tensor, output]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetLeakyReLUWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, output.device) + + def lib_leaky_relu(): + LIBINFINIOP.infiniopLeakyReLU( + descriptor, + workspace.data(), + workspace.size(), + output.data(), + input_tensor.data(), + None, + ) + + lib_leaky_relu() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(output.actual_tensor(), ans, atol=atol, rtol=rtol) + assert torch.allclose(output.actual_tensor(), ans, atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: leaky_relu(input_tensor.torch_tensor(), negative_slope), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_leaky_relu(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyLeakyReLUDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mLeakyReLU Test passed!\033[0m") diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py index e92e77105..363d7a6d7 100644 --- a/test/infiniop/libinfiniop/op_register.py +++ b/test/infiniop/libinfiniop/op_register.py @@ -454,6 +454,7 @@ def swiglu_(lib): infiniopOperatorDescriptor_t, ] + @OpRegister.operator def conv_(lib): lib.infiniopCreateConvDescriptor.restype = c_int32 @@ -489,3 +490,216 @@ def conv_(lib): lib.infiniopDestroyConvDescriptor.argtypes = [ infiniopOperatorDescriptor_t, ] + + +@OpRegister.operator +def exp_(lib): + lib.infiniopCreateExpDescriptor.restype = c_int32 + lib.infiniopCreateExpDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopExp.restype = c_int32 + lib.infiniopExp.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyExpDescriptor.restype = c_int32 + lib.infiniopDestroyExpDescriptor.argtypes = [infiniopOperatorDescriptor_t] + + +@OpRegister.operator +def sin_(lib): + lib.infiniopCreateSinDescriptor.restype = c_int32 + lib.infiniopCreateSinDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopSin.restype = c_int32 + lib.infiniopSin.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroySinDescriptor.restype = c_int32 + lib.infiniopDestroySinDescriptor.argtypes = [infiniopOperatorDescriptor_t] + + +@OpRegister.operator +def cos_(lib): + lib.infiniopCreateCosDescriptor.restype = c_int32 + lib.infiniopCreateCosDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopCos.restype = c_int32 + lib.infiniopCos.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyCosDescriptor.restype = c_int32 + lib.infiniopDestroyCosDescriptor.argtypes = [infiniopOperatorDescriptor_t] + + +@OpRegister.operator +def tanh_(lib): + lib.infiniopCreateTanhDescriptor.restype = c_int32 + lib.infiniopCreateTanhDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopTanh.restype = c_int32 + lib.infiniopTanh.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyTanhDescriptor.restype = c_int32 + lib.infiniopDestroyTanhDescriptor.argtypes = [infiniopOperatorDescriptor_t] + + +@OpRegister.operator +def sigmoid_backward_(lib): + lib.infiniopCreateSigmoidBackwardDescriptor.restype = c_int32 + lib.infiniopCreateSigmoidBackwardDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetSigmoidBackwardWorkspaceSize.restype = c_int32 + lib.infiniopGetSigmoidBackwardWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopSigmoidBackward.restype = c_int32 + lib.infiniopSigmoidBackward.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroySigmoidBackwardDescriptor.restype = c_int32 + lib.infiniopDestroySigmoidBackwardDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def hardswish_(lib): + lib.infiniopCreateHardSwishDescriptor.restype = c_int32 + lib.infiniopCreateHardSwishDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopHardSwish.restype = c_int32 + lib.infiniopHardSwish.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyHardSwishDescriptor.restype = c_int32 + lib.infiniopDestroyHardSwishDescriptor.argtypes = [infiniopOperatorDescriptor_t] + + +@OpRegister.operator +def where_(lib): + lib.infiniopCreateWhereDescriptor.restype = c_int32 + lib.infiniopCreateWhereDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetWhereWorkspaceSize.restype = c_int32 + lib.infiniopGetWhereWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopWhere.restype = c_int32 + lib.infiniopWhere.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyWhereDescriptor.restype = c_int32 + lib.infiniopDestroyWhereDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def leaky_relu_(lib): + lib.infiniopCreateLeakyReLUDescriptor.restype = c_int32 + lib.infiniopCreateLeakyReLUDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + c_float, + ] + + lib.infiniopGetLeakyReLUWorkspaceSize.restype = c_int32 + lib.infiniopGetLeakyReLUWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopLeakyReLU.restype = c_int32 + lib.infiniopLeakyReLU.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyLeakyReLUDescriptor.restype = c_int32 + lib.infiniopDestroyLeakyReLUDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] diff --git a/test/infiniop/libinfiniop/utils.py b/test/infiniop/libinfiniop/utils.py index de397a69e..1a8eaf505 100644 --- a/test/infiniop/libinfiniop/utils.py +++ b/test/infiniop/libinfiniop/utils.py @@ -66,10 +66,33 @@ def __init__( torch_strides.append(strides[i]) else: torch_shape.append(shape[i]) + + is_bool = dt == InfiniDtype.BOOL + if is_bool: + dt = InfiniDtype.F32 + + is_int = ( + dt == InfiniDtype.I8 + or dt == InfiniDtype.I16 + or dt == InfiniDtype.I32 + or dt == InfiniDtype.I64 + ) + if mode == "random": - self._torch_tensor = torch.rand( - torch_shape, dtype=to_torch_dtype(dt), device=torch_device_map[device] - ) + if is_int: + self._torch_tensor = torch.randint( + 0, + 100, + torch_shape, + dtype=to_torch_dtype(dt), + device=torch_device_map[device], + ) + else: + self._torch_tensor = torch.rand( + torch_shape, + dtype=to_torch_dtype(dt), + device=torch_device_map[device], + ) elif mode == "zeros": self._torch_tensor = torch.zeros( torch_shape, dtype=to_torch_dtype(dt), device=torch_device_map[device] @@ -88,6 +111,12 @@ def __init__( else: raise ValueError("Unsupported mode") + if is_bool: + self._torch_tensor = self._torch_tensor > 0.5 + + if is_bool: + self._torch_tensor = self._torch_tensor > 0.5 + if scale is not None: self._torch_tensor *= scale if bias is not None: @@ -103,6 +132,9 @@ def __init__( def torch_tensor(self): return self._torch_tensor + def update_torch_tensor(self, new_tensor: torch.tensor): + self._torch_tensor = new_tensor + def actual_tensor(self): return self._data_tensor @@ -120,6 +152,9 @@ def from_torch(torch_tensor, dt: InfiniDtype, device: InfiniDeviceEnum): shape_, strides_, dt, device, mode="manual", set_tensor=torch_tensor ) + def update_torch_tensor(self, new_tensor: torch.Tensor): + self._torch_tensor = new_tensor + def to_torch_dtype(dt: InfiniDtype, compatability_mode=False): if dt == InfiniDtype.I8: @@ -140,6 +175,8 @@ def to_torch_dtype(dt: InfiniDtype, compatability_mode=False): return torch.float32 elif dt == InfiniDtype.F64: return torch.float64 + elif dt == InfiniDtype.BOOL: + return torch.bool # TODO: These following types may not be supported by older # versions of PyTorch. Use compatability mode to convert them. elif dt == InfiniDtype.U16: @@ -330,6 +367,11 @@ def debug(actual, desired, atol=0, rtol=1e-2, equal_nan=False, verbose=True): actual = actual.to(torch.float32) desired = desired.to(torch.float32) + # 如果是BOOL,全部转成FP32再比对 + if actual.dtype == torch.bool or desired.dtype == torch.bool: + actual = actual.to(torch.float32) + desired = desired.to(torch.float32) + print_discrepancy(actual, desired, atol, rtol, equal_nan, verbose) np.testing.assert_allclose( actual.cpu(), desired.cpu(), rtol, atol, equal_nan, verbose=True @@ -523,7 +565,7 @@ def profile_operation(desc, func, torch_device, NUM_PRERUN, NUM_ITERATIONS): # Timed execution elapsed = timed_op(lambda: func(), NUM_ITERATIONS, torch_device) - print(f" {desc} time: {elapsed * 1000 :6f} ms") + print(f" {desc} time: {elapsed * 1000:6f} ms") def test_operator(device, test_func, test_cases, tensor_dtypes): @@ -605,9 +647,11 @@ def get_test_devices(args): def get_sync_func(device): import torch - if device == InfiniDeviceEnum.CPU or device == InfiniDeviceEnum.CAMBRICON: + device_str = torch_device_map[device] + + if device == InfiniDeviceEnum.CPU: sync = None else: - sync = getattr(torch, torch_device_map[device]).synchronize + sync = getattr(torch, device_str).synchronize return sync diff --git a/test/infiniop/sigmoid_backward.py b/test/infiniop/sigmoid_backward.py new file mode 100644 index 000000000..d387fe1f4 --- /dev/null +++ b/test/infiniop/sigmoid_backward.py @@ -0,0 +1,227 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# 和 ReLU 保持一致的形状/步幅用例 +_TEST_CASES_ = [ + # shape, input_stride, grad_output_stride, grad_input_stride + ((13, 4), None, None, None), + ((13, 4), (10, 1), (10, 1), (10, 1)), + ((13, 4), (0, 1), None, None), + ((13, 4, 4), None, None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), (0, 4, 1), None), + ((16, 5632), None, None, None), + ((16, 5632), (13312, 1), (13312, 1), (13312, 1)), + ((4, 4, 5632), None, None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_INPUT = auto() + INPLACE_GRAD_OUTPUT = auto() + + +# 每个 case 都测试三种 inplace 方式 +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_INPUT, + Inplace.INPLACE_GRAD_OUTPUT, +] + +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# 数据类型 +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +# Sigmoid backward 容差(略宽于 ReLU) +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 2e-3, "rtol": 2e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, + InfiniDtype.BF16: {"atol": 3e-2, "rtol": 3e-2}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def sigmoid_backward(grad_input, input_tensor, grad_output): + """ + PyTorch reference implementation of Sigmoid backward. + + Given: + y = sigmoid(x) = 1 / (1 + exp(-x)) + Then: + dL/dx = dL/dy * y * (1 - y) + """ + s = torch.sigmoid(input_tensor) + result = grad_output * s * (1.0 - s) + + # 安全拷贝,避免原地副作用 + with torch.no_grad(): + grad_input.copy_(result) + + +def test( + handle, + device, + shape, + input_stride=None, + grad_output_stride=None, + grad_input_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=torch.float16, + sync=None, +): + # 输入含正负值,便于覆盖 sigmoid 的不同区间 + input_tensor = TestTensor( + shape, input_stride, dtype, device, mode="random", scale=4.0, bias=-2.0 + ) + grad_output = TestTensor(shape, grad_output_stride, dtype, device, mode="random") + + if inplace == Inplace.INPLACE_INPUT: + if input_stride != grad_input_stride: + return + grad_input = input_tensor + elif inplace == Inplace.INPLACE_GRAD_OUTPUT: + if grad_input_stride != grad_output_stride: + return + grad_input = grad_output + else: + grad_input = TestTensor(shape, grad_input_stride, dtype, device, mode="zeros") + + if grad_input.is_broadcast(): + return + + print( + f"Testing Sigmoid Backward on {InfiniDeviceNames[device]} with shape:{shape} " + f"input_stride:{input_stride} grad_output_stride:{grad_output_stride} grad_input_stride:{grad_input_stride} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + + # 计算 PyTorch 参考结果(写入 grad_input.torch_tensor()) + sigmoid_backward( + grad_input.torch_tensor(), + input_tensor.torch_tensor(), + grad_output.torch_tensor(), + ) + + if sync is not None: + sync() + + # 创建算子描述子 + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateSigmoidBackwardDescriptor( + handle, + ctypes.byref(descriptor), + grad_input.descriptor, + input_tensor.descriptor, + grad_output.descriptor, + ) + ) + + # 使内部 desc 的 shape/stride 失效,强制 kernel 走外部传参 + for tensor in [input_tensor, grad_output, grad_input]: + tensor.destroy_desc() + + # workspace + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetSigmoidBackwardWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, grad_input.device) + + def lib_sigmoid_backward(): + check_error( + LIBINFINIOP.infiniopSigmoidBackward( + descriptor, + workspace.data(), + workspace.size(), + grad_input.data(), + input_tensor.data(), + grad_output.data(), + None, + ) + ) + + # 执行库实现,结果写入 grad_input.actual_tensor() + lib_sigmoid_backward() + + # 校验 + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug( + grad_input.actual_tensor(), grad_input.torch_tensor(), atol=atol, rtol=rtol + ) + assert torch.allclose( + grad_input.actual_tensor(), grad_input.torch_tensor(), atol=atol, rtol=rtol + ) + + # 性能分析(可选) + if PROFILE: + profile_operation( + "PyTorch", + lambda: sigmoid_backward( + grad_input.torch_tensor(), + input_tensor.torch_tensor(), + grad_output.torch_tensor(), + ), + device, + NUM_PRERUN, + NUM_ITERATIONS, + ) + profile_operation( + " lib", + lambda: lib_sigmoid_backward(), + device, + NUM_PRERUN, + NUM_ITERATIONS, + ) + + check_error(LIBINFINIOP.infiniopDestroySigmoidBackwardDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # 覆盖运行选项 + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mSigmoid Backward test passed!\033[0m") diff --git a/test/infiniop/sin.py b/test/infiniop/sin.py new file mode 100644 index 000000000..9089f1403 --- /dev/null +++ b/test/infiniop/sin.py @@ -0,0 +1,162 @@ +import ctypes +from ctypes import c_uint64 +from enum import Enum, auto + +import torch +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # tensor_shape, inplace + # TODO: Uncomment the following line. + # ((),), + ((1, 3),), + ((3, 3),), + ((32, 20, 512),), + ((33, 333, 333),), + ((32, 256, 112, 112),), + ((3, 3, 13, 9, 17),), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_X, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, + InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-3}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def sin(x): + return torch.sin(x).to(x.dtype) + + +def test( + handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=torch.float16, sync=None +): + x_torch_tensor = torch.rand(shape) * 2 - 1 + + x = TestTensor( + shape, + x_torch_tensor.stride(), + dtype, + device, + mode="manual", + set_tensor=x_torch_tensor, + ) + + if inplace == Inplace.INPLACE_X: + y = x + else: + y = TestTensor(shape, None, dtype, device) + + if y.is_broadcast(): + return + + print( + f"Testing Sin on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" + ) + + ans = sin(x.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateSinDescriptor( + handle, ctypes.byref(descriptor), y.descriptor, x.descriptor + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [x, y]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetSinWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, y.device) + + def lib_sin(): + LIBINFINIOP.infiniopSin( + descriptor, workspace.data(), workspace.size(), y.data(), x.data(), None + ) + + lib_sin() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) + assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: sin(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_sin(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroySinDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mSin Test passed!\033[0m") diff --git a/test/infiniop/tanh.py b/test/infiniop/tanh.py new file mode 100644 index 000000000..67d1a6c01 --- /dev/null +++ b/test/infiniop/tanh.py @@ -0,0 +1,162 @@ +import ctypes +from ctypes import c_uint64 +from enum import Enum, auto + +import torch +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # tensor_shape, inplace + # TODO: Uncomment the following line. + # ((),), + ((1, 3),), + ((3, 3),), + ((32, 20, 512),), + ((33, 333, 333),), + ((32, 256, 112, 112),), + ((3, 3, 13, 9, 17),), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_X, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, + InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-3}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def tanh(x): + return torch.tanh(x).to(x.dtype) + + +def test( + handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=torch.float16, sync=None +): + x_torch_tensor = torch.rand(shape) * 2 - 1 + + x = TestTensor( + shape, + x_torch_tensor.stride(), + dtype, + device, + mode="manual", + set_tensor=x_torch_tensor, + ) + + if inplace == Inplace.INPLACE_X: + y = x + else: + y = TestTensor(shape, None, dtype, device) + + if y.is_broadcast(): + return + + print( + f"Testing Tanh on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" + ) + + ans = tanh(x.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateTanhDescriptor( + handle, ctypes.byref(descriptor), y.descriptor, x.descriptor + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [x, y]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetTanhWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, y.device) + + def lib_tanh(): + LIBINFINIOP.infiniopTanh( + descriptor, workspace.data(), workspace.size(), y.data(), x.data(), None + ) + + lib_tanh() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) + assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: tanh(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_tanh(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyTanhDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTanh Test passed!\033[0m") diff --git a/test/infiniop/where.py b/test/infiniop/where.py new file mode 100644 index 000000000..1ed944cff --- /dev/null +++ b/test/infiniop/where.py @@ -0,0 +1,312 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # shape, condition_stride, a_stride, b_stride, c_stride + # 基本形状测试 + ((10,), None, None, None, None), + ((5, 10), None, None, None, None), + ((2, 3, 4), None, None, None, None), + ((4, 5, 6), None, None, None, None), + # 不同步长测试 + ((10, 10), (10, 1), None, None, None), + ((10, 10), None, (10, 1), None, None), + ((10, 10), None, None, (10, 1), None), + ((10, 10), None, None, None, (10, 1)), + # 奇怪形状测试 + ((7, 13), None, None, None, None), # 质数维度 + ((3, 5, 7), None, None, None, None), # 三维质数 + ((11, 17, 23), None, None, None, None), # 更大质数 + # 非标准形状测试 + ((1, 1), None, None, None, None), # 最小形状 + ((1, 100), None, None, None, None), # 单行 + ((100, 1), None, None, None, None), # 单列 + ((64, 64), None, None, None, None), # 2的幂次 + ((16, 16, 16), None, None, None, None), # 三维2的幂次 + # 大形状测试 + ((100, 100), None, None, None, None), + ((32, 32, 32), None, None, None, None), + # 广播测试 - 这些会被跳过,但保留作为潜在的扩展 + ((10,), (0,), None, None, None), # 广播condition + ((5, 10), None, (0, 1), None, None), # 广播a + ((5, 10), None, None, (0, 1), None), # 广播b +] + + +# 暂时只测试浮点类型,确认逻辑正确后再扩展到整数类型 +_TENSOR_DTYPES = [ + InfiniDtype.F16, + InfiniDtype.F32, + InfiniDtype.F64, + InfiniDtype.BF16, + InfiniDtype.I8, + InfiniDtype.I16, + InfiniDtype.I32, + InfiniDtype.I64, + # InfiniDtype.U8, InfiniDtype.U16, InfiniDtype.U32, InfiniDtype.U64, + InfiniDtype.BOOL, +] + + +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-6}, + InfiniDtype.F64: {"atol": 1e-15, "rtol": 1e-14}, + InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.I8: {"atol": 0, "rtol": 0}, + InfiniDtype.I16: {"atol": 0, "rtol": 0}, + InfiniDtype.I32: {"atol": 0, "rtol": 0}, + InfiniDtype.I64: {"atol": 0, "rtol": 0}, + InfiniDtype.U8: {"atol": 0, "rtol": 0}, + InfiniDtype.U16: {"atol": 0, "rtol": 0}, + InfiniDtype.U32: {"atol": 0, "rtol": 0}, + InfiniDtype.U64: {"atol": 0, "rtol": 0}, + InfiniDtype.BOOL: {"atol": 0, "rtol": 0}, +} + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_A = auto() + INPLACE_B = auto() + + +_INPLACE = [ + Inplace.INPLACE_A, + Inplace.INPLACE_B, + Inplace.OUT_OF_PLACE, +] + +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def where(c, condition, a, b): + """Where operation: c[i] = condition[i] ? a[i] : b[i]""" + result = torch.where(condition.to(torch.bool), a, b) + c.copy_(result) + + +def test( + handle, + device, + shape, + condition_stride=None, + a_stride=None, + b_stride=None, + c_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=InfiniDtype.F32, + sync=None, +): + # Create input tensors a and b with specified dtype + # For unsigned integer types, we need to be careful about random generation + if dtype in [InfiniDtype.U8, InfiniDtype.U16, InfiniDtype.U32, InfiniDtype.U64]: + # Use a smaller range for unsigned types to avoid overflow + a = TestTensor(shape, a_stride, dtype, device, mode="random", scale=10, bias=0) + b = TestTensor(shape, b_stride, dtype, device, mode="random", scale=10, bias=0) + condition = TestTensor( + shape, condition_stride, dtype, device, mode="random", scale=10, bias=0 + ) + elif dtype in [InfiniDtype.I8, InfiniDtype.I16, InfiniDtype.I32, InfiniDtype.I64]: + # Use a reasonable range for signed integer types + a = TestTensor( + shape, a_stride, dtype, device, mode="random", scale=100, bias=-50 + ) + b = TestTensor( + shape, b_stride, dtype, device, mode="random", scale=100, bias=-50 + ) + condition = TestTensor( + shape, condition_stride, dtype, device, mode="random", scale=100, bias=-50 + ) + else: + # For floating point and bool types, use default random generation + a = TestTensor(shape, a_stride, dtype, device) + b = TestTensor(shape, b_stride, dtype, device) + condition = TestTensor(shape, condition_stride, dtype, device) + # Handle inplace operations + if inplace == Inplace.INPLACE_A: + if a_stride != c_stride: + return + c = a + elif inplace == Inplace.INPLACE_B: + if b_stride != c_stride: + return + c = b + else: + c = TestTensor(shape, c_stride, dtype, device) + + # Skip broadcast cases for now + if ( + c.is_broadcast() + or condition.is_broadcast() + or a.is_broadcast() + or b.is_broadcast() + ): + return + + print( + f"Testing Where on {InfiniDeviceNames[device]} with shape:{shape} " + f"condition_stride:{condition_stride} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + + # Compute reference result using PyTorch + where( + c.torch_tensor(), condition.torch_tensor(), a.torch_tensor(), b.torch_tensor() + ) + + if sync is not None: + sync() + + # Store expected result before library operation + expected_result = c.torch_tensor().clone() + + # Create descriptor + descriptor = infiniopOperatorDescriptor_t() + print( + a.torch_tensor().dtype, + b.torch_tensor().dtype, + condition.torch_tensor().dtype, + c.torch_tensor().dtype, + ) + check_error( + LIBINFINIOP.infiniopCreateWhereDescriptor( + handle, + ctypes.byref(descriptor), + c.descriptor, + condition.descriptor, + a.descriptor, + b.descriptor, + ) + ) + + # Get workspace size + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetWhereWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, c.device) + + def lib_where(): + check_error( + LIBINFINIOP.infiniopWhere( + descriptor, + workspace.data() if workspace is not None else None, + workspace_size.value, + c.data(), + condition.data(), + a.data(), + b.data(), + None, + ) + ) + + # Execute library operation + lib_where() + + # Destroy the tensor descriptors + for tensor in [condition, a, b, c]: + tensor.destroy_desc() + + # Check results with better error reporting + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + + # # Always print debug info for failed cases + # print(f"Condition values: {condition.torch_tensor().flatten()[:10]}") + # print(f"A values: {a.torch_tensor().flatten()[:10]}") + # print(f"B values: {b.torch_tensor().flatten()[:10]}") + # print(f"Expected result: {expected_result.flatten()[:10]}") + # print(f"Actual result: {c.actual_tensor().flatten()[:10]}") + + # if DEBUG: + # print(f"Expected result shape: {expected_result.shape}") + # print(f"Actual result shape: {c.actual_tensor().shape}") + # print(f"Expected result dtype: {expected_result.dtype}") + # print(f"Actual result dtype: {c.actual_tensor().dtype}") + # debug(c.actual_tensor(), expected_result, atol=atol, rtol=rtol) + + # Use torch.equal for exact comparison for integer and boolean types + if dtype in [ + InfiniDtype.I8, + InfiniDtype.I16, + InfiniDtype.I32, + InfiniDtype.I64, + InfiniDtype.U8, + InfiniDtype.U16, + InfiniDtype.U32, + InfiniDtype.U64, + InfiniDtype.BOOL, + ]: + if not torch.equal(c.actual_tensor(), expected_result): + print(f"Exact comparison failed for {InfiniDtypeNames[dtype]}") + print( + f"Max absolute difference: {torch.max(torch.abs(c.actual_tensor() - expected_result))}" + ) + assert False, f"Results don't match exactly for {InfiniDtypeNames[dtype]}" + else: + if not torch.allclose(c.actual_tensor(), expected_result, atol=atol, rtol=rtol): + print(f"Tolerance comparison failed for {InfiniDtypeNames[dtype]}") + print( + f"Max absolute difference: {torch.max(torch.abs(c.actual_tensor() - expected_result))}" + ) + print(f"Tolerance: atol={atol}, rtol={rtol}") + assert ( + False + ), f"Results don't match within tolerance for {InfiniDtypeNames[dtype]}" + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: where(c.torch_tensor(), condition.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_where(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + # Clean up + check_error(LIBINFINIOP.infiniopDestroyWhereDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mWhere test passed!\033[0m")