diff --git a/include/infiniop.h b/include/infiniop.h
index d51b8d92e..572546cf7 100644
--- a/include/infiniop.h
+++ b/include/infiniop.h
@@ -17,5 +17,14 @@
 #include "infiniop/ops/sub.h"
 #include "infiniop/ops/swiglu.h"
 #include "infiniop/tensor_descriptor.h"
+#include "infiniop/ops/exp.h"
+#include "infiniop/ops/sin.h"
+#include "infiniop/ops/cos.h"
+#include "infiniop/ops/tanh.h"
+#include "infiniop/ops/sigmoid_backward.h"
+#include "infiniop/ops/hardswish.h"
+#include "infiniop/ops/where.h"
+#include "infiniop/ops/leaky_relu.h"
+#include "infiniop/ops/cast.h"
 
 #endif // __INFINIOP_API_H__
diff --git a/include/infiniop/ops/cast.h b/include/infiniop/ops/cast.h
new file mode 100644
index 000000000..a3a84c00b
--- /dev/null
+++ b/include/infiniop/ops/cast.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_CAST_API_H__
+#define __INFINIOP_CAST_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopCastDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateCastDescriptor(infiniopHandle_t handle,
+                                                         infiniopCastDescriptor_t *desc_ptr,
+                                                         infiniopTensorDescriptor_t output,
+                                                         infiniopTensorDescriptor_t input);
+
+__C __export infiniStatus_t infiniopGetCastWorkspaceSize(infiniopCastDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopCast(infiniopCastDescriptor_t desc,
+                                         void *workspace,
+                                         size_t workspace_size,
+                                         void *output,
+                                         const void *input,
+                                         void *stream);
+
+__C __export infiniStatus_t infiniopDestroyCastDescriptor(infiniopCastDescriptor_t desc);
+
+#endif
\ No newline at end of file
diff --git a/include/infiniop/ops/cos.h b/include/infiniop/ops/cos.h
new file mode 100644
index 000000000..098c0d7e1
--- /dev/null
+++ b/include/infiniop/ops/cos.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_COS_API_H__
+#define __INFINIOP_COS_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopCosDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateCosDescriptor(infiniopHandle_t handle,
+                                                        infiniopCosDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t output,
+                                                        infiniopTensorDescriptor_t input);
+
+__C __export infiniStatus_t infiniopGetCosWorkspaceSize(infiniopCosDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopCos(infiniopCosDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *output,
+                                        const void *input,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroyCosDescriptor(infiniopCosDescriptor_t desc);
+
+#endif
\ No newline at end of file
diff --git a/include/infiniop/ops/exp.h b/include/infiniop/ops/exp.h
new file mode 100644
index 000000000..1b7defcc5
--- /dev/null
+++ b/include/infiniop/ops/exp.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_EXP_API_H__
+#define __INFINIOP_EXP_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopExpDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateExpDescriptor(infiniopHandle_t handle,
+                                                        infiniopExpDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t output,
+                                                        infiniopTensorDescriptor_t input);
+
+__C __export infiniStatus_t infiniopGetExpWorkspaceSize(infiniopExpDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopExp(infiniopExpDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *output,
+                                        const void *input,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroyExpDescriptor(infiniopExpDescriptor_t desc);
+
+#endif
\ No newline at end of file
diff --git a/include/infiniop/ops/hardswish.h b/include/infiniop/ops/hardswish.h
new file mode 100644
index 000000000..8b54b207b
--- /dev/null
+++ b/include/infiniop/ops/hardswish.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_HARDSWISH_API_H__
+#define __INFINIOP_HARDSWISH_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopHardSwishDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateHardSwishDescriptor(infiniopHandle_t handle,
+                                                              infiniopHardSwishDescriptor_t *desc_ptr,
+                                                              infiniopTensorDescriptor_t output,
+                                                              infiniopTensorDescriptor_t input);
+
+__C __export infiniStatus_t infiniopGetHardSwishWorkspaceSize(infiniopHardSwishDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopHardSwish(infiniopHardSwishDescriptor_t desc,
+                                              void *workspace,
+                                              size_t workspace_size,
+                                              void *output,
+                                              const void *input,
+                                              void *stream);
+
+__C __export infiniStatus_t infiniopDestroyHardSwishDescriptor(infiniopHardSwishDescriptor_t desc);
+
+#endif
\ No newline at end of file
diff --git a/include/infiniop/ops/leaky_relu.h b/include/infiniop/ops/leaky_relu.h
new file mode 100644
index 000000000..143ff28b6
--- /dev/null
+++ b/include/infiniop/ops/leaky_relu.h
@@ -0,0 +1,25 @@
+#ifndef __INFINIOP_LEAKY_RELU_API_H__
+#define __INFINIOP_LEAKY_RELU_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopLeakyReLUDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateLeakyReLUDescriptor(infiniopHandle_t handle,
+                                                              infiniopLeakyReLUDescriptor_t *desc_ptr,
+                                                              infiniopTensorDescriptor_t output,
+                                                              infiniopTensorDescriptor_t input,
+                                                              float negative_slope);
+
+__C __export infiniStatus_t infiniopGetLeakyReLUWorkspaceSize(infiniopLeakyReLUDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopLeakyReLU(infiniopLeakyReLUDescriptor_t desc,
+                                              void *workspace,
+                                              size_t workspace_size,
+                                              void *output,
+                                              const void *input,
+                                              void *stream);
+
+__C __export infiniStatus_t infiniopDestroyLeakyReLUDescriptor(infiniopLeakyReLUDescriptor_t desc);
+
+#endif
\ No newline at end of file
diff --git a/include/infiniop/ops/sigmoid_backward.h b/include/infiniop/ops/sigmoid_backward.h
new file mode 100644
index 000000000..82b55e9dd
--- /dev/null
+++ b/include/infiniop/ops/sigmoid_backward.h
@@ -0,0 +1,26 @@
+#ifndef __INFINIOP_SIGMOID_BACKWARD_API_H__
+#define __INFINIOP_SIGMOID_BACKWARD_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopSigmoidBackwardDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateSigmoidBackwardDescriptor(infiniopHandle_t handle,
+                                                                    infiniopSigmoidBackwardDescriptor_t *desc_ptr,
+                                                                    infiniopTensorDescriptor_t grad_input,
+                                                                    infiniopTensorDescriptor_t input,
+                                                                    infiniopTensorDescriptor_t grad_output);
+
+__C __export infiniStatus_t infiniopGetSigmoidBackwardWorkspaceSize(infiniopSigmoidBackwardDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopSigmoidBackward(infiniopSigmoidBackwardDescriptor_t desc,
+                                                   void *workspace,
+                                                   size_t workspace_size,
+                                                   void *grad_input,
+                                                   const void *input,
+                                                   const void *grad_output,
+                                                   void *stream);
+
+__C __export infiniStatus_t infiniopDestroySigmoidBackwardDescriptor(infiniopSigmoidBackwardDescriptor_t desc);
+
+#endif
\ No newline at end of file
diff --git a/include/infiniop/ops/sin.h b/include/infiniop/ops/sin.h
new file mode 100644
index 000000000..dba8683e5
--- /dev/null
+++ b/include/infiniop/ops/sin.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_SIN_API_H__
+#define __INFINIOP_SIN_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopSinDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateSinDescriptor(infiniopHandle_t handle,
+                                                        infiniopSinDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t output,
+                                                        infiniopTensorDescriptor_t input);
+
+__C __export infiniStatus_t infiniopGetSinWorkspaceSize(infiniopSinDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopSin(infiniopSinDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *output,
+                                        const void *input,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroySinDescriptor(infiniopSinDescriptor_t desc);
+
+#endif
\ No newline at end of file
diff --git a/include/infiniop/ops/tanh.h b/include/infiniop/ops/tanh.h
new file mode 100644
index 000000000..b6531e391
--- /dev/null
+++ b/include/infiniop/ops/tanh.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_TANH_API_H__
+#define __INFINIOP_TANH_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopTanhDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateTanhDescriptor(infiniopHandle_t handle,
+                                                         infiniopTanhDescriptor_t *desc_ptr,
+                                                         infiniopTensorDescriptor_t output,
+                                                         infiniopTensorDescriptor_t input);
+
+__C __export infiniStatus_t infiniopGetTanhWorkspaceSize(infiniopTanhDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopTanh(infiniopTanhDescriptor_t desc,
+                                         void *workspace,
+                                         size_t workspace_size,
+                                         void *output,
+                                         const void *input,
+                                         void *stream);
+
+__C __export infiniStatus_t infiniopDestroyTanhDescriptor(infiniopTanhDescriptor_t desc);
+
+#endif
\ No newline at end of file
diff --git a/include/infiniop/ops/where.h b/include/infiniop/ops/where.h
new file mode 100644
index 000000000..d38c753df
--- /dev/null
+++ b/include/infiniop/ops/where.h
@@ -0,0 +1,28 @@
+#ifndef __INFINIOP_WHERE_API_H__
+#define __INFINIOP_WHERE_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopWhereDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateWhereDescriptor(infiniopHandle_t handle,
+                                                          infiniopWhereDescriptor_t *desc_ptr,
+                                                          infiniopTensorDescriptor_t c,
+                                                          infiniopTensorDescriptor_t condition,
+                                                          infiniopTensorDescriptor_t a,
+                                                          infiniopTensorDescriptor_t b);
+
+__C __export infiniStatus_t infiniopGetWhereWorkspaceSize(infiniopWhereDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopWhere(infiniopWhereDescriptor_t desc,
+                                          void *workspace,
+                                          size_t workspace_size,
+                                          void *c,
+                                          const void *condition,
+                                          const void *a,
+                                          const void *b,
+                                          void *stream);
+
+__C __export infiniStatus_t infiniopDestroyWhereDescriptor(infiniopWhereDescriptor_t desc);
+
+#endif
\ No newline at end of file
diff --git a/scripts/python_test.py b/scripts/python_test.py
index eb2d4319e..69b26ed8b 100644
--- a/scripts/python_test.py
+++ b/scripts/python_test.py
@@ -24,6 +24,15 @@ def run_tests(args):
         "rope.py",
         "sub.py",
         "swiglu.py",
+        "exp.py",
+        "sin.py",
+        "cos.py",
+        "tanh.py",
+        "sigmoid_backward.py",
+        "hardswish.py",
+        "where.py",
+        "leaky_relu.py",
+        "cast.py",
     ]:
         result = subprocess.run(
             f"python {test} {args} --debug", text=True, encoding="utf-8", shell=True
diff --git a/src/infiniop-test/include/ops.hpp b/src/infiniop-test/include/ops.hpp
index 3820f7cfd..aa2ef9389 100644
--- a/src/infiniop-test/include/ops.hpp
+++ b/src/infiniop-test/include/ops.hpp
@@ -16,6 +16,15 @@ DECLARE_INFINIOP_TEST(add)
 DECLARE_INFINIOP_TEST(causal_softmax)
 DECLARE_INFINIOP_TEST(rearrange)
 DECLARE_INFINIOP_TEST(sub)
+DECLARE_INFINIOP_TEST(exp)
+DECLARE_INFINIOP_TEST(sin)
+DECLARE_INFINIOP_TEST(cos)
+DECLARE_INFINIOP_TEST(tanh)
+DECLARE_INFINIOP_TEST(sigmoid_backward)
+DECLARE_INFINIOP_TEST(hardswish)
+DECLARE_INFINIOP_TEST(where)
+DECLARE_INFINIOP_TEST(leakyrelu)
+DECLARE_INFINIOP_TEST(cast)
 
 #define REGISTER_INFINIOP_TEST(name)                      \
     {                                                     \
@@ -30,19 +39,28 @@ DECLARE_INFINIOP_TEST(sub)
 /*
  * Register all the tests here
  */
-#define TEST_BUILDER_MAPPINGS                  \
-    {                                          \
-        REGISTER_INFINIOP_TEST(gemm)           \
-        REGISTER_INFINIOP_TEST(random_sample)  \
-        REGISTER_INFINIOP_TEST(add)            \
-        REGISTER_INFINIOP_TEST(mul)            \
-        REGISTER_INFINIOP_TEST(clip)           \
-        REGISTER_INFINIOP_TEST(swiglu)         \
-        REGISTER_INFINIOP_TEST(rope)           \
-        REGISTER_INFINIOP_TEST(rms_norm)       \
-        REGISTER_INFINIOP_TEST(causal_softmax) \
-        REGISTER_INFINIOP_TEST(rearrange)      \
-        REGISTER_INFINIOP_TEST(sub)            \
+#define TEST_BUILDER_MAPPINGS                    \
+    {                                            \
+        REGISTER_INFINIOP_TEST(gemm)             \
+        REGISTER_INFINIOP_TEST(random_sample)    \
+        REGISTER_INFINIOP_TEST(add)              \
+        REGISTER_INFINIOP_TEST(mul)              \
+        REGISTER_INFINIOP_TEST(clip)             \
+        REGISTER_INFINIOP_TEST(swiglu)           \
+        REGISTER_INFINIOP_TEST(rope)             \
+        REGISTER_INFINIOP_TEST(rms_norm)         \
+        REGISTER_INFINIOP_TEST(causal_softmax)   \
+        REGISTER_INFINIOP_TEST(rearrange)        \
+        REGISTER_INFINIOP_TEST(sub)              \
+        REGISTER_INFINIOP_TEST(exp)              \
+        REGISTER_INFINIOP_TEST(sin)              \
+        REGISTER_INFINIOP_TEST(cos)              \
+        REGISTER_INFINIOP_TEST(tanh)             \
+        REGISTER_INFINIOP_TEST(sigmoid_backward) \
+        REGISTER_INFINIOP_TEST(hardswish)        \
+        REGISTER_INFINIOP_TEST(where)            \
+        REGISTER_INFINIOP_TEST(leakyrelu)        \
+        REGISTER_INFINIOP_TEST(cast)             \
     }
 
 namespace infiniop_test {
@@ -64,4 +82,4 @@ bool check_names(
 
 } // namespace infiniop_test
 
-#endif
+#endif
\ No newline at end of file
diff --git a/src/infiniop-test/include/test.hpp b/src/infiniop-test/include/test.hpp
index e2dd45f9f..277061029 100644
--- a/src/infiniop-test/include/test.hpp
+++ b/src/infiniop-test/include/test.hpp
@@ -47,7 +47,7 @@ std::vector<std::shared_ptr<Result>> runAllTests(
     const GGUFFileReader &,
     infiniDevice_t device, int device_id,
     size_t warm_ups, size_t iterations,
-    double rtol, double atol);
+    double rtol, double atol, bool equal_nan = false);
 
 // Run a single test read from a GGUF file
 std::shared_ptr<Result> runTest(
@@ -55,10 +55,11 @@ std::shared_ptr<Result> runTest(
     infiniDevice_t device, int device_id,
     size_t warm_ups, size_t iterations,
     double rtol, double atol,
-    size_t test_id);
+    size_t test_id,
+    bool equal_nan = false);
 
 // Check if two tensors are close within given tolerance
-void allClose(std::shared_ptr<Tensor> actual, std::shared_ptr<Tensor> expected, double rtol = 1e-3, double atol = 1e-3);
+void allClose(std::shared_ptr<Tensor> actual, std::shared_ptr<Tensor> expected, double rtol = 1e-3, double atol = 1e-3, bool equal_nan = false);
 
 // Check if two tensors are equal
 void allEqual(std::shared_ptr<Tensor> actual, std::shared_ptr<Tensor> expected);
@@ -85,13 +86,14 @@ class Test {
     namespace infiniop_test::name {                                           \
     class Test : public infiniop_test::base::Test {                           \
         double _rtol, _atol;                                                  \
+        bool _equal_nan;                                                      \
                                                                               \
     public:                                                                   \
         static std::string op_name() { return #name; }                        \
         static std::shared_ptr<Test> build(                                   \
             std::unordered_map<std::string, std::vector<uint8_t>> attributes, \
             std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors, \
-            double, double);                                                  \
+            double, double, bool);                                            \
                                                                               \
         static std::vector<std::string> attribute_names();                    \
         static std::vector<std::string> tensor_names();                       \
@@ -109,7 +111,8 @@ class Test {
         struct Attributes;                                                    \
         Attributes *_attributes;                                              \
         Test() = delete;                                                      \
-        Test(double rtol, double atol) : _rtol(rtol), _atol(atol) {}          \
+        Test(double rtol, double atol, bool equal_nan = false)                \
+            : _rtol(rtol), _atol(atol), _equal_nan(equal_nan) {}              \
     };                                                                        \
     }
 
@@ -117,7 +120,7 @@ namespace infiniop_test {
 using BuilderFunc = std::function<std::shared_ptr<infiniop_test::base::Test>(
     std::unordered_map<std::string, std::vector<uint8_t>>,
     std::unordered_map<std::string, std::shared_ptr<Tensor>>,
-    double, double)>;
+    double, double, bool)>;
 
 // Testcase Registry
 // Each testcase should provid a formatted builder, attribute names, and tensor names
diff --git a/src/infiniop-test/src/gguf.cpp b/src/infiniop-test/src/gguf.cpp
index a4b200033..aee5b39a8 100644
--- a/src/infiniop-test/src/gguf.cpp
+++ b/src/infiniop-test/src/gguf.cpp
@@ -53,7 +53,9 @@ GGUFFileReader::GGUFFileReader(const std::string &filepath) {
     try {
         _file = std::make_shared<FileMapping>(filepath);
     } catch (const std::exception &e) {
-        throw e;
+        // throw e;
+        std::cerr << "Error: " << e.what() << std::endl;
+        // throw e;
     }
     _data = _file->ptr();
     _cursor = reinterpret_cast<uint8_t *>(_data);
diff --git a/src/infiniop-test/src/main.cpp b/src/infiniop-test/src/main.cpp
index 4863c8172..6805bd7f8 100644
--- a/src/infiniop-test/src/main.cpp
+++ b/src/infiniop-test/src/main.cpp
@@ -1,8 +1,8 @@
 #include "gguf.hpp"
 #include "test.hpp"
+#include <cstring>
 #include <infinirt.h>
 #include <iostream>
-
 struct ParsedArgs {
     std::string file_path;                          // Mandatory argument: test.gguf file path
     infiniDevice_t device_type = INFINI_DEVICE_CPU; // Default to CPU
@@ -11,12 +11,13 @@ struct ParsedArgs {
     int iterations = 0;                             // Default to 0 if not given
     double atol = 0.001;                            // Default absolute tolerance
     double rtol = 0.001;                            // Default relative tolerance
+    bool equal_nan = false;                         // Default relative tolerance
 };
 
 void printUsage() {
     std::cout << "Usage:" << std::endl
               << std::endl;
-    std::cout << "infiniop-test <test.gguf> [--<device>[:id]] [--warmup <warmups>] [--run <iterations>] [--atol <atol>] [--rtol <rtol>]" << std::endl
+    std::cout << "infiniop-test <test.gguf> [--<device>[:id]] [--warmup <warmups>] [--run <iterations>] [--atol <atol>] [--rtol <rtol>] [--equal-nan <equal nan>]" << std::endl
               << std::endl;
     std::cout << "  <test.gguf>>" << std::endl;
     std::cout << "    Path to the test gguf file" << std::endl
@@ -36,6 +37,9 @@ void printUsage() {
     std::cout << "  --rtol <relative_tolerance>" << std::endl;
     std::cout << "    (Optional) Relative tolerance for correctness check. Default to 0.001" << std::endl
               << std::endl;
+    std::cout << "  --equal-nan <compare NaNs as equal>" << std::endl;
+    std::cout << "    (Optional) If True, then two NaNs will be considered equal. Default to False" << std::endl
+              << std::endl;
     exit(-1);
 }
 
@@ -91,6 +95,11 @@ ParsedArgs parseArgs(int argc, char *argv[]) {
             else if (arg == "--rtol" && i + 1 < argc) {
                 args.rtol = std::stod(argv[++i]);
             }
+            else if (arg == "--equal-nan" && i + 1 < argc) {
+                args.equal_nan = (strcmp(argv[++i], "True") == 0 || strcmp(argv[i], "true") == 0)
+                                   ? true
+                                   : false;
+            }
             else {
                 printUsage();
             }
@@ -119,7 +128,7 @@ int main(int argc, char *argv[]) {
             reader,
             (infiniDevice_t)args.device_type, args.device_id,
             args.warmups, args.iterations,
-            args.rtol, args.atol);
+            args.rtol, args.atol, args.equal_nan);
 
         std::cout << "=====================================" << std::endl;
         for (auto result : results) {
diff --git a/src/infiniop-test/src/ops/add.cpp b/src/infiniop-test/src/ops/add.cpp
index 27f69d687..e90290d55 100644
--- a/src/infiniop-test/src/ops/add.cpp
+++ b/src/infiniop-test/src/ops/add.cpp
@@ -15,8 +15,8 @@ struct Test::Attributes {
 std::shared_ptr<Test> Test::build(
     std::unordered_map<std::string, std::vector<uint8_t>> attributes,
     std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
-    double rtol, double atol) {
-    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    double rtol, double atol, bool equal_nan) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol, equal_nan));
     test->_attributes = new Attributes();
     if (tensors.find("a") == tensors.end()
         || tensors.find("b") == tensors.end()
@@ -58,7 +58,7 @@ std::shared_ptr<infiniop_test::Result> Test::run(
              return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
 
     try {
-        allClose(c, _attributes->ans, _rtol, _atol);
+        allClose(c, _attributes->ans, _rtol, _atol, _equal_nan);
     } catch (const std::exception &e) {
         return TEST_FAILED(RESULT_INCORRECT, e.what());
     }
@@ -98,7 +98,7 @@ std::string Test::toString() const {
     oss << "- b: " << _attributes->b->info() << std::endl;
     oss << "- c: " << _attributes->c->info() << std::endl;
     oss << std::scientific << std::setprecision(2);
-    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl;
     return oss.str();
 }
 
diff --git a/src/infiniop-test/src/ops/cast.cpp b/src/infiniop-test/src/ops/cast.cpp
new file mode 100644
index 000000000..258f74654
--- /dev/null
+++ b/src/infiniop-test/src/ops/cast.cpp
@@ -0,0 +1,111 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::cast {
+
+struct Test::Attributes {
+    std::shared_ptr<Tensor> input;
+    std::shared_ptr<Tensor> output;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol, bool equal_nan) {
+
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol, equal_nan));
+    test->_attributes = new Attributes();
+
+    if (tensors.find("input") == tensors.end()
+        || tensors.find("output") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->input = tensors["input"];
+    test->_attributes->output = tensors["output"];
+    test->_attributes->ans = tensors["ans"];
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id,
+    size_t warm_ups, size_t iterations) {
+
+    infiniopCastDescriptor_t op_desc;
+
+    auto input = _attributes->input->to(device, device_id);
+    auto output = _attributes->output->to(device, device_id);
+
+    if (input->data() == output->data()) {
+        return TEST_FAILED(OP_CREATION_FAILED,
+                           "Cast does not support inplace: input and output alias.");
+    }
+
+    CHECK_OR(infiniopCreateCastDescriptor(handle, &op_desc,
+                                          /*dst*/ output->desc(),
+                                          /*src*/ input->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create cast descriptor."));
+
+    size_t workspace_size;
+    CHECK_OR(infiniopGetCastWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+
+    CHECK_OR(infiniopCast(op_desc, workspace, workspace_size,
+                          /*dst*/ output->data(),
+                          /*src*/ input->data(),
+                          /*stream*/ nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(output, _attributes->ans, _rtol, _atol, _equal_nan);
+    } catch (const std::exception &e) {
+        infiniopDestroyCastDescriptor(op_desc);
+        infinirtFree(workspace);
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = benchmark(
+        [=]() {
+            infiniopCast(op_desc, workspace, workspace_size,
+                         /*dst*/ output->data(),
+                         /*src*/ input->data(),
+                         /*stream*/ nullptr);
+        },
+        warm_ups, iterations);
+
+    infiniopDestroyCastDescriptor(op_desc);
+    infinirtFree(workspace);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() { return {}; }
+
+std::vector<std::string> Test::tensor_names() { return {"input", "output", "ans"}; }
+
+std::vector<std::string> Test::output_names() { return {"output"}; }
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- input: " << _attributes->input->info() << std::endl;
+    oss << "- output: " << _attributes->output->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol
+        << ", equal_nan=" << _equal_nan << std::endl;
+    oss << "- inplace: false" << std::endl;
+    return oss.str();
+}
+
+Test::~Test() { delete _attributes; }
+
+} // namespace infiniop_test::cast
diff --git a/src/infiniop-test/src/ops/causal_softmax.cpp b/src/infiniop-test/src/ops/causal_softmax.cpp
index 29612960a..97c65ef8c 100644
--- a/src/infiniop-test/src/ops/causal_softmax.cpp
+++ b/src/infiniop-test/src/ops/causal_softmax.cpp
@@ -14,8 +14,8 @@ struct Test::Attributes {
 std::shared_ptr<Test> Test::build(
     std::unordered_map<std::string, std::vector<uint8_t>> attributes,
     std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
-    double rtol, double atol) {
-    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    double rtol, double atol, bool equal_nan) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol, equal_nan));
     test->_attributes = new Attributes();
     if (tensors.find("x") == tensors.end()
         || tensors.find("y") == tensors.end()
@@ -53,7 +53,7 @@ std::shared_ptr<infiniop_test::Result> Test::run(
              return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
 
     try {
-        allClose(y, _attributes->ans, _rtol, _atol);
+        allClose(y, _attributes->ans, _rtol, _atol, _equal_nan);
     } catch (const std::exception &e) {
         return TEST_FAILED(RESULT_INCORRECT, e.what());
     }
@@ -92,7 +92,7 @@ std::string Test::toString() const {
     oss << "- y: " << _attributes->y->info() << std::endl;
     oss << "- ans: " << _attributes->ans->info() << std::endl;
     oss << std::scientific << std::setprecision(2);
-    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl;
     return oss.str();
 }
 
diff --git a/src/infiniop-test/src/ops/clip.cpp b/src/infiniop-test/src/ops/clip.cpp
index 82a0e9b10..a01c18a4d 100644
--- a/src/infiniop-test/src/ops/clip.cpp
+++ b/src/infiniop-test/src/ops/clip.cpp
@@ -16,8 +16,8 @@ struct Test::Attributes {
 std::shared_ptr<Test> Test::build(
     std::unordered_map<std::string, std::vector<uint8_t>> attributes,
     std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
-    double rtol, double atol) {
-    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    double rtol, double atol, bool equal_nan) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol, equal_nan));
     test->_attributes = new Attributes();
     if (tensors.find("x") == tensors.end()
         || tensors.find("min_val") == tensors.end()
@@ -64,7 +64,7 @@ std::shared_ptr<infiniop_test::Result> Test::run(
              return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
 
     try {
-        allClose(y, _attributes->ans, _rtol, _atol);
+        allClose(y, _attributes->ans, _rtol, _atol, _equal_nan);
     } catch (const std::exception &e) {
         return TEST_FAILED(RESULT_INCORRECT, e.what());
     }
@@ -109,7 +109,7 @@ std::string Test::toString() const {
     oss << "- max_val: " << _attributes->max_val->info() << std::endl;
     oss << "- y: " << _attributes->y->info() << std::endl;
     oss << std::scientific << std::setprecision(2);
-    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl;
     return oss.str();
 }
 
diff --git a/src/infiniop-test/src/ops/cos.cpp b/src/infiniop-test/src/ops/cos.cpp
new file mode 100644
index 000000000..d1d99a0a3
--- /dev/null
+++ b/src/infiniop-test/src/ops/cos.cpp
@@ -0,0 +1,105 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::cos {
+
+struct Test::Attributes {
+    std::shared_ptr<Tensor> input;
+    std::shared_ptr<Tensor> output;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol, bool equal_nan) {
+
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol, equal_nan));
+    test->_attributes = new Attributes();
+
+    if (tensors.find("input") == tensors.end()
+        || tensors.find("output") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->input = tensors["input"];
+    test->_attributes->output = tensors["output"];
+    test->_attributes->ans = tensors["ans"];
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id,
+    size_t warm_ups, size_t iterations) {
+
+    infiniopCosDescriptor_t op_desc;
+
+    auto input = _attributes->input->to(device, device_id);
+    auto output = _attributes->output->to(device, device_id);
+
+    CHECK_OR(infiniopCreateCosDescriptor(handle, &op_desc,
+                                         /*y*/ output->desc(),
+                                         /*x*/ input->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create cos descriptor."));
+
+    size_t workspace_size;
+    CHECK_OR(infiniopGetCosWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+
+    CHECK_OR(infiniopCos(op_desc, workspace, workspace_size,
+                         /*y*/ output->data(),
+                         /*x*/ input->data(),
+                         /*stream*/ nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(output, _attributes->ans, _rtol, _atol, _equal_nan);
+    } catch (const std::exception &e) {
+        infiniopDestroyCosDescriptor(op_desc);
+        infinirtFree(workspace);
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = benchmark(
+        [=]() {
+            infiniopCos(op_desc, workspace, workspace_size,
+                        output->data(),
+                        input->data(),
+                        nullptr);
+        },
+        warm_ups, iterations);
+
+    infiniopDestroyCosDescriptor(op_desc);
+    infinirtFree(workspace);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() { return {}; }
+
+std::vector<std::string> Test::tensor_names() { return {"input", "output", "ans"}; }
+
+std::vector<std::string> Test::output_names() { return {"output"}; }
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- input: " << _attributes->input->info() << std::endl;
+    oss << "- output: " << _attributes->output->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol
+        << ", equal_nan=" << _equal_nan << std::endl;
+    return oss.str();
+}
+
+Test::~Test() { delete _attributes; }
+
+} // namespace infiniop_test::cos
diff --git a/src/infiniop-test/src/ops/exp.cpp b/src/infiniop-test/src/ops/exp.cpp
new file mode 100644
index 000000000..23dde66d0
--- /dev/null
+++ b/src/infiniop-test/src/ops/exp.cpp
@@ -0,0 +1,103 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::exp {
+
+struct Test::Attributes {
+    std::shared_ptr<Tensor> input;
+    std::shared_ptr<Tensor> output; // out
+    std::shared_ptr<Tensor> ans;    // reference
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol, bool equal_nan) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol, equal_nan));
+    test->_attributes = new Attributes();
+
+    if (tensors.find("input") == tensors.end()
+        || tensors.find("output") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->input = tensors["input"];
+    test->_attributes->output = tensors["output"];
+    test->_attributes->ans = tensors["ans"];
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id,
+    size_t warm_ups, size_t iterations) {
+
+    infiniopExpDescriptor_t op_desc;
+
+    auto input = _attributes->input->to(device, device_id);
+    auto output = _attributes->output->to(device, device_id);
+
+    CHECK_OR(infiniopCreateExpDescriptor(handle, &op_desc,
+                                         /*y*/ output->desc(),
+                                         /*x*/ input->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create exp descriptor."));
+
+    size_t workspace_size;
+    CHECK_OR(infiniopGetExpWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+
+    CHECK_OR(infiniopExp(op_desc, workspace, workspace_size,
+                         /*y*/ output->data(),
+                         /*x*/ input->data(),
+                         /*stream*/ nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(output, _attributes->ans, _rtol, _atol, _equal_nan);
+    } catch (const std::exception &e) {
+        infiniopDestroyExpDescriptor(op_desc);
+        infinirtFree(workspace);
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = benchmark(
+        [=]() {
+            infiniopExp(op_desc, workspace, workspace_size,
+                        output->data(),
+                        input->data(),
+                        nullptr);
+        },
+        warm_ups, iterations);
+
+    infiniopDestroyExpDescriptor(op_desc);
+    infinirtFree(workspace);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() { return {}; }
+
+std::vector<std::string> Test::tensor_names() { return {"input", "output", "ans"}; }
+
+std::vector<std::string> Test::output_names() { return {"output"}; }
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- input: " << _attributes->input->info() << std::endl;
+    oss << "- output: " << _attributes->output->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl;
+    return oss.str();
+}
+
+Test::~Test() { delete _attributes; }
+
+} // namespace infiniop_test::exp
diff --git a/src/infiniop-test/src/ops/gemm.cpp b/src/infiniop-test/src/ops/gemm.cpp
index 37c8ed6fe..664288d73 100644
--- a/src/infiniop-test/src/ops/gemm.cpp
+++ b/src/infiniop-test/src/ops/gemm.cpp
@@ -18,8 +18,8 @@ struct Test::Attributes {
 std::shared_ptr<Test> Test::build(
     std::unordered_map<std::string, std::vector<uint8_t>> attributes,
     std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
-    double rtol, double atol) {
-    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    double rtol, double atol, bool equal_nan) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol, equal_nan));
     test->_attributes = new Attributes();
     if (!check_names(attributes, Test::attribute_names()) || !check_names(tensors, Test::tensor_names())) {
         throw std::runtime_error("Invalid Test");
@@ -65,7 +65,7 @@ std::shared_ptr<infiniop_test::Result> Test::run(
              return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
 
     try {
-        allClose(c, _attributes->ans, _rtol, _atol);
+        allClose(c, _attributes->ans, _rtol, _atol, _equal_nan);
     } catch (const std::exception &e) {
         return TEST_FAILED(RESULT_INCORRECT, e.what());
     }
@@ -120,7 +120,7 @@ std::string Test::toString() const {
     oss << "- b: " << _attributes->b->info() << std::endl;
     oss << "- c: " << _attributes->c->info() << std::endl;
     oss << std::scientific << std::setprecision(2);
-    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl;
     return oss.str();
 }
 
diff --git a/src/infiniop-test/src/ops/hardswish.cpp b/src/infiniop-test/src/ops/hardswish.cpp
new file mode 100644
index 000000000..e47943bdb
--- /dev/null
+++ b/src/infiniop-test/src/ops/hardswish.cpp
@@ -0,0 +1,105 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::hardswish {
+
+struct Test::Attributes {
+    std::shared_ptr<Tensor> input;
+    std::shared_ptr<Tensor> output;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol, bool equal_nan) {
+
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol, equal_nan));
+    test->_attributes = new Attributes();
+
+    if (tensors.find("input") == tensors.end()
+        || tensors.find("output") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->input = tensors["input"];
+    test->_attributes->output = tensors["output"];
+    test->_attributes->ans = tensors["ans"];
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id,
+    size_t warm_ups, size_t iterations) {
+
+    infiniopHardSwishDescriptor_t op_desc;
+
+    auto input = _attributes->input->to(device, device_id);
+    auto output = _attributes->output->to(device, device_id);
+
+    CHECK_OR(infiniopCreateHardSwishDescriptor(handle, &op_desc,
+                                               /*y*/ output->desc(),
+                                               /*x*/ input->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create HardSwish descriptor."));
+
+    size_t workspace_size;
+    CHECK_OR(infiniopGetHardSwishWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+
+    CHECK_OR(infiniopHardSwish(op_desc, workspace, workspace_size,
+                               /*y*/ output->data(),
+                               /*x*/ input->data(),
+                               /*stream*/ nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(output, _attributes->ans, _rtol, _atol, _equal_nan);
+    } catch (const std::exception &e) {
+        infiniopDestroyHardSwishDescriptor(op_desc);
+        infinirtFree(workspace);
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = benchmark(
+        [=]() {
+            infiniopHardSwish(op_desc, workspace, workspace_size,
+                              output->data(),
+                              input->data(),
+                              nullptr);
+        },
+        warm_ups, iterations);
+
+    infiniopDestroyHardSwishDescriptor(op_desc);
+    infinirtFree(workspace);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() { return {}; }
+
+std::vector<std::string> Test::tensor_names() { return {"input", "output", "ans"}; }
+
+std::vector<std::string> Test::output_names() { return {"output"}; }
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- input: " << _attributes->input->info() << std::endl;
+    oss << "- output: " << _attributes->output->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol
+        << ", equal_nan=" << _equal_nan << std::endl;
+    return oss.str();
+}
+
+Test::~Test() { delete _attributes; }
+
+} // namespace infiniop_test::hardswish
diff --git a/src/infiniop-test/src/ops/leakyrelu.cpp b/src/infiniop-test/src/ops/leakyrelu.cpp
new file mode 100644
index 000000000..d26924ca2
--- /dev/null
+++ b/src/infiniop-test/src/ops/leakyrelu.cpp
@@ -0,0 +1,112 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <cstring> // for std::memcpy
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::leakyrelu {
+
+struct Test::Attributes {
+    std::shared_ptr<Tensor> input;
+    std::shared_ptr<Tensor> output;
+    std::shared_ptr<Tensor> ans;
+    float negative_slope = 0.01f; // 默认与 PyTorch 对齐
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol, bool equal_nan) {
+
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol, equal_nan));
+    test->_attributes = new Attributes();
+
+    if (tensors.find("input") == tensors.end()
+        || tensors.find("output") == tensors.end()
+        || tensors.find("ans") == tensors.end()
+        || attributes.find("negative_slope") == attributes.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->input = tensors["input"];
+    test->_attributes->output = tensors["output"];
+    test->_attributes->ans = tensors["ans"];
+    test->_attributes->negative_slope = *reinterpret_cast<float *>(attributes["negative_slope"].data());
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id,
+    size_t warm_ups, size_t iterations) {
+
+    infiniopLeakyReLUDescriptor_t op_desc;
+
+    auto input = _attributes->input->to(device, device_id);
+    auto output = _attributes->output->to(device, device_id);
+
+    CHECK_OR(infiniopCreateLeakyReLUDescriptor(
+                 handle, &op_desc,
+                 /*y*/ output->desc(),
+                 /*x*/ input->desc(),
+                 /*negative_slope*/ _attributes->negative_slope),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create LeakyReLU descriptor."));
+
+    size_t workspace_size = 0;
+    CHECK_OR(infiniopGetLeakyReLUWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+
+    void *workspace = nullptr;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+
+    CHECK_OR(infiniopLeakyReLU(op_desc, workspace, workspace_size,
+                               /*y*/ output->data(),
+                               /*x*/ input->data(),
+                               /*stream*/ nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+    try {
+        allClose(output, _attributes->ans, _rtol, _atol, _equal_nan);
+    } catch (const std::exception &e) {
+        infiniopDestroyLeakyReLUDescriptor(op_desc);
+        infinirtFree(workspace);
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = benchmark(
+        [=]() {
+            infiniopLeakyReLU(op_desc, workspace, workspace_size,
+                              /*y*/ output->data(),
+                              /*x*/ input->data(),
+                              /*stream*/ nullptr);
+        },
+        warm_ups, iterations);
+
+    infiniopDestroyLeakyReLUDescriptor(op_desc);
+    infinirtFree(workspace);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() { return {"negative_slope"}; }
+
+std::vector<std::string> Test::tensor_names() { return {"input", "output", "ans"}; }
+
+std::vector<std::string> Test::output_names() { return {"output"}; }
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- input: " << _attributes->input->info() << std::endl;
+    oss << "- output: " << _attributes->output->info() << std::endl;
+    oss << "- negative_slope: " << _attributes->negative_slope << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol
+        << ", equal_nan=" << _equal_nan << std::endl;
+    oss << "- inplace: true" << std::endl;
+    return oss.str();
+}
+
+Test::~Test() { delete _attributes; }
+
+} // namespace infiniop_test::leakyrelu
diff --git a/src/infiniop-test/src/ops/mul.cpp b/src/infiniop-test/src/ops/mul.cpp
index 8ebfc426b..cb0b639bf 100644
--- a/src/infiniop-test/src/ops/mul.cpp
+++ b/src/infiniop-test/src/ops/mul.cpp
@@ -15,8 +15,8 @@ struct Test::Attributes {
 std::shared_ptr<Test> Test::build(
     std::unordered_map<std::string, std::vector<uint8_t>> attributes,
     std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
-    double rtol, double atol) {
-    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    double rtol, double atol, bool equal_nan) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol, equal_nan));
     test->_attributes = new Attributes();
     if (tensors.find("a") == tensors.end()
         || tensors.find("b") == tensors.end()
@@ -58,7 +58,7 @@ std::shared_ptr<infiniop_test::Result> Test::run(
              return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
 
     try {
-        allClose(c, _attributes->ans, _rtol, _atol);
+        allClose(c, _attributes->ans, _rtol, _atol, _equal_nan);
     } catch (const std::exception &e) {
         return TEST_FAILED(RESULT_INCORRECT, e.what());
     }
@@ -98,7 +98,7 @@ std::string Test::toString() const {
     oss << "- b: " << _attributes->b->info() << std::endl;
     oss << "- c: " << _attributes->c->info() << std::endl;
     oss << std::scientific << std::setprecision(2);
-    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl;
     return oss.str();
 }
 
diff --git a/src/infiniop-test/src/ops/random_sample.cpp b/src/infiniop-test/src/ops/random_sample.cpp
index a11e0f446..75ee07b44 100644
--- a/src/infiniop-test/src/ops/random_sample.cpp
+++ b/src/infiniop-test/src/ops/random_sample.cpp
@@ -20,8 +20,8 @@ struct Test::Attributes {
 std::shared_ptr<Test> Test::build(
     std::unordered_map<std::string, std::vector<uint8_t>> attributes,
     std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
-    double rtol, double atol) {
-    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    double rtol, double atol, bool equal_nan) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol, equal_nan));
     test->_attributes = new Attributes();
     if (!check_names(attributes, Test::attribute_names()) || !check_names(tensors, Test::tensor_names())) {
         throw std::runtime_error("Invalid Test");
@@ -70,7 +70,7 @@ std::shared_ptr<infiniop_test::Result> Test::run(
              return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
 
     try {
-        allClose(result, _attributes->ans, _rtol, _atol);
+        allClose(result, _attributes->ans, _rtol, _atol, _equal_nan);
     } catch (const std::exception &e) {
         return TEST_FAILED(RESULT_INCORRECT, e.what());
     }
@@ -117,7 +117,7 @@ std::string Test::toString() const {
     oss << "- data: " << _attributes->data->info() << std::endl;
     oss << "- result: " << _attributes->result->info() << std::endl;
     oss << std::scientific << std::setprecision(2);
-    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl;
     return oss.str();
 }
 
diff --git a/src/infiniop-test/src/ops/rearrange.cpp b/src/infiniop-test/src/ops/rearrange.cpp
index 9fbf6f2cb..bdf162ce2 100644
--- a/src/infiniop-test/src/ops/rearrange.cpp
+++ b/src/infiniop-test/src/ops/rearrange.cpp
@@ -12,9 +12,9 @@ struct Test::Attributes {
 std::shared_ptr<Test> Test::build(
     std::unordered_map<std::string, std::vector<uint8_t>> attributes,
     std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
-    double rtol, double atol) {
+    double rtol, double atol, bool equal_nan) {
 
-    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol, equal_nan));
     test->_attributes = new Attributes();
     if (!check_names(attributes, Test::attribute_names()) || !check_names(tensors, Test::tensor_names())) {
         throw std::runtime_error("Invalid Test");
diff --git a/src/infiniop-test/src/ops/rms_norm.cpp b/src/infiniop-test/src/ops/rms_norm.cpp
index 8359a4536..786ce8470 100644
--- a/src/infiniop-test/src/ops/rms_norm.cpp
+++ b/src/infiniop-test/src/ops/rms_norm.cpp
@@ -16,8 +16,8 @@ struct Test::Attributes {
 std::shared_ptr<Test> Test::build(
     std::unordered_map<std::string, std::vector<uint8_t>> attributes,
     std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
-    double rtol, double atol) {
-    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    double rtol, double atol, bool equal_nan) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol, equal_nan));
     test->_attributes = new Attributes();
 
     if (attributes.find("epsilon") == attributes.end()
@@ -72,7 +72,7 @@ std::shared_ptr<infiniop_test::Result> Test::run(
              return TEST_FAILED(OP_EXECUTION_FAILED, "RMSNorm execution failed"));
 
     try {
-        allClose(y, _attributes->ans, _rtol, _atol);
+        allClose(y, _attributes->ans, _rtol, _atol, _equal_nan);
     } catch (const std::exception &e) {
         return TEST_FAILED(RESULT_INCORRECT, e.what());
     }
@@ -117,7 +117,7 @@ std::string Test::toString() const {
     oss << "- w: " << _attributes->w->info() << std::endl;
     oss << "- y: " << _attributes->y->info() << std::endl;
     oss << std::scientific << std::setprecision(2);
-    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl;
     return oss.str();
 }
 
diff --git a/src/infiniop-test/src/ops/rope.cpp b/src/infiniop-test/src/ops/rope.cpp
index 636f565af..94010a122 100644
--- a/src/infiniop-test/src/ops/rope.cpp
+++ b/src/infiniop-test/src/ops/rope.cpp
@@ -17,8 +17,8 @@ struct Test::Attributes {
 std::shared_ptr<Test> Test::build(
     std::unordered_map<std::string, std::vector<uint8_t>> attributes,
     std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
-    double rtol, double atol) {
-    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    double rtol, double atol, bool equal_nan) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol, equal_nan));
     test->_attributes = new Attributes();
 
     if (tensors.find("y") == tensors.end()
@@ -77,7 +77,7 @@ std::shared_ptr<infiniop_test::Result> Test::run(
              return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
 
     try {
-        allClose(y, _attributes->ans, _rtol, _atol);
+        allClose(y, _attributes->ans, _rtol, _atol, _equal_nan);
     } catch (const std::exception &e) {
         return TEST_FAILED(RESULT_INCORRECT, e.what());
     }
@@ -121,7 +121,7 @@ std::string Test::toString() const {
     oss << "- sin_table: " << _attributes->sin_table->info() << std::endl;
     oss << "- cos_table: " << _attributes->cos_table->info() << std::endl;
     oss << std::scientific << std::setprecision(2);
-    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl;
     return oss.str();
 }
 
diff --git a/src/infiniop-test/src/ops/sigmoid_backward.cpp b/src/infiniop-test/src/ops/sigmoid_backward.cpp
new file mode 100644
index 000000000..0248d6a47
--- /dev/null
+++ b/src/infiniop-test/src/ops/sigmoid_backward.cpp
@@ -0,0 +1,115 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::sigmoid_backward {
+
+struct Test::Attributes {
+    std::shared_ptr<Tensor> input;
+    std::shared_ptr<Tensor> grad_output;
+    std::shared_ptr<Tensor> grad_input; // output
+    std::shared_ptr<Tensor> ans;        // reference
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol, bool equal_nan) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol, equal_nan));
+    test->_attributes = new Attributes();
+
+    if (tensors.find("input") == tensors.end()
+        || tensors.find("grad_output") == tensors.end()
+        || tensors.find("grad_input") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->input = tensors["input"];
+    test->_attributes->grad_output = tensors["grad_output"];
+    test->_attributes->grad_input = tensors["grad_input"];
+    test->_attributes->ans = tensors["ans"];
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id,
+    size_t warm_ups, size_t iterations) {
+
+    infiniopSigmoidBackwardDescriptor_t op_desc;
+
+    auto input = _attributes->input->to(device, device_id);
+    auto grad_output = _attributes->grad_output->to(device, device_id);
+    auto grad_input = _attributes->grad_input->to(device, device_id);
+
+    CHECK_OR(infiniopCreateSigmoidBackwardDescriptor(handle, &op_desc,
+                                                     /*dst*/ grad_input->desc(),
+                                                     /*input*/ input->desc(),
+                                                     /*dy*/ grad_output->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create sigmoid_backward descriptor."));
+
+    size_t workspace_size;
+    CHECK_OR(infiniopGetSigmoidBackwardWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+
+    CHECK_OR(infiniopSigmoidBackward(op_desc, workspace, workspace_size,
+                                     /*dst*/ grad_input->data(),
+                                     /*input*/ input->data(),
+                                     /*dy*/ grad_output->data(),
+                                     /*stream*/ nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        // 浮点比较；混合精度下建议适当放宽 rtol/atol
+        allClose(grad_input, _attributes->ans, _rtol, _atol, _equal_nan);
+    } catch (const std::exception &e) {
+        infiniopDestroySigmoidBackwardDescriptor(op_desc);
+        infinirtFree(workspace);
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = benchmark(
+        [=]() {
+            infiniopSigmoidBackward(op_desc, workspace, workspace_size,
+                                    grad_input->data(),
+                                    input->data(),
+                                    grad_output->data(),
+                                    nullptr);
+        },
+        warm_ups, iterations);
+
+    infiniopDestroySigmoidBackwardDescriptor(op_desc);
+    infinirtFree(workspace);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() { return {}; }
+
+std::vector<std::string> Test::tensor_names() {
+    return {"input", "grad_output", "grad_input", "ans"};
+}
+
+std::vector<std::string> Test::output_names() { return {"grad_input"}; }
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- input: " << _attributes->input->info() << std::endl;
+    oss << "- grad_output: " << _attributes->grad_output->info() << std::endl;
+    oss << "- grad_input: " << _attributes->grad_input->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol
+        << ", equal_nan=" << _equal_nan << std::endl;
+    return oss.str();
+}
+
+Test::~Test() { delete _attributes; }
+
+} // namespace infiniop_test::sigmoid_backward
diff --git a/src/infiniop-test/src/ops/sin.cpp b/src/infiniop-test/src/ops/sin.cpp
new file mode 100644
index 000000000..daa10d1f8
--- /dev/null
+++ b/src/infiniop-test/src/ops/sin.cpp
@@ -0,0 +1,105 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::sin {
+
+struct Test::Attributes {
+    std::shared_ptr<Tensor> input;
+    std::shared_ptr<Tensor> output;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol, bool equal_nan) {
+
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol, equal_nan));
+    test->_attributes = new Attributes();
+
+    if (tensors.find("input") == tensors.end()
+        || tensors.find("output") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->input = tensors["input"];
+    test->_attributes->output = tensors["output"];
+    test->_attributes->ans = tensors["ans"];
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id,
+    size_t warm_ups, size_t iterations) {
+
+    infiniopSinDescriptor_t op_desc;
+
+    auto input = _attributes->input->to(device, device_id);
+    auto output = _attributes->output->to(device, device_id);
+
+    CHECK_OR(infiniopCreateSinDescriptor(handle, &op_desc,
+                                         /*y*/ output->desc(),
+                                         /*x*/ input->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create sin descriptor."));
+
+    size_t workspace_size;
+    CHECK_OR(infiniopGetSinWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+
+    CHECK_OR(infiniopSin(op_desc, workspace, workspace_size,
+                         /*y*/ output->data(),
+                         /*x*/ input->data(),
+                         /*stream*/ nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(output, _attributes->ans, _rtol, _atol, _equal_nan);
+    } catch (const std::exception &e) {
+        infiniopDestroySinDescriptor(op_desc);
+        infinirtFree(workspace);
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = benchmark(
+        [=]() {
+            infiniopSin(op_desc, workspace, workspace_size,
+                        output->data(),
+                        input->data(),
+                        nullptr);
+        },
+        warm_ups, iterations);
+
+    infiniopDestroySinDescriptor(op_desc);
+    infinirtFree(workspace);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() { return {}; }
+
+std::vector<std::string> Test::tensor_names() { return {"input", "output", "ans"}; }
+
+std::vector<std::string> Test::output_names() { return {"output"}; }
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- input: " << _attributes->input->info() << std::endl;
+    oss << "- output: " << _attributes->output->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol
+        << ", equal_nan=" << _equal_nan << std::endl;
+    return oss.str();
+}
+
+Test::~Test() { delete _attributes; }
+
+} // namespace infiniop_test::sin
diff --git a/src/infiniop-test/src/ops/sub.cpp b/src/infiniop-test/src/ops/sub.cpp
index 6bb1fd1eb..bb3adc350 100644
--- a/src/infiniop-test/src/ops/sub.cpp
+++ b/src/infiniop-test/src/ops/sub.cpp
@@ -15,8 +15,8 @@ struct Test::Attributes {
 std::shared_ptr<Test> Test::build(
     std::unordered_map<std::string, std::vector<uint8_t>> attributes,
     std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
-    double rtol, double atol) {
-    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    double rtol, double atol, bool equal_nan) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol, equal_nan));
     test->_attributes = new Attributes();
     if (tensors.find("a") == tensors.end()
         || tensors.find("b") == tensors.end()
@@ -58,7 +58,7 @@ std::shared_ptr<infiniop_test::Result> Test::run(
              return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
 
     try {
-        allClose(c, _attributes->ans, _rtol, _atol);
+        allClose(c, _attributes->ans, _rtol, _atol, _equal_nan);
     } catch (const std::exception &e) {
         return TEST_FAILED(RESULT_INCORRECT, e.what());
     }
@@ -98,7 +98,7 @@ std::string Test::toString() const {
     oss << "- b: " << _attributes->b->info() << std::endl;
     oss << "- c: " << _attributes->c->info() << std::endl;
     oss << std::scientific << std::setprecision(2);
-    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl;
     return oss.str();
 }
 
diff --git a/src/infiniop-test/src/ops/swiglu.cpp b/src/infiniop-test/src/ops/swiglu.cpp
index 96b75efc5..f86dfadc6 100644
--- a/src/infiniop-test/src/ops/swiglu.cpp
+++ b/src/infiniop-test/src/ops/swiglu.cpp
@@ -15,8 +15,8 @@ struct Test::Attributes {
 std::shared_ptr<Test> Test::build(
     std::unordered_map<std::string, std::vector<uint8_t>> attributes,
     std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
-    double rtol, double atol) {
-    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    double rtol, double atol, bool equal_nan) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol, equal_nan));
     test->_attributes = new Attributes();
 
     if (tensors.find("a") == tensors.end()
@@ -54,7 +54,7 @@ std::shared_ptr<infiniop_test::Result> Test::run(
     CHECK_OR(infiniopSwiGLU(op_desc, workspace, workspace_size, c->data(), a->data(), b->data(), nullptr),
              return TEST_FAILED(OP_CREATION_FAILED, "Failed during execution."));
     try {
-        allClose(c, _attributes->ans, _rtol, _atol);
+        allClose(c, _attributes->ans, _rtol, _atol, _equal_nan);
     } catch (const std::exception &e) {
         return TEST_FAILED(RESULT_INCORRECT, e.what());
     }
@@ -93,7 +93,7 @@ std::string Test::toString() const {
     oss << "- b: " << _attributes->b->info() << std::endl;
     oss << "- c: " << _attributes->c->info() << std::endl;
     oss << std::scientific << std::setprecision(2);
-    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl;
     return oss.str();
 }
 
diff --git a/src/infiniop-test/src/ops/tanh.cpp b/src/infiniop-test/src/ops/tanh.cpp
new file mode 100644
index 000000000..4ccc2aa7b
--- /dev/null
+++ b/src/infiniop-test/src/ops/tanh.cpp
@@ -0,0 +1,105 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::tanh {
+
+struct Test::Attributes {
+    std::shared_ptr<Tensor> input;
+    std::shared_ptr<Tensor> output; // out
+    std::shared_ptr<Tensor> ans;    // reference
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol, bool equal_nan) {
+
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol, equal_nan));
+    test->_attributes = new Attributes();
+
+    if (tensors.find("input") == tensors.end()
+        || tensors.find("output") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->input = tensors["input"];
+    test->_attributes->output = tensors["output"];
+    test->_attributes->ans = tensors["ans"];
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id,
+    size_t warm_ups, size_t iterations) {
+
+    infiniopTanhDescriptor_t op_desc;
+
+    auto input = _attributes->input->to(device, device_id);
+    auto output = _attributes->output->to(device, device_id);
+
+    CHECK_OR(infiniopCreateTanhDescriptor(handle, &op_desc,
+                                          /*y*/ output->desc(),
+                                          /*x*/ input->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create tanh descriptor."));
+
+    size_t workspace_size;
+    CHECK_OR(infiniopGetTanhWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+
+    CHECK_OR(infiniopTanh(op_desc, workspace, workspace_size,
+                          /*y*/ output->data(),
+                          /*x*/ input->data(),
+                          /*stream*/ nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(output, _attributes->ans, _rtol, _atol, _equal_nan);
+    } catch (const std::exception &e) {
+        infiniopDestroyTanhDescriptor(op_desc);
+        infinirtFree(workspace);
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = benchmark(
+        [=]() {
+            infiniopTanh(op_desc, workspace, workspace_size,
+                         output->data(),
+                         input->data(),
+                         nullptr);
+        },
+        warm_ups, iterations);
+
+    infiniopDestroyTanhDescriptor(op_desc);
+    infinirtFree(workspace);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() { return {}; }
+
+std::vector<std::string> Test::tensor_names() { return {"input", "output", "ans"}; }
+
+std::vector<std::string> Test::output_names() { return {"output"}; }
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- input: " << _attributes->input->info() << std::endl;
+    oss << "- output: " << _attributes->output->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol
+        << ", equal_nan=" << _equal_nan << std::endl;
+    return oss.str();
+}
+
+Test::~Test() { delete _attributes; }
+
+} // namespace infiniop_test::tanh
diff --git a/src/infiniop-test/src/ops/where.cpp b/src/infiniop-test/src/ops/where.cpp
new file mode 100644
index 000000000..c9bf8379f
--- /dev/null
+++ b/src/infiniop-test/src/ops/where.cpp
@@ -0,0 +1,130 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::where {
+
+struct Test::Attributes {
+    std::shared_ptr<Tensor> cond;
+    std::shared_ptr<Tensor> a;
+    std::shared_ptr<Tensor> b;
+    std::shared_ptr<Tensor> out;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol, bool equal_nan) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol, equal_nan));
+    test->_attributes = new Attributes();
+
+    if (tensors.find("condition") == tensors.end()
+        || tensors.find("a") == tensors.end()
+        || tensors.find("b") == tensors.end()
+        || tensors.find("c") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->cond = tensors["condition"];
+    test->_attributes->a = tensors["a"];
+    test->_attributes->b = tensors["b"];
+    test->_attributes->out = tensors["c"];
+    test->_attributes->ans = tensors["ans"];
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id,
+    size_t warm_ups, size_t iterations) {
+
+    infiniopWhereDescriptor_t op_desc;
+
+    auto cond = _attributes->cond->to(device, device_id);
+    auto a = _attributes->a->to(device, device_id);
+    auto b = _attributes->b->to(device, device_id);
+    auto out = _attributes->out->to(device, device_id);
+
+    CHECK_OR(infiniopCreateWhereDescriptor(handle, &op_desc,
+                                           out->desc(),
+                                           cond->desc(),
+                                           a->desc(),
+                                           b->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create where descriptor."));
+
+    size_t workspace_size;
+    CHECK_OR(infiniopGetWhereWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+
+    CHECK_OR(infiniopWhere(op_desc, workspace, workspace_size,
+                           out->data(),
+                           cond->data(),
+                           a->data(),
+                           b->data(),
+                           nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        // where 输出通常与 a/b 同 dtype；若为整型/布尔，建议 rtol=0, atol=0
+        allClose(out, _attributes->ans, _rtol, _atol, _equal_nan);
+    } catch (const std::exception &e) {
+        infiniopDestroyWhereDescriptor(op_desc);
+        infinirtFree(workspace);
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.0;
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopWhere(op_desc, workspace, workspace_size,
+                          out->data(),
+                          cond->data(),
+                          a->data(),
+                          b->data(),
+                          nullptr);
+        },
+        warm_ups, iterations);
+
+    infiniopDestroyWhereDescriptor(op_desc);
+    infinirtFree(workspace);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"condition", "a", "b", "c", "ans"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"c"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- condition: " << _attributes->cond->info() << std::endl;
+    oss << "- a: " << _attributes->a->info() << std::endl;
+    oss << "- b: " << _attributes->b->info() << std::endl;
+    oss << "- out: " << _attributes->out->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::where
diff --git a/src/infiniop-test/src/test.cpp b/src/infiniop-test/src/test.cpp
index e312ac5f5..0cbfe067a 100644
--- a/src/infiniop-test/src/test.cpp
+++ b/src/infiniop-test/src/test.cpp
@@ -49,7 +49,7 @@ std::string Result::toString() const {
 std::vector<std::shared_ptr<Result>> runAllTests(const GGUFFileReader &gguf_reader,
                                                  infiniDevice_t device, int device_id,
                                                  size_t warm_ups, size_t iterations,
-                                                 double rtol, double atol) {
+                                                 double rtol, double atol, bool equal_nan) {
     auto meta = gguf_reader.getAttributeMap();
     auto count_meta = meta.find("test_count");
     if (count_meta == meta.end()) {
@@ -60,7 +60,7 @@ std::vector<std::shared_ptr<Result>> runAllTests(const GGUFFileReader &gguf_read
     auto results = std::vector<std::shared_ptr<Result>>(count);
     try {
         for (size_t i = 0; i < count; i++) {
-            results[i] = runTest(gguf_reader, device, device_id, warm_ups, iterations, rtol, atol, i);
+            results[i] = runTest(gguf_reader, device, device_id, warm_ups, iterations, rtol, atol, i, equal_nan);
         }
     } catch (const std::exception &e) {
         std::cerr << "Error: " << e.what() << std::endl;
@@ -72,7 +72,7 @@ std::vector<std::shared_ptr<Result>> runAllTests(const GGUFFileReader &gguf_read
 std::shared_ptr<Result> runTest(const GGUFFileReader &gguf_reader,
                                 infiniDevice_t device, int device_id,
                                 size_t warm_ups, size_t iterations,
-                                double rtol, double atol, size_t test_id) {
+                                double rtol, double atol, size_t test_id, bool equal_nan) {
     auto meta = gguf_reader.getAttributeMap();
     auto tensor_info = gguf_reader.getTensorInfoMap();
     auto name_meta = meta.find("test." + std::to_string(test_id) + ".op_name");
@@ -107,7 +107,7 @@ std::shared_ptr<Result> runTest(const GGUFFileReader &gguf_reader,
         }
         std::shared_ptr<infiniop_test::base::Test> test;
         try {
-            test = builder.build(attrs, tensors, rtol, atol);
+            test = builder.build(attrs, tensors, rtol, atol, equal_nan);
         } catch (const std::exception &e) {
             return TEST_INIT_FAILED(op_name + "/n" + e.what());
         }
@@ -141,7 +141,7 @@ void incrementOffset(ptrdiff_t &offset_1, const std::vector<ptrdiff_t> &strides_
     }
 }
 
-void allClose(std::shared_ptr<Tensor> actual_, std::shared_ptr<Tensor> expected_, double rtol, double atol) {
+void allClose(std::shared_ptr<Tensor> actual_, std::shared_ptr<Tensor> expected_, double rtol, double atol, bool equal_nan) {
     auto actual = actual_->to(INFINI_DEVICE_CPU);
     auto expected = expected_->to(INFINI_DEVICE_CPU);
     auto shape = actual->shape();
@@ -158,12 +158,22 @@ void allClose(std::shared_ptr<Tensor> actual_, std::shared_ptr<Tensor> expected_
     for (size_t i = 0; i < total; i++) {
         double a_ = getVal((char *)actual->data() + actual_offset, actual->ggml_type());
         double e_ = getVal((char *)expected->data() + expected_offset, expected->ggml_type());
-        if (std::fabs(a_ - e_) > atol && std::fabs(a_ - e_) > rtol * std::fmax(std::fabs(a_), std::fabs(e_))) {
-            if (num_failed == 0) {
-                first_failed_msg = "First failed at index " + std::to_string(i) + " with value " + std::to_string(a_) + " but should be " + std::to_string(e_) + ".";
+        if (std::isnan(a_) || std::isnan(e_)) {
+            if ((equal_nan && (std::isnan(a_) != std::isnan(e_))) || !equal_nan) {
+                num_failed++;
+                if (num_failed == 0) {
+                    first_failed_msg = "First failed at index " + std::to_string(i) + " with value " + std::to_string(a_) + " but should be " + std::to_string(e_) + ".";
+                }
+            }
+        } else {
+            if (std::fabs(a_ - e_) > atol && std::fabs(a_ - e_) > rtol * std::fmax(std::fabs(a_), std::fabs(e_))) {
+                if (num_failed == 0) {
+                    first_failed_msg = "First failed at index " + std::to_string(i) + " with value " + std::to_string(a_) + " but should be " + std::to_string(e_) + ".";
+                }
+                num_failed++;
             }
-            num_failed++;
         }
+
         incrementOffset(actual_offset, actual->strides(), ggmlTypeSize(actual->ggml_type()),
                         expected_offset, expected->strides(), ggmlTypeSize(expected->ggml_type()),
                         counter, shape);
diff --git a/src/infiniop/ops/cast/cpu/cast_cpu.cc b/src/infiniop/ops/cast/cpu/cast_cpu.cc
new file mode 100644
index 000000000..3c5336161
--- /dev/null
+++ b/src/infiniop/ops/cast/cpu/cast_cpu.cc
@@ -0,0 +1,86 @@
+#include "cast_cpu.h"
+
+namespace op::cast::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto input_dtype = input_desc_vec.at(0)->dtype();
+    auto output_dtype = out_desc->dtype();
+
+    CHECK_SAME_SHAPE(out_desc->shape(), input_desc_vec.at(0)->shape());
+    auto info_result = op::elementwise::ElementwiseInfo::create(out_desc, input_desc_vec);
+    CHECK_RESULT(info_result);
+    // create CPU elementwise descriptor
+    *desc_ptr = new Descriptor(
+        input_dtype,
+        output_dtype,
+        info_result.take(),
+        nullptr,
+        0,
+        handle->device,
+        handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+#define SWITCH_IN_TYPE(OUT_TYPE, IN_TYPE)                                                          \
+    switch (IN_TYPE) {                                                                             \
+    case INFINI_DTYPE_I32:                                                                         \
+        return _device_info->calculate<CastOp, OUT_TYPE, int32_t>(_info, output, inputs, stream);  \
+    case INFINI_DTYPE_I64:                                                                         \
+        return _device_info->calculate<CastOp, OUT_TYPE, int64_t>(_info, output, inputs, stream);  \
+    case INFINI_DTYPE_U32:                                                                         \
+        return _device_info->calculate<CastOp, OUT_TYPE, uint32_t>(_info, output, inputs, stream); \
+    case INFINI_DTYPE_U64:                                                                         \
+        return _device_info->calculate<CastOp, OUT_TYPE, uint64_t>(_info, output, inputs, stream); \
+    case INFINI_DTYPE_F16:                                                                         \
+        return _device_info->calculate<CastOp, OUT_TYPE, fp16_t>(_info, output, inputs, stream);   \
+    case INFINI_DTYPE_F32:                                                                         \
+        return _device_info->calculate<CastOp, OUT_TYPE, float>(_info, output, inputs, stream);    \
+    case INFINI_DTYPE_F64:                                                                         \
+        return _device_info->calculate<CastOp, OUT_TYPE, double>(_info, output, inputs, stream);   \
+    case INFINI_DTYPE_BF16:                                                                        \
+        return _device_info->calculate<CastOp, OUT_TYPE, bf16_t>(_info, output, inputs, stream);   \
+    default:                                                                                       \
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;                                                     \
+    }
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    // Handle type conversions based on input and output types
+    switch (_output_dtype) {
+    case INFINI_DTYPE_I32:
+        SWITCH_IN_TYPE(int32_t, _input_dtype)
+    case INFINI_DTYPE_I64:
+        SWITCH_IN_TYPE(int64_t, _input_dtype)
+    case INFINI_DTYPE_U32:
+        SWITCH_IN_TYPE(uint32_t, _input_dtype)
+    case INFINI_DTYPE_U64:
+        SWITCH_IN_TYPE(uint64_t, _input_dtype)
+    case INFINI_DTYPE_F16:
+        SWITCH_IN_TYPE(fp16_t, _input_dtype)
+    case INFINI_DTYPE_F32:
+        SWITCH_IN_TYPE(float, _input_dtype)
+    case INFINI_DTYPE_F64:
+        SWITCH_IN_TYPE(double, _input_dtype)
+    case INFINI_DTYPE_BF16:
+        SWITCH_IN_TYPE(bf16_t, _input_dtype)
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::cast::cpu
\ No newline at end of file
diff --git a/src/infiniop/ops/cast/cpu/cast_cpu.h b/src/infiniop/ops/cast/cpu/cast_cpu.h
new file mode 100644
index 000000000..47485e883
--- /dev/null
+++ b/src/infiniop/ops/cast/cpu/cast_cpu.h
@@ -0,0 +1,58 @@
+#ifndef CAST_CPU_H
+#define CAST_CPU_H
+
+#include "../../../../utils/custom_types.h"
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+#include <type_traits>
+
+namespace op::cast::cpu {
+class Descriptor final : public InfiniopDescriptor {
+    infiniDtype_t _input_dtype, _output_dtype;
+    op::elementwise::ElementwiseInfo _info;
+    std::unique_ptr<op::elementwise::cpu::DeviceImpl> _device_info;
+    size_t _workspace_size;
+
+    Descriptor(
+        infiniDtype_t input_dtype,
+        infiniDtype_t output_dtype,
+        op::elementwise::ElementwiseInfo info,
+        op::elementwise::cpu::DeviceImpl *device_info,
+        size_t workspace_size,
+        infiniDevice_t device_type,
+        int device_id)
+        : InfiniopDescriptor{device_type, device_id},
+          _input_dtype(input_dtype),
+          _output_dtype(output_dtype),
+          _info(std::move(info)),
+          _device_info(std::move(device_info)),
+          _workspace_size(workspace_size) {}
+
+public:
+    ~Descriptor();
+
+    size_t workspaceSize() const { return _workspace_size; }
+
+    static infiniStatus_t create(
+        infiniopHandle_t handle_,
+        Descriptor **desc_ptr,
+        infiniopTensorDescriptor_t output_desc,
+        std::vector<infiniopTensorDescriptor_t> input_desc);
+
+    infiniStatus_t calculate(
+        void *workspace, size_t workspace_size,
+        void *output,
+        std::vector<const void *> inputs,
+        void *stream) const;
+};
+struct CastOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename OUT_TYPE, typename IN_TYPE>
+    OUT_TYPE operator()(const IN_TYPE &x) const {
+        return utils::cast<OUT_TYPE, IN_TYPE>(x);
+    }
+};
+
+} // namespace op::cast::cpu
+
+#endif // CAST_CPU_H
\ No newline at end of file
diff --git a/src/infiniop/ops/cast/cuda/kernel.cuh b/src/infiniop/ops/cast/cuda/kernel.cuh
new file mode 100644
index 000000000..98556fd71
--- /dev/null
+++ b/src/infiniop/ops/cast/cuda/kernel.cuh
@@ -0,0 +1,77 @@
+namespace op::cast::cuda {
+
+typedef struct CastOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+private:
+    template <typename T_src, typename T_dst>
+    __device__ __forceinline__ T_dst cast_impl(const T_src &x) const {
+        if constexpr (std::is_same_v<T_src, T_dst>) {
+            return x;
+        } else if constexpr (std::is_same_v<T_src, half>) {
+            // From half
+            if constexpr (std::is_same_v<T_dst, float>) {
+                return __half2float(x);
+            } else if constexpr (std::is_same_v<T_dst, double>) {
+                return static_cast<double>(__half2float(x));
+            } else if constexpr (std::is_same_v<T_dst, cuda_bfloat16>) {
+                return __float2bfloat16(__half2float(x));
+            } else if constexpr (std::is_integral_v<T_dst>) {
+                return static_cast<T_dst>(__half2float(x));
+            } else {
+                return static_cast<T_dst>(__half2float(x));
+            }
+        } else if constexpr (std::is_same_v<T_src, cuda_bfloat16>) {
+            // From bfloat16
+            if constexpr (std::is_same_v<T_dst, float>) {
+                return __bfloat162float(x);
+            } else if constexpr (std::is_same_v<T_dst, double>) {
+                return static_cast<double>(__bfloat162float(x));
+            } else if constexpr (std::is_same_v<T_dst, half>) {
+                return __float2half(__bfloat162float(x));
+            } else if constexpr (std::is_integral_v<T_dst>) {
+                return static_cast<T_dst>(__bfloat162float(x));
+            } else {
+                return static_cast<T_dst>(__bfloat162float(x));
+            }
+        } else if constexpr (std::is_same_v<T_dst, half>) {
+            // To half
+            if constexpr (std::is_same_v<T_src, float>) {
+                return __float2half(x);
+            } else if constexpr (std::is_same_v<T_src, double>) {
+                return __float2half(static_cast<float>(x));
+            } else {
+                return __float2half(static_cast<float>(x));
+            }
+        } else if constexpr (std::is_same_v<T_dst, cuda_bfloat16>) {
+            // To bfloat16
+            if constexpr (std::is_same_v<T_src, float>) {
+                return __float2bfloat16(x);
+            } else if constexpr (std::is_same_v<T_src, double>) {
+                return __float2bfloat16(static_cast<float>(x));
+            } else {
+                return __float2bfloat16(static_cast<float>(x));
+            }
+        } else if constexpr (std::is_same_v<T_src, half2>) {
+            // Handle half2 special case
+            if constexpr (std::is_same_v<T_dst, float>) {
+                return __half2float(__low2half(x));
+            } else {
+                return static_cast<T_dst>(__half2float(__low2half(x)));
+            }
+        } else {
+            // Direct cast for other cases
+            return static_cast<T_dst>(x);
+        }
+    }
+
+public:
+    template <typename T_dst, typename T_src>
+    __device__ __forceinline__ T_dst operator()(const T_src &x) const {
+        return cast_impl<T_src, T_dst>(x);
+    }
+
+} CastOp;
+
+} // namespace op::cast::cuda
diff --git a/src/infiniop/ops/cast/metax/cast_metax.h b/src/infiniop/ops/cast/metax/cast_metax.h
new file mode 100644
index 000000000..0ae57feb2
--- /dev/null
+++ b/src/infiniop/ops/cast/metax/cast_metax.h
@@ -0,0 +1,48 @@
+#ifndef CAST_METAX_API_H
+#define CAST_METAX_API_H
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+#include <type_traits>
+
+namespace op::cast::metax {
+class Descriptor final : public InfiniopDescriptor {
+    infiniDtype_t _input_dtype, _output_dtype;
+    op::elementwise::ElementwiseInfo _info;
+    std::unique_ptr<op::elementwise::metax::DeviceImpl> _device_info;
+    size_t _workspace_size;
+
+    Descriptor(
+        infiniDtype_t input_dtype,
+        infiniDtype_t output_dtype,
+        op::elementwise::ElementwiseInfo info,
+        op::elementwise::metax::DeviceImpl *device_info,
+        size_t workspace_size,
+        infiniDevice_t device_type,
+        int device_id)
+        : InfiniopDescriptor{device_type, device_id},
+          _input_dtype(input_dtype),
+          _output_dtype(output_dtype),
+          _info(std::move(info)),
+          _device_info(std::move(device_info)),
+          _workspace_size(workspace_size) {}
+
+public:
+    ~Descriptor();
+
+    size_t workspaceSize() const { return _workspace_size; }
+
+    static infiniStatus_t create(
+        infiniopHandle_t handle_,
+        Descriptor **desc_ptr,
+        infiniopTensorDescriptor_t output_desc,
+        std::vector<infiniopTensorDescriptor_t> input_desc);
+
+    infiniStatus_t calculate(
+        void *workspace, size_t workspace_size,
+        void *output,
+        std::vector<const void *> inputs,
+        void *stream) const;
+};
+} // namespace op::cast::metax
+
+#endif // CAST_METAX_API_H
\ No newline at end of file
diff --git a/src/infiniop/ops/cast/metax/cast_metax.maca b/src/infiniop/ops/cast/metax/cast_metax.maca
new file mode 100644
index 000000000..5e2c73022
--- /dev/null
+++ b/src/infiniop/ops/cast/metax/cast_metax.maca
@@ -0,0 +1,101 @@
+#include "cast_metax.h"
+
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::cast::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto output_dtype = out_desc->dtype();
+    auto input_dtype = input_desc_vec.at(0)->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &out_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_SAME_SHAPE(out_desc->shape(), input_desc_vec.at(0)->shape());
+    auto info_result = op::elementwise::ElementwiseInfo::create(out_desc, input_desc_vec);
+    CHECK_RESULT(info_result);
+    auto info = info_result.take(); 
+    auto workspace_size = info.getMetaMemSize() + info.getInputSize() * sizeof(void *);
+
+    auto device_impl_result = op::elementwise::metax::DeviceImpl::create(handle->internal());
+    CHECK_RESULT(device_impl_result);
+
+    // create metax elementwise descriptor
+    *desc_ptr = new Descriptor(
+        input_dtype,
+        output_dtype,
+        std::move(info),
+        std::move(device_impl_result.take()),
+        workspace_size,
+        handle->device,
+        handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+
+#define SWITCH_IN_TYPE_METAX(OUT_TYPE, IN_TYPE)                                                      \
+        switch(IN_TYPE){                                                                       \
+            case INFINI_DTYPE_I32:                                                             \
+                return _device_info->calculate<256, cuda::CastOp, OUT_TYPE, int32_t>(_info, workspace, output, inputs, stream);  \
+            case INFINI_DTYPE_I64:                                                                 \
+                return _device_info->calculate<256, cuda::CastOp, OUT_TYPE, int64_t>(_info, workspace, output, inputs, stream);  \
+            case INFINI_DTYPE_U32:                                                                 \
+                return _device_info->calculate<256, cuda::CastOp, OUT_TYPE, uint32_t>(_info, workspace, output, inputs, stream); \
+            case INFINI_DTYPE_U64:                                                                 \
+                return _device_info->calculate<256, cuda::CastOp, OUT_TYPE, uint64_t>(_info, workspace, output, inputs, stream); \
+            case INFINI_DTYPE_F16:                                                                 \
+                return _device_info->calculate<256, cuda::CastOp, OUT_TYPE, half>(_info, workspace, output, inputs, stream);   \
+            case INFINI_DTYPE_F32:                                                                 \
+                return _device_info->calculate<256, cuda::CastOp, OUT_TYPE, float>(_info, workspace, output, inputs, stream);    \
+            case INFINI_DTYPE_F64:                                                                 \
+                return _device_info->calculate<256, cuda::CastOp, OUT_TYPE, double>(_info, workspace, output, inputs, stream);   \
+            case INFINI_DTYPE_BF16:                                                                \
+                return _device_info->calculate<256, cuda::CastOp, OUT_TYPE, cuda_bfloat16>(_info, workspace, output, inputs, stream);   \
+            default:                                                                               \
+                return INFINI_STATUS_BAD_TENSOR_DTYPE;                                             \
+        }
+
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    // Handle type conversions based on input and output types
+    switch (_output_dtype) {
+    case INFINI_DTYPE_I32:
+        SWITCH_IN_TYPE_METAX(int32_t,_input_dtype)
+    case INFINI_DTYPE_I64:
+        SWITCH_IN_TYPE_METAX(int64_t,_input_dtype)
+    case INFINI_DTYPE_U32:
+        SWITCH_IN_TYPE_METAX(uint32_t,_input_dtype)
+    case INFINI_DTYPE_U64:
+        SWITCH_IN_TYPE_METAX(uint64_t,_input_dtype)
+    case INFINI_DTYPE_F16:
+        SWITCH_IN_TYPE_METAX(half,_input_dtype)
+    case INFINI_DTYPE_F32:
+        SWITCH_IN_TYPE_METAX(float,_input_dtype)
+    case INFINI_DTYPE_F64:
+        SWITCH_IN_TYPE_METAX(double,_input_dtype)
+    case INFINI_DTYPE_BF16:
+        SWITCH_IN_TYPE_METAX(cuda_bfloat16,_input_dtype)
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::cast::metax
diff --git a/src/infiniop/ops/cast/nvidia/cast_nvidia.cu b/src/infiniop/ops/cast/nvidia/cast_nvidia.cu
new file mode 100644
index 000000000..238af6857
--- /dev/null
+++ b/src/infiniop/ops/cast/nvidia/cast_nvidia.cu
@@ -0,0 +1,98 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "cast_nvidia.cuh"
+
+namespace op::cast::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto output_dtype = out_desc->dtype();
+    auto input_dtype = input_desc_vec.at(0)->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &out_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_SAME_SHAPE(out_desc->shape(), input_desc_vec.at(0)->shape());
+    auto info_result = op::elementwise::ElementwiseInfo::create(out_desc, input_desc_vec);
+    CHECK_RESULT(info_result);
+    auto info = info_result.take();
+    auto workspace_size = info.getMetaMemSize() + info.getInputSize() * sizeof(void *);
+    // Create DeviceImpl using the correct pattern from the macro
+    auto device_impl_result = op::elementwise::nvidia::DeviceImpl::create(handle->internal());
+    CHECK_RESULT(device_impl_result);
+
+    // Create nvidia elementwise descriptor
+    *desc_ptr = new Descriptor(
+        input_dtype,
+        output_dtype,
+        std::move(info),
+        std::move(device_impl_result.take()),
+        workspace_size,
+        handle->device,
+        handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+#define SWITCH_IN_TYPE_NVIDIA(OUT_TYPE, IN_TYPE)                                                                              \
+    switch (IN_TYPE) {                                                                                                        \
+    case INFINI_DTYPE_I32:                                                                                                    \
+        return _device_info->calculate<256, cuda::CastOp, OUT_TYPE, int32_t>(_info, workspace, output, inputs, stream);       \
+    case INFINI_DTYPE_I64:                                                                                                    \
+        return _device_info->calculate<256, cuda::CastOp, OUT_TYPE, int64_t>(_info, workspace, output, inputs, stream);       \
+    case INFINI_DTYPE_U32:                                                                                                    \
+        return _device_info->calculate<256, cuda::CastOp, OUT_TYPE, uint32_t>(_info, workspace, output, inputs, stream);      \
+    case INFINI_DTYPE_U64:                                                                                                    \
+        return _device_info->calculate<256, cuda::CastOp, OUT_TYPE, uint64_t>(_info, workspace, output, inputs, stream);      \
+    case INFINI_DTYPE_F16:                                                                                                    \
+        return _device_info->calculate<256, cuda::CastOp, OUT_TYPE, half>(_info, workspace, output, inputs, stream);          \
+    case INFINI_DTYPE_F32:                                                                                                    \
+        return _device_info->calculate<256, cuda::CastOp, OUT_TYPE, float>(_info, workspace, output, inputs, stream);         \
+    case INFINI_DTYPE_F64:                                                                                                    \
+        return _device_info->calculate<256, cuda::CastOp, OUT_TYPE, double>(_info, workspace, output, inputs, stream);        \
+    case INFINI_DTYPE_BF16:                                                                                                   \
+        return _device_info->calculate<256, cuda::CastOp, OUT_TYPE, cuda_bfloat16>(_info, workspace, output, inputs, stream); \
+    default:                                                                                                                  \
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;                                                                                \
+    }
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    // Handle type conversions based on input and output types
+    switch (_output_dtype) {
+    case INFINI_DTYPE_I32:
+        SWITCH_IN_TYPE_NVIDIA(int32_t, _input_dtype)
+    case INFINI_DTYPE_I64:
+        SWITCH_IN_TYPE_NVIDIA(int64_t, _input_dtype)
+    case INFINI_DTYPE_U32:
+        SWITCH_IN_TYPE_NVIDIA(uint32_t, _input_dtype)
+    case INFINI_DTYPE_U64:
+        SWITCH_IN_TYPE_NVIDIA(uint64_t, _input_dtype)
+    case INFINI_DTYPE_F16:
+        SWITCH_IN_TYPE_NVIDIA(half, _input_dtype)
+    case INFINI_DTYPE_F32:
+        SWITCH_IN_TYPE_NVIDIA(float, _input_dtype)
+    case INFINI_DTYPE_F64:
+        SWITCH_IN_TYPE_NVIDIA(double, _input_dtype)
+    case INFINI_DTYPE_BF16:
+        SWITCH_IN_TYPE_NVIDIA(cuda_bfloat16, _input_dtype)
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::cast::nvidia
\ No newline at end of file
diff --git a/src/infiniop/ops/cast/nvidia/cast_nvidia.cuh b/src/infiniop/ops/cast/nvidia/cast_nvidia.cuh
new file mode 100644
index 000000000..09418f0db
--- /dev/null
+++ b/src/infiniop/ops/cast/nvidia/cast_nvidia.cuh
@@ -0,0 +1,47 @@
+#ifndef CAST_NVIDIA_API_H
+#define CAST_NVIDIA_API_H
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+#include <type_traits>
+
+namespace op::cast::nvidia {
+class Descriptor final : public InfiniopDescriptor {
+    infiniDtype_t _input_dtype, _output_dtype;
+    op::elementwise::ElementwiseInfo _info;
+    std::unique_ptr<op::elementwise::nvidia::DeviceImpl> _device_info;
+    size_t _workspace_size;
+
+    Descriptor(
+        infiniDtype_t input_dtype,
+        infiniDtype_t output_dtype,
+        op::elementwise::ElementwiseInfo info,
+        op::elementwise::nvidia::DeviceImpl *device_info,
+        size_t workspace_size,
+        infiniDevice_t device_type,
+        int device_id)
+        : InfiniopDescriptor{device_type, device_id},
+          _input_dtype(input_dtype),
+          _output_dtype(output_dtype),
+          _info(std::move(info)),
+          _device_info(std::move(device_info)),
+          _workspace_size(workspace_size) {}
+
+public:
+    ~Descriptor();
+
+    size_t workspaceSize() const { return _workspace_size; }
+
+    static infiniStatus_t create(
+        infiniopHandle_t handle_,
+        Descriptor **desc_ptr,
+        infiniopTensorDescriptor_t output_desc,
+        std::vector<infiniopTensorDescriptor_t> input_desc);
+
+    infiniStatus_t calculate(
+        void *workspace, size_t workspace_size,
+        void *output,
+        std::vector<const void *> inputs,
+        void *stream) const;
+};
+} // namespace op::cast::nvidia
+#endif // CAST_NVIDIA_API_H
diff --git a/src/infiniop/ops/cast/operator.cc b/src/infiniop/ops/cast/operator.cc
new file mode 100644
index 000000000..dc2589741
--- /dev/null
+++ b/src/infiniop/ops/cast/operator.cc
@@ -0,0 +1,141 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/cast.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/cast_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/cast_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/cast_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateCastDescriptor(
+    infiniopHandle_t handle,
+    infiniopCastDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                              \
+        return op::cast::NAMESPACE::Descriptor::create(                     \
+            handle,                                                         \
+            reinterpret_cast<op::cast::NAMESPACE::Descriptor **>(desc_ptr), \
+            output_desc,                                                    \
+            {input_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetCastWorkspaceSize(infiniopCastDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                \
+    case CASE:                                                                              \
+        *size = reinterpret_cast<op::cast::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopCast(
+    infiniopCastDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                                 \
+        return reinterpret_cast<const op::cast::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, output, {input}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t infiniopDestroyCastDescriptor(infiniopCastDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        delete reinterpret_cast<const op::cast::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
\ No newline at end of file
diff --git a/src/infiniop/ops/cos/cpu/cos_cpu.cc b/src/infiniop/ops/cos/cpu/cos_cpu.cc
new file mode 100644
index 000000000..578b55281
--- /dev/null
+++ b/src/infiniop/ops/cos/cpu/cos_cpu.cc
@@ -0,0 +1,52 @@
+#include "cos_cpu.h"
+
+namespace op::cos::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &out_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(out_shape, input_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<CosOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<CosOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<CosOp, double>(_info, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<CosOp, bf16_t>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::cos::cpu
\ No newline at end of file
diff --git a/src/infiniop/ops/cos/cpu/cos_cpu.h b/src/infiniop/ops/cos/cpu/cos_cpu.h
new file mode 100644
index 000000000..45dbba919
--- /dev/null
+++ b/src/infiniop/ops/cos/cpu/cos_cpu.h
@@ -0,0 +1,33 @@
+#ifndef COS_CPU_H
+#define COS_CPU_H
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+#include <cmath>
+
+ELEMENTWISE_DESCRIPTOR(cos, cpu)
+
+namespace op::cos::cpu {
+typedef struct CosOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    T operator()(const T &x) const {
+        // cos(x) = cosine of x
+        if constexpr (std::is_same_v<T, fp16_t>) {
+            float x_f = static_cast<float>(x);
+            return static_cast<T>(std::cos(x_f));
+        } else if constexpr (std::is_same_v<T, bf16_t>) {
+            float x_f = static_cast<float>(x);
+            return static_cast<T>(std::cos(x_f));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return std::cos(x);
+        } else if constexpr (std::is_same_v<T, double>) {
+            return std::cos(x);
+        } else {
+            return std::cos(x);
+        }
+    }
+} CosOp;
+} // namespace op::cos::cpu
+
+#endif // COS_CPU_H
\ No newline at end of file
diff --git a/src/infiniop/ops/cos/cuda/kernel.cuh b/src/infiniop/ops/cos/cuda/kernel.cuh
new file mode 100644
index 000000000..3d909ce4e
--- /dev/null
+++ b/src/infiniop/ops/cos/cuda/kernel.cuh
@@ -0,0 +1,46 @@
+#ifndef COS_CUDA_H
+#define COS_CUDA_H
+
+namespace op::cos::cuda {
+typedef struct CosOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            // For half2, process each half separately using CUDA intrinsics
+            half x_low = __low2half(x);
+            half x_high = __high2half(x);
+
+            float x_low_f = __half2float(x_low);
+            float x_high_f = __half2float(x_high);
+
+            half cos_low = __float2half(cosf(x_low_f));
+            half cos_high = __float2half(cosf(x_high_f));
+
+            return __halves2half2(cos_low, cos_high);
+        } else if constexpr (std::is_same_v<T, half>) {
+            // Convert to float for computation to maintain precision
+            float x_f = __half2float(x);
+            float result = cosf(x_f);
+            return __float2half(result);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            // Convert to float for computation to maintain precision
+            float x_f = __bfloat162float(x);
+            float result = cosf(x_f);
+            return __float2bfloat16(result);
+        } else if constexpr (std::is_same_v<T, float>) {
+            // Use fast math functions for float
+            return cosf(x);
+        } else if constexpr (std::is_same_v<T, double>) {
+            return ::cos(x);
+        } else {
+            // Fallback
+            return cosf(x);
+        }
+    }
+} CosOp;
+} // namespace op::cos::cuda
+
+#endif // COS_CUDA_H
\ No newline at end of file
diff --git a/src/infiniop/ops/cos/metax/cos_metax.h b/src/infiniop/ops/cos/metax/cos_metax.h
new file mode 100644
index 000000000..9c43dfd5f
--- /dev/null
+++ b/src/infiniop/ops/cos/metax/cos_metax.h
@@ -0,0 +1,8 @@
+#ifndef COS_METAX_API_H
+#define COS_METAX_API_H
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(cos, metax)
+
+#endif // COS_METAX_API_H
diff --git a/src/infiniop/ops/cos/metax/cos_metax.maca b/src/infiniop/ops/cos/metax/cos_metax.maca
new file mode 100644
index 000000000..894c8ca9f
--- /dev/null
+++ b/src/infiniop/ops/cos/metax/cos_metax.maca
@@ -0,0 +1,60 @@
+#include "cos_metax.h"
+
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::cos::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &out_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(out_shape, input_shape);
+
+    // create METAX elementwise descriptor
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::CosOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::CosOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::CosOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::CosOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::cos::metax
\ No newline at end of file
diff --git a/src/infiniop/ops/cos/nvidia/cos_nvidia.cu b/src/infiniop/ops/cos/nvidia/cos_nvidia.cu
new file mode 100644
index 000000000..bee985672
--- /dev/null
+++ b/src/infiniop/ops/cos/nvidia/cos_nvidia.cu
@@ -0,0 +1,59 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "cos_nvidia.cuh"
+
+namespace op::cos::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &out_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(out_shape, input_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::CosOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::CosOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::CosOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::CosOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::cos::nvidia
\ No newline at end of file
diff --git a/src/infiniop/ops/cos/nvidia/cos_nvidia.cuh b/src/infiniop/ops/cos/nvidia/cos_nvidia.cuh
new file mode 100644
index 000000000..7849028e9
--- /dev/null
+++ b/src/infiniop/ops/cos/nvidia/cos_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef COS_NVIDIA_API_H
+#define COS_NVIDIA_API_H
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(cos, nvidia)
+
+#endif // COS_NVIDIA_API_H
diff --git a/src/infiniop/ops/cos/operator.cc b/src/infiniop/ops/cos/operator.cc
new file mode 100644
index 000000000..e8ddeddf5
--- /dev/null
+++ b/src/infiniop/ops/cos/operator.cc
@@ -0,0 +1,141 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/cos.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/cos_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/cos_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/cos_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateCosDescriptor(
+    infiniopHandle_t handle,
+    infiniopCosDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                             \
+        return op::cos::NAMESPACE::Descriptor::create(                     \
+            handle,                                                        \
+            reinterpret_cast<op::cos::NAMESPACE::Descriptor **>(desc_ptr), \
+            output_desc,                                                   \
+            {input_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetCosWorkspaceSize(infiniopCosDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                               \
+    case CASE:                                                                             \
+        *size = reinterpret_cast<op::cos::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopCos(
+    infiniopCosDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                                \
+        return reinterpret_cast<const op::cos::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, output, {input}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t infiniopDestroyCosDescriptor(infiniopCosDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                 \
+        delete reinterpret_cast<const op::cos::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
\ No newline at end of file
diff --git a/src/infiniop/ops/exp/cpu/exp_cpu.cc b/src/infiniop/ops/exp/cpu/exp_cpu.cc
new file mode 100644
index 000000000..61456efad
--- /dev/null
+++ b/src/infiniop/ops/exp/cpu/exp_cpu.cc
@@ -0,0 +1,52 @@
+#include "exp_cpu.h"
+
+namespace op::exp::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<ExpOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<ExpOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<ExpOp, double>(_info, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<ExpOp, bf16_t>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::exp::cpu
\ No newline at end of file
diff --git a/src/infiniop/ops/exp/cpu/exp_cpu.h b/src/infiniop/ops/exp/cpu/exp_cpu.h
new file mode 100644
index 000000000..47eb5e7a3
--- /dev/null
+++ b/src/infiniop/ops/exp/cpu/exp_cpu.h
@@ -0,0 +1,27 @@
+#ifndef EXP_CPU_H
+#define EXP_CPU_H
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+#include <cmath>
+
+ELEMENTWISE_DESCRIPTOR(exp, cpu)
+
+namespace op::exp::cpu {
+typedef struct ExpOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    T operator()(const T &a) const {
+        if constexpr (std::is_same_v<T, float>) {
+            return std::exp(a);
+        } else if constexpr (std::is_same_v<T, double>) {
+            return std::exp(a);
+        } else {
+            // For fp16_t and bf16_t, convert to float, compute exp, then convert back
+            return static_cast<T>(std::exp(static_cast<float>(a)));
+        }
+    }
+} ExpOp;
+} // namespace op::exp::cpu
+
+#endif // EXP_CPU_H
diff --git a/src/infiniop/ops/exp/cuda/kernel.cuh b/src/infiniop/ops/exp/cuda/kernel.cuh
new file mode 100644
index 000000000..2eafe9566
--- /dev/null
+++ b/src/infiniop/ops/exp/cuda/kernel.cuh
@@ -0,0 +1,37 @@
+#ifndef EXP_CUDA_H
+#define EXP_CUDA_H
+
+namespace op::exp::cuda {
+typedef struct ExpOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &a) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            // For half2, split into two halves, compute exp, then combine
+            half2 result;
+            result.x = __float2half(expf(__half2float(a.x)));
+            result.y = __float2half(expf(__half2float(a.y)));
+            return result;
+        } else if constexpr (std::is_same_v<T, half>) {
+            // Convert half to float, compute exp, convert back
+            float fa = __half2float(a);
+            float result = expf(fa);
+            return __float2half(result);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            // Convert bf16 to float, compute exp, then convert back
+            float fa = __bfloat162float(a);
+            float result = expf(fa);
+            return __float2bfloat16(result);
+        } else if constexpr (std::is_same_v<T, float>) {
+            return expf(a);
+        } else if constexpr (std::is_same_v<T, double>) {
+            return ::exp(a);
+        } else {
+            return ::exp(a);
+        }
+    }
+} ExpOp;
+} // namespace op::exp::cuda
+
+#endif // EXP_CUDA_H
\ No newline at end of file
diff --git a/src/infiniop/ops/exp/metax/exp_metax.h b/src/infiniop/ops/exp/metax/exp_metax.h
new file mode 100644
index 000000000..64fa186cf
--- /dev/null
+++ b/src/infiniop/ops/exp/metax/exp_metax.h
@@ -0,0 +1,8 @@
+#ifndef EXP_METAX_API_H
+#define EXP_METAX_API_H
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(exp, metax)
+
+#endif // EXP_METAX_API_H
diff --git a/src/infiniop/ops/exp/metax/exp_metax.maca b/src/infiniop/ops/exp/metax/exp_metax.maca
new file mode 100644
index 000000000..a214634b9
--- /dev/null
+++ b/src/infiniop/ops/exp/metax/exp_metax.maca
@@ -0,0 +1,60 @@
+#include "exp_metax.h"
+
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::exp::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create MetaX elementwise descriptor
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::ExpOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::ExpOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::ExpOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::ExpOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::exp::metax
\ No newline at end of file
diff --git a/src/infiniop/ops/exp/nvidia/exp_nvidia.cu b/src/infiniop/ops/exp/nvidia/exp_nvidia.cu
new file mode 100644
index 000000000..f79846145
--- /dev/null
+++ b/src/infiniop/ops/exp/nvidia/exp_nvidia.cu
@@ -0,0 +1,59 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "exp_nvidia.cuh"
+
+namespace op::exp::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::ExpOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::ExpOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::ExpOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::ExpOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::exp::nvidia
\ No newline at end of file
diff --git a/src/infiniop/ops/exp/nvidia/exp_nvidia.cuh b/src/infiniop/ops/exp/nvidia/exp_nvidia.cuh
new file mode 100644
index 000000000..596d88d62
--- /dev/null
+++ b/src/infiniop/ops/exp/nvidia/exp_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef EXP_NVIDIA_API_H
+#define EXP_NVIDIA_API_H
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(exp, nvidia)
+
+#endif // EXP_NVIDIA_API_H
diff --git a/src/infiniop/ops/exp/operator.cc b/src/infiniop/ops/exp/operator.cc
new file mode 100644
index 000000000..674d8dbfc
--- /dev/null
+++ b/src/infiniop/ops/exp/operator.cc
@@ -0,0 +1,142 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/exp.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/exp_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/exp_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/exp_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateExpDescriptor(
+    infiniopHandle_t handle,
+    infiniopExpDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                             \
+        return op::exp::NAMESPACE::Descriptor::create(                     \
+            handle,                                                        \
+            reinterpret_cast<op::exp::NAMESPACE::Descriptor **>(desc_ptr), \
+            output_desc,                                                   \
+            {input_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetExpWorkspaceSize(infiniopExpDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                               \
+    case CASE:                                                                             \
+        *size = reinterpret_cast<op::exp::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopExp(
+    infiniopExpDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                                \
+        return reinterpret_cast<const op::exp::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, output, {input}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyExpDescriptor(infiniopExpDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                 \
+        delete reinterpret_cast<const op::exp::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
\ No newline at end of file
diff --git a/src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc
new file mode 100644
index 000000000..606db3a1f
--- /dev/null
+++ b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc
@@ -0,0 +1,52 @@
+#include "hardswish_cpu.h"
+
+namespace op::hardswish::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &out_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(out_shape, input_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<HardSwishOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<HardSwishOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<HardSwishOp, double>(_info, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<HardSwishOp, bf16_t>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::hardswish::cpu
\ No newline at end of file
diff --git a/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h
new file mode 100644
index 000000000..54ce51b18
--- /dev/null
+++ b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h
@@ -0,0 +1,41 @@
+
+#ifndef HARDSWISH_CPU_H
+#define HARDSWISH_CPU_H
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+#include <algorithm>
+#include <cmath>
+
+ELEMENTWISE_DESCRIPTOR(hardswish, cpu)
+
+namespace op::hardswish::cpu {
+typedef struct HardSwishOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    T operator()(const T &x) const {
+        // HardSwish(x) = x * HardSigmoid(x) = x * max(0, min(1, (x + 3) / 6))
+        if constexpr (std::is_same_v<T, fp16_t>) {
+            float x_f = static_cast<float>(x);
+            float hard_sigmoid = std::max(0.0f, std::min(1.0f, (x_f + 3.0f) / 6.0f));
+            return static_cast<T>(x_f * hard_sigmoid);
+        } else if constexpr (std::is_same_v<T, bf16_t>) {
+            float x_f = static_cast<float>(x);
+            float hard_sigmoid = std::max(0.0f, std::min(1.0f, (x_f + 3.0f) / 6.0f));
+            return static_cast<T>(x_f * hard_sigmoid);
+        } else if constexpr (std::is_same_v<T, float>) {
+            float hard_sigmoid = std::max(0.0f, std::min(1.0f, (x + 3.0f) / 6.0f));
+            return x * hard_sigmoid;
+        } else if constexpr (std::is_same_v<T, double>) {
+            double hard_sigmoid = std::max(0.0, std::min(1.0, (x + 3.0) / 6.0));
+            return x * hard_sigmoid;
+        } else {
+            float x_f = static_cast<float>(x);
+            float hard_sigmoid = std::max(0.0f, std::min(1.0f, (x_f + 3.0f) / 6.0f));
+            return static_cast<T>(x_f * hard_sigmoid);
+        }
+    }
+} HardSwishOp;
+} // namespace op::hardswish::cpu
+
+#endif // HARDSWISH_CPU_H
\ No newline at end of file
diff --git a/src/infiniop/ops/hardswish/cuda/kernel.cuh b/src/infiniop/ops/hardswish/cuda/kernel.cuh
new file mode 100644
index 000000000..cd55aa874
--- /dev/null
+++ b/src/infiniop/ops/hardswish/cuda/kernel.cuh
@@ -0,0 +1,36 @@
+#ifndef HARDSWISH_CUDA_H
+#define HARDSWISH_CUDA_H
+
+namespace op::hardswish::cuda {
+
+typedef struct HardSwishOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, half>) {
+            float xf = __half2float(x);
+            float t = fminf(6.0f, fmaxf(0.0f, xf + 3.0f)) * (1.0f / 6.0f);
+            return __float2half(xf * t);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            float xf = __bfloat162float(x);
+            float t = fminf(6.0f, fmaxf(0.0f, xf + 3.0f)) * (1.0f / 6.0f);
+            return __float2bfloat16(xf * t);
+        } else if constexpr (std::is_same_v<T, float>) {
+            float t = fminf(6.0f, fmaxf(0.0f, x + 3.0f)) * (1.0f / 6.0f);
+            return x * t;
+        } else if constexpr (std::is_same_v<T, double>) {
+            double t = fmin(6.0, fmax(0.0, x + 3.0)) * (1.0 / 6.0);
+            return x * t;
+        } else {
+            float xf = static_cast<float>(x);
+            float t = fminf(6.0f, fmaxf(0.0f, xf + 3.0f)) * (1.0f / 6.0f);
+            return static_cast<T>(xf * t);
+        }
+    }
+} HardSwishOp;
+
+} // namespace op::hardswish::cuda
+
+#endif // HARDSWISH_CUDA_H
diff --git a/src/infiniop/ops/hardswish/metax/hardswish_metax.h b/src/infiniop/ops/hardswish/metax/hardswish_metax.h
new file mode 100644
index 000000000..cfde66aa0
--- /dev/null
+++ b/src/infiniop/ops/hardswish/metax/hardswish_metax.h
@@ -0,0 +1,9 @@
+
+#ifndef HARDSWISH_METAX_API_H
+#define HARDSWISH_METAX_API_H
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(hardswish, metax)
+
+#endif // HARDSWISH_METAX_API_H
diff --git a/src/infiniop/ops/hardswish/metax/hardswish_metax.maca b/src/infiniop/ops/hardswish/metax/hardswish_metax.maca
new file mode 100644
index 000000000..308f4c493
--- /dev/null
+++ b/src/infiniop/ops/hardswish/metax/hardswish_metax.maca
@@ -0,0 +1,60 @@
+#include "hardswish_metax.h"
+
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::hardswish::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &out_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(out_shape, input_shape);
+
+    // create METAX elementwise descriptor
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::HardSwishOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::HardSwishOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::HardSwishOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::HardSwishOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::hardswish::metax
\ No newline at end of file
diff --git a/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu
new file mode 100644
index 000000000..b3e9f7d5a
--- /dev/null
+++ b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu
@@ -0,0 +1,59 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "hardswish_nvidia.cuh"
+
+namespace op::hardswish::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &out_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(out_shape, input_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::HardSwishOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::HardSwishOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::HardSwishOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::HardSwishOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::hardswish::nvidia
\ No newline at end of file
diff --git a/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cuh b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cuh
new file mode 100644
index 000000000..8fd92a3a6
--- /dev/null
+++ b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cuh
@@ -0,0 +1,9 @@
+
+#ifndef HARDSWISH_NVIDIA_API_H
+#define HARDSWISH_NVIDIA_API_H
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(hardswish, nvidia)
+
+#endif // HARDSWISH_NVIDIA_API_H
diff --git a/src/infiniop/ops/hardswish/operator.cc b/src/infiniop/ops/hardswish/operator.cc
new file mode 100644
index 000000000..0a807a022
--- /dev/null
+++ b/src/infiniop/ops/hardswish/operator.cc
@@ -0,0 +1,141 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/hardswish.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/hardswish_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/hardswish_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/hardswish_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateHardSwishDescriptor(
+    infiniopHandle_t handle,
+    infiniopHardSwishDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                                  \
+    case CASE:                                                                   \
+        return op::hardswish::NAMESPACE::Descriptor::create(                     \
+            handle,                                                              \
+            reinterpret_cast<op::hardswish::NAMESPACE::Descriptor **>(desc_ptr), \
+            output_desc,                                                         \
+            {input_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetHardSwishWorkspaceSize(infiniopHardSwishDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                     \
+    case CASE:                                                                                   \
+        *size = reinterpret_cast<op::hardswish::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopHardSwish(
+    infiniopHardSwishDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                  \
+    case CASE:                                                                      \
+        return reinterpret_cast<const op::hardswish::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, output, {input}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t infiniopDestroyHardSwishDescriptor(infiniopHardSwishDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                      \
+    case CASE:                                                                       \
+        delete reinterpret_cast<const op::hardswish::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
\ No newline at end of file
diff --git a/src/infiniop/ops/leaky_relu/cpu/leaky_relu_cpu.cc b/src/infiniop/ops/leaky_relu/cpu/leaky_relu_cpu.cc
new file mode 100644
index 000000000..b8d09a1af
--- /dev/null
+++ b/src/infiniop/ops/leaky_relu/cpu/leaky_relu_cpu.cc
@@ -0,0 +1,64 @@
+#include "leaky_relu_cpu.h"
+
+namespace op::leaky_relu::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec,
+    float negative_slope) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CPU elementwise descriptor
+    auto info_result = op::elementwise::ElementwiseInfo::create(out_desc, input_desc_vec);
+    CHECK_RESULT(info_result);
+
+    *desc_ptr = new Descriptor(
+        dtype,
+        info_result.take(),
+        nullptr,
+        0,
+        handle->device,
+        handle->device_id,
+        negative_slope);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<LeakyReLUOp, fp16_t>(_info, output, inputs, stream, _negative_slope);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<LeakyReLUOp, float>(_info, output, inputs, stream, _negative_slope);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<LeakyReLUOp, double>(_info, output, inputs, stream, _negative_slope);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<LeakyReLUOp, bf16_t>(_info, output, inputs, stream, _negative_slope);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::leaky_relu::cpu
\ No newline at end of file
diff --git a/src/infiniop/ops/leaky_relu/cpu/leaky_relu_cpu.h b/src/infiniop/ops/leaky_relu/cpu/leaky_relu_cpu.h
new file mode 100644
index 000000000..7cd46289c
--- /dev/null
+++ b/src/infiniop/ops/leaky_relu/cpu/leaky_relu_cpu.h
@@ -0,0 +1,78 @@
+#ifndef LEAKY_RELU_CPU_H
+#define LEAKY_RELU_CPU_H
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+#include <cmath>
+
+namespace op::leaky_relu::cpu {
+class Descriptor final : public InfiniopDescriptor {
+    infiniDtype_t _dtype;
+    op::elementwise::ElementwiseInfo _info;
+    std::unique_ptr<op::elementwise::cpu::DeviceImpl> _device_info;
+    size_t _workspace_size;
+    float _negative_slope;
+
+    Descriptor(
+        infiniDtype_t dtype,
+        op::elementwise::ElementwiseInfo info,
+        op::elementwise::cpu::DeviceImpl *device_info,
+        size_t workspace_size,
+        infiniDevice_t device_type,
+        int device_id,
+        float negative_slope)
+        : InfiniopDescriptor{device_type, device_id},
+          _dtype(dtype),
+          _info(std::move(info)),
+          _device_info(std::move(device_info)),
+          _workspace_size(workspace_size),
+          _negative_slope(negative_slope) {}
+
+public:
+    ~Descriptor();
+
+    size_t workspaceSize() const { return _workspace_size; }
+
+    static infiniStatus_t create(
+        infiniopHandle_t handle,
+        Descriptor **desc_ptr,
+        infiniopTensorDescriptor_t output_desc,
+        std::vector<infiniopTensorDescriptor_t> input_desc,
+        float negative_slope);
+
+    infiniStatus_t calculate(
+        void *workspace, size_t workspace_size,
+        void *output,
+        std::vector<const void *> inputs,
+        void *stream) const;
+};
+
+typedef struct LeakyReLUOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &x, float negative_slope) const {
+        // LeakyReLU(x) = max(0, x) + negative_slope * min(0, x)
+        // Equivalent to: x >= 0 ? x : negative_slope * x
+        if constexpr (std::is_same_v<T, fp16_t>) {
+            float x_f = static_cast<float>(x);
+            float result = x_f >= 0.0f ? x_f : negative_slope * x_f;
+            return static_cast<T>(result);
+        } else if constexpr (std::is_same_v<T, bf16_t>) {
+            float x_f = static_cast<float>(x);
+            float result = x_f >= 0.0f ? x_f : negative_slope * x_f;
+            return static_cast<T>(result);
+        } else if constexpr (std::is_same_v<T, float>) {
+            return x >= 0.0f ? x : negative_slope * x;
+        } else if constexpr (std::is_same_v<T, double>) {
+            return x >= 0.0 ? x : static_cast<double>(negative_slope) * x;
+        } else {
+            return x >= T(0) ? x : static_cast<T>(negative_slope) * x;
+        }
+    }
+
+} LeakyReLUOp;
+
+} // namespace op::leaky_relu::cpu
+
+#endif // LEAKY_RELU_CPU_H
\ No newline at end of file
diff --git a/src/infiniop/ops/leaky_relu/cuda/kernel.cuh b/src/infiniop/ops/leaky_relu/cuda/kernel.cuh
new file mode 100644
index 000000000..460538443
--- /dev/null
+++ b/src/infiniop/ops/leaky_relu/cuda/kernel.cuh
@@ -0,0 +1,42 @@
+#ifndef LEAKY_RELU_CUDA_H
+#define LEAKY_RELU_CUDA_H
+
+namespace op::leaky_relu::cuda {
+typedef struct LeakyReLUOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    // __host__ __device__ LeakyReLUOp() = default;
+
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x, const float *negative_slope) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            // For half2, process each half separately
+            half x_low = __low2half(x);
+            half x_high = __high2half(x);
+            half result_low = x_low >= __float2half(0.0f) ? x_low : __float2half(*negative_slope) * x_low;
+            half result_high = x_high >= __float2half(0.0f) ? x_high : __float2half(*negative_slope) * x_high;
+            return __halves2half2(result_low, result_high);
+        } else if constexpr (std::is_same_v<T, half>) {
+            // Use CUDA half operations
+            half zero = __float2half(0.0f);
+            half neg_slope = __float2half(*negative_slope);
+            return x >= zero ? x : neg_slope * x;
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            // Convert to float for computation to maintain precision
+            float x_f = __bfloat162float(x);
+            float result = x_f >= 0.0f ? x_f : *negative_slope * x_f;
+            return __float2bfloat16(result);
+        } else if constexpr (std::is_same_v<T, float>) {
+            return x >= 0.0f ? x : *negative_slope * x;
+        } else if constexpr (std::is_same_v<T, double>) {
+            return x >= 0.0 ? x : static_cast<double>(*negative_slope) * x;
+        } else {
+            // Fallback
+            return x >= T(0) ? x : static_cast<T>(*negative_slope) * x;
+        }
+    }
+} LeakyReLUOp;
+} // namespace op::leaky_relu::cuda
+
+#endif // LEAKY_RELU_CUDA_H
\ No newline at end of file
diff --git a/src/infiniop/ops/leaky_relu/metax/leaky_relu_metax.h b/src/infiniop/ops/leaky_relu/metax/leaky_relu_metax.h
new file mode 100644
index 000000000..3feb273a0
--- /dev/null
+++ b/src/infiniop/ops/leaky_relu/metax/leaky_relu_metax.h
@@ -0,0 +1,49 @@
+#ifndef LEAKY_RELU_METAX_API_H
+#define LEAKY_RELU_METAX_API_H
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+namespace op::leaky_relu::metax {
+class Descriptor final : public InfiniopDescriptor {
+    infiniDtype_t _dtype;
+    op::elementwise::ElementwiseInfo _info;
+    std::unique_ptr<op::elementwise::metax::DeviceImpl> _device_info;
+    size_t _workspace_size;
+    float _negative_slope;
+
+    Descriptor(
+        infiniDtype_t dtype,
+        op::elementwise::ElementwiseInfo info,
+        op::elementwise::metax::DeviceImpl *device_info,
+        size_t workspace_size,
+        infiniDevice_t device_type,
+        int device_id,
+        float negative_slope)
+        : InfiniopDescriptor{device_type, device_id},
+          _dtype(dtype),
+          _info(std::move(info)),
+          _device_info(std::move(device_info)),
+          _workspace_size(workspace_size),
+          _negative_slope(negative_slope) {}
+
+public:
+    ~Descriptor();
+
+    size_t workspaceSize() const { return _workspace_size; }
+
+    static infiniStatus_t create(
+        infiniopHandle_t handle,
+        Descriptor **desc_ptr,
+        infiniopTensorDescriptor_t output_desc,
+        std::vector<infiniopTensorDescriptor_t> input_desc,
+        float negative_slope);
+
+    infiniStatus_t calculate(
+        void *workspace, size_t workspace_size,
+        void *output,
+        std::vector<const void *> inputs,
+        void *stream) const;
+};
+} // namespace op::leaky_relu::metax
+
+#endif // LEAKY_RELU_METAX_API_H
\ No newline at end of file
diff --git a/src/infiniop/ops/leaky_relu/metax/leaky_relu_metax.maca b/src/infiniop/ops/leaky_relu/metax/leaky_relu_metax.maca
new file mode 100644
index 000000000..441f89e59
--- /dev/null
+++ b/src/infiniop/ops/leaky_relu/metax/leaky_relu_metax.maca
@@ -0,0 +1,77 @@
+#include "leaky_relu_metax.h"
+
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::leaky_relu::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec,
+    float negative_slope) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &out_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(out_shape, input_shape);
+
+    auto info_result = op::elementwise::ElementwiseInfo::create(out_desc, input_desc_vec);
+    CHECK_RESULT(info_result);
+    auto info = info_result.take(); 
+    auto workspace_size = info.getMetaMemSize() + info.getInputSize() * sizeof(void *) + sizeof(float);//device negative_slope
+    auto device_impl_result = op::elementwise::metax::DeviceImpl::create(handle->internal());
+    CHECK_RESULT(device_impl_result);
+    *desc_ptr = new Descriptor(
+        dtype,
+        std::move(info),
+        std::move(device_impl_result.take()),
+        workspace_size,
+        handle->device,
+        handle->device_id,
+        negative_slope);
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+    const int8_t *d_negative_slope_start = reinterpret_cast<int8_t *>(workspace) + workspace_size - sizeof(_negative_slope);
+    CHECK_METAX(hcMemcpyAsync((void *)d_negative_slope_start, 
+                    &_negative_slope, 
+                    sizeof(_negative_slope), 
+                    hcMemcpyHostToDevice, 
+                    reinterpret_cast<hcStream_t>(stream)));
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::LeakyReLUOp, half>(_info, workspace, output, inputs, stream, reinterpret_cast<const float*>(d_negative_slope_start));
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::LeakyReLUOp, cuda_bfloat16>(_info, workspace, output, inputs, stream, reinterpret_cast<const float*>(d_negative_slope_start));
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::LeakyReLUOp, float>(_info, workspace, output, inputs, stream, reinterpret_cast<const float*>(d_negative_slope_start));
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::LeakyReLUOp, double>(_info, workspace, output, inputs, stream, reinterpret_cast<const float*>(d_negative_slope_start));
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::leaky_relu::metax
\ No newline at end of file
diff --git a/src/infiniop/ops/leaky_relu/nvidia/leaky_relu_nvidia.cu b/src/infiniop/ops/leaky_relu/nvidia/leaky_relu_nvidia.cu
new file mode 100644
index 000000000..01ad07734
--- /dev/null
+++ b/src/infiniop/ops/leaky_relu/nvidia/leaky_relu_nvidia.cu
@@ -0,0 +1,77 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "leaky_relu_nvidia.cuh"
+
+namespace op::leaky_relu::nvidia {
+
+Descriptor::~Descriptor() = default;
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec,
+    float negative_slope) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create NVIDIA elementwise descriptor
+    auto info_result = op::elementwise::ElementwiseInfo::create(out_desc, input_desc_vec);
+    CHECK_RESULT(info_result);
+    auto info = info_result.take();
+    auto workspace_size = info.getMetaMemSize() + info.getInputSize() * sizeof(void *) + sizeof(float); // device negative_slope
+    auto device_impl_result = op::elementwise::nvidia::DeviceImpl::create(handle->internal());
+    CHECK_RESULT(device_impl_result);
+
+    *desc_ptr = new Descriptor(
+        dtype,
+        std::move(info),
+        std::move(device_impl_result.take()),
+        workspace_size,
+        handle->device,
+        handle->device_id,
+        negative_slope);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+    const int8_t *d_negative_slope_start = reinterpret_cast<int8_t *>(workspace) + workspace_size - sizeof(_negative_slope);
+    CHECK_CUDA(cudaMemcpyAsync((void *)d_negative_slope_start,
+                               &_negative_slope,
+                               sizeof(_negative_slope),
+                               cudaMemcpyHostToDevice,
+                               reinterpret_cast<cudaStream_t>(stream)));
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::LeakyReLUOp, half>(_info, workspace, output, inputs, stream, reinterpret_cast<const float *>(d_negative_slope_start));
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::LeakyReLUOp, cuda_bfloat16>(_info, workspace, output, inputs, stream, reinterpret_cast<const float *>(d_negative_slope_start));
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::LeakyReLUOp, float>(_info, workspace, output, inputs, stream, reinterpret_cast<const float *>(d_negative_slope_start));
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::LeakyReLUOp, double>(_info, workspace, output, inputs, stream, reinterpret_cast<const float *>(d_negative_slope_start));
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
+
+} // namespace op::leaky_relu::nvidia
\ No newline at end of file
diff --git a/src/infiniop/ops/leaky_relu/nvidia/leaky_relu_nvidia.cuh b/src/infiniop/ops/leaky_relu/nvidia/leaky_relu_nvidia.cuh
new file mode 100644
index 000000000..bb3bf3c54
--- /dev/null
+++ b/src/infiniop/ops/leaky_relu/nvidia/leaky_relu_nvidia.cuh
@@ -0,0 +1,49 @@
+#ifndef LEAKY_RELU_NVIDIA_API_H
+#define LEAKY_RELU_NVIDIA_API_H
+
+#include "../../../../utils/custom_types.h"
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+namespace op::leaky_relu::nvidia {
+class Descriptor final : public InfiniopDescriptor {
+    infiniDtype_t _dtype;
+    op::elementwise::ElementwiseInfo _info;
+    std::unique_ptr<op::elementwise::nvidia::DeviceImpl> _device_info;
+    size_t _workspace_size;
+    float _negative_slope;
+
+    Descriptor(
+        infiniDtype_t dtype,
+        op::elementwise::ElementwiseInfo info,
+        op::elementwise::nvidia::DeviceImpl *device_info,
+        size_t workspace_size,
+        infiniDevice_t device_type,
+        int device_id,
+        float negative_slope)
+        : InfiniopDescriptor{device_type, device_id},
+          _dtype(dtype),
+          _info(std::move(info)),
+          _device_info(std::move(device_info)),
+          _workspace_size(workspace_size),
+          _negative_slope(negative_slope) {}
+
+public:
+    ~Descriptor();
+
+    size_t workspaceSize() const { return _workspace_size; }
+
+    static infiniStatus_t create(
+        infiniopHandle_t handle,
+        Descriptor **desc_ptr,
+        infiniopTensorDescriptor_t output_desc,
+        std::vector<infiniopTensorDescriptor_t> input_desc,
+        float negative_slope);
+
+    infiniStatus_t calculate(
+        void *workspace, size_t workspace_size,
+        void *output,
+        std::vector<const void *> inputs,
+        void *stream) const;
+};
+} // namespace op::leaky_relu::nvidia
+#endif // LEAKY_RELU_NVIDIA_API_H
\ No newline at end of file
diff --git a/src/infiniop/ops/leaky_relu/operator.cc b/src/infiniop/ops/leaky_relu/operator.cc
new file mode 100644
index 000000000..0f4ce436e
--- /dev/null
+++ b/src/infiniop/ops/leaky_relu/operator.cc
@@ -0,0 +1,143 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/leaky_relu.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/leaky_relu_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/leaky_relu_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/leaky_relu_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateLeakyReLUDescriptor(
+    infiniopHandle_t handle,
+    infiniopLeakyReLUDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    float negative_slope) {
+
+#define CREATE(CASE, NAMESPACE)                                                   \
+    case CASE:                                                                    \
+        return op::leaky_relu::NAMESPACE::Descriptor::create(                     \
+            infiniopHandle_t(handle),                                             \
+            reinterpret_cast<op::leaky_relu::NAMESPACE::Descriptor **>(desc_ptr), \
+            output_desc,                                                          \
+            std::vector<infiniopTensorDescriptor_t>{input_desc},                  \
+            negative_slope)
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetLeakyReLUWorkspaceSize(infiniopLeakyReLUDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                      \
+    case CASE:                                                                                    \
+        *size = reinterpret_cast<op::leaky_relu::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopLeakyReLU(
+    infiniopLeakyReLUDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                   \
+    case CASE:                                                                       \
+        return reinterpret_cast<const op::leaky_relu::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, output, {input}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t infiniopDestroyLeakyReLUDescriptor(infiniopLeakyReLUDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                       \
+    case CASE:                                                                        \
+        delete reinterpret_cast<const op::leaky_relu::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
\ No newline at end of file
diff --git a/src/infiniop/ops/sigmoid_backward/cpu/sigmoid_backward_cpu.cc b/src/infiniop/ops/sigmoid_backward/cpu/sigmoid_backward_cpu.cc
new file mode 100644
index 000000000..30dc3bf56
--- /dev/null
+++ b/src/infiniop/ops/sigmoid_backward/cpu/sigmoid_backward_cpu.cc
@@ -0,0 +1,54 @@
+#include "sigmoid_backward_cpu.h"
+
+namespace op::sigmoid_backward::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &grad_output_desc = input_desc_vec.at(1);
+    const auto &grad_input_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+    const auto &grad_output_shape = grad_output_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(grad_input_shape, input_shape, grad_output_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<SigmoidBackwardOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<SigmoidBackwardOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<SigmoidBackwardOp, double>(_info, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<SigmoidBackwardOp, bf16_t>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::sigmoid_backward::cpu
\ No newline at end of file
diff --git a/src/infiniop/ops/sigmoid_backward/cpu/sigmoid_backward_cpu.h b/src/infiniop/ops/sigmoid_backward/cpu/sigmoid_backward_cpu.h
new file mode 100644
index 000000000..a581874e9
--- /dev/null
+++ b/src/infiniop/ops/sigmoid_backward/cpu/sigmoid_backward_cpu.h
@@ -0,0 +1,29 @@
+#ifndef SIGMOID_BACKWARD_CPU_H
+#define SIGMOID_BACKWARD_CPU_H
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+#include <cmath>
+
+ELEMENTWISE_DESCRIPTOR(sigmoid_backward, cpu)
+
+namespace op::sigmoid_backward::cpu {
+typedef struct SigmoidBackwardOp {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    T operator()(const T &input, const T &grad_output) const {
+        // sigmoid_backward: grad_input = grad_output * sigmoid(input) * (1 - sigmoid(input))
+        T sigmoid_val;
+        if constexpr (std::is_same_v<T, fp16_t>) {
+            sigmoid_val = static_cast<T>(1.0f / (1.0f + std::exp(-static_cast<float>(input))));
+        } else if constexpr (std::is_same_v<T, bf16_t>) {
+            sigmoid_val = static_cast<T>(1.0f / (1.0f + std::exp(-static_cast<float>(input))));
+        } else {
+            sigmoid_val = static_cast<T>(1.0) / (static_cast<T>(1.0) + std::exp(-input));
+        }
+        return grad_output * sigmoid_val * (static_cast<T>(1.0) - sigmoid_val);
+    }
+} SigmoidBackwardOp;
+} // namespace op::sigmoid_backward::cpu
+
+#endif // SIGMOID_BACKWARD_CPU_H
diff --git a/src/infiniop/ops/sigmoid_backward/cuda/kernel.cuh b/src/infiniop/ops/sigmoid_backward/cuda/kernel.cuh
new file mode 100644
index 000000000..38ac9607d
--- /dev/null
+++ b/src/infiniop/ops/sigmoid_backward/cuda/kernel.cuh
@@ -0,0 +1,56 @@
+#ifndef SIGMOID_BACKWARD_CUDA_H
+#define SIGMOID_BACKWARD_CUDA_H
+
+namespace op::sigmoid_backward::cuda {
+typedef struct SigmoidBackwardOp {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &input, const T &grad_output) const {
+        T sigmoid_val;
+
+        if constexpr (std::is_same_v<T, half2>) {
+            // For half2, process each component
+            half2 one = __float2half2_rn(1.0f);
+            half2 neg_input = __hneg2(input);
+            half2 exp_neg_input = h2exp(neg_input);
+            sigmoid_val = __h2div(one, __hadd2(one, exp_neg_input));
+            return __hmul2(__hmul2(grad_output, sigmoid_val), __hsub2(one, sigmoid_val));
+        } else if constexpr (std::is_same_v<T, half>) {
+            half one = __float2half(1.0f);
+            // half neg_input = __hneg(input);
+            // half exp_neg_input = hexp(neg_input);
+            // sigmoid_val = __hdiv(one, __hadd(one, exp_neg_input));
+            sigmoid_val = sigmoid(input);
+            return __hmul(__hmul(grad_output, sigmoid_val), __hsub(one, sigmoid_val));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            cuda_bfloat16 one = __float2bfloat16(1.0f);
+            // cuda_bfloat16 neg_input = __hneg(input);
+            // cuda_bfloat16 exp_neg_input = hexp(neg_input);
+            // sigmoid_val = __hdiv(one, __hadd(one, exp_neg_input));
+            sigmoid_val = sigmoid(input);
+            return __hmul(__hmul(grad_output, sigmoid_val), __hsub(one, sigmoid_val));
+        } else if constexpr (std::is_same_v<T, float>) {
+            sigmoid_val = __fdiv_rn(1.0f, __fadd_rn(1.0f, expf(-input)));
+            return __fmul_rn(__fmul_rn(grad_output, sigmoid_val), __fsub_rn(1.0f, sigmoid_val));
+        } else if constexpr (std::is_same_v<T, double>) {
+            sigmoid_val = 1.0 / (1.0 + exp(-input));
+            return grad_output * sigmoid_val * (1.0 - sigmoid_val);
+        } else {
+            // Fallback for other types
+            sigmoid_val = static_cast<T>(1.0) / (static_cast<T>(1.0) + exp(-input));
+            return grad_output * sigmoid_val * (static_cast<T>(1.0) - sigmoid_val);
+        }
+    }
+
+private:
+    __device__ __forceinline__ half sigmoid(const half &x) const {
+        return __float2half(__fdiv_rn(1.0f, __fadd_rn(1.0f, expf(__half2float(-x)))));
+    }
+    __device__ __forceinline__ cuda_bfloat16 sigmoid(const cuda_bfloat16 &x) const {
+        return __float2bfloat16(__fdiv_rn(1.0f, __fadd_rn(1.0f, expf(__bfloat162float(-x)))));
+    }
+} SigmoidBackwardOp;
+} // namespace op::sigmoid_backward::cuda
+
+#endif // SIGMOID_BACKWARD_CUDA_H
diff --git a/src/infiniop/ops/sigmoid_backward/metax/sigmoid_backward_metax.h b/src/infiniop/ops/sigmoid_backward/metax/sigmoid_backward_metax.h
new file mode 100644
index 000000000..412e91e55
--- /dev/null
+++ b/src/infiniop/ops/sigmoid_backward/metax/sigmoid_backward_metax.h
@@ -0,0 +1,8 @@
+#ifndef SIGMOID_BACKWARD_METAX_API_H
+#define SIGMOID_BACKWARD_METAX_API_H
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(sigmoid_backward, metax)
+
+#endif // SIGMOID_BACKWARD_METAX_API_H
diff --git a/src/infiniop/ops/sigmoid_backward/metax/sigmoid_backward_metax.maca b/src/infiniop/ops/sigmoid_backward/metax/sigmoid_backward_metax.maca
new file mode 100644
index 000000000..3f3cf5382
--- /dev/null
+++ b/src/infiniop/ops/sigmoid_backward/metax/sigmoid_backward_metax.maca
@@ -0,0 +1,62 @@
+#include "sigmoid_backward_metax.h"
+
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::sigmoid_backward::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &grad_output_desc = input_desc_vec.at(1);
+    const auto &grad_input_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+    const auto &grad_output_shape = grad_output_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(grad_input_shape, input_shape, grad_output_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::SigmoidBackwardOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::SigmoidBackwardOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::SigmoidBackwardOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::SigmoidBackwardOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::sigmoid_backward::metax
diff --git a/src/infiniop/ops/sigmoid_backward/nvidia/sigmoid_backward_nvidia.cu b/src/infiniop/ops/sigmoid_backward/nvidia/sigmoid_backward_nvidia.cu
new file mode 100644
index 000000000..6cab4d3f1
--- /dev/null
+++ b/src/infiniop/ops/sigmoid_backward/nvidia/sigmoid_backward_nvidia.cu
@@ -0,0 +1,61 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "sigmoid_backward_nvidia.cuh"
+
+namespace op::sigmoid_backward::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &grad_output_desc = input_desc_vec.at(1);
+    const auto &grad_input_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+    const auto &grad_output_shape = grad_output_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(grad_input_shape, input_shape, grad_output_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::SigmoidBackwardOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::SigmoidBackwardOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::SigmoidBackwardOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::SigmoidBackwardOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::sigmoid_backward::nvidia
diff --git a/src/infiniop/ops/sigmoid_backward/nvidia/sigmoid_backward_nvidia.cuh b/src/infiniop/ops/sigmoid_backward/nvidia/sigmoid_backward_nvidia.cuh
new file mode 100644
index 000000000..2dd69a167
--- /dev/null
+++ b/src/infiniop/ops/sigmoid_backward/nvidia/sigmoid_backward_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef SIGMOID_BACKWARD_NVIDIA_API_H
+#define SIGMOID_BACKWARD_NVIDIA_API_H
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(sigmoid_backward, nvidia)
+
+#endif // SIGMOID_BACKWARD_NVIDIA_API_H
diff --git a/src/infiniop/ops/sigmoid_backward/operator.cc b/src/infiniop/ops/sigmoid_backward/operator.cc
new file mode 100644
index 000000000..ce4258ae6
--- /dev/null
+++ b/src/infiniop/ops/sigmoid_backward/operator.cc
@@ -0,0 +1,145 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/sigmoid_backward.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/sigmoid_backward_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/sigmoid_backward_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/sigmoid_backward_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateSigmoidBackwardDescriptor(
+    infiniopHandle_t handle,
+    infiniopSigmoidBackwardDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t grad_input_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t grad_output_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                                         \
+    case CASE:                                                                          \
+        return op::sigmoid_backward::NAMESPACE::Descriptor::create(                     \
+            handle,                                                                     \
+            reinterpret_cast<op::sigmoid_backward::NAMESPACE::Descriptor **>(desc_ptr), \
+            grad_input_desc,                                                            \
+            {input_desc,                                                                \
+             grad_output_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetSigmoidBackwardWorkspaceSize(infiniopSigmoidBackwardDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                            \
+    case CASE:                                                                                          \
+        *size = reinterpret_cast<op::sigmoid_backward::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopSigmoidBackward(
+    infiniopSigmoidBackwardDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *grad_input,
+    const void *input,
+    const void *grad_output,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                         \
+    case CASE:                                                                             \
+        return reinterpret_cast<const op::sigmoid_backward::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, grad_input, {input, grad_output}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroySigmoidBackwardDescriptor(infiniopSigmoidBackwardDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                             \
+    case CASE:                                                                              \
+        delete reinterpret_cast<const op::sigmoid_backward::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
\ No newline at end of file
diff --git a/src/infiniop/ops/sin/cpu/sin_cpu.cc b/src/infiniop/ops/sin/cpu/sin_cpu.cc
new file mode 100644
index 000000000..3a96798fb
--- /dev/null
+++ b/src/infiniop/ops/sin/cpu/sin_cpu.cc
@@ -0,0 +1,52 @@
+#include "sin_cpu.h"
+
+namespace op::sin::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &out_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(out_shape, input_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<SinOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<SinOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<SinOp, double>(_info, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<SinOp, bf16_t>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::sin::cpu
\ No newline at end of file
diff --git a/src/infiniop/ops/sin/cpu/sin_cpu.h b/src/infiniop/ops/sin/cpu/sin_cpu.h
new file mode 100644
index 000000000..838fd97ec
--- /dev/null
+++ b/src/infiniop/ops/sin/cpu/sin_cpu.h
@@ -0,0 +1,33 @@
+#ifndef SIN_CPU_H
+#define SIN_CPU_H
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+#include <cmath>
+
+ELEMENTWISE_DESCRIPTOR(sin, cpu)
+
+namespace op::sin::cpu {
+typedef struct SinOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    T operator()(const T &x) const {
+        // sin(x)
+        if constexpr (std::is_same_v<T, fp16_t>) {
+            float x_f = static_cast<float>(x);
+            return static_cast<T>(std::sin(x_f));
+        } else if constexpr (std::is_same_v<T, bf16_t>) {
+            float x_f = static_cast<float>(x);
+            return static_cast<T>(std::sin(x_f));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return std::sin(x);
+        } else if constexpr (std::is_same_v<T, double>) {
+            return std::sin(x);
+        } else {
+            return std::sin(x);
+        }
+    }
+} SinOp;
+} // namespace op::sin::cpu
+
+#endif // SIN_CPU_H
\ No newline at end of file
diff --git a/src/infiniop/ops/sin/cuda/kernel.cuh b/src/infiniop/ops/sin/cuda/kernel.cuh
new file mode 100644
index 000000000..d052e7d2b
--- /dev/null
+++ b/src/infiniop/ops/sin/cuda/kernel.cuh
@@ -0,0 +1,39 @@
+#ifndef SIN_CUDA_H
+#define SIN_CUDA_H
+
+namespace op::sin::cuda {
+typedef struct SinOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            // For half2, process each half separately
+            half x_low = __low2half(x);
+            half x_high = __high2half(x);
+            half sin_low = hsin(x_low);
+            half sin_high = hsin(x_high);
+            return __halves2half2(sin_low, sin_high);
+        } else if constexpr (std::is_same_v<T, half>) {
+            // Use CUDA half sin function
+            return hsin(x);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            // Convert to float for computation to maintain precision
+            float x_f = __bfloat162float(x);
+            float result = sinf(x_f);
+            return __float2bfloat16(result);
+        } else if constexpr (std::is_same_v<T, float>) {
+            // Use fast math functions for float
+            return sinf(x);
+        } else if constexpr (std::is_same_v<T, double>) {
+            return ::sin(x);
+        } else {
+            // Fallback
+            return sinf(x);
+        }
+    }
+} SinOp;
+} // namespace op::sin::cuda
+
+#endif // SIN_CUDA_H
\ No newline at end of file
diff --git a/src/infiniop/ops/sin/metax/sin_metax.h b/src/infiniop/ops/sin/metax/sin_metax.h
new file mode 100644
index 000000000..2b744fc60
--- /dev/null
+++ b/src/infiniop/ops/sin/metax/sin_metax.h
@@ -0,0 +1,8 @@
+#ifndef SIN_METAX_API_H
+#define SIN_METAX_API_H
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(sin, metax)
+
+#endif // SIN_METAX_API_H
diff --git a/src/infiniop/ops/sin/metax/sin_metax.maca b/src/infiniop/ops/sin/metax/sin_metax.maca
new file mode 100644
index 000000000..5700c791b
--- /dev/null
+++ b/src/infiniop/ops/sin/metax/sin_metax.maca
@@ -0,0 +1,60 @@
+#include "sin_metax.h"
+
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::sin::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &out_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(out_shape, input_shape);
+
+    // create METAX elementwise descriptor
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::SinOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::SinOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::SinOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::SinOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::sin::metax
diff --git a/src/infiniop/ops/sin/nvidia/sin_nvidia.cu b/src/infiniop/ops/sin/nvidia/sin_nvidia.cu
new file mode 100644
index 000000000..4676e3290
--- /dev/null
+++ b/src/infiniop/ops/sin/nvidia/sin_nvidia.cu
@@ -0,0 +1,59 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "sin_nvidia.cuh"
+
+namespace op::sin::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &out_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(out_shape, input_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::SinOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::SinOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::SinOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::SinOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::sin::nvidia
diff --git a/src/infiniop/ops/sin/nvidia/sin_nvidia.cuh b/src/infiniop/ops/sin/nvidia/sin_nvidia.cuh
new file mode 100644
index 000000000..93debcaf6
--- /dev/null
+++ b/src/infiniop/ops/sin/nvidia/sin_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef SIN_NVIDIA_API_H
+#define SIN_NVIDIA_API_H
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(sin, nvidia)
+
+#endif // SIN_NVIDIA_API_H
diff --git a/src/infiniop/ops/sin/operator.cc b/src/infiniop/ops/sin/operator.cc
new file mode 100644
index 000000000..5f1873f3d
--- /dev/null
+++ b/src/infiniop/ops/sin/operator.cc
@@ -0,0 +1,141 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/sin.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/sin_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/sin_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/sin_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateSinDescriptor(
+    infiniopHandle_t handle,
+    infiniopSinDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                             \
+        return op::sin::NAMESPACE::Descriptor::create(                     \
+            handle,                                                        \
+            reinterpret_cast<op::sin::NAMESPACE::Descriptor **>(desc_ptr), \
+            output_desc,                                                   \
+            {input_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetSinWorkspaceSize(infiniopSinDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                               \
+    case CASE:                                                                             \
+        *size = reinterpret_cast<op::sin::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopSin(
+    infiniopSinDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                                \
+        return reinterpret_cast<const op::sin::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, output, {input}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t infiniopDestroySinDescriptor(infiniopSinDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                 \
+        delete reinterpret_cast<const op::sin::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
\ No newline at end of file
diff --git a/src/infiniop/ops/tanh/cpu/tanh_cpu.cc b/src/infiniop/ops/tanh/cpu/tanh_cpu.cc
new file mode 100644
index 000000000..4ce419477
--- /dev/null
+++ b/src/infiniop/ops/tanh/cpu/tanh_cpu.cc
@@ -0,0 +1,52 @@
+#include "tanh_cpu.h"
+
+namespace op::tanh::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &out_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(out_shape, input_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<TanhOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<TanhOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<TanhOp, double>(_info, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<TanhOp, bf16_t>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::tanh::cpu
\ No newline at end of file
diff --git a/src/infiniop/ops/tanh/cpu/tanh_cpu.h b/src/infiniop/ops/tanh/cpu/tanh_cpu.h
new file mode 100644
index 000000000..86592b666
--- /dev/null
+++ b/src/infiniop/ops/tanh/cpu/tanh_cpu.h
@@ -0,0 +1,34 @@
+#ifndef TANH_CPU_H
+#define TANH_CPU_H
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+#include <cmath>
+
+ELEMENTWISE_DESCRIPTOR(tanh, cpu)
+
+namespace op::tanh::cpu {
+typedef struct TanhOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    T operator()(const T &x) const {
+        // tanh(x) = (exp(2*x) - 1) / (exp(2*x) + 1)
+        // or more stable: tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
+        if constexpr (std::is_same_v<T, fp16_t>) {
+            float x_f = static_cast<float>(x);
+            return static_cast<T>(std::tanh(x_f));
+        } else if constexpr (std::is_same_v<T, bf16_t>) {
+            float x_f = static_cast<float>(x);
+            return static_cast<T>(std::tanh(x_f));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return std::tanh(x);
+        } else if constexpr (std::is_same_v<T, double>) {
+            return std::tanh(x);
+        } else {
+            return std::tanh(x);
+        }
+    }
+} TanhOp;
+} // namespace op::tanh::cpu
+
+#endif // TANH_CPU_H
\ No newline at end of file
diff --git a/src/infiniop/ops/tanh/cuda/kernel.cuh b/src/infiniop/ops/tanh/cuda/kernel.cuh
new file mode 100644
index 000000000..433ae7d68
--- /dev/null
+++ b/src/infiniop/ops/tanh/cuda/kernel.cuh
@@ -0,0 +1,44 @@
+#ifndef TANH_CUDA_H
+#define TANH_CUDA_H
+
+namespace op::tanh::cuda {
+typedef struct TanhOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, half>) {
+            // Use CUDA intrinsic for half precision
+            return htanh(x);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            // Convert to float for computation to maintain precision
+            float x_f = __bfloat162float(x);
+            float result = tanhf(x_f);
+            return __float2bfloat16(result);
+        } else if constexpr (std::is_same_v<T, float>) {
+            // Use fast math functions for float
+            return tanhf(x);
+            // return static_cast<float>(::tanhf(static_cast<double>(x)));
+        } else if constexpr (std::is_same_v<T, double>) {
+            return ::tanh(x);
+        } else {
+            // Fallback
+            return tanhf(x);
+        }
+    }
+
+private:
+    // Helper function for half precision tanh (assuming it exists or can be approximated)
+    __device__ __forceinline__ half htanh(const half &x) const {
+        return __float2half(tanhf(__half2float(x)));
+    }
+
+    // Helper function for bfloat16 precision tanh (assuming it exists or can be approximated)
+    __device__ __forceinline__ cuda_bfloat16 htanh(const cuda_bfloat16 &x) const {
+        return __float2bfloat16(tanhf(__bfloat162float(x)));
+    }
+} TanhOp;
+} // namespace op::tanh::cuda
+
+#endif // TANH_CUDA_H
\ No newline at end of file
diff --git a/src/infiniop/ops/tanh/metax/tanh_metax.h b/src/infiniop/ops/tanh/metax/tanh_metax.h
new file mode 100644
index 000000000..c05b4ec26
--- /dev/null
+++ b/src/infiniop/ops/tanh/metax/tanh_metax.h
@@ -0,0 +1,8 @@
+#ifndef TANH_METAX_API_H
+#define TANH_METAX_API_H
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(tanh, metax)
+
+#endif // TANH_METAX_API_H
diff --git a/src/infiniop/ops/tanh/metax/tanh_metax.maca b/src/infiniop/ops/tanh/metax/tanh_metax.maca
new file mode 100644
index 000000000..cf0756f6b
--- /dev/null
+++ b/src/infiniop/ops/tanh/metax/tanh_metax.maca
@@ -0,0 +1,60 @@
+#include "tanh_metax.h"
+
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::tanh::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &out_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(out_shape, input_shape);
+
+    // create METAX elementwise descriptor
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::TanhOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::TanhOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::TanhOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::TanhOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::tanh::metax
\ No newline at end of file
diff --git a/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu b/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu
new file mode 100644
index 000000000..b93164765
--- /dev/null
+++ b/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu
@@ -0,0 +1,59 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "tanh_nvidia.cuh"
+
+namespace op::tanh::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &out_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(out_shape, input_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::TanhOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::TanhOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::TanhOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::TanhOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::tanh::nvidia
\ No newline at end of file
diff --git a/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cuh b/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cuh
new file mode 100644
index 000000000..8c1acb30b
--- /dev/null
+++ b/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef TANH_NVIDIA_API_H
+#define TANH_NVIDIA_API_H
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(tanh, nvidia)
+
+#endif // TANH_NVIDIA_API_H
diff --git a/src/infiniop/ops/tanh/operator.cc b/src/infiniop/ops/tanh/operator.cc
new file mode 100644
index 000000000..bbdb19c29
--- /dev/null
+++ b/src/infiniop/ops/tanh/operator.cc
@@ -0,0 +1,141 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/tanh.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/tanh_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/tanh_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/tanh_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateTanhDescriptor(
+    infiniopHandle_t handle,
+    infiniopTanhDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                              \
+        return op::tanh::NAMESPACE::Descriptor::create(                     \
+            handle,                                                         \
+            reinterpret_cast<op::tanh::NAMESPACE::Descriptor **>(desc_ptr), \
+            output_desc,                                                    \
+            {input_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetTanhWorkspaceSize(infiniopTanhDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                \
+    case CASE:                                                                              \
+        *size = reinterpret_cast<op::tanh::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopTanh(
+    infiniopTanhDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                                 \
+        return reinterpret_cast<const op::tanh::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, output, {input}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t infiniopDestroyTanhDescriptor(infiniopTanhDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        delete reinterpret_cast<const op::tanh::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
\ No newline at end of file
diff --git a/src/infiniop/ops/where/cpu/where_cpu.cc b/src/infiniop/ops/where/cpu/where_cpu.cc
new file mode 100644
index 000000000..789397c5a
--- /dev/null
+++ b/src/infiniop/ops/where/cpu/where_cpu.cc
@@ -0,0 +1,83 @@
+#include "where_cpu.h"
+
+namespace op::where::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &condition_desc = input_desc_vec.at(0);
+    const auto &a_desc = input_desc_vec.at(1);
+    const auto &b_desc = input_desc_vec.at(2);
+    const auto &out_shape = out_desc->shape();
+    const auto &condition_shape = condition_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    // Check a, b, c have same dtype and support all legal types
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16,
+                INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64,
+                INFINI_DTYPE_U8, INFINI_DTYPE_U16, INFINI_DTYPE_U32, INFINI_DTYPE_U64,
+                INFINI_DTYPE_BOOL);
+
+    if (a_desc->dtype() != dtype || b_desc->dtype() != dtype) {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    CHECK_SAME_SHAPE(out_shape, condition_shape);
+    CHECK_SAME_SHAPE(out_shape, a_shape);
+    CHECK_SAME_SHAPE(out_shape, b_shape);
+
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<WhereOp, fp16_t, fp16_t, fp16_t, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<WhereOp, float, float, float, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<WhereOp, double, double, double, double>(_info, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<WhereOp, bf16_t, bf16_t, bf16_t, bf16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_I8:
+        return _device_info->calculate<WhereOp, int8_t, int8_t, int8_t, int8_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_I16:
+        return _device_info->calculate<WhereOp, int16_t, int16_t, int16_t, int16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_I32:
+        return _device_info->calculate<WhereOp, int32_t, int32_t, int32_t, int32_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_I64:
+        return _device_info->calculate<WhereOp, int64_t, int64_t, int64_t, int64_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_U8:
+        return _device_info->calculate<WhereOp, uint8_t, uint8_t, uint8_t, uint8_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_U16:
+        return _device_info->calculate<WhereOp, uint16_t, uint16_t, uint16_t, uint16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_U32:
+        return _device_info->calculate<WhereOp, uint32_t, uint32_t, uint32_t, uint32_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_U64:
+        return _device_info->calculate<WhereOp, uint64_t, uint64_t, uint64_t, uint64_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_BOOL:
+        return _device_info->calculate<WhereOp, bool, bool, bool, bool>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::where::cpu
\ No newline at end of file
diff --git a/src/infiniop/ops/where/cpu/where_cpu.h b/src/infiniop/ops/where/cpu/where_cpu.h
new file mode 100644
index 000000000..c294751a4
--- /dev/null
+++ b/src/infiniop/ops/where/cpu/where_cpu.h
@@ -0,0 +1,31 @@
+#ifndef __WHERE_CPU_H__
+#define __WHERE_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+#include "infiniop/ops/where.h"
+
+ELEMENTWISE_DESCRIPTOR(where, cpu)
+
+namespace op::where::cpu {
+
+typedef struct WhereOp {
+public:
+    static constexpr size_t num_inputs = 3;
+
+    template <typename T, typename Tcond, typename Ta, typename Tb>
+    T operator()(const Tcond &cond, const Ta &a, const Tb &b) const {
+        if constexpr (std::is_same_v<T, fp16_t>) {
+            // For vectorized half types, apply element-wise selection
+            return static_cast<T>(_f16_to_bool(cond) ? a : b);
+        } else if constexpr (std::is_same_v<T, bf16_t>) {
+            // For vectorized half types, apply element-wise selection
+            return static_cast<T>(_bf16_to_bool(cond) ? a : b);
+        } else {
+            return static_cast<T>(static_cast<bool>(cond) ? a : b);
+        }
+    }
+} WhereOp;
+
+} // namespace op::where::cpu
+
+#endif // __WHERE_CPU_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/where/cuda/kernel.cuh b/src/infiniop/ops/where/cuda/kernel.cuh
new file mode 100644
index 000000000..d9744dad2
--- /dev/null
+++ b/src/infiniop/ops/where/cuda/kernel.cuh
@@ -0,0 +1,24 @@
+#ifndef __WHERE_CUDA_H__
+#define __WHERE_CUDA_H__
+
+namespace op::where::cuda {
+
+typedef struct WhereOp {
+public:
+    static constexpr size_t num_inputs = 3;
+
+    template <typename T, typename Tcond, typename Ta, typename Tb>
+    __device__ __forceinline__ T operator()(const Tcond &condition, const Ta &a, const Tb &b) const {
+        if constexpr (std::is_same_v<T, half2> || std::is_same_v<T, cuda_bfloat162>) {
+            // For vectorized half types, apply element-wise selection
+            return static_cast<T>(static_cast<bool>(condition) ? a : b);
+        } else {
+            // return condition ? a : b;
+            return static_cast<T>(static_cast<bool>(condition) ? a : b);
+        }
+    }
+} WhereOp;
+
+} // namespace op::where::cuda
+
+#endif // __WHERE_CUDA_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/where/metax/where_metax.h b/src/infiniop/ops/where/metax/where_metax.h
new file mode 100644
index 000000000..43bb1a945
--- /dev/null
+++ b/src/infiniop/ops/where/metax/where_metax.h
@@ -0,0 +1,8 @@
+#ifndef __WHERE_METAX_API_H__
+#define __WHERE_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(where, metax)
+
+#endif // __WHERE_METAX_API_H__
diff --git a/src/infiniop/ops/where/metax/where_metax.maca b/src/infiniop/ops/where/metax/where_metax.maca
new file mode 100644
index 000000000..b97f1fdc5
--- /dev/null
+++ b/src/infiniop/ops/where/metax/where_metax.maca
@@ -0,0 +1,90 @@
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+#include "where_metax.h"
+
+namespace op::where::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &condition_desc = input_desc_vec.at(0);
+    const auto &a_desc = input_desc_vec.at(1);
+    const auto &b_desc = input_desc_vec.at(2);
+    const auto &out_shape = out_desc->shape();
+    const auto &condition_shape = condition_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    // Check a, b, c have same dtype and support all legal types
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16,
+                INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64,
+                INFINI_DTYPE_U8, INFINI_DTYPE_U16, INFINI_DTYPE_U32, INFINI_DTYPE_U64,
+                INFINI_DTYPE_BOOL);
+
+    if (a_desc->dtype() != dtype || b_desc->dtype() != dtype) {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    CHECK_SAME_SHAPE(out_shape, condition_shape);
+    CHECK_SAME_SHAPE(out_shape, a_shape);
+    CHECK_SAME_SHAPE(out_shape, b_shape);
+
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::WhereOp, half, half, half, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::WhereOp, cuda_bfloat16, cuda_bfloat16, cuda_bfloat16, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::WhereOp, float, float, float, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::WhereOp, double, double, double, double>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I8:
+        return _device_info->calculate<256, cuda::WhereOp, int8_t, int8_t, int8_t, int8_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I16:
+        return _device_info->calculate<256, cuda::WhereOp, int16_t, int16_t, int16_t, int16_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I32:
+        return _device_info->calculate<256, cuda::WhereOp, int32_t, int32_t, int32_t, int32_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I64:
+        return _device_info->calculate<256, cuda::WhereOp, int64_t, int64_t, int64_t, int64_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_U8:
+        return _device_info->calculate<256, cuda::WhereOp, uint8_t, uint8_t, uint8_t, uint8_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_U16:
+        return _device_info->calculate<256, cuda::WhereOp, uint16_t, uint16_t, uint16_t, uint16_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_U32:
+        return _device_info->calculate<256, cuda::WhereOp, uint32_t, uint32_t, uint32_t, uint32_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_U64:
+        return _device_info->calculate<256, cuda::WhereOp, uint64_t, uint64_t, uint64_t, uint64_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BOOL:
+        return _device_info->calculate<256, cuda::WhereOp, bool, bool, bool, bool>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::where::metax
\ No newline at end of file
diff --git a/src/infiniop/ops/where/nvidia/where_nvidia.cu b/src/infiniop/ops/where/nvidia/where_nvidia.cu
new file mode 100644
index 000000000..c992663dc
--- /dev/null
+++ b/src/infiniop/ops/where/nvidia/where_nvidia.cu
@@ -0,0 +1,90 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "where_nvidia.cuh"
+
+namespace op::where::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &condition_desc = input_desc_vec.at(0);
+    const auto &a_desc = input_desc_vec.at(1);
+    const auto &b_desc = input_desc_vec.at(2);
+    const auto &out_shape = out_desc->shape();
+    const auto &condition_shape = condition_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    // Check a, b, c have same dtype and support all legal types
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16,
+                INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64,
+                INFINI_DTYPE_U8, INFINI_DTYPE_U16, INFINI_DTYPE_U32, INFINI_DTYPE_U64,
+                INFINI_DTYPE_BOOL);
+
+    if (a_desc->dtype() != dtype || b_desc->dtype() != dtype) {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    CHECK_SAME_SHAPE(out_shape, condition_shape);
+    CHECK_SAME_SHAPE(out_shape, a_shape);
+    CHECK_SAME_SHAPE(out_shape, b_shape);
+
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::WhereOp, half, half, half, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::WhereOp, cuda_bfloat16, cuda_bfloat16, cuda_bfloat16, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::WhereOp, float, float, float, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::WhereOp, double, double, double, double>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I8:
+        return _device_info->calculate<256, cuda::WhereOp, int8_t, int8_t, int8_t, int8_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I16:
+        return _device_info->calculate<256, cuda::WhereOp, int16_t, int16_t, int16_t, int16_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I32:
+        return _device_info->calculate<256, cuda::WhereOp, int32_t, int32_t, int32_t, int32_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I64:
+        return _device_info->calculate<256, cuda::WhereOp, int64_t, int64_t, int64_t, int64_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_U8:
+        return _device_info->calculate<256, cuda::WhereOp, uint8_t, uint8_t, uint8_t, uint8_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_U16:
+        return _device_info->calculate<256, cuda::WhereOp, uint16_t, uint16_t, uint16_t, uint16_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_U32:
+        return _device_info->calculate<256, cuda::WhereOp, uint32_t, uint32_t, uint32_t, uint32_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_U64:
+        return _device_info->calculate<256, cuda::WhereOp, uint64_t, uint64_t, uint64_t, uint64_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BOOL:
+        return _device_info->calculate<256, cuda::WhereOp, bool, bool, bool, bool>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::where::nvidia
\ No newline at end of file
diff --git a/src/infiniop/ops/where/nvidia/where_nvidia.cuh b/src/infiniop/ops/where/nvidia/where_nvidia.cuh
new file mode 100644
index 000000000..c168364a8
--- /dev/null
+++ b/src/infiniop/ops/where/nvidia/where_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __WHERE_CUDA_API_H__
+#define __WHERE_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(where, nvidia)
+
+#endif // __WHERE_CUDA_API_H__
diff --git a/src/infiniop/ops/where/operator.cc b/src/infiniop/ops/where/operator.cc
new file mode 100644
index 000000000..2a8a66923
--- /dev/null
+++ b/src/infiniop/ops/where/operator.cc
@@ -0,0 +1,145 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/where.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/where_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/where_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/where_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateWhereDescriptor(
+    infiniopHandle_t handle,
+    infiniopWhereDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t c,
+    infiniopTensorDescriptor_t condition,
+    infiniopTensorDescriptor_t a,
+    infiniopTensorDescriptor_t b) {
+
+#define CREATE(CASE, NAMESPACE)                                              \
+    case CASE:                                                               \
+        return op::where::NAMESPACE::Descriptor::create(                     \
+            handle,                                                          \
+            reinterpret_cast<op::where::NAMESPACE::Descriptor **>(desc_ptr), \
+            c,                                                               \
+            {condition, a, b})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetWhereWorkspaceSize(infiniopWhereDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                 \
+    case CASE:                                                                               \
+        *size = reinterpret_cast<op::where::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax)
+#endif
+    }
+
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopWhere(
+    infiniopWhereDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *c,
+    const void *condition,
+    const void *a,
+    const void *b,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                              \
+    case CASE:                                                                  \
+        return reinterpret_cast<const op::where::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, c, {condition, a, b}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyWhereDescriptor(infiniopWhereDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                  \
+    case CASE:                                                                   \
+        delete reinterpret_cast<const op::where::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
\ No newline at end of file
diff --git a/src/infinirt/infinirt_impl.h b/src/infinirt/infinirt_impl.h
index 0d6f8cf05..4c41a1198 100644
--- a/src/infinirt/infinirt_impl.h
+++ b/src/infinirt/infinirt_impl.h
@@ -30,7 +30,6 @@
     infiniStatus_t freeAsync(void *ptr, infinirtStream_t stream) IMPL;
 
 #define INFINIRT_DEVICE_API_IMPL INFINIRT_DEVICE_API(, )
-#define INFINIRT_DEVICE_API_NOOP INFINIRT_DEVICE_API({ return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; }, \
-                                                     {*count = 0; return INFINI_STATUS_SUCCESS; })
+#define INFINIRT_DEVICE_API_NOOP INFINIRT_DEVICE_API({ return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; }, {*count = 0; return INFINI_STATUS_SUCCESS; })
 
 #endif // __INFINIRT_IMPL_H__
diff --git a/src/utils.h b/src/utils.h
index f4e63be25..e721f05a6 100644
--- a/src/utils.h
+++ b/src/utils.h
@@ -108,4 +108,4 @@ inline size_t align(size_t size, size_t alignment) {
 
 } // namespace utils
 
-#endif
+#endif
\ No newline at end of file
diff --git a/src/utils/custom_types.cc b/src/utils/custom_types.cc
index 1a6da3c70..a2c1ec538 100644
--- a/src/utils/custom_types.cc
+++ b/src/utils/custom_types.cc
@@ -2,6 +2,14 @@
 #include <cstdint>
 #include <cstring>
 
+bool _f16_to_bool(fp16_t val) {
+    uint16_t h = val._v;
+    const uint16_t exponent_mask = 0x7C00; // 指数掩码 (5 bits)
+    const uint16_t mantissa_mask = 0x03FF; // 尾数掩码 (10 bits)
+    // 判断条件：当指数和尾数全为0时表示浮点0值（无论正负）
+    return (h & (exponent_mask | mantissa_mask)) != 0;
+}
+
 float _f16_to_f32(fp16_t val) {
     uint16_t h = val._v;
     uint32_t sign = (h & 0x8000) << 16;
@@ -62,6 +70,13 @@ fp16_t _f32_to_f16(float val) {
     }
 }
 
+bool _bf16_to_bool(bf16_t val) {
+    // 提取指数和尾数部分（忽略符号位）
+    const uint16_t exponent_and_mantissa = val._v & 0x7FFF;
+    // 当指数和尾数部分全为0时表示浮点0值
+    return exponent_and_mantissa != 0;
+}
+
 float _bf16_to_f32(bf16_t val) {
     // 只需把 bf16 放到 float32 高 16 bit，其余 16 位置 0。
     uint32_t bits32 = static_cast<uint32_t>(val._v) << 16;
diff --git a/src/utils/custom_types.h b/src/utils/custom_types.h
index 05a5c2fca..1dfe8ebc2 100644
--- a/src/utils/custom_types.h
+++ b/src/utils/custom_types.h
@@ -13,9 +13,11 @@ struct CustomBFloat16 {
 };
 typedef struct CustomBFloat16 bf16_t;
 
+bool _f16_to_bool(fp16_t val);
 float _f16_to_f32(fp16_t val);
 fp16_t _f32_to_f16(float val);
 
+bool _bf16_to_bool(bf16_t val);
 float _bf16_to_f32(bf16_t val);
 bf16_t _f32_to_bf16(float val);
 
@@ -25,6 +27,14 @@ template <typename TypeTo, typename TypeFrom>
 TypeTo cast(TypeFrom val) {
     if constexpr (std::is_same<TypeTo, TypeFrom>::value) {
         return val;
+    } else if constexpr (std::is_same<TypeTo, fp16_t>::value && std::is_same<TypeFrom, bf16_t>::value) {
+        return _f32_to_f16(_bf16_to_f32(val));
+    } else if constexpr (std::is_same<TypeTo, bf16_t>::value && std::is_same<TypeFrom, fp16_t>::value) {
+        return _f32_to_bf16(_f16_to_f32(val));
+    } else if constexpr (std::is_same<TypeFrom, bf16_t>::value && std::is_same<TypeTo, bool>::value) {
+        return static_cast<TypeTo>(_bf16_to_bool(val));
+    } else if constexpr (std::is_same<TypeFrom, fp16_t>::value && std::is_same<TypeTo, bool>::value) {
+        return static_cast<TypeTo>(_f16_to_bool(val));
     } else if constexpr (std::is_same<TypeTo, fp16_t>::value && std::is_same<TypeFrom, float>::value) {
         return _f32_to_f16(val);
     } else if constexpr (std::is_same<TypeTo, fp16_t>::value && !std::is_same<TypeFrom, float>::value) {
@@ -42,6 +52,25 @@ TypeTo cast(TypeFrom val) {
     } else if constexpr (std::is_same<TypeFrom, bf16_t>::value && !std::is_same<TypeTo, float>::value) {
         return static_cast<TypeTo>(_bf16_to_f32(val));
     } else {
+        // float tmp;
+        // if constexpr (std::is_same<TypeFrom, fp16_t>::value){
+        //     tmp = _f16_to_f32(val);
+        // }
+        // else if constexpr (std::is_same<TypeFrom, bf16_t>::value){
+        //     tmp = _bf16_to_f32(val);
+        // }
+        // else{
+        //     tmp = static_cast<float>(val);
+        // }
+        // if constexpr (std::is_same<TypeTo, fp16_t>::value){
+        //     return _f32_to_f16(tmp);
+        // }
+        // else if constexpr (std::is_same<TypeFrom, bf16_t>::value){
+        //     return _f32_to_bf16(tmp);
+        // }
+        // else{
+        // return static_cast<TypeTo>(tmp);
+        // }
         return static_cast<TypeTo>(val);
     }
 }
diff --git a/test/infiniop-test/test_generate/__init__.py b/test/infiniop-test/test_generate/__init__.py
index a61f63f7c..8db1e6755 100644
--- a/test/infiniop-test/test_generate/__init__.py
+++ b/test/infiniop-test/test_generate/__init__.py
@@ -1 +1,8 @@
-from .infiniop_test import InfiniopTestCase, InfiniopTestWriter, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides, process_zero_stride_tensor
+from .infiniop_test import (
+    InfiniopTestCase,
+    InfiniopTestWriter,
+    np_dtype_to_ggml,
+    gguf_strides,
+    contiguous_gguf_strides,
+    process_zero_stride_tensor,
+)
diff --git a/test/infiniop-test/test_generate/testcases/add.py b/test/infiniop-test/test_generate/testcases/add.py
index b04ba2042..052ef18a7 100644
--- a/test/infiniop-test/test_generate/testcases/add.py
+++ b/test/infiniop-test/test_generate/testcases/add.py
@@ -4,7 +4,14 @@
 from typing import List
 from numpy.lib.stride_tricks import as_strided
 
-from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides, process_zero_stride_tensor
+from .. import (
+    InfiniopTestWriter,
+    InfiniopTestCase,
+    np_dtype_to_ggml,
+    gguf_strides,
+    contiguous_gguf_strides,
+    process_zero_stride_tensor,
+)
 
 
 def add(
@@ -26,7 +33,6 @@ def __init__(
         c: np.ndarray,
         shape_c: List[int] | None,
         stride_c: List[int] | None,
-
     ):
         super().__init__("add")
         self.a = a
@@ -39,7 +45,6 @@ def __init__(
         self.shape_c = shape_c
         self.stride_c = stride_c
 
-
     def write_test(self, test_writer: "InfiniopTestWriter"):
         super().write_test(test_writer)
         if self.shape_a is not None:
@@ -49,12 +54,22 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
         if self.shape_c is not None:
             test_writer.add_array(test_writer.gguf_key("c.shape"), self.shape_c)
         if self.stride_a is not None:
-            test_writer.add_array(test_writer.gguf_key("a.strides"), gguf_strides(*self.stride_a))
+            test_writer.add_array(
+                test_writer.gguf_key("a.strides"), gguf_strides(*self.stride_a)
+            )
         if self.stride_b is not None:
-            test_writer.add_array(test_writer.gguf_key("b.strides"), gguf_strides(*self.stride_b))
+            test_writer.add_array(
+                test_writer.gguf_key("b.strides"), gguf_strides(*self.stride_b)
+            )
         test_writer.add_array(
             test_writer.gguf_key("c.strides"),
-            gguf_strides(*self.stride_c if self.stride_c is not None else contiguous_gguf_strides(self.shape_c))
+            gguf_strides(
+                *(
+                    self.stride_c
+                    if self.stride_c is not None
+                    else contiguous_gguf_strides(self.shape_c)
+                )
+            ),
         )
         test_writer.add_tensor(
             test_writer.gguf_key("a"), self.a, raw_dtype=np_dtype_to_ggml(self.a.dtype)
@@ -114,7 +129,6 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
                 stride_c=stride_c,
             )
             test_cases.append(test_case)
-            
+
     test_writer.add_tests(test_cases)
     test_writer.save()
-    
\ No newline at end of file
diff --git a/test/infiniop-test/test_generate/testcases/causal_softmax.py b/test/infiniop-test/test_generate/testcases/causal_softmax.py
index 74c3efcf0..037701865 100644
--- a/test/infiniop-test/test_generate/testcases/causal_softmax.py
+++ b/test/infiniop-test/test_generate/testcases/causal_softmax.py
@@ -4,7 +4,13 @@
 from typing import List
 from enum import Enum, auto
 
-from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides
+from .. import (
+    InfiniopTestWriter,
+    InfiniopTestCase,
+    np_dtype_to_ggml,
+    gguf_strides,
+    contiguous_gguf_strides,
+)
 
 
 def causal_softmax(x):
@@ -37,8 +43,8 @@ def __init__(
         super().__init__("causal_softmax")
         self.x = x
         self.y = y
-        self.shape_x=shape_x
-        self.shape_y=shape_y
+        self.shape_x = shape_x
+        self.shape_y = shape_y
         self.stride_x = stride_x
         self.stride_y = stride_y
 
@@ -49,10 +55,18 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
         if self.shape_y is not None:
             test_writer.add_array(test_writer.gguf_key("y.shape"), self.shape_y)
         if self.stride_x is not None:
-            test_writer.add_array(test_writer.gguf_key("x.strides"), gguf_strides(*self.stride_x))
+            test_writer.add_array(
+                test_writer.gguf_key("x.strides"), gguf_strides(*self.stride_x)
+            )
         test_writer.add_array(
             test_writer.gguf_key("y.strides"),
-            gguf_strides(*self.stride_y if self.stride_y is not None else contiguous_gguf_strides(self.shape_y))
+            gguf_strides(
+                *(
+                    self.stride_y
+                    if self.stride_y is not None
+                    else contiguous_gguf_strides(self.shape_y)
+                )
+            ),
         )
         test_writer.add_tensor(
             test_writer.gguf_key("x"),
@@ -102,6 +116,6 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
                 stride_y,
             )
             test_cases.append(test_case)
-            
+
     test_writer.add_tests(test_cases)
     test_writer.save()
diff --git a/test/infiniop-test/test_generate/testcases/clip.py b/test/infiniop-test/test_generate/testcases/clip.py
index f08a59929..786153197 100644
--- a/test/infiniop-test/test_generate/testcases/clip.py
+++ b/test/infiniop-test/test_generate/testcases/clip.py
@@ -2,7 +2,13 @@
 import gguf
 from typing import List, Optional, Tuple
 
-from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides
+from .. import (
+    InfiniopTestWriter,
+    InfiniopTestCase,
+    np_dtype_to_ggml,
+    gguf_strides,
+    contiguous_gguf_strides,
+)
 
 
 def clip(
@@ -35,7 +41,7 @@ def random_tensor(shape, dtype):
     Returns:
         Random tensor with the specified shape and dtype
     """
-    return (np.random.rand(*shape).astype(dtype) * 4.0 - 2.0)
+    return np.random.rand(*shape).astype(dtype) * 4.0 - 2.0
 
 
 class ClipTestCase(InfiniopTestCase):
@@ -52,7 +58,7 @@ def __init__(
         max_val: np.ndarray,
         max_stride: Optional[List[int]],
         y: np.ndarray,
-        y_shape:  Optional[List[int]],
+        y_shape: Optional[List[int]],
         y_stride: Optional[List[int]],
     ):
         super().__init__("clip")
@@ -63,7 +69,7 @@ def __init__(
         self.max_val = max_val
         self.max_stride = max_stride
         self.y = y
-        self.y_shape=y_shape
+        self.y_shape = y_shape
         self.y_stride = y_stride
 
     def write_test(self, test_writer: "InfiniopTestWriter"):
@@ -71,57 +77,64 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
 
         # Add strides as arrays if they exist
         if self.x_stride is not None:
-            test_writer.add_array(test_writer.gguf_key("x.strides"), gguf_strides(*self.x_stride))
+            test_writer.add_array(
+                test_writer.gguf_key("x.strides"), gguf_strides(*self.x_stride)
+            )
         if self.min_stride is not None:
-            test_writer.add_array(test_writer.gguf_key("min_val.strides"), gguf_strides(*self.min_stride))
+            test_writer.add_array(
+                test_writer.gguf_key("min_val.strides"), gguf_strides(*self.min_stride)
+            )
         if self.max_stride is not None:
-            test_writer.add_array(test_writer.gguf_key("max_val.strides"), gguf_strides(*self.max_stride))
+            test_writer.add_array(
+                test_writer.gguf_key("max_val.strides"), gguf_strides(*self.max_stride)
+            )
         if self.y_shape is not None:
             test_writer.add_array(test_writer.gguf_key("y.shape"), self.y_shape)
         test_writer.add_array(
             test_writer.gguf_key("y.strides"),
-            gguf_strides(*self.y_stride if self.y_stride is not None else contiguous_gguf_strides(self.y_shape))
+            gguf_strides(
+                *(
+                    self.y_stride
+                    if self.y_stride is not None
+                    else contiguous_gguf_strides(self.y_shape)
+                )
+            ),
         )
 
         # Add tensors to the test
         test_writer.add_tensor(
-            test_writer.gguf_key("x"),
-            self.x,
-            raw_dtype=np_dtype_to_ggml(self.x.dtype)
+            test_writer.gguf_key("x"), self.x, raw_dtype=np_dtype_to_ggml(self.x.dtype)
         )
 
         test_writer.add_tensor(
             test_writer.gguf_key("min_val"),
             self.min_val,
-            raw_dtype=np_dtype_to_ggml(self.min_val.dtype)
+            raw_dtype=np_dtype_to_ggml(self.min_val.dtype),
         )
 
         test_writer.add_tensor(
             test_writer.gguf_key("max_val"),
             self.max_val,
-            raw_dtype=np_dtype_to_ggml(self.max_val.dtype)
+            raw_dtype=np_dtype_to_ggml(self.max_val.dtype),
         )
 
         test_writer.add_tensor(
-            test_writer.gguf_key("y"),
-            self.y,
-            raw_dtype=np_dtype_to_ggml(self.y.dtype)
+            test_writer.gguf_key("y"), self.y, raw_dtype=np_dtype_to_ggml(self.y.dtype)
         )
 
         # Calculate the expected result
         ans = clip(
             self.x.astype(np.float64),
             self.min_val.astype(np.float64),
-            self.max_val.astype(np.float64)
+            self.max_val.astype(np.float64),
         )
 
         # Add the expected result to the test
         test_writer.add_tensor(
-            test_writer.gguf_key("ans"),
-            ans,
-            raw_dtype=gguf.GGMLQuantizationType.F64
+            test_writer.gguf_key("ans"), ans, raw_dtype=gguf.GGMLQuantizationType.F64
         )
 
+
 if __name__ == "__main__":
     test_writer = InfiniopTestWriter("clip.gguf")
 
@@ -130,23 +143,23 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
 
     # Test case shapes
     shapes = [
-        (10,),                # 1D tensor
-        (5, 10),              # 2D tensor
-        (2, 3, 4),            # 3D tensor
-        (7, 13),              # Prime dimensions
-        (1, 1),               # Minimum shape
-        (100, 100),           # Large shape
-        (16, 16, 16),         # Large 3D
+        (10,),  # 1D tensor
+        (5, 10),  # 2D tensor
+        (2, 3, 4),  # 3D tensor
+        (7, 13),  # Prime dimensions
+        (1, 1),  # Minimum shape
+        (100, 100),  # Large shape
+        (16, 16, 16),  # Large 3D
     ]
 
     # Test case min/max values
     min_max_values = [
-        (-1.0, 1.0),          # Standard range
-        (0.0, 2.0),           # Positive range
-        (-2.0, 0.0),          # Negative range
-        (-1000.0, 1000.0),    # Large range
-        (-0.001, 0.001),      # Small range
-        (0.0, 0.0),           # min=max
+        (-1.0, 1.0),  # Standard range
+        (0.0, 2.0),  # Positive range
+        (-2.0, 0.0),  # Negative range
+        (-1000.0, 1000.0),  # Large range
+        (-0.001, 0.001),  # Small range
+        (0.0, 0.0),  # min=max
     ]
 
     # Data types to test
@@ -171,7 +184,7 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
                         max_stride=None,
                         y=y,
                         y_shape=shape,
-                        y_stride=None
+                        y_stride=None,
                     )
                 )
 
@@ -199,7 +212,7 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
                     max_stride=row_stride,
                     y=y,
                     y_shape=shape,
-                    y_stride=row_stride
+                    y_stride=row_stride,
                 )
             )
 
@@ -219,7 +232,7 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
                     max_stride=col_stride,
                     y=y,
                     y_shape=shape,
-                    y_stride=col_stride
+                    y_stride=col_stride,
                 )
             )
 
@@ -239,7 +252,7 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
                     max_stride=row_stride,
                     y=y,
                     y_shape=shape,
-                    y_stride=col_stride
+                    y_stride=col_stride,
                 )
             )
 
diff --git a/test/infiniop-test/test_generate/testcases/mul.py b/test/infiniop-test/test_generate/testcases/mul.py
index 00c427bcb..ad4f6b806 100644
--- a/test/infiniop-test/test_generate/testcases/mul.py
+++ b/test/infiniop-test/test_generate/testcases/mul.py
@@ -2,30 +2,36 @@
 import gguf
 from typing import List
 
-from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides
+from .. import (
+    InfiniopTestWriter,
+    InfiniopTestCase,
+    np_dtype_to_ggml,
+    gguf_strides,
+    contiguous_gguf_strides,
+)
 
-def mul(
-    a: np.ndarray,
-    b: np.ndarray
-):
+
+def mul(a: np.ndarray, b: np.ndarray):
     return np.multiply(a, b)
 
+
 def random_tensor(shape, dtype):
     rate = 1e-3
     var = 0.5 * rate  # 数值范围在[-5e-4, 5e-4]
     return rate * np.random.rand(*shape).astype(dtype) - var
 
+
 class MulTestCase(InfiniopTestCase):
     def __init__(
         self,
         a: np.ndarray,
-        shape_a: List[int] | None,        
+        shape_a: List[int] | None,
         stride_a: List[int] | None,
         b: np.ndarray,
-        shape_b: List[int] | None,       
+        shape_b: List[int] | None,
         stride_b: List[int] | None,
         c: np.ndarray,
-        shape_c: List[int] | None,    
+        shape_c: List[int] | None,
         stride_c: List[int] | None,
     ):
         super().__init__("mul")
@@ -39,7 +45,6 @@ def __init__(
         self.shape_c = shape_c
         self.stride_c = stride_c
 
-
     def write_test(self, test_writer: "InfiniopTestWriter"):
         super().write_test(test_writer)
         if self.shape_a is not None:
@@ -49,12 +54,22 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
         if self.shape_c is not None:
             test_writer.add_array(test_writer.gguf_key("c.shape"), self.shape_c)
         if self.stride_a is not None:
-            test_writer.add_array(test_writer.gguf_key("a.strides"), gguf_strides(*self.stride_a))
+            test_writer.add_array(
+                test_writer.gguf_key("a.strides"), gguf_strides(*self.stride_a)
+            )
         if self.stride_b is not None:
-            test_writer.add_array(test_writer.gguf_key("b.strides"), gguf_strides(*self.stride_b))
+            test_writer.add_array(
+                test_writer.gguf_key("b.strides"), gguf_strides(*self.stride_b)
+            )
         test_writer.add_array(
             test_writer.gguf_key("c.strides"),
-            gguf_strides(*self.stride_c if self.stride_c is not None else contiguous_gguf_strides(self.shape_c))
+            gguf_strides(
+                *(
+                    self.stride_c
+                    if self.stride_c is not None
+                    else contiguous_gguf_strides(self.shape_c)
+                )
+            ),
         )
 
         test_writer.add_tensor(
@@ -68,7 +83,7 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
         )
         a_fp64 = self.a.astype(np.float64)
         b_fp64 = self.b.astype(np.float64)
-        
+
         ans_fp64 = np.multiply(a_fp64, b_fp64)
         ans = mul(self.a, self.b)
         test_writer.add_tensor(
@@ -80,7 +95,8 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
             raw_dtype=np_dtype_to_ggml(ans_fp64.dtype),
         )
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     test_writer = InfiniopTestWriter("mul.gguf")
     test_cases = []
 
@@ -96,16 +112,15 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
         ((2048, 2560), (2560, 1), (1, 2048), (2560, 1)),
         ((4, 48, 64), (64 * 48, 64, 1), (1, 4, 192), None),
         ((4, 48, 64), None, (1, 4, 192), (48 * 64, 64, 1)),
-    ]   
+    ]
     _TENSOR_DTYPES_ = [np.float32, np.float16]
-    
+
     for dtype in _TENSOR_DTYPES_:
         for shape, stride_a, stride_b, stride_c in _TEST_CASES_:
             a = random_tensor(shape, dtype)
             b = random_tensor(shape, dtype)
             c = np.empty(tuple(0 for _ in shape), dtype=dtype)
 
-                
             test_cases.append(
                 MulTestCase(
                     a=a,
@@ -118,7 +133,7 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
                     shape_c=shape,
                     stride_c=stride_c,
                 )
-            )   
-    
+            )
+
     test_writer.add_tests(test_cases)
     test_writer.save()
diff --git a/test/infiniop-test/test_generate/testcases/rearrange.py b/test/infiniop-test/test_generate/testcases/rearrange.py
index 9617a1fc0..3d3a0e73b 100644
--- a/test/infiniop-test/test_generate/testcases/rearrange.py
+++ b/test/infiniop-test/test_generate/testcases/rearrange.py
@@ -1,14 +1,21 @@
 import torch
 from typing import List
 
-from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides
+from .. import (
+    InfiniopTestWriter,
+    InfiniopTestCase,
+    np_dtype_to_ggml,
+    gguf_strides,
+    contiguous_gguf_strides,
+)
+
 
 def row_major_strides(shape):
     """生成张量的行优先stride
-    
+
     Args:
         shape: 张量形状
-    
+
     Returns:
         行优先strides列表
     """
@@ -19,12 +26,13 @@ def row_major_strides(shape):
         strides.insert(0, stride)
     return strides
 
+
 def column_major_strides(shape):
     """生成张量的列优先stride
-    
+
     Args:
         shape: 张量形状
-    
+
     Returns:
         列优先strides列表
     """
@@ -35,6 +43,7 @@ def column_major_strides(shape):
         strides.append(stride)
     return strides
 
+
 def rearrange_using_torch(src: torch.Tensor, dst_strides: List[int]) -> torch.Tensor:
     """
     使用torch的rearrange函数计算结果
@@ -66,27 +75,35 @@ def __init__(
         self.shape = shape
         self.src_strides = src_strides
         self.dst_strides = dst_strides
-        
+
     def write_test(self, test_writer: "InfiniopTestWriter"):
         super().write_test(test_writer)
-        
+
         # 写入形状信息
         if self.shape is not None:
             test_writer.add_array(test_writer.gguf_key("src.shape"), self.shape)
             test_writer.add_array(test_writer.gguf_key("dst.shape"), self.shape)
-        
+
         # 写入strides信息
         if self.src_strides is not None:
-            test_writer.add_array(test_writer.gguf_key("src.strides"), gguf_strides(*self.src_strides))
+            test_writer.add_array(
+                test_writer.gguf_key("src.strides"), gguf_strides(*self.src_strides)
+            )
         test_writer.add_array(
             test_writer.gguf_key("dst.strides"),
-            gguf_strides(*self.dst_strides if self.dst_strides is not None else contiguous_gguf_strides(self.shape))
+            gguf_strides(
+                *(
+                    self.dst_strides
+                    if self.dst_strides is not None
+                    else contiguous_gguf_strides(self.shape)
+                )
+            ),
         )
-        
+
         # 转换torch tensor为numpy用于写入文件
         src_numpy = self.src.detach().cpu().numpy()
         dst_numpy = self.dst.detach().cpu().numpy()
-        
+
         # 写入张量数据
         test_writer.add_tensor(
             test_writer.gguf_key("src"),
@@ -98,9 +115,13 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
             dst_numpy,
             raw_dtype=np_dtype_to_ggml(dst_numpy.dtype),
         )
-        
+
         # 计算并写入答案
-        dst_strides_for_ans = self.dst_strides if self.dst_strides is not None else list(contiguous_gguf_strides(self.shape))
+        dst_strides_for_ans = (
+            self.dst_strides
+            if self.dst_strides is not None
+            else list(contiguous_gguf_strides(self.shape))
+        )
         ans_torch = rearrange_using_torch(self.src, dst_strides_for_ans)
         ans_numpy = ans_torch.detach().cpu().numpy()
         test_writer.add_tensor(
@@ -109,6 +130,7 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
             raw_dtype=np_dtype_to_ggml(src_numpy.dtype),
         )
 
+
 if __name__ == "__main__":
     test_writer = InfiniopTestWriter("rearrange.gguf")
     test_cases = []
@@ -117,12 +139,20 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
         # (shape, src_stride, dst_stride)
         ((100, 100), (1, 100), (100, 1)),
         ((4, 4), (1, 4), (4, 1)),
-        ((4, 6, 64), (64, 4*64, 1), (6*64, 64, 1)),
+        ((4, 6, 64), (64, 4 * 64, 1), (6 * 64, 64, 1)),
         ((2000, 2000), (1, 2000), (2000, 1)),
         ((2001, 2001), (1, 2001), (2001, 1)),
         ((2, 2, 2, 4), (16, 8, 4, 1), (16, 8, 1, 2)),
-        ((3, 4, 7, 53, 9), row_major_strides((3, 4, 7, 53, 9)), column_major_strides((3, 4, 7, 53, 9))),
-        ((3, 4, 50, 50, 5, 7), row_major_strides((3, 4, 50, 50, 5, 7)), column_major_strides((3, 4, 50, 50, 5, 7))),
+        (
+            (3, 4, 7, 53, 9),
+            row_major_strides((3, 4, 7, 53, 9)),
+            column_major_strides((3, 4, 7, 53, 9)),
+        ),
+        (
+            (3, 4, 50, 50, 5, 7),
+            row_major_strides((3, 4, 50, 50, 5, 7)),
+            column_major_strides((3, 4, 50, 50, 5, 7)),
+        ),
     ]
 
     _TENSOR_DTYPES_ = [torch.float32, torch.float16]
@@ -132,7 +162,7 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
             src = torch.rand(*shape, dtype=dtype)
             # 生成目标张量，使用正确的形状
             dst = torch.empty(shape, dtype=dtype)
-            
+
             test_case = RearrangeTestCase(
                 src=src,
                 dst=dst,
@@ -140,7 +170,7 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
                 src_strides=src_strides,
                 dst_strides=dst_strides,
             )
-            test_cases.append(test_case)        
+            test_cases.append(test_case)
 
     test_writer.add_tests(test_cases)
-    test_writer.save() 
+    test_writer.save()
diff --git a/test/infiniop-test/test_generate/testcases/rms_norm.py b/test/infiniop-test/test_generate/testcases/rms_norm.py
index 681ebafc4..9332c090a 100644
--- a/test/infiniop-test/test_generate/testcases/rms_norm.py
+++ b/test/infiniop-test/test_generate/testcases/rms_norm.py
@@ -1,11 +1,19 @@
 import numpy as np
 from typing import List
 
-from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides
+from .. import (
+    InfiniopTestWriter,
+    InfiniopTestCase,
+    np_dtype_to_ggml,
+    gguf_strides,
+    contiguous_gguf_strides,
+)
+
 
 def random_tensor(shape: tuple, dtype: np.dtype) -> np.ndarray:
     return np.random.uniform(-1.0, 1.0, shape).astype(dtype) * 0.001
 
+
 def rms_norm(x: np.ndarray, w: np.ndarray, epsilon: float) -> np.ndarray:
     """
     使用numpy计算rms_norm结果
@@ -16,13 +24,14 @@ def rms_norm(x: np.ndarray, w: np.ndarray, epsilon: float) -> np.ndarray:
     Returns:
         输出张量, 形状与 input 相同
     """
-    squared = x ** 2
+    squared = x**2
     mean = np.mean(squared, axis=-1, keepdims=True)
     rms = np.sqrt(mean + epsilon)
-    
+
     normalized = x / rms
     return normalized * w
 
+
 class RMSNormTestCase(InfiniopTestCase):
     def __init__(
         self,
@@ -40,9 +49,9 @@ def __init__(
         self.y = y
         self.shape = shape
         self.epsilon = epsilon
-        self.x_strides=x_strides
-        self.y_strides=y_strides
-        
+        self.x_strides = x_strides
+        self.y_strides = y_strides
+
     def write_test(self, test_writer: "InfiniopTestWriter"):
         super().write_test(test_writer)
         test_writer.add_float32(test_writer.gguf_key("epsilon"), self.epsilon)
@@ -50,10 +59,18 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
             test_writer.add_array(test_writer.gguf_key("x.shape"), self.shape)
             test_writer.add_array(test_writer.gguf_key("y.shape"), self.shape)
         if self.x_strides is not None:
-            test_writer.add_array(test_writer.gguf_key("x.strides"), gguf_strides(*self.x_strides))
+            test_writer.add_array(
+                test_writer.gguf_key("x.strides"), gguf_strides(*self.x_strides)
+            )
         test_writer.add_array(
             test_writer.gguf_key("y.strides"),
-            gguf_strides(*self.y_strides if self.y_strides is not None else contiguous_gguf_strides(self.shape))
+            gguf_strides(
+                *(
+                    self.y_strides
+                    if self.y_strides is not None
+                    else contiguous_gguf_strides(self.shape)
+                )
+            ),
         )
         test_writer.add_tensor(
             test_writer.gguf_key("x"),
@@ -70,13 +87,16 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
             self.y,
             raw_dtype=np_dtype_to_ggml(self.y.dtype),
         )
-        ans = rms_norm(self.x.astype(np.float64), self.w.astype(np.float64), self.epsilon)
+        ans = rms_norm(
+            self.x.astype(np.float64), self.w.astype(np.float64), self.epsilon
+        )
         test_writer.add_tensor(
             test_writer.gguf_key("ans"),
             ans,
             raw_dtype=np_dtype_to_ggml(np.float64),
         )
 
+
 if __name__ == "__main__":
     test_writer = InfiniopTestWriter("rms_norm.gguf")
     test_cases = []
@@ -112,9 +132,9 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
                 shape=shape,
                 x_strides=x_strides,
                 y_strides=y_strides,
-                epsilon=epsilon
+                epsilon=epsilon,
             )
-            test_cases.append(test_case)        
+            test_cases.append(test_case)
 
     test_writer.add_tests(test_cases)
     test_writer.save()
diff --git a/test/infiniop-test/test_generate/testcases/rope.py b/test/infiniop-test/test_generate/testcases/rope.py
index 85d9685dd..27f5a06db 100644
--- a/test/infiniop-test/test_generate/testcases/rope.py
+++ b/test/infiniop-test/test_generate/testcases/rope.py
@@ -4,11 +4,17 @@
 from typing import List
 
 
-from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides
+from .. import (
+    InfiniopTestWriter,
+    InfiniopTestCase,
+    np_dtype_to_ggml,
+    gguf_strides,
+    contiguous_gguf_strides,
+)
 
 
 def rotary_embedding(t, sin, cos):
-    dh = t.shape[2] 
+    dh = t.shape[2]
     assert dh % 2 == 0, "Embedding dimension must be even."
 
     t_even = t[..., 0::2]  # [seq_len, n_head, dh // 2]
@@ -30,7 +36,9 @@ def rotary_embedding(t, sin, cos):
 def sin_cos_table(pos, dim, theta, dtype):
     assert dim % 2 == 0, "Embedding dimension must be even."
 
-    freqs = 1.0 / (theta ** (np.arange(0, dim, 2)[: (dim // 2)].astype(np.float32) / dim))
+    freqs = 1.0 / (
+        theta ** (np.arange(0, dim, 2)[: (dim // 2)].astype(np.float32) / dim)
+    )
 
     angles = np.outer(pos, freqs)
 
@@ -79,19 +87,33 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
             test_writer.add_array(test_writer.gguf_key("x.shape"), self.shape_x)
         test_writer.add_array(
             test_writer.gguf_key("y.strides"),
-            gguf_strides(*self.stride_y if self.stride_y is not None else contiguous_gguf_strides(self.shape_y))
+            gguf_strides(
+                *(
+                    self.stride_y
+                    if self.stride_y is not None
+                    else contiguous_gguf_strides(self.shape_y)
+                )
+            ),
         )
         if self.stride_x is not None:
-            test_writer.add_array(test_writer.gguf_key("x.strides"), gguf_strides(*self.stride_x))
+            test_writer.add_array(
+                test_writer.gguf_key("x.strides"), gguf_strides(*self.stride_x)
+            )
 
         test_writer.add_tensor(
-            test_writer.gguf_key("pos_ids"), self.pos_ids, raw_dtype=np_dtype_to_ggml(self.pos_ids.dtype)
+            test_writer.gguf_key("pos_ids"),
+            self.pos_ids,
+            raw_dtype=np_dtype_to_ggml(self.pos_ids.dtype),
         )
         test_writer.add_tensor(
-            test_writer.gguf_key("sin_table"), self.sin_table, raw_dtype=np_dtype_to_ggml(self.sin_table.dtype)
+            test_writer.gguf_key("sin_table"),
+            self.sin_table,
+            raw_dtype=np_dtype_to_ggml(self.sin_table.dtype),
         )
         test_writer.add_tensor(
-            test_writer.gguf_key("cos_table"), self.cos_table, raw_dtype=np_dtype_to_ggml(self.cos_table.dtype)
+            test_writer.gguf_key("cos_table"),
+            self.cos_table,
+            raw_dtype=np_dtype_to_ggml(self.cos_table.dtype),
         )
         ans = rotary_embedding(
             self.x.astype(np.float64),
@@ -103,8 +125,6 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
         )
 
 
-
-
 if __name__ == "__main__":
     # ==============================================================================
     #  Configuration (Internal Use Only)
@@ -130,7 +150,9 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
             x = np.random.rand(*shape).astype(dtype)
             y = np.empty(tuple(0 for _ in shape), dtype=dtype)
             pos_ids = np.arange(0, x.shape[0], dtype=np.int32)
-            sin_table, cos_table = sin_cos_table(pos_ids, x.shape[2], theta=1e5, dtype=dtype)
+            sin_table, cos_table = sin_cos_table(
+                pos_ids, x.shape[2], theta=1e5, dtype=dtype
+            )
             test_case = RoPETestCase(
                 y=y,
                 x=x,
diff --git a/test/infiniop-test/test_generate/testcases/swiglu.py b/test/infiniop-test/test_generate/testcases/swiglu.py
index cb692b613..aa3450fed 100644
--- a/test/infiniop-test/test_generate/testcases/swiglu.py
+++ b/test/infiniop-test/test_generate/testcases/swiglu.py
@@ -2,7 +2,14 @@
 import gguf
 from typing import List
 
-from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides, process_zero_stride_tensor
+from .. import (
+    InfiniopTestWriter,
+    InfiniopTestCase,
+    np_dtype_to_ggml,
+    gguf_strides,
+    contiguous_gguf_strides,
+    process_zero_stride_tensor,
+)
 
 
 def swiglu(
@@ -26,7 +33,6 @@ def __init__(
         c: np.ndarray,
         shape_c: List[int] | None,
         stride_c: List[int] | None,
-
     ):
         super().__init__("swiglu")
         self.a = a
@@ -39,7 +45,6 @@ def __init__(
         self.shape_c = shape_c
         self.stride_c = stride_c
 
-
     def write_test(self, test_writer: "InfiniopTestWriter"):
         super().write_test(test_writer)
         if self.shape_a is not None:
@@ -47,14 +52,24 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
         if self.shape_b is not None:
             test_writer.add_array(test_writer.gguf_key("b.shape"), self.shape_b)
         if self.shape_c is not None:
-            test_writer.add_array(test_writer.gguf_key("c.shape"), self.shape_c)  
+            test_writer.add_array(test_writer.gguf_key("c.shape"), self.shape_c)
         if self.stride_a is not None:
-            test_writer.add_array(test_writer.gguf_key("a.strides"), gguf_strides(*self.stride_a))
+            test_writer.add_array(
+                test_writer.gguf_key("a.strides"), gguf_strides(*self.stride_a)
+            )
         if self.stride_b is not None:
-            test_writer.add_array(test_writer.gguf_key("b.strides"), gguf_strides(*self.stride_b))
+            test_writer.add_array(
+                test_writer.gguf_key("b.strides"), gguf_strides(*self.stride_b)
+            )
         test_writer.add_array(
             test_writer.gguf_key("c.strides"),
-            gguf_strides(*self.stride_c if self.stride_c is not None else contiguous_gguf_strides(self.shape_c))
+            gguf_strides(
+                *(
+                    self.stride_c
+                    if self.stride_c is not None
+                    else contiguous_gguf_strides(self.shape_c)
+                )
+            ),
         )
         test_writer.add_tensor(
             test_writer.gguf_key("a"), self.a, raw_dtype=np_dtype_to_ggml(self.a.dtype)
diff --git a/test/infiniop/cast.py b/test/infiniop/cast.py
new file mode 100644
index 000000000..3065021de
--- /dev/null
+++ b/test/infiniop/cast.py
@@ -0,0 +1,242 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_SHAPES_ = [
+    (13, 4),
+    (13, 4, 4),
+    (16, 5632),
+    (4, 4, 5632),
+    (1024,),
+    (32, 32),
+]
+
+_TEST_STRIDES_ = [
+    None,  # Contiguous
+    # Add some non-contiguous strides for specific shapes
+]
+
+# Define type conversion test matrix
+_TYPE_CONVERSIONS_ = [
+    # Integer to integer conversions
+    (InfiniDtype.I32, InfiniDtype.I64),
+    (InfiniDtype.I64, InfiniDtype.I32),
+    (InfiniDtype.U32, InfiniDtype.U64),
+    (InfiniDtype.U64, InfiniDtype.U32),
+    (InfiniDtype.I32, InfiniDtype.U32),
+    (InfiniDtype.U32, InfiniDtype.I32),
+    # Integer to float conversions
+    (InfiniDtype.I32, InfiniDtype.F32),
+    (InfiniDtype.I32, InfiniDtype.F64),
+    (InfiniDtype.I64, InfiniDtype.F32),
+    (InfiniDtype.I64, InfiniDtype.F64),
+    (InfiniDtype.U32, InfiniDtype.F32),
+    (InfiniDtype.U32, InfiniDtype.F64),
+    (InfiniDtype.U64, InfiniDtype.F32),
+    (InfiniDtype.U64, InfiniDtype.F64),
+    # Float to integer conversions
+    (InfiniDtype.F32, InfiniDtype.I32),
+    (InfiniDtype.F32, InfiniDtype.I64),
+    (InfiniDtype.F64, InfiniDtype.I32),
+    (InfiniDtype.F64, InfiniDtype.I64),
+    (InfiniDtype.F32, InfiniDtype.U32),
+    (InfiniDtype.F32, InfiniDtype.U64),
+    (InfiniDtype.F64, InfiniDtype.U32),
+    (InfiniDtype.F64, InfiniDtype.U64),
+    # Float to float conversions
+    (InfiniDtype.F32, InfiniDtype.F64),
+    (InfiniDtype.F64, InfiniDtype.F32),
+    (InfiniDtype.F16, InfiniDtype.F32),
+    (InfiniDtype.F32, InfiniDtype.F16),
+    (InfiniDtype.F16, InfiniDtype.F64),
+    (InfiniDtype.F64, InfiniDtype.F16),
+    (InfiniDtype.BF16, InfiniDtype.F32),
+    (InfiniDtype.F32, InfiniDtype.BF16),
+]
+
+# Form the test cases
+_TEST_CASES = []
+for input_dtype, output_dtype in _TYPE_CONVERSIONS_:
+    for shape in _TEST_SHAPES_:
+        for stride in _TEST_STRIDES_:
+            _TEST_CASES.append((shape, stride, stride, input_dtype, output_dtype))
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+    InfiniDtype.F64: {"atol": 1e-15, "rtol": 1e-15},
+    InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.I32: {"atol": 0, "rtol": 0},
+    InfiniDtype.I64: {"atol": 0, "rtol": 0},
+    InfiniDtype.U32: {"atol": 0, "rtol": 0},
+    InfiniDtype.U64: {"atol": 0, "rtol": 0},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def cast_pytorch(output, input_tensor):
+    """Cast using PyTorch"""
+    output.copy_(input_tensor)
+
+
+def test(
+    handle,
+    device,
+    shape,
+    input_stride=None,
+    output_stride=None,
+    input_dtype=InfiniDtype.F32,
+    output_dtype=InfiniDtype.F16,
+    sync=None,
+):
+    # Create input tensor with appropriate data based on type
+    if input_dtype in [InfiniDtype.I32, InfiniDtype.I64]:
+        # Signed integer: use both positive and negative values
+        input_tensor = TestTensor(
+            shape, input_stride, input_dtype, device, mode="randint", low=-50, high=50
+        )
+    elif input_dtype in [InfiniDtype.U32, InfiniDtype.U64]:
+        # Unsigned integer: use positive values
+        input_tensor = TestTensor(
+            shape, input_stride, input_dtype, device, mode="randint", low=0, high=100
+        )
+    else:
+        # Float: use random values
+        input_tensor = TestTensor(shape, input_stride, input_dtype, device)
+
+    output_tensor = TestTensor(shape, output_stride, output_dtype, device, mode="zeros")
+
+    print(
+        f"Testing Cast on {InfiniDeviceNames[device]} with shape:{shape} "
+        f"input_stride:{input_stride} output_stride:{output_stride} "
+        f"input_dtype:{InfiniDtypeNames[input_dtype]} output_dtype:{InfiniDtypeNames[output_dtype]}"
+    )
+
+    # Perform PyTorch cast for reference
+    cast_pytorch(output_tensor.torch_tensor(), input_tensor.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateCastDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            output_tensor.descriptor,
+            input_tensor.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [input_tensor, output_tensor]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetCastWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, output_tensor.device)
+
+    def lib_cast():
+        check_error(
+            LIBINFINIOP.infiniopCast(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                output_tensor.data(),
+                input_tensor.data(),
+                None,
+            )
+        )
+
+    lib_cast()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, output_dtype)
+    if DEBUG:
+        debug(
+            output_tensor.actual_tensor(),
+            output_tensor.torch_tensor(),
+            atol=atol,
+            rtol=rtol,
+        )
+
+    # For integer types, use exact comparison
+    if output_dtype in [
+        InfiniDtype.I32,
+        InfiniDtype.I64,
+        InfiniDtype.U32,
+        InfiniDtype.U64,
+    ]:
+        assert torch.equal(output_tensor.actual_tensor(), output_tensor.torch_tensor())
+    else:
+        assert torch.allclose(
+            output_tensor.actual_tensor(),
+            output_tensor.torch_tensor(),
+            atol=atol,
+            rtol=rtol,
+        )
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: cast_pytorch(output_tensor.torch_tensor(), input_tensor.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_cast(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyCastDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    print(f"\033[94mRunning Cast operator tests...\033[0m")
+    print(f"Total test cases: {len(_TEST_CASES)}")
+    print(f"Type conversions tested: {len(_TYPE_CONVERSIONS_)}")
+    print("\nType conversion matrix:")
+    for i, (input_dtype, output_dtype) in enumerate(_TYPE_CONVERSIONS_):
+        print(
+            f"  {i+1:2d}. {InfiniDtypeNames[input_dtype]:>6} -> {InfiniDtypeNames[output_dtype]:<6}"
+        )
+    print()
+
+    for device in get_test_devices(args):
+        print(f"\033[93mTesting on device: {InfiniDeviceNames[device]}\033[0m")
+        test_operator(
+            device, test, _TEST_CASES, []
+        )  # Empty dtype list since we handle dtypes in test cases
+
+    print("\033[92mAll Cast tests passed!\033[0m")
diff --git a/test/infiniop/cos.py b/test/infiniop/cos.py
new file mode 100644
index 000000000..8393eb418
--- /dev/null
+++ b/test/infiniop/cos.py
@@ -0,0 +1,162 @@
+import ctypes
+from ctypes import c_uint64
+from enum import Enum, auto
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # tensor_shape, inplace
+    # TODO: Uncomment the following line.
+    # ((),),
+    ((1, 3),),
+    ((3, 3),),
+    ((32, 20, 512),),
+    ((33, 333, 333),),
+    ((32, 256, 112, 112),),
+    ((3, 3, 13, 9, 17),),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_X,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+    InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-3},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def cos(x):
+    return torch.cos(x).to(x.dtype)
+
+
+def test(
+    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=torch.float16, sync=None
+):
+    x_torch_tensor = torch.rand(shape) * 2 - 1
+
+    x = TestTensor(
+        shape,
+        x_torch_tensor.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=x_torch_tensor,
+    )
+
+    if inplace == Inplace.INPLACE_X:
+        y = x
+    else:
+        y = TestTensor(shape, None, dtype, device)
+
+    if y.is_broadcast():
+        return
+
+    print(
+        f"Testing Cos on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
+    )
+
+    ans = cos(x.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateCosDescriptor(
+            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [x, y]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetCosWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, y.device)
+
+    def lib_cos():
+        LIBINFINIOP.infiniopCos(
+            descriptor, workspace.data(), workspace.size(), y.data(), x.data(), None
+        )
+
+    lib_cos()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: cos(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_cos(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyCosDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mCos Test passed!\033[0m")
diff --git a/test/infiniop/exp.py b/test/infiniop/exp.py
new file mode 100644
index 000000000..47849d8b5
--- /dev/null
+++ b/test/infiniop/exp.py
@@ -0,0 +1,162 @@
+import ctypes
+from ctypes import c_uint64
+from enum import Enum, auto
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # tensor_shape, inplace
+    # TODO: Uncomment the following line.
+    # ((),),
+    ((1, 3),),
+    ((3, 3),),
+    ((32, 20, 512),),
+    ((33, 333, 333),),
+    ((32, 256, 112, 112),),
+    ((3, 3, 13, 9, 17),),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_X,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+    InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-3},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def exp(x):
+    return torch.exp(x).to(x.dtype)
+
+
+def test(
+    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=torch.float16, sync=None
+):
+    x_torch_tensor = torch.rand(shape) * 2 - 1
+
+    x = TestTensor(
+        shape,
+        x_torch_tensor.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=x_torch_tensor,
+    )
+
+    if inplace == Inplace.INPLACE_X:
+        y = x
+    else:
+        y = TestTensor(shape, None, dtype, device)
+
+    if y.is_broadcast():
+        return
+
+    print(
+        f"Testing Exp on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
+    )
+
+    ans = exp(x.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateExpDescriptor(
+            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [x, y]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetExpWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, y.device)
+
+    def lib_exp():
+        LIBINFINIOP.infiniopExp(
+            descriptor, workspace.data(), workspace.size(), y.data(), x.data(), None
+        )
+
+    lib_exp()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: exp(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_exp(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyExpDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mExp Test passed!\033[0m")
diff --git a/test/infiniop/hardswish.py b/test/infiniop/hardswish.py
new file mode 100644
index 000000000..05afc92e9
--- /dev/null
+++ b/test/infiniop/hardswish.py
@@ -0,0 +1,190 @@
+import ctypes
+from ctypes import c_uint64
+from enum import Enum, auto
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # tensor_shape, inplace
+    # TODO: Uncomment the following line.
+    # ((),),
+    ((1, 3),),
+    ((3, 3),),
+    ((32, 20, 512),),
+    ((33, 333, 333),),
+    ((32, 256, 112, 112),),
+    ((3, 3, 13, 9, 17),),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_INPUT = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_INPUT,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+# HardSwish 是分段线性（含除法），数值较稳定；容差与 GeLU 保持一致或更严格均可
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+    InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-3},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+_hswish = torch.nn.Hardswish(inplace=False)
+
+
+def hardswish(x):
+    """
+    Reference HardSwish using PyTorch:
+      hswish(x) = x * clamp(x + 3, 0, 6) / 6
+    """
+    # return torch.nn.functional.hardswish(x).to(x.dtype)
+    return _hswish(x).to(x.dtype)
+
+
+def test(
+    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=torch.float16, sync=None
+):
+    # Sample inputs from [-2, 2]，覆盖线性区和中间平滑区；可根据需要扩大范围
+    input_torch_tensor = torch.rand(shape) * 4 - 2
+
+    input_tensor = TestTensor(
+        shape,
+        input_torch_tensor.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=input_torch_tensor,
+    )
+
+    if inplace == Inplace.INPLACE_INPUT:
+        output_tensor = input_tensor
+    else:
+        output_tensor = TestTensor(shape, None, dtype, device)
+
+    if output_tensor.is_broadcast():
+        return
+
+    print(
+        f"Testing HardSwish on {InfiniDeviceNames[device]} with shape:{shape} "
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+
+    # PyTorch reference
+    ans = hardswish(input_tensor.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    # Create HardSwish descriptor
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateHardSwishDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            output_tensor.descriptor,
+            input_tensor.descriptor,
+        )
+    )
+
+    # Invalidate desc shapes/strides to ensure kernel uses runtime args
+    for tensor in [input_tensor, output_tensor]:
+        tensor.destroy_desc()
+
+    # Workspace
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetHardSwishWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, output_tensor.device)
+
+    def lib_hardswish():
+        LIBINFINIOP.infiniopHardSwish(
+            descriptor,
+            workspace.data(),
+            workspace.size(),
+            output_tensor.data(),
+            input_tensor.data(),
+            None,
+        )
+
+    # Run lib op
+    lib_hardswish()
+
+    # Verify
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(output_tensor.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(output_tensor.actual_tensor(), ans, atol=atol, rtol=rtol)
+
+    # Profiling (optional)
+    if PROFILE:
+        profile_operation(
+            "PyTorch",
+            lambda: hardswish(input_tensor.torch_tensor()),
+            device,
+            NUM_PRERUN,
+            NUM_ITERATIONS,
+        )
+        profile_operation(
+            "    lib", lambda: lib_hardswish(), device, NUM_PRERUN, NUM_ITERATIONS
+        )
+
+    # Clean up
+    check_error(LIBINFINIOP.infiniopDestroyHardSwishDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mHardSwish test passed!\033[0m")
diff --git a/test/infiniop/leaky_relu.py b/test/infiniop/leaky_relu.py
new file mode 100644
index 000000000..57de370b9
--- /dev/null
+++ b/test/infiniop/leaky_relu.py
@@ -0,0 +1,340 @@
+# import ctypes
+# from ctypes import c_uint64, c_float
+# from enum import Enum, auto
+
+# import torch
+# from libinfiniop import (
+#     LIBINFINIOP,
+#     InfiniDeviceNames,
+#     InfiniDtype,
+#     InfiniDtypeNames,
+#     TestTensor,
+#     TestWorkspace,
+#     check_error,
+#     debug,
+#     get_args,
+#     get_test_devices,
+#     get_tolerance,
+#     infiniopOperatorDescriptor_t,
+#     profile_operation,
+#     test_operator,
+# )
+
+# # ==============================================================================
+# #  Configuration (Internal Use Only)
+# # ==============================================================================
+# # These are not meant to be imported from other modules
+# _TEST_CASES_ = [
+#     # tensor_shape, negative_slope
+#     ((1, 3), 0.01),
+#     ((3, 3), 0.1),
+#     ((32, 20, 512), 0.2),
+#     ((33, 333, 333), 0.01),
+#     ((32, 256, 112, 112), 0.1),
+#     ((3, 3, 13, 9, 17), 0.2),
+# ]
+
+
+# class Inplace(Enum):
+#     OUT_OF_PLACE = auto()
+#     INPLACE_X = auto()
+
+
+# # Inplace options applied for each test case in _TEST_CASES_
+# _INPLACE = [
+#     Inplace.OUT_OF_PLACE,
+#     Inplace.INPLACE_X,
+# ]
+
+# # Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+# _TEST_CASES = [
+#     test_case + (inplace_item,)
+#     for test_case in _TEST_CASES_
+#     for inplace_item in _INPLACE
+# ]
+
+# # Data types used for testing
+# _TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+# # Tolerance map for different data types
+# _TOLERANCE_MAP = {
+#     InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+#     InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+#     InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-3},
+# }
+
+# DEBUG = False
+# PROFILE = False
+# NUM_PRERUN = 10
+# NUM_ITERATIONS = 1000
+
+
+# def leaky_relu(x, negative_slope):
+#     return torch.nn.functional.leaky_relu(x, negative_slope=negative_slope).to(x.dtype)
+
+
+# def test(
+#     handle, device, shape, negative_slope, inplace=Inplace.OUT_OF_PLACE, dtype=torch.float16, sync=None
+# ):
+#     x_torch_tensor = torch.rand(shape) * 2 - 1
+
+#     x = TestTensor(
+#         shape,
+#         x_torch_tensor.stride(),
+#         dtype,
+#         device,
+#         mode="manual",
+#         set_tensor=x_torch_tensor,
+#     )
+
+#     if inplace == Inplace.INPLACE_X:
+#         y = x
+#     else:
+#         y = TestTensor(shape, None, dtype, device)
+
+#     if y.is_broadcast():
+#         return
+
+#     print(
+#         f"Testing LeakyReLU on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} negative_slope:{negative_slope} inplace: {inplace}"
+#     )
+
+#     ans = leaky_relu(x.torch_tensor(), negative_slope)
+
+#     if sync is not None:
+#         sync()
+
+#     descriptor = infiniopOperatorDescriptor_t()
+#     check_error(
+#         LIBINFINIOP.infiniopCreateLeakyReLUDescriptor(
+#             handle, ctypes.byref(descriptor), y.descriptor, x.descriptor, c_float(negative_slope)
+#         )
+#     )
+
+#     # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+#     for tensor in [x, y]:
+#         tensor.destroy_desc()
+
+#     workspace_size = c_uint64(0)
+#     check_error(
+#         LIBINFINIOP.infiniopGetLeakyReLUWorkspaceSize(
+#             descriptor, ctypes.byref(workspace_size)
+#         )
+#     )
+#     workspace = TestWorkspace(workspace_size.value, y.device)
+
+#     def lib_leaky_relu():
+#         LIBINFINIOP.infiniopLeakyReLU(
+#             descriptor, workspace.data(), workspace.size(), y.data(), x.data(), None
+#         )
+
+#     lib_leaky_relu()
+
+#     atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+#     if DEBUG:
+#         debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+#     assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+
+#     # Profiling workflow
+#     if PROFILE:
+#         # fmt: off
+#         profile_operation("PyTorch", lambda: leaky_relu(x.torch_tensor(), negative_slope), device, NUM_PRERUN, NUM_ITERATIONS)
+#         profile_operation("    lib", lambda: lib_leaky_relu(), device, NUM_PRERUN, NUM_ITERATIONS)
+#         # fmt: on
+
+#     check_error(LIBINFINIOP.infiniopDestroyLeakyReLUDescriptor(descriptor))
+
+
+# if __name__ == "__main__":
+#     args = get_args()
+
+#     # Configure testing options
+#     DEBUG = args.debug
+#     PROFILE = args.profile
+#     NUM_PRERUN = args.num_prerun
+#     NUM_ITERATIONS = args.num_iterations
+
+#     for device in get_test_devices(args):
+#         test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+#     print("\033[92mTest passed!\033[0m")
+
+
+import ctypes
+from ctypes import c_uint64, c_float
+from enum import Enum, auto
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # tensor_shape, negative_slope, inplace
+    # TODO: Uncomment the following line.
+    # ((), 0.01),
+    ((1, 3), 0.01),
+    ((3, 3), 0.1),
+    ((32, 20, 512), 0.2),
+    ((33, 333, 333), 0.01),
+    ((32, 256, 112, 112), 0.1),
+    ((3, 3, 13, 9, 17), 0.02),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_INPUT = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_INPUT,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+    InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-3},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def leaky_relu(x, negative_slope=0.01):
+    return torch.nn.functional.leaky_relu(x, negative_slope=negative_slope).to(x.dtype)
+
+
+def test(
+    handle,
+    device,
+    shape,
+    negative_slope,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=torch.float16,
+    sync=None,
+):
+    # Generate test data with both positive and negative values to test LeakyReLU properly
+    input_torch_tensor = torch.randn(shape) * 2  # Range around [-2, 2]
+
+    input_tensor = TestTensor(
+        shape,
+        input_torch_tensor.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=input_torch_tensor,
+    )
+
+    if inplace == Inplace.INPLACE_INPUT:
+        output = input_tensor
+    else:
+        output = TestTensor(shape, None, dtype, device)
+
+    if output.is_broadcast():
+        return
+
+    print(
+        f"Testing LeakyReLU on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} negative_slope:{negative_slope} inplace: {inplace}"
+    )
+
+    ans = leaky_relu(input_tensor.torch_tensor(), negative_slope)
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateLeakyReLUDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            output.descriptor,
+            input_tensor.descriptor,
+            c_float(negative_slope),
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [input_tensor, output]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetLeakyReLUWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, output.device)
+
+    def lib_leaky_relu():
+        LIBINFINIOP.infiniopLeakyReLU(
+            descriptor,
+            workspace.data(),
+            workspace.size(),
+            output.data(),
+            input_tensor.data(),
+            None,
+        )
+
+    lib_leaky_relu()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(output.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(output.actual_tensor(), ans, atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: leaky_relu(input_tensor.torch_tensor(), negative_slope), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_leaky_relu(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyLeakyReLUDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mLeakyReLU Test passed!\033[0m")
diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py
index e92e77105..363d7a6d7 100644
--- a/test/infiniop/libinfiniop/op_register.py
+++ b/test/infiniop/libinfiniop/op_register.py
@@ -454,6 +454,7 @@ def swiglu_(lib):
         infiniopOperatorDescriptor_t,
     ]
 
+
 @OpRegister.operator
 def conv_(lib):
     lib.infiniopCreateConvDescriptor.restype = c_int32
@@ -489,3 +490,216 @@ def conv_(lib):
     lib.infiniopDestroyConvDescriptor.argtypes = [
         infiniopOperatorDescriptor_t,
     ]
+
+
+@OpRegister.operator
+def exp_(lib):
+    lib.infiniopCreateExpDescriptor.restype = c_int32
+    lib.infiniopCreateExpDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopExp.restype = c_int32
+    lib.infiniopExp.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyExpDescriptor.restype = c_int32
+    lib.infiniopDestroyExpDescriptor.argtypes = [infiniopOperatorDescriptor_t]
+
+
+@OpRegister.operator
+def sin_(lib):
+    lib.infiniopCreateSinDescriptor.restype = c_int32
+    lib.infiniopCreateSinDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopSin.restype = c_int32
+    lib.infiniopSin.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroySinDescriptor.restype = c_int32
+    lib.infiniopDestroySinDescriptor.argtypes = [infiniopOperatorDescriptor_t]
+
+
+@OpRegister.operator
+def cos_(lib):
+    lib.infiniopCreateCosDescriptor.restype = c_int32
+    lib.infiniopCreateCosDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopCos.restype = c_int32
+    lib.infiniopCos.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyCosDescriptor.restype = c_int32
+    lib.infiniopDestroyCosDescriptor.argtypes = [infiniopOperatorDescriptor_t]
+
+
+@OpRegister.operator
+def tanh_(lib):
+    lib.infiniopCreateTanhDescriptor.restype = c_int32
+    lib.infiniopCreateTanhDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopTanh.restype = c_int32
+    lib.infiniopTanh.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyTanhDescriptor.restype = c_int32
+    lib.infiniopDestroyTanhDescriptor.argtypes = [infiniopOperatorDescriptor_t]
+
+
+@OpRegister.operator
+def sigmoid_backward_(lib):
+    lib.infiniopCreateSigmoidBackwardDescriptor.restype = c_int32
+    lib.infiniopCreateSigmoidBackwardDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetSigmoidBackwardWorkspaceSize.restype = c_int32
+    lib.infiniopGetSigmoidBackwardWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopSigmoidBackward.restype = c_int32
+    lib.infiniopSigmoidBackward.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroySigmoidBackwardDescriptor.restype = c_int32
+    lib.infiniopDestroySigmoidBackwardDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def hardswish_(lib):
+    lib.infiniopCreateHardSwishDescriptor.restype = c_int32
+    lib.infiniopCreateHardSwishDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopHardSwish.restype = c_int32
+    lib.infiniopHardSwish.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyHardSwishDescriptor.restype = c_int32
+    lib.infiniopDestroyHardSwishDescriptor.argtypes = [infiniopOperatorDescriptor_t]
+
+
+@OpRegister.operator
+def where_(lib):
+    lib.infiniopCreateWhereDescriptor.restype = c_int32
+    lib.infiniopCreateWhereDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetWhereWorkspaceSize.restype = c_int32
+    lib.infiniopGetWhereWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopWhere.restype = c_int32
+    lib.infiniopWhere.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyWhereDescriptor.restype = c_int32
+    lib.infiniopDestroyWhereDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def leaky_relu_(lib):
+    lib.infiniopCreateLeakyReLUDescriptor.restype = c_int32
+    lib.infiniopCreateLeakyReLUDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_float,
+    ]
+
+    lib.infiniopGetLeakyReLUWorkspaceSize.restype = c_int32
+    lib.infiniopGetLeakyReLUWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopLeakyReLU.restype = c_int32
+    lib.infiniopLeakyReLU.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyLeakyReLUDescriptor.restype = c_int32
+    lib.infiniopDestroyLeakyReLUDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
diff --git a/test/infiniop/libinfiniop/utils.py b/test/infiniop/libinfiniop/utils.py
index de397a69e..1a8eaf505 100644
--- a/test/infiniop/libinfiniop/utils.py
+++ b/test/infiniop/libinfiniop/utils.py
@@ -66,10 +66,33 @@ def __init__(
                 torch_strides.append(strides[i])
             else:
                 torch_shape.append(shape[i])
+
+        is_bool = dt == InfiniDtype.BOOL
+        if is_bool:
+            dt = InfiniDtype.F32
+
+        is_int = (
+            dt == InfiniDtype.I8
+            or dt == InfiniDtype.I16
+            or dt == InfiniDtype.I32
+            or dt == InfiniDtype.I64
+        )
+
         if mode == "random":
-            self._torch_tensor = torch.rand(
-                torch_shape, dtype=to_torch_dtype(dt), device=torch_device_map[device]
-            )
+            if is_int:
+                self._torch_tensor = torch.randint(
+                    0,
+                    100,
+                    torch_shape,
+                    dtype=to_torch_dtype(dt),
+                    device=torch_device_map[device],
+                )
+            else:
+                self._torch_tensor = torch.rand(
+                    torch_shape,
+                    dtype=to_torch_dtype(dt),
+                    device=torch_device_map[device],
+                )
         elif mode == "zeros":
             self._torch_tensor = torch.zeros(
                 torch_shape, dtype=to_torch_dtype(dt), device=torch_device_map[device]
@@ -88,6 +111,12 @@ def __init__(
         else:
             raise ValueError("Unsupported mode")
 
+        if is_bool:
+            self._torch_tensor = self._torch_tensor > 0.5
+
+        if is_bool:
+            self._torch_tensor = self._torch_tensor > 0.5
+
         if scale is not None:
             self._torch_tensor *= scale
         if bias is not None:
@@ -103,6 +132,9 @@ def __init__(
     def torch_tensor(self):
         return self._torch_tensor
 
+    def update_torch_tensor(self, new_tensor: torch.tensor):
+        self._torch_tensor = new_tensor
+
     def actual_tensor(self):
         return self._data_tensor
 
@@ -120,6 +152,9 @@ def from_torch(torch_tensor, dt: InfiniDtype, device: InfiniDeviceEnum):
             shape_, strides_, dt, device, mode="manual", set_tensor=torch_tensor
         )
 
+    def update_torch_tensor(self, new_tensor: torch.Tensor):
+        self._torch_tensor = new_tensor
+
 
 def to_torch_dtype(dt: InfiniDtype, compatability_mode=False):
     if dt == InfiniDtype.I8:
@@ -140,6 +175,8 @@ def to_torch_dtype(dt: InfiniDtype, compatability_mode=False):
         return torch.float32
     elif dt == InfiniDtype.F64:
         return torch.float64
+    elif dt == InfiniDtype.BOOL:
+        return torch.bool
     # TODO: These following types may not be supported by older
     # versions of PyTorch. Use compatability mode to convert them.
     elif dt == InfiniDtype.U16:
@@ -330,6 +367,11 @@ def debug(actual, desired, atol=0, rtol=1e-2, equal_nan=False, verbose=True):
         actual = actual.to(torch.float32)
         desired = desired.to(torch.float32)
 
+    # 如果是BOOL，全部转成FP32再比对
+    if actual.dtype == torch.bool or desired.dtype == torch.bool:
+        actual = actual.to(torch.float32)
+        desired = desired.to(torch.float32)
+
     print_discrepancy(actual, desired, atol, rtol, equal_nan, verbose)
     np.testing.assert_allclose(
         actual.cpu(), desired.cpu(), rtol, atol, equal_nan, verbose=True
@@ -523,7 +565,7 @@ def profile_operation(desc, func, torch_device, NUM_PRERUN, NUM_ITERATIONS):
 
     # Timed execution
     elapsed = timed_op(lambda: func(), NUM_ITERATIONS, torch_device)
-    print(f" {desc} time: {elapsed * 1000 :6f} ms")
+    print(f" {desc} time: {elapsed * 1000:6f} ms")
 
 
 def test_operator(device, test_func, test_cases, tensor_dtypes):
@@ -605,9 +647,11 @@ def get_test_devices(args):
 def get_sync_func(device):
     import torch
 
-    if device == InfiniDeviceEnum.CPU or device == InfiniDeviceEnum.CAMBRICON:
+    device_str = torch_device_map[device]
+
+    if device == InfiniDeviceEnum.CPU:
         sync = None
     else:
-        sync = getattr(torch, torch_device_map[device]).synchronize
+        sync = getattr(torch, device_str).synchronize
 
     return sync
diff --git a/test/infiniop/sigmoid_backward.py b/test/infiniop/sigmoid_backward.py
new file mode 100644
index 000000000..d387fe1f4
--- /dev/null
+++ b/test/infiniop/sigmoid_backward.py
@@ -0,0 +1,227 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# 和 ReLU 保持一致的形状/步幅用例
+_TEST_CASES_ = [
+    # shape, input_stride, grad_output_stride, grad_input_stride
+    ((13, 4), None, None, None),
+    ((13, 4), (10, 1), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None, None),
+    ((13, 4, 4), None, None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), (0, 4, 1), None),
+    ((16, 5632), None, None, None),
+    ((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
+    ((4, 4, 5632), None, None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_INPUT = auto()
+    INPLACE_GRAD_OUTPUT = auto()
+
+
+# 每个 case 都测试三种 inplace 方式
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_INPUT,
+    Inplace.INPLACE_GRAD_OUTPUT,
+]
+
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# 数据类型
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+# Sigmoid backward 容差（略宽于 ReLU）
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 2e-3, "rtol": 2e-3},
+    InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
+    InfiniDtype.BF16: {"atol": 3e-2, "rtol": 3e-2},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def sigmoid_backward(grad_input, input_tensor, grad_output):
+    """
+    PyTorch reference implementation of Sigmoid backward.
+
+    Given:
+      y = sigmoid(x) = 1 / (1 + exp(-x))
+    Then:
+      dL/dx = dL/dy * y * (1 - y)
+    """
+    s = torch.sigmoid(input_tensor)
+    result = grad_output * s * (1.0 - s)
+
+    # 安全拷贝，避免原地副作用
+    with torch.no_grad():
+        grad_input.copy_(result)
+
+
+def test(
+    handle,
+    device,
+    shape,
+    input_stride=None,
+    grad_output_stride=None,
+    grad_input_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=torch.float16,
+    sync=None,
+):
+    # 输入含正负值，便于覆盖 sigmoid 的不同区间
+    input_tensor = TestTensor(
+        shape, input_stride, dtype, device, mode="random", scale=4.0, bias=-2.0
+    )
+    grad_output = TestTensor(shape, grad_output_stride, dtype, device, mode="random")
+
+    if inplace == Inplace.INPLACE_INPUT:
+        if input_stride != grad_input_stride:
+            return
+        grad_input = input_tensor
+    elif inplace == Inplace.INPLACE_GRAD_OUTPUT:
+        if grad_input_stride != grad_output_stride:
+            return
+        grad_input = grad_output
+    else:
+        grad_input = TestTensor(shape, grad_input_stride, dtype, device, mode="zeros")
+
+    if grad_input.is_broadcast():
+        return
+
+    print(
+        f"Testing Sigmoid Backward on {InfiniDeviceNames[device]} with shape:{shape} "
+        f"input_stride:{input_stride} grad_output_stride:{grad_output_stride} grad_input_stride:{grad_input_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+
+    # 计算 PyTorch 参考结果（写入 grad_input.torch_tensor()）
+    sigmoid_backward(
+        grad_input.torch_tensor(),
+        input_tensor.torch_tensor(),
+        grad_output.torch_tensor(),
+    )
+
+    if sync is not None:
+        sync()
+
+    # 创建算子描述子
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateSigmoidBackwardDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            grad_input.descriptor,
+            input_tensor.descriptor,
+            grad_output.descriptor,
+        )
+    )
+
+    # 使内部 desc 的 shape/stride 失效，强制 kernel 走外部传参
+    for tensor in [input_tensor, grad_output, grad_input]:
+        tensor.destroy_desc()
+
+    # workspace
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetSigmoidBackwardWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, grad_input.device)
+
+    def lib_sigmoid_backward():
+        check_error(
+            LIBINFINIOP.infiniopSigmoidBackward(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                grad_input.data(),
+                input_tensor.data(),
+                grad_output.data(),
+                None,
+            )
+        )
+
+    # 执行库实现，结果写入 grad_input.actual_tensor()
+    lib_sigmoid_backward()
+
+    # 校验
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(
+            grad_input.actual_tensor(), grad_input.torch_tensor(), atol=atol, rtol=rtol
+        )
+    assert torch.allclose(
+        grad_input.actual_tensor(), grad_input.torch_tensor(), atol=atol, rtol=rtol
+    )
+
+    # 性能分析（可选）
+    if PROFILE:
+        profile_operation(
+            "PyTorch",
+            lambda: sigmoid_backward(
+                grad_input.torch_tensor(),
+                input_tensor.torch_tensor(),
+                grad_output.torch_tensor(),
+            ),
+            device,
+            NUM_PRERUN,
+            NUM_ITERATIONS,
+        )
+        profile_operation(
+            "    lib",
+            lambda: lib_sigmoid_backward(),
+            device,
+            NUM_PRERUN,
+            NUM_ITERATIONS,
+        )
+
+    check_error(LIBINFINIOP.infiniopDestroySigmoidBackwardDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # 覆盖运行选项
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mSigmoid Backward test passed!\033[0m")
diff --git a/test/infiniop/sin.py b/test/infiniop/sin.py
new file mode 100644
index 000000000..9089f1403
--- /dev/null
+++ b/test/infiniop/sin.py
@@ -0,0 +1,162 @@
+import ctypes
+from ctypes import c_uint64
+from enum import Enum, auto
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # tensor_shape, inplace
+    # TODO: Uncomment the following line.
+    # ((),),
+    ((1, 3),),
+    ((3, 3),),
+    ((32, 20, 512),),
+    ((33, 333, 333),),
+    ((32, 256, 112, 112),),
+    ((3, 3, 13, 9, 17),),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_X,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+    InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-3},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def sin(x):
+    return torch.sin(x).to(x.dtype)
+
+
+def test(
+    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=torch.float16, sync=None
+):
+    x_torch_tensor = torch.rand(shape) * 2 - 1
+
+    x = TestTensor(
+        shape,
+        x_torch_tensor.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=x_torch_tensor,
+    )
+
+    if inplace == Inplace.INPLACE_X:
+        y = x
+    else:
+        y = TestTensor(shape, None, dtype, device)
+
+    if y.is_broadcast():
+        return
+
+    print(
+        f"Testing Sin on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
+    )
+
+    ans = sin(x.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateSinDescriptor(
+            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [x, y]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetSinWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, y.device)
+
+    def lib_sin():
+        LIBINFINIOP.infiniopSin(
+            descriptor, workspace.data(), workspace.size(), y.data(), x.data(), None
+        )
+
+    lib_sin()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: sin(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_sin(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroySinDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mSin Test passed!\033[0m")
diff --git a/test/infiniop/tanh.py b/test/infiniop/tanh.py
new file mode 100644
index 000000000..67d1a6c01
--- /dev/null
+++ b/test/infiniop/tanh.py
@@ -0,0 +1,162 @@
+import ctypes
+from ctypes import c_uint64
+from enum import Enum, auto
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # tensor_shape, inplace
+    # TODO: Uncomment the following line.
+    # ((),),
+    ((1, 3),),
+    ((3, 3),),
+    ((32, 20, 512),),
+    ((33, 333, 333),),
+    ((32, 256, 112, 112),),
+    ((3, 3, 13, 9, 17),),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_X,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+    InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-3},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def tanh(x):
+    return torch.tanh(x).to(x.dtype)
+
+
+def test(
+    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=torch.float16, sync=None
+):
+    x_torch_tensor = torch.rand(shape) * 2 - 1
+
+    x = TestTensor(
+        shape,
+        x_torch_tensor.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=x_torch_tensor,
+    )
+
+    if inplace == Inplace.INPLACE_X:
+        y = x
+    else:
+        y = TestTensor(shape, None, dtype, device)
+
+    if y.is_broadcast():
+        return
+
+    print(
+        f"Testing Tanh on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
+    )
+
+    ans = tanh(x.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateTanhDescriptor(
+            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [x, y]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetTanhWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, y.device)
+
+    def lib_tanh():
+        LIBINFINIOP.infiniopTanh(
+            descriptor, workspace.data(), workspace.size(), y.data(), x.data(), None
+        )
+
+    lib_tanh()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: tanh(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_tanh(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyTanhDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTanh Test passed!\033[0m")
diff --git a/test/infiniop/where.py b/test/infiniop/where.py
new file mode 100644
index 000000000..1ed944cff
--- /dev/null
+++ b/test/infiniop/where.py
@@ -0,0 +1,312 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # shape, condition_stride, a_stride, b_stride, c_stride
+    # 基本形状测试
+    ((10,), None, None, None, None),
+    ((5, 10), None, None, None, None),
+    ((2, 3, 4), None, None, None, None),
+    ((4, 5, 6), None, None, None, None),
+    # 不同步长测试
+    ((10, 10), (10, 1), None, None, None),
+    ((10, 10), None, (10, 1), None, None),
+    ((10, 10), None, None, (10, 1), None),
+    ((10, 10), None, None, None, (10, 1)),
+    # 奇怪形状测试
+    ((7, 13), None, None, None, None),  # 质数维度
+    ((3, 5, 7), None, None, None, None),  # 三维质数
+    ((11, 17, 23), None, None, None, None),  # 更大质数
+    # 非标准形状测试
+    ((1, 1), None, None, None, None),  # 最小形状
+    ((1, 100), None, None, None, None),  # 单行
+    ((100, 1), None, None, None, None),  # 单列
+    ((64, 64), None, None, None, None),  # 2的幂次
+    ((16, 16, 16), None, None, None, None),  # 三维2的幂次
+    # 大形状测试
+    ((100, 100), None, None, None, None),
+    ((32, 32, 32), None, None, None, None),
+    # 广播测试 - 这些会被跳过，但保留作为潜在的扩展
+    ((10,), (0,), None, None, None),  # 广播condition
+    ((5, 10), None, (0, 1), None, None),  # 广播a
+    ((5, 10), None, None, (0, 1), None),  # 广播b
+]
+
+
+# 暂时只测试浮点类型，确认逻辑正确后再扩展到整数类型
+_TENSOR_DTYPES = [
+    InfiniDtype.F16,
+    InfiniDtype.F32,
+    InfiniDtype.F64,
+    InfiniDtype.BF16,
+    InfiniDtype.I8,
+    InfiniDtype.I16,
+    InfiniDtype.I32,
+    InfiniDtype.I64,
+    # InfiniDtype.U8, InfiniDtype.U16, InfiniDtype.U32, InfiniDtype.U64,
+    InfiniDtype.BOOL,
+]
+
+
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-6},
+    InfiniDtype.F64: {"atol": 1e-15, "rtol": 1e-14},
+    InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.I8: {"atol": 0, "rtol": 0},
+    InfiniDtype.I16: {"atol": 0, "rtol": 0},
+    InfiniDtype.I32: {"atol": 0, "rtol": 0},
+    InfiniDtype.I64: {"atol": 0, "rtol": 0},
+    InfiniDtype.U8: {"atol": 0, "rtol": 0},
+    InfiniDtype.U16: {"atol": 0, "rtol": 0},
+    InfiniDtype.U32: {"atol": 0, "rtol": 0},
+    InfiniDtype.U64: {"atol": 0, "rtol": 0},
+    InfiniDtype.BOOL: {"atol": 0, "rtol": 0},
+}
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_A = auto()
+    INPLACE_B = auto()
+
+
+_INPLACE = [
+    Inplace.INPLACE_A,
+    Inplace.INPLACE_B,
+    Inplace.OUT_OF_PLACE,
+]
+
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def where(c, condition, a, b):
+    """Where operation: c[i] = condition[i] ? a[i] : b[i]"""
+    result = torch.where(condition.to(torch.bool), a, b)
+    c.copy_(result)
+
+
+def test(
+    handle,
+    device,
+    shape,
+    condition_stride=None,
+    a_stride=None,
+    b_stride=None,
+    c_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=InfiniDtype.F32,
+    sync=None,
+):
+    # Create input tensors a and b with specified dtype
+    # For unsigned integer types, we need to be careful about random generation
+    if dtype in [InfiniDtype.U8, InfiniDtype.U16, InfiniDtype.U32, InfiniDtype.U64]:
+        # Use a smaller range for unsigned types to avoid overflow
+        a = TestTensor(shape, a_stride, dtype, device, mode="random", scale=10, bias=0)
+        b = TestTensor(shape, b_stride, dtype, device, mode="random", scale=10, bias=0)
+        condition = TestTensor(
+            shape, condition_stride, dtype, device, mode="random", scale=10, bias=0
+        )
+    elif dtype in [InfiniDtype.I8, InfiniDtype.I16, InfiniDtype.I32, InfiniDtype.I64]:
+        # Use a reasonable range for signed integer types
+        a = TestTensor(
+            shape, a_stride, dtype, device, mode="random", scale=100, bias=-50
+        )
+        b = TestTensor(
+            shape, b_stride, dtype, device, mode="random", scale=100, bias=-50
+        )
+        condition = TestTensor(
+            shape, condition_stride, dtype, device, mode="random", scale=100, bias=-50
+        )
+    else:
+        # For floating point and bool types, use default random generation
+        a = TestTensor(shape, a_stride, dtype, device)
+        b = TestTensor(shape, b_stride, dtype, device)
+        condition = TestTensor(shape, condition_stride, dtype, device)
+    # Handle inplace operations
+    if inplace == Inplace.INPLACE_A:
+        if a_stride != c_stride:
+            return
+        c = a
+    elif inplace == Inplace.INPLACE_B:
+        if b_stride != c_stride:
+            return
+        c = b
+    else:
+        c = TestTensor(shape, c_stride, dtype, device)
+
+    # Skip broadcast cases for now
+    if (
+        c.is_broadcast()
+        or condition.is_broadcast()
+        or a.is_broadcast()
+        or b.is_broadcast()
+    ):
+        return
+
+    print(
+        f"Testing Where on {InfiniDeviceNames[device]} with shape:{shape} "
+        f"condition_stride:{condition_stride} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+
+    # Compute reference result using PyTorch
+    where(
+        c.torch_tensor(), condition.torch_tensor(), a.torch_tensor(), b.torch_tensor()
+    )
+
+    if sync is not None:
+        sync()
+
+    # Store expected result before library operation
+    expected_result = c.torch_tensor().clone()
+
+    # Create descriptor
+    descriptor = infiniopOperatorDescriptor_t()
+    print(
+        a.torch_tensor().dtype,
+        b.torch_tensor().dtype,
+        condition.torch_tensor().dtype,
+        c.torch_tensor().dtype,
+    )
+    check_error(
+        LIBINFINIOP.infiniopCreateWhereDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            c.descriptor,
+            condition.descriptor,
+            a.descriptor,
+            b.descriptor,
+        )
+    )
+
+    # Get workspace size
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetWhereWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, c.device)
+
+    def lib_where():
+        check_error(
+            LIBINFINIOP.infiniopWhere(
+                descriptor,
+                workspace.data() if workspace is not None else None,
+                workspace_size.value,
+                c.data(),
+                condition.data(),
+                a.data(),
+                b.data(),
+                None,
+            )
+        )
+
+    # Execute library operation
+    lib_where()
+
+    # Destroy the tensor descriptors
+    for tensor in [condition, a, b, c]:
+        tensor.destroy_desc()
+
+    # Check results with better error reporting
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+
+    # # Always print debug info for failed cases
+    # print(f"Condition values: {condition.torch_tensor().flatten()[:10]}")
+    # print(f"A values: {a.torch_tensor().flatten()[:10]}")
+    # print(f"B values: {b.torch_tensor().flatten()[:10]}")
+    # print(f"Expected result: {expected_result.flatten()[:10]}")
+    # print(f"Actual result: {c.actual_tensor().flatten()[:10]}")
+
+    # if DEBUG:
+    #     print(f"Expected result shape: {expected_result.shape}")
+    #     print(f"Actual result shape: {c.actual_tensor().shape}")
+    #     print(f"Expected result dtype: {expected_result.dtype}")
+    #     print(f"Actual result dtype: {c.actual_tensor().dtype}")
+    #     debug(c.actual_tensor(), expected_result, atol=atol, rtol=rtol)
+
+    # Use torch.equal for exact comparison for integer and boolean types
+    if dtype in [
+        InfiniDtype.I8,
+        InfiniDtype.I16,
+        InfiniDtype.I32,
+        InfiniDtype.I64,
+        InfiniDtype.U8,
+        InfiniDtype.U16,
+        InfiniDtype.U32,
+        InfiniDtype.U64,
+        InfiniDtype.BOOL,
+    ]:
+        if not torch.equal(c.actual_tensor(), expected_result):
+            print(f"Exact comparison failed for {InfiniDtypeNames[dtype]}")
+            print(
+                f"Max absolute difference: {torch.max(torch.abs(c.actual_tensor() - expected_result))}"
+            )
+            assert False, f"Results don't match exactly for {InfiniDtypeNames[dtype]}"
+    else:
+        if not torch.allclose(c.actual_tensor(), expected_result, atol=atol, rtol=rtol):
+            print(f"Tolerance comparison failed for {InfiniDtypeNames[dtype]}")
+            print(
+                f"Max absolute difference: {torch.max(torch.abs(c.actual_tensor() - expected_result))}"
+            )
+            print(f"Tolerance: atol={atol}, rtol={rtol}")
+            assert (
+                False
+            ), f"Results don't match within tolerance for {InfiniDtypeNames[dtype]}"
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: where(c.torch_tensor(), condition.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_where(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    # Clean up
+    check_error(LIBINFINIOP.infiniopDestroyWhereDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mWhere test passed!\033[0m")