Fix dActivation (#1462)

* Ensure that each tensor is seeded differently Signed-off-by: Przemek Tredak <[email protected]> * Fix Signed-off-by: Przemek Tredak <[email protected]> * Fix Signed-off-by: Przemek Tredak <[email protected]> * Fix Signed-off-by: Przemek Tredak <[email protected]> * Fix Signed-off-by: Przemek Tredak <[email protected]> * Disambiguate (and fix) the C++ unit tests for dact Signed-off-by: Przemek Tredak <[email protected]> * Fix tests Signed-off-by: Przemek Tredak <[email protected]> * Fix Signed-off-by: Przemek Tredak <[email protected]> * Fix Signed-off-by: Przemek Tredak <[email protected]> * Fix MXFP8 dbias tests Signed-off-by: Przemek Tredak <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Przemek Tredak <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
NVIDIA · Feb 7, 2025 · 2d058d6 · 2d058d6
1 parent 6b34c95
commit 2d058d6
Show file tree

Hide file tree

Showing 25 changed files with 232 additions and 198 deletions.
diff --git a/tests/cpp/operator/test_act.cu b/tests/cpp/operator/test_act.cu
@@ -116,10 +116,10 @@ void performTest(const size_t N, const size_t H) {
   DType itype = TypeInfo<IType>::dtype;
   DType otype = TypeInfo<OType>::dtype;
 
-  Tensor input({ N, H }, itype);
-  Tensor output({ N, H }, otype);
-  Tensor igrad({ N, H }, itype);
-  Tensor ograd({ N, H }, itype);
+  Tensor input("input", { N, H }, itype);
+  Tensor output("output", { N, H }, otype);
+  Tensor igrad("igrad", { N, H }, itype);
+  Tensor ograd("ograd", { N, H }, itype);
 
   fillUniform(&input);
   fillUniform(&ograd);
@@ -171,10 +171,10 @@ void performTestGLU(const size_t N, const size_t H) {
   DType itype = TypeInfo<IType>::dtype;
   DType otype = TypeInfo<OType>::dtype;
 
-  Tensor input({N, H * 2}, itype);
-  Tensor output({N, H}, otype);
-  Tensor igrad({ N, H * 2 }, itype);
-  Tensor ograd({ N, H }, itype);
+  Tensor input("input", {N, H * 2}, itype);
+  Tensor output("output", {N, H}, otype);
+  Tensor igrad("igrad", { N, H * 2 }, itype);
+  Tensor ograd("ograd", { N, H }, itype);
 
   fillUniform(&input);
   fillUniform(&ograd);

diff --git a/tests/cpp/operator/test_cast.cu b/tests/cpp/operator/test_cast.cu
@@ -44,8 +44,8 @@ void performTest(const std::vector<size_t>& shape) {
   DType itype = TypeInfo<InputType>::dtype;
   DType otype = TypeInfo<OutputType>::dtype;
 
-  Tensor input(shape, itype);
-  Tensor output_c(shape, otype);
+  Tensor input("input", shape, itype);
+  Tensor output_c("output_c", shape, otype);
 
   std::unique_ptr<OutputType[]> ref_output_c = std::make_unique<OutputType[]>(full_size);
 

diff --git a/tests/cpp/operator/test_cast_dbias.cu b/tests/cpp/operator/test_cast_dbias.cu
@@ -66,11 +66,11 @@ void performTest(const std::vector<size_t>& shape) {
   const size_t N = first_dimension(shape);
   const size_t H = last_dimension(shape);
 
-  Tensor input(shape, itype);
+  Tensor input("input", shape, itype);
 
-  Tensor output_c(shape, otype);
+  Tensor output_c("output_c", shape, otype);
   // dbias has the same data type with "output grad"
-  Tensor dbias({H}, itype);
+  Tensor dbias("dbias", {H}, itype);
 
   fillUniform(&input);
   setRandomScale(&output_c);
@@ -94,7 +94,7 @@ void performTest(const std::vector<size_t>& shape) {
                       workspace.data(),
                       0);
 
-  workspace = Tensor(workspace.rowwise_shape(), workspace.dtype());
+  workspace = Tensor("workspace", workspace.rowwise_shape(), workspace.dtype());
 
   nvte_quantize_dbias(input.data(),
                       output_c.data(),

diff --git a/tests/cpp/operator/test_cast_dbias_dgelu.cu b/tests/cpp/operator/test_cast_dbias_dgelu.cu
@@ -25,7 +25,7 @@ namespace {
 
 template <typename IT, typename OT, typename CT>
 void compute_ref_cast_dbias_dgelu(const IT *input,
-                                  const IT *gelu_input,
+                                  const IT *grad,
                                   const CT scale,
                                   OT *output_c,
                                   CT *amax_h,
@@ -39,9 +39,9 @@ void compute_ref_cast_dbias_dgelu(const IT *input,
   for (size_t i = 0; i < N; i++) {
     for (size_t j = 0; j < H; j++) {
       CT in_elt = static_cast<CT>(input[i * H + j]);
-      const CT gelu_in = static_cast<CT>(gelu_input[i * H + j]);
+      const CT in_grad = static_cast<CT>(grad[i * H + j]);
 
-      const CT elt = in_elt * static_cast<float>(dgelu(static_cast<float>(gelu_in)));
+      const CT elt = in_grad * static_cast<float>(dgelu(static_cast<float>(in_elt)));
       const CT elt_abs = std::abs(elt);
 
       // update amax
@@ -74,23 +74,23 @@ void performTest(const std::vector<size_t>& shape) {
   const size_t N = first_dimension(shape);
   const size_t H = last_dimension(shape);
 
-  Tensor input(shape, itype);
-  Tensor gelu_input(shape, itype);
+  Tensor input("input", shape, itype);
+  Tensor grad("grad", shape, itype);
 
-  Tensor output_c(shape, otype);
+  Tensor output_c("output_c", shape, otype);
   // dbias has the same data type with "output grad"
-  Tensor dbias({H}, itype);
+  Tensor dbias("dbias", {H}, itype);
 
   fillUniform(&input);
-  fillUniform(&gelu_input);
+  fillUniform(&grad);
   setRandomScale(&output_c);
 
   std::unique_ptr<OType[]> ref_output_c = std::make_unique<OType[]>(N*H);
   std::unique_ptr<IType[]> ref_output_dbias = std::make_unique<IType[]>(H);
 
   CType ref_amax;
   compute_ref_cast_dbias_dgelu(input.rowwise_cpu_dptr<IType>(),
-                               gelu_input.rowwise_cpu_dptr<IType>(),
+                               grad.rowwise_cpu_dptr<IType>(),
                                output_c.scale(),
                                ref_output_c.get(),
                                &ref_amax,
@@ -99,18 +99,18 @@ void performTest(const std::vector<size_t>& shape) {
 
   Tensor workspace;
 
-  nvte_quantize_dbias_dgelu(input.data(),
-                            gelu_input.data(),
+  nvte_quantize_dbias_dgelu(grad.data(),
+                            input.data(),
                             output_c.data(),
                             dbias.data(),
                             workspace.data(),
                             0);
 
-  workspace = Tensor(workspace.rowwise_shape(), workspace.dtype());
+  workspace = Tensor("workspace", workspace.rowwise_shape(), workspace.dtype());
 
 
-  nvte_quantize_dbias_dgelu(input.data(),
-                            gelu_input.data(),
+  nvte_quantize_dbias_dgelu(grad.data(),
+                            input.data(),
                             output_c.data(),
                             dbias.data(),
                             workspace.data(),

diff --git a/tests/cpp/operator/test_cast_gated_swiglu.cu b/tests/cpp/operator/test_cast_gated_swiglu.cu
@@ -72,9 +72,9 @@ void performTest(const std::vector<size_t>& shape) {
   const size_t rows = first_dimension(shape);
   const size_t cols = last_dimension(shape);
 
-  Tensor grad(shape, itype);
-  Tensor input(input_shape, itype);
-  Tensor output_c(input_shape, otype);
+  Tensor grad("grad", shape, itype);
+  Tensor input("input", input_shape, itype);
+  Tensor output_c("output_c", input_shape, otype);
 
   fillUniform(&grad);
   fillUniform(&input);