From fc9b2d7804d5ceaba4ada50bf21b322fef737185 Mon Sep 17 00:00:00 2001
From: "jack.kirk" <jack.kirk@codeplay.com>
Date: Mon, 4 Apr 2022 16:58:18 +0100
Subject: [PATCH 01/21] New test cases, bfloat16 builtins/matrix.

Signed-off-by: jack.kirk <jack.kirk@codeplay.com>
---
 SYCL/BFloat16/bfloat16_builtins.cpp     | 118 ++++++++++++++++++++++++
 SYCL/Matrix/joint_matrix_tensorcore.cpp |  66 ++++++++++---
 2 files changed, 172 insertions(+), 12 deletions(-)
 create mode 100644 SYCL/BFloat16/bfloat16_builtins.cpp
diff --git a/SYCL/BFloat16/bfloat16_builtins.cpp b/SYCL/BFloat16/bfloat16_builtins.cpp
new file mode 100644
index 0000000000..8baad18c61
--- /dev/null
+++ b/SYCL/BFloat16/bfloat16_builtins.cpp
@@ -0,0 +1,118 @@
+// REQUIRES: cuda
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_80
+// TODO: Currently the CI does not have a sm_80 capable machine. Enable the test
+// execution once it does.
+// RUNx: %t.out
+
+#include <CL/sycl.hpp>
+
+#include <cmath>
+#include <vector>
+
+using namespace cl::sycl;
+using sycl::ext::oneapi::experimental::bfloat16;
+
+constexpr int N = 16 * 3; // divisible by all vector sizes
+constexpr float bf16_eps = 0.00390625;
+
+union conv {
+  float f;
+  vec<uint16_t, 2> u;
+  uint32_t u2;
+};
+
+float from_bf16(uint16_t x) {
+  conv c;
+  c.u.y() = x;
+  c.u.x() = 0;
+  return c.f;
+}
+
+bool check(float a, float b) {
+  return fabs(2 * (a - b) / (a + b)) > bf16_eps * 2;
+}
+
+#define TEST_BUILTIN_1_SCAL_IMPL(NAME)                                         \
+  {                                                                            \
+    buffer<float> a_buf(&a[0], N);                                             \
+    buffer<int> err_buf(&err, 1);                                              \
+    q.submit([&](handler &cgh) {                                               \
+      auto A = a_buf.get_access<access::mode::read_write>(cgh);                \
+      auto ERR = err_buf.get_access<access::mode::write>(cgh);                 \
+      cgh.parallel_for(N, [=](id<1> index) {                                   \
+        if (check(from_bf16(NAME(bfloat16{A[index]}).raw()),                   \
+                  NAME(A[index]))) {                                           \
+          ERR[0] = 1;                                                          \
+        }                                                                      \
+      });                                                                      \
+    });                                                                        \
+  }                                                                            \
+  assert(err == 0);
+
+#define TEST_BUILTIN_1(NAME) TEST_BUILTIN_1_SCAL_IMPL(NAME)
+
+#define TEST_BUILTIN_2_SCAL_IMPL(NAME)                                         \
+  {                                                                            \
+    buffer<float> a_buf(&a[0], N);                                             \
+    buffer<float> b_buf(&b[0], N);                                             \
+    buffer<int> err_buf(&err, 1);                                              \
+    q.submit([&](handler &cgh) {                                               \
+      auto A = a_buf.get_access<access::mode::read>(cgh);                      \
+      auto B = b_buf.get_access<access::mode::read>(cgh);                      \
+      auto ERR = err_buf.get_access<access::mode::write>(cgh);                 \
+      cgh.parallel_for(N, [=](id<1> index) {                                   \
+        if (check(                                                             \
+                from_bf16(NAME(bfloat16{A[index]}, bfloat16{B[index]}).raw()), \
+                NAME(A[index], B[index]))) {                                   \
+          ERR[0] = 1;                                                          \
+        }                                                                      \
+      });                                                                      \
+    });                                                                        \
+  }                                                                            \
+  assert(err == 0);
+
+#define TEST_BUILTIN_2(NAME) TEST_BUILTIN_2_SCAL_IMPL(NAME)
+
+#define TEST_BUILTIN_3_SCAL_IMPL(NAME)                                         \
+  {                                                                            \
+    buffer<float> a_buf(&a[0], N);                                             \
+    buffer<float> b_buf(&b[0], N);                                             \
+    buffer<float> c_buf(&c[0], N);                                             \
+    buffer<int> err_buf(&err, 1);                                              \
+    q.submit([&](handler &cgh) {                                               \
+      auto A = a_buf.get_access<access::mode::read>(cgh);                      \
+      auto B = b_buf.get_access<access::mode::read>(cgh);                      \
+      auto C = c_buf.get_access<access::mode::read>(cgh);                      \
+      auto ERR = err_buf.get_access<access::mode::write>(cgh);                 \
+      cgh.parallel_for(N, [=](id<1> index) {                                   \
+        if (check(from_bf16(NAME(bfloat16{A[index]}, bfloat16{B[index]},       \
+                                 bfloat16{C[index]})                           \
+                                .raw()),                                       \
+                  NAME(A[index], B[index], C[index]))) {                       \
+          ERR[0] = 1;                                                          \
+        }                                                                      \
+      });                                                                      \
+    });                                                                        \
+  }                                                                            \
+  assert(err == 0);
+
+#define TEST_BUILTIN_3(NAME) TEST_BUILTIN_3_SCAL_IMPL(NAME)
+
+int main() {
+  queue q;
+  std::vector<float> a(N), b(N), c(N);
+  int err = 0;
+
+  for (int i = 0; i < N; i++) {
+    a[i] = (i - N / 2) / (float)N;
+    b[i] = (N / 2 - i) / (float)N;
+    c[i] = (float)(3 * i);
+  }
+
+  TEST_BUILTIN_1(fabs);
+  TEST_BUILTIN_2(fmin);
+  TEST_BUILTIN_2(fmax);
+  TEST_BUILTIN_3(fma);
+
+  return 0;
+}
diff --git a/SYCL/Matrix/joint_matrix_tensorcore.cpp b/SYCL/Matrix/joint_matrix_tensorcore.cpp
index 8c827d1c11..eb602f99d7 100644
--- a/SYCL/Matrix/joint_matrix_tensorcore.cpp
+++ b/SYCL/Matrix/joint_matrix_tensorcore.cpp
@@ -1,6 +1,9 @@
-// REQUIRES: gpu, cuda
+// REQUIRES: cuda
 
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX=3  %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX=3 %s -o %t.out
+// TODO: Currently the CI does not have a sm_80 capable machine. Enable the test
+// execution once it does.
+// RUNx: %t.out
 //
 // Specifying the sm version via the --cuda-gpu-arch flag is necessary
 // for the Nvidia case.  DPC++ JIT compilation is not
@@ -11,6 +14,8 @@
 
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
+using sycl::ext::oneapi::experimental::bfloat16;
+constexpr float bf16_eps = 0.00390625;
 
 // Example usage of Nvidia matrix multiply.
 // Optimizations such as memory paddings for avoiding bank conflicts are not
@@ -63,6 +68,10 @@ T2 matrix_ref_mn(const int &m, const int &n, T1 *A, T1 *B, T2 *C) {
   if constexpr (std::is_same<T1, uint16_t>::value) {
     for (int k = 0; k < Big_K; k++)
       res += make_fp32(A[m * Big_K + k]) * make_fp32(B[k * Big_N + n]);
+  } else if constexpr (std::is_same<T1, bfloat16>::value) {
+    for (int k = 0; k < Big_K; k++)
+      res +=
+          make_fp32(A[m * Big_K + k].raw()) * make_fp32(B[k * Big_N + n].raw());
   } else {
     for (int k = 0; k < Big_K; k++)
 
@@ -105,7 +114,7 @@ void test() {
     for (int i = 0; i < Big_K * Big_N; i++) {
       B[i] = make_bf16(0.1f * (i % 10));
     }
-  } else {
+  } else if constexpr (!std::is_same<T1, bfloat16>::value) {
     for (int i = 0; i < Big_M * Big_K; i++) {
       A[i] = i % 100;
     }
@@ -114,13 +123,36 @@ void test() {
       B[i] = i % 100;
     }
   }
-
+{
   buffer<T1, 1> bufA(A, range<1>(Big_M * Big_K));
   buffer<T1, 1> bufB(B, range<1>(Big_K * Big_N));
   buffer<T2, 1> bufC(C, range<1>(Big_M * Big_N));
   buffer<T2, 1> bufD(D, range<1>(Big_M * Big_N));
 
   queue q;
+// currently bfloat16 has to be initialized on device
+if constexpr (std::is_same<T1, bfloat16>::value) {
+  q.submit([&](handler &cgh) {
+    auto accA = bufA.template get_access<access::mode::write>(cgh);
+
+    cgh.parallel_for<KernelName<bfloat16, class copyA, M, K, N>>(
+        range<1>(Big_M * Big_K), [=](item<1> item) {
+          auto i = item.get_linear_id();
+          accA[i] = 0.1f * (i % 10);
+        });
+  });
+
+  q.submit([&](handler &cgh) {
+    auto accB = bufB.template get_access<access::mode::write>(cgh);
+
+    cgh.parallel_for<KernelName<bfloat16, class copyB, M, K, N>>(
+        range<1>(Big_K * Big_N), [=](item<1> item) {
+          auto i = item.get_linear_id();
+          accB[i] = 0.1f * (i % 10);
+        });
+  });
+}
+
   q.submit([&](handler &cgh) {
     auto accC = bufC.template get_access<access::mode::read_write>(cgh);
     auto accA = bufA.template get_access<access::mode::read_write>(cgh);
@@ -136,9 +168,9 @@ void test() {
           sycl::sub_group sg = item.get_sub_group();
           const auto m =
               item.get_group()
-                  .get_id()[0]; // row id of current submatrix of BIG C matrix
+                  .get_group_id()[0]; // row id of current submatrix of BIG C matrix
           const auto n =
-              item.get_group().get_id()[1]; // column id of current
+              item.get_group().get_group_id()[1]; // column id of current
                                             // submatrix of BIG C matrix
 
           joint_matrix<T1, matrix_use::a, M, K, matrix_layout::row_major> sub_a;
@@ -171,14 +203,20 @@ void test() {
   });
 
   q.wait();
+}
 
-  const auto host_accessor = bufD.template get_access<access::mode::read>();
-  for (int m = 0; m < Big_M; m++)
-    for (int n = 0; n < Big_N; n++) {
-
-      assert((host_accessor[m * Big_N + n] ==
+for (int m = 0; m < Big_M; m++)
+  for (int n = 0; n < Big_N; n++) {
+    if constexpr (std::is_same<T1, bfloat16>::value) {
+      auto res_device = matrix_ref_mn<T1, T2, Big_N, Big_K>(m, n, A, B, C);
+      assert(fabs(2 * (D[m * Big_N + n] - res_device)) /
+                 (D[m * Big_N + n] + res_device) <
+             bf16_eps * 2);
+    } else {
+      assert((D[m * Big_N + n] ==
               matrix_ref_mn<T1, T2, Big_N, Big_K>(m, n, A, B, C)));
     }
+  }
 };
 
 int main() {
@@ -203,10 +241,14 @@ int main() {
 
   test<double, double, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 8, 4, 8>();
 
-  // A/B bf16
+  // A/B bf16 using storage type
   test<uint16_t, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 16, 16, 16>();
   test<uint16_t, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 8, 16, 32>();
   test<uint16_t, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 32, 16, 8>();
 
+  test<bfloat16, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 16, 16, 16>();
+  test<bfloat16, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 8, 16, 32>();
+  test<bfloat16, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 32, 16, 8>();
+
   return 0;
 };

From e8acad9b11ab0395f39584570c420fbb167b30f5 Mon Sep 17 00:00:00 2001
From: "jack.kirk" <jack.kirk@codeplay.com>
Date: Tue, 5 Apr 2022 18:14:36 +0100
Subject: [PATCH 02/21] format

---
 SYCL/Matrix/joint_matrix_tensorcore.cpp | 117 ++++++++++++------------
 1 file changed, 57 insertions(+), 60 deletions(-)

diff --git a/SYCL/Matrix/joint_matrix_tensorcore.cpp b/SYCL/Matrix/joint_matrix_tensorcore.cpp
index eb602f99d7..cfd1c8fa8a 100644
--- a/SYCL/Matrix/joint_matrix_tensorcore.cpp
+++ b/SYCL/Matrix/joint_matrix_tensorcore.cpp
@@ -130,28 +130,28 @@ void test() {
   buffer<T2, 1> bufD(D, range<1>(Big_M * Big_N));
 
   queue q;
-// currently bfloat16 has to be initialized on device
-if constexpr (std::is_same<T1, bfloat16>::value) {
-  q.submit([&](handler &cgh) {
-    auto accA = bufA.template get_access<access::mode::write>(cgh);
-
-    cgh.parallel_for<KernelName<bfloat16, class copyA, M, K, N>>(
-        range<1>(Big_M * Big_K), [=](item<1> item) {
-          auto i = item.get_linear_id();
-          accA[i] = 0.1f * (i % 10);
-        });
-  });
-
-  q.submit([&](handler &cgh) {
-    auto accB = bufB.template get_access<access::mode::write>(cgh);
-
-    cgh.parallel_for<KernelName<bfloat16, class copyB, M, K, N>>(
-        range<1>(Big_K * Big_N), [=](item<1> item) {
-          auto i = item.get_linear_id();
-          accB[i] = 0.1f * (i % 10);
-        });
-  });
-}
+  // currently bfloat16 has to be initialized on device
+  if constexpr (std::is_same<T1, bfloat16>::value) {
+    q.submit([&](handler &cgh) {
+      auto accA = bufA.template get_access<access::mode::write>(cgh);
+
+      cgh.parallel_for<KernelName<bfloat16, class copyA, M, K, N>>(
+          range<1>(Big_M * Big_K), [=](item<1> item) {
+            auto i = item.get_linear_id();
+            accA[i] = 0.1f * (i % 10);
+          });
+    });
+
+    q.submit([&](handler &cgh) {
+      auto accB = bufB.template get_access<access::mode::write>(cgh);
+
+      cgh.parallel_for<KernelName<bfloat16, class copyB, M, K, N>>(
+          range<1>(Big_K * Big_N), [=](item<1> item) {
+            auto i = item.get_linear_id();
+            accB[i] = 0.1f * (i % 10);
+          });
+    });
+  }
 
   q.submit([&](handler &cgh) {
     auto accC = bufC.template get_access<access::mode::read_write>(cgh);
@@ -162,44 +162,41 @@ if constexpr (std::is_same<T1, bfloat16>::value) {
     range<2> LocalRange = {1, N_THREADS_PER_MATRIX_OP};
     range<2> GlobalRange = {Sub_Tiles_M, Sub_Tiles_N * N_THREADS_PER_MATRIX_OP};
 
-    cgh.parallel_for<KernelName<T1, T2, M, K, N>>(
-        nd_range<2>(GlobalRange, LocalRange), [=
-    ](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-          const auto m =
-              item.get_group()
-                  .get_group_id()[0]; // row id of current submatrix of BIG C matrix
-          const auto n =
-              item.get_group().get_group_id()[1]; // column id of current
-                                            // submatrix of BIG C matrix
-
-          joint_matrix<T1, matrix_use::a, M, K, matrix_layout::row_major> sub_a;
-
-          joint_matrix<T1, matrix_use::b, K, N, matrix_layout::row_major> sub_b;
-
-          joint_matrix<T2, matrix_use::accumulator, M, N,
-                       matrix_layout::row_major>
-              sub_c;
-
-          joint_matrix_load(
-              sg, sub_c, accC.get_pointer() + (m * M) * Big_N + n * N, Big_N);
-
-          for (int k = 0; k < Sub_Tiles_K;
-               k++) // row/col id of current submatrix of BIG A/B matrices
-          {
-            joint_matrix_load(sg, sub_a,
-                              accA.get_pointer() + (k * K) + (m * M * Big_K),
-                              Big_K);
-
-            joint_matrix_load(sg, sub_b,
-                              accB.get_pointer() + (k * K * Big_N) + (n * N),
-                              Big_N);
-
-            sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
-          }
-          joint_matrix_store(
-              sg, sub_c, accD.get_pointer() + (m * M) * Big_N + n * N, Big_N);
-        });
+    cgh.parallel_for<
+        KernelName<T1, T2, M, K, N>>(nd_range<2>(GlobalRange, LocalRange), [=
+    ](nd_item<2> item)[[sycl::reqd_work_group_size(1, 1, 32)]] {
+      sycl::sub_group sg = item.get_sub_group();
+      const auto m =
+          item.get_group()
+              .get_group_id()[0]; // row id of current submatrix of BIG C matrix
+      const auto n =
+          item.get_group().get_group_id()[1]; // column id of current
+                                              // submatrix of BIG C matrix
+
+      joint_matrix<T1, matrix_use::a, M, K, matrix_layout::row_major> sub_a;
+
+      joint_matrix<T1, matrix_use::b, K, N, matrix_layout::row_major> sub_b;
+
+      joint_matrix<T2, matrix_use::accumulator, M, N, matrix_layout::row_major>
+          sub_c;
+
+      joint_matrix_load(sg, sub_c, accC.get_pointer() + (m * M) * Big_N + n * N,
+                        Big_N);
+
+      for (int k = 0; k < Sub_Tiles_K;
+           k++) // row/col id of current submatrix of BIG A/B matrices
+      {
+        joint_matrix_load(
+            sg, sub_a, accA.get_pointer() + (k * K) + (m * M * Big_K), Big_K);
+
+        joint_matrix_load(
+            sg, sub_b, accB.get_pointer() + (k * K * Big_N) + (n * N), Big_N);
+
+        sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
+      }
+      joint_matrix_store(sg, sub_c,
+                         accD.get_pointer() + (m * M) * Big_N + n * N, Big_N);
+    });
   });
 
   q.wait();

From caed5208c4df6a0d2f704b72411a18e4addd4d45 Mon Sep 17 00:00:00 2001
From: "jack.kirk" <jack.kirk@codeplay.com>
Date: Tue, 5 Apr 2022 18:25:16 +0100
Subject: [PATCH 03/21] format

---
 SYCL/Matrix/joint_matrix_tensorcore.cpp | 170 ++++++++++++------------
 1 file changed, 86 insertions(+), 84 deletions(-)

diff --git a/SYCL/Matrix/joint_matrix_tensorcore.cpp b/SYCL/Matrix/joint_matrix_tensorcore.cpp
index cfd1c8fa8a..fe4bfcab61 100644
--- a/SYCL/Matrix/joint_matrix_tensorcore.cpp
+++ b/SYCL/Matrix/joint_matrix_tensorcore.cpp
@@ -123,97 +123,99 @@ void test() {
       B[i] = i % 100;
     }
   }
-{
-  buffer<T1, 1> bufA(A, range<1>(Big_M * Big_K));
-  buffer<T1, 1> bufB(B, range<1>(Big_K * Big_N));
-  buffer<T2, 1> bufC(C, range<1>(Big_M * Big_N));
-  buffer<T2, 1> bufD(D, range<1>(Big_M * Big_N));
-
-  queue q;
-  // currently bfloat16 has to be initialized on device
-  if constexpr (std::is_same<T1, bfloat16>::value) {
-    q.submit([&](handler &cgh) {
-      auto accA = bufA.template get_access<access::mode::write>(cgh);
-
-      cgh.parallel_for<KernelName<bfloat16, class copyA, M, K, N>>(
-          range<1>(Big_M * Big_K), [=](item<1> item) {
-            auto i = item.get_linear_id();
-            accA[i] = 0.1f * (i % 10);
-          });
-    });
+  {
+    buffer<T1, 1> bufA(A, range<1>(Big_M * Big_K));
+    buffer<T1, 1> bufB(B, range<1>(Big_K * Big_N));
+    buffer<T2, 1> bufC(C, range<1>(Big_M * Big_N));
+    buffer<T2, 1> bufD(D, range<1>(Big_M * Big_N));
+
+    queue q;
+    // currently bfloat16 has to be initialized on device
+    if constexpr (std::is_same<T1, bfloat16>::value) {
+      q.submit([&](handler &cgh) {
+        auto accA = bufA.template get_access<access::mode::write>(cgh);
+
+        cgh.parallel_for<KernelName<bfloat16, class copyA, M, K, N>>(
+            range<1>(Big_M * Big_K), [=](item<1> item) {
+              auto i = item.get_linear_id();
+              accA[i] = 0.1f * (i % 10);
+            });
+      });
+
+      q.submit([&](handler &cgh) {
+        auto accB = bufB.template get_access<access::mode::write>(cgh);
+
+        cgh.parallel_for<KernelName<bfloat16, class copyB, M, K, N>>(
+            range<1>(Big_K * Big_N), [=](item<1> item) {
+              auto i = item.get_linear_id();
+              accB[i] = 0.1f * (i % 10);
+            });
+      });
+    }
 
     q.submit([&](handler &cgh) {
-      auto accB = bufB.template get_access<access::mode::write>(cgh);
-
-      cgh.parallel_for<KernelName<bfloat16, class copyB, M, K, N>>(
-          range<1>(Big_K * Big_N), [=](item<1> item) {
-            auto i = item.get_linear_id();
-            accB[i] = 0.1f * (i % 10);
-          });
+      auto accC = bufC.template get_access<access::mode::read_write>(cgh);
+      auto accA = bufA.template get_access<access::mode::read_write>(cgh);
+      auto accB = bufB.template get_access<access::mode::read_write>(cgh);
+      auto accD = bufD.template get_access<access::mode::read_write>(cgh);
+
+      range<2> LocalRange = {1, N_THREADS_PER_MATRIX_OP};
+      range<2> GlobalRange = {Sub_Tiles_M,
+                              Sub_Tiles_N * N_THREADS_PER_MATRIX_OP};
+
+      cgh.parallel_for<
+          KernelName<T1, T2, M, K, N>>(nd_range<2>(GlobalRange, LocalRange), [=
+      ](nd_item<2> item)[[sycl::reqd_work_group_size(1, 1, 32)]] {
+        sycl::sub_group sg = item.get_sub_group();
+        const auto m =
+            item.get_group().get_group_id()[0]; // row id of current submatrix
+                                                // of BIG C matrix
+        const auto n =
+            item.get_group().get_group_id()[1]; // column id of current
+                                                // submatrix of BIG C matrix
+
+        joint_matrix<T1, matrix_use::a, M, K, matrix_layout::row_major> sub_a;
+
+        joint_matrix<T1, matrix_use::b, K, N, matrix_layout::row_major> sub_b;
+
+        joint_matrix<T2, matrix_use::accumulator, M, N,
+                     matrix_layout::row_major>
+            sub_c;
+
+        joint_matrix_load(sg, sub_c,
+                          accC.get_pointer() + (m * M) * Big_N + n * N, Big_N);
+
+        for (int k = 0; k < Sub_Tiles_K;
+             k++) // row/col id of current submatrix of BIG A/B matrices
+        {
+          joint_matrix_load(
+              sg, sub_a, accA.get_pointer() + (k * K) + (m * M * Big_K), Big_K);
+
+          joint_matrix_load(
+              sg, sub_b, accB.get_pointer() + (k * K * Big_N) + (n * N), Big_N);
+
+          sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
+        }
+        joint_matrix_store(sg, sub_c,
+                           accD.get_pointer() + (m * M) * Big_N + n * N, Big_N);
+      });
     });
-  }
-
-  q.submit([&](handler &cgh) {
-    auto accC = bufC.template get_access<access::mode::read_write>(cgh);
-    auto accA = bufA.template get_access<access::mode::read_write>(cgh);
-    auto accB = bufB.template get_access<access::mode::read_write>(cgh);
-    auto accD = bufD.template get_access<access::mode::read_write>(cgh);
-
-    range<2> LocalRange = {1, N_THREADS_PER_MATRIX_OP};
-    range<2> GlobalRange = {Sub_Tiles_M, Sub_Tiles_N * N_THREADS_PER_MATRIX_OP};
-
-    cgh.parallel_for<
-        KernelName<T1, T2, M, K, N>>(nd_range<2>(GlobalRange, LocalRange), [=
-    ](nd_item<2> item)[[sycl::reqd_work_group_size(1, 1, 32)]] {
-      sycl::sub_group sg = item.get_sub_group();
-      const auto m =
-          item.get_group()
-              .get_group_id()[0]; // row id of current submatrix of BIG C matrix
-      const auto n =
-          item.get_group().get_group_id()[1]; // column id of current
-                                              // submatrix of BIG C matrix
-
-      joint_matrix<T1, matrix_use::a, M, K, matrix_layout::row_major> sub_a;
 
-      joint_matrix<T1, matrix_use::b, K, N, matrix_layout::row_major> sub_b;
-
-      joint_matrix<T2, matrix_use::accumulator, M, N, matrix_layout::row_major>
-          sub_c;
-
-      joint_matrix_load(sg, sub_c, accC.get_pointer() + (m * M) * Big_N + n * N,
-                        Big_N);
-
-      for (int k = 0; k < Sub_Tiles_K;
-           k++) // row/col id of current submatrix of BIG A/B matrices
-      {
-        joint_matrix_load(
-            sg, sub_a, accA.get_pointer() + (k * K) + (m * M * Big_K), Big_K);
-
-        joint_matrix_load(
-            sg, sub_b, accB.get_pointer() + (k * K * Big_N) + (n * N), Big_N);
+    q.wait();
+  }
 
-        sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
+  for (int m = 0; m < Big_M; m++)
+    for (int n = 0; n < Big_N; n++) {
+      if constexpr (std::is_same<T1, bfloat16>::value) {
+        auto res_device = matrix_ref_mn<T1, T2, Big_N, Big_K>(m, n, A, B, C);
+        assert(fabs(2 * (D[m * Big_N + n] - res_device)) /
+                   (D[m * Big_N + n] + res_device) <
+               bf16_eps * 2);
+      } else {
+        assert((D[m * Big_N + n] ==
+                matrix_ref_mn<T1, T2, Big_N, Big_K>(m, n, A, B, C)));
       }
-      joint_matrix_store(sg, sub_c,
-                         accD.get_pointer() + (m * M) * Big_N + n * N, Big_N);
-    });
-  });
-
-  q.wait();
-}
-
-for (int m = 0; m < Big_M; m++)
-  for (int n = 0; n < Big_N; n++) {
-    if constexpr (std::is_same<T1, bfloat16>::value) {
-      auto res_device = matrix_ref_mn<T1, T2, Big_N, Big_K>(m, n, A, B, C);
-      assert(fabs(2 * (D[m * Big_N + n] - res_device)) /
-                 (D[m * Big_N + n] + res_device) <
-             bf16_eps * 2);
-    } else {
-      assert((D[m * Big_N + n] ==
-              matrix_ref_mn<T1, T2, Big_N, Big_K>(m, n, A, B, C)));
     }
-  }
 };
 
 int main() {

From b178c43ca24056ef5568f2835f4bfa349facf517 Mon Sep 17 00:00:00 2001
From: "jack.kirk" <jack.kirk@codeplay.com>
Date: Tue, 5 Apr 2022 18:35:46 +0100
Subject: [PATCH 04/21] format

---
 SYCL/Matrix/joint_matrix_tensorcore.cpp | 77 +++++++++++++------------
 1 file changed, 40 insertions(+), 37 deletions(-)

diff --git a/SYCL/Matrix/joint_matrix_tensorcore.cpp b/SYCL/Matrix/joint_matrix_tensorcore.cpp
index fe4bfcab61..f3f322aa6a 100644
--- a/SYCL/Matrix/joint_matrix_tensorcore.cpp
+++ b/SYCL/Matrix/joint_matrix_tensorcore.cpp
@@ -163,43 +163,46 @@ void test() {
       range<2> GlobalRange = {Sub_Tiles_M,
                               Sub_Tiles_N * N_THREADS_PER_MATRIX_OP};
 
-      cgh.parallel_for<
-          KernelName<T1, T2, M, K, N>>(nd_range<2>(GlobalRange, LocalRange), [=
-      ](nd_item<2> item)[[sycl::reqd_work_group_size(1, 1, 32)]] {
-        sycl::sub_group sg = item.get_sub_group();
-        const auto m =
-            item.get_group().get_group_id()[0]; // row id of current submatrix
-                                                // of BIG C matrix
-        const auto n =
-            item.get_group().get_group_id()[1]; // column id of current
-                                                // submatrix of BIG C matrix
-
-        joint_matrix<T1, matrix_use::a, M, K, matrix_layout::row_major> sub_a;
-
-        joint_matrix<T1, matrix_use::b, K, N, matrix_layout::row_major> sub_b;
-
-        joint_matrix<T2, matrix_use::accumulator, M, N,
-                     matrix_layout::row_major>
-            sub_c;
-
-        joint_matrix_load(sg, sub_c,
-                          accC.get_pointer() + (m * M) * Big_N + n * N, Big_N);
-
-        for (int k = 0; k < Sub_Tiles_K;
-             k++) // row/col id of current submatrix of BIG A/B matrices
-        {
-          joint_matrix_load(
-              sg, sub_a, accA.get_pointer() + (k * K) + (m * M * Big_K), Big_K);
-
-          joint_matrix_load(
-              sg, sub_b, accB.get_pointer() + (k * K * Big_N) + (n * N), Big_N);
-
-          sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
-        }
-        joint_matrix_store(sg, sub_c,
-                           accD.get_pointer() + (m * M) * Big_N + n * N, Big_N);
-      });
-    });
+      cgh.parallel_for<KernelName<T1, T2, M, K, N>>(
+          nd_range<2>(GlobalRange, LocalRange),
+          [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
+            sycl::sub_group sg = item.get_sub_group();
+            const auto m =
+                item.get_group().get_group_id()[0]; // row id of current
+                                                    // submatrix of BIG C matrix
+            const auto n =
+                item.get_group().get_group_id()[1]; // column id of current
+                                                    // submatrix of BIG C matrix
+
+            joint_matrix<T1, matrix_use::a, M, K, matrix_layout::row_major>
+                sub_a;
+
+            joint_matrix<T1, matrix_use::b, K, N, matrix_layout::row_major>
+                sub_b;
+
+            joint_matrix<T2, matrix_use::accumulator, M, N,
+                         matrix_layout::row_major>
+                sub_c;
+
+            joint_matrix_load(
+                sg, sub_c, accC.get_pointer() + (m * M) * Big_N + n * N, Big_N);
+
+            for (int k = 0; k < Sub_Tiles_K;
+                 k++) // row/col id of current submatrix of BIG A/B matrices
+            {
+              joint_matrix_load(sg, sub_a,
+                                accA.get_pointer() + (k * K) + (m * M * Big_K),
+                                Big_K);
+
+              joint_matrix_load(sg, sub_b,
+                                accB.get_pointer() + (k * K * Big_N) + (n * N),
+                                Big_N);
+
+              sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
+            }
+            joint_matrix_store(
+                sg, sub_c, accD.get_pointer() + (m * M) * Big_N + n * N, Big_N);
+          });
 
     q.wait();
   }

From ca241530a9dc33a95597a1da505d26c134b9ac28 Mon Sep 17 00:00:00 2001
From: "jack.kirk" <jack.kirk@codeplay.com>
Date: Tue, 5 Apr 2022 18:45:35 +0100
Subject: [PATCH 05/21] format

---
 SYCL/Matrix/joint_matrix_tensorcore.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/SYCL/Matrix/joint_matrix_tensorcore.cpp b/SYCL/Matrix/joint_matrix_tensorcore.cpp
index f3f322aa6a..109ed10ad3 100644
--- a/SYCL/Matrix/joint_matrix_tensorcore.cpp
+++ b/SYCL/Matrix/joint_matrix_tensorcore.cpp
@@ -204,7 +204,7 @@ void test() {
                 sg, sub_c, accD.get_pointer() + (m * M) * Big_N + n * N, Big_N);
           });
 
-    q.wait();
+      q.wait();
   }
 
   for (int m = 0; m < Big_M; m++)
@@ -219,7 +219,7 @@ void test() {
                 matrix_ref_mn<T1, T2, Big_N, Big_K>(m, n, A, B, C)));
       }
     }
-};
+  };
 
 int main() {
 

From 4013bd06edbbb0cb9524a08f918750f6c73f411b Mon Sep 17 00:00:00 2001
From: "jack.kirk" <jack.kirk@codeplay.com>
Date: Thu, 7 Apr 2022 14:58:31 +0100
Subject: [PATCH 06/21] Do compute capability check at runtime.

---
 SYCL/BFloat16/bfloat16_builtins.cpp     | 35 +++++++------
 SYCL/Matrix/joint_matrix_tensorcore.cpp | 67 +++++++++++++------------
 2 files changed, 56 insertions(+), 46 deletions(-)

diff --git a/SYCL/BFloat16/bfloat16_builtins.cpp b/SYCL/BFloat16/bfloat16_builtins.cpp
index 8baad18c61..9320895675 100644
--- a/SYCL/BFloat16/bfloat16_builtins.cpp
+++ b/SYCL/BFloat16/bfloat16_builtins.cpp
@@ -1,8 +1,6 @@
 // REQUIRES: cuda
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_80
-// TODO: Currently the CI does not have a sm_80 capable machine. Enable the test
-// execution once it does.
-// RUNx: %t.out
+// RUN: %t.out
 
 #include <CL/sycl.hpp>
 
@@ -100,19 +98,26 @@ bool check(float a, float b) {
 
 int main() {
   queue q;
-  std::vector<float> a(N), b(N), c(N);
-  int err = 0;
 
-  for (int i = 0; i < N; i++) {
-    a[i] = (i - N / 2) / (float)N;
-    b[i] = (N / 2 - i) / (float)N;
-    c[i] = (float)(3 * i);
+  auto computeCapability =
+      std::stof(q.get_device().get_info<sycl::info::device::backend_version>());
+  // TODO check for "ext_oneapi_bfloat16" aspect instead once aspect is
+  // supported. Since this test only covers CUDA the current check is
+  // functionally equivalent to "ext_oneapi_bfloat16".
+  if (computeCapability >= 8.0) {
+    std::vector<float> a(N), b(N), c(N);
+    int err = 0;
+
+    for (int i = 0; i < N; i++) {
+      a[i] = (i - N / 2) / (float)N;
+      b[i] = (N / 2 - i) / (float)N;
+      c[i] = (float)(3 * i);
+    }
+
+    TEST_BUILTIN_1(fabs);
+    TEST_BUILTIN_2(fmin);
+    TEST_BUILTIN_2(fmax);
+    TEST_BUILTIN_3(fma);
   }
-
-  TEST_BUILTIN_1(fabs);
-  TEST_BUILTIN_2(fmin);
-  TEST_BUILTIN_2(fmax);
-  TEST_BUILTIN_3(fma);
-
   return 0;
 }
diff --git a/SYCL/Matrix/joint_matrix_tensorcore.cpp b/SYCL/Matrix/joint_matrix_tensorcore.cpp
index 109ed10ad3..c184a29046 100644
--- a/SYCL/Matrix/joint_matrix_tensorcore.cpp
+++ b/SYCL/Matrix/joint_matrix_tensorcore.cpp
@@ -1,9 +1,7 @@
 // REQUIRES: cuda
 
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX=3 %s -o %t.out
-// TODO: Currently the CI does not have a sm_80 capable machine. Enable the test
-// execution once it does.
-// RUNx: %t.out
+// RUN: %t.out
 //
 // Specifying the sm version via the --cuda-gpu-arch flag is necessary
 // for the Nvidia case.  DPC++ JIT compilation is not
@@ -84,7 +82,7 @@ T2 matrix_ref_mn(const int &m, const int &n, T1 *A, T1 *B, T2 *C) {
 
 template <typename T1, typename T2, size_t Sub_Tiles_M, size_t Sub_Tiles_K,
           size_t Sub_Tiles_N, size_t M, size_t K, size_t N>
-void test() {
+void test(queue& q) {
 
   constexpr auto Big_M =
       Sub_Tiles_M *
@@ -129,7 +127,6 @@ void test() {
     buffer<T2, 1> bufC(C, range<1>(Big_M * Big_N));
     buffer<T2, 1> bufD(D, range<1>(Big_M * Big_N));
 
-    queue q;
     // currently bfloat16 has to be initialized on device
     if constexpr (std::is_same<T1, bfloat16>::value) {
       q.submit([&](handler &cgh) {
@@ -203,7 +200,7 @@ void test() {
             joint_matrix_store(
                 sg, sub_c, accD.get_pointer() + (m * M) * Big_N + n * N, Big_N);
           });
-
+      });
       q.wait();
   }
 
@@ -223,34 +220,42 @@ void test() {
 
 int main() {
 
-  // A/B half, Accumulator float
-  test<half, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 16, 16, 16>();
-  test<half, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 8, 16, 32>();
-  test<half, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 32, 16, 8>();
-
-  // A/B/Accumulator half
-  test<half, half, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 16, 16, 16>();
-  test<half, half, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 8, 16, 32>();
-  test<half, half, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 32, 16, 8>();
-
-  test<int8_t, int32_t, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 16, 16, 16>();
-  test<int8_t, int32_t, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 8, 16, 32>();
-  test<int8_t, int32_t, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 32, 16, 8>();
+  queue Q;
+  auto computeCapability =
+      std::stof(Q.get_device().get_info<sycl::info::device::backend_version>());
 
-  test<uint8_t, int32_t, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 16, 16, 16>();
-  test<uint8_t, int32_t, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 8, 16, 32>();
-  test<uint8_t, int32_t, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 32, 16, 8>();
+  if (computeCapability >= 7.0) {
+    // A/B half, Accumulator float
+    test<half, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 16, 16, 16>(Q);
+    test<half, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 8, 16, 32>(Q);
+    test<half, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 32, 16, 8>(Q);
 
-  test<double, double, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 8, 4, 8>();
-
-  // A/B bf16 using storage type
-  test<uint16_t, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 16, 16, 16>();
-  test<uint16_t, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 8, 16, 32>();
-  test<uint16_t, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 32, 16, 8>();
+    // A/B/Accumulator half
+    test<half, half, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 16, 16, 16>(Q);
+    test<half, half, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 8, 16, 32>(Q);
+    test<half, half, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 32, 16, 8>(Q);
+  }
+  if (computeCapability >= 7.2) {
+    test<int8_t, int32_t, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 16, 16, 16>(Q);
+    test<int8_t, int32_t, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 8, 16, 32>(Q);
+    test<int8_t, int32_t, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 32, 16, 8>(Q);
+
+    test<uint8_t, int32_t, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 16, 16, 16>(
+        Q);
+    test<uint8_t, int32_t, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 8, 16, 32>(Q);
+    test<uint8_t, int32_t, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 32, 16, 8>(Q);
+  }
+  if (computeCapability >= 8.0) {
+    test<double, double, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 8, 4, 8>(Q);
 
-  test<bfloat16, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 16, 16, 16>();
-  test<bfloat16, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 8, 16, 32>();
-  test<bfloat16, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 32, 16, 8>();
+    // A/B bfloat16 using storage type
+    test<uint16_t, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 16, 16, 16>(Q);
+    test<uint16_t, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 8, 16, 32>(Q);
+    test<uint16_t, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 32, 16, 8>(Q);
 
+    test<bfloat16, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 16, 16, 16>(Q);
+    test<bfloat16, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 8, 16, 32>(Q);
+    test<bfloat16, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 32, 16, 8>(Q);
+  }
   return 0;
 };

From e4e7f205a9f3cff493ea000ad38ce30fe838d355 Mon Sep 17 00:00:00 2001
From: "jack.kirk" <jack.kirk@codeplay.com>
Date: Thu, 7 Apr 2022 15:09:43 +0100
Subject: [PATCH 07/21] format

---
 SYCL/Matrix/joint_matrix_tensorcore.cpp | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/SYCL/Matrix/joint_matrix_tensorcore.cpp b/SYCL/Matrix/joint_matrix_tensorcore.cpp
index c184a29046..70b6eff513 100644
--- a/SYCL/Matrix/joint_matrix_tensorcore.cpp
+++ b/SYCL/Matrix/joint_matrix_tensorcore.cpp
@@ -82,7 +82,7 @@ T2 matrix_ref_mn(const int &m, const int &n, T1 *A, T1 *B, T2 *C) {
 
 template <typename T1, typename T2, size_t Sub_Tiles_M, size_t Sub_Tiles_K,
           size_t Sub_Tiles_N, size_t M, size_t K, size_t N>
-void test(queue& q) {
+void test(queue &q) {
 
   constexpr auto Big_M =
       Sub_Tiles_M *
@@ -200,11 +200,11 @@ void test(queue& q) {
             joint_matrix_store(
                 sg, sub_c, accD.get_pointer() + (m * M) * Big_N + n * N, Big_N);
           });
-      });
-      q.wait();
+    });
+    q.wait();
   }
 
-  for (int m = 0; m < Big_M; m++)
+  for (int m = 0; m < Big_M; m++) {
     for (int n = 0; n < Big_N; n++) {
       if constexpr (std::is_same<T1, bfloat16>::value) {
         auto res_device = matrix_ref_mn<T1, T2, Big_N, Big_K>(m, n, A, B, C);
@@ -216,7 +216,8 @@ void test(queue& q) {
                 matrix_ref_mn<T1, T2, Big_N, Big_K>(m, n, A, B, C)));
       }
     }
-  };
+  }
+};
 
 int main() {
 

From 5c5d65c420efbc44ecb80ad4891bf4502e74c75e Mon Sep 17 00:00:00 2001
From: "jack.kirk" <jack.kirk@codeplay.com>
Date: Tue, 12 Apr 2022 15:13:02 +0100
Subject: [PATCH 08/21] tests for bfloat16 marray math fcts

---
 SYCL/BFloat16/bfloat16_builtins.cpp     | 158 ++++++++++++++++++++----
 SYCL/Matrix/joint_matrix_tensorcore.cpp |  36 +++---
 2 files changed, 152 insertions(+), 42 deletions(-)

diff --git a/SYCL/BFloat16/bfloat16_builtins.cpp b/SYCL/BFloat16/bfloat16_builtins.cpp
index 9320895675..3c3648d917 100644
--- a/SYCL/BFloat16/bfloat16_builtins.cpp
+++ b/SYCL/BFloat16/bfloat16_builtins.cpp
@@ -10,20 +10,14 @@
 using namespace cl::sycl;
 using sycl::ext::oneapi::experimental::bfloat16;
 
-constexpr int N = 16 * 3; // divisible by all vector sizes
+constexpr int N = 60; // divisible by all tested array sizes
 constexpr float bf16_eps = 0.00390625;
 
-union conv {
-  float f;
-  vec<uint16_t, 2> u;
-  uint32_t u2;
-};
-
-float from_bf16(uint16_t x) {
-  conv c;
-  c.u.y() = x;
-  c.u.x() = 0;
-  return c.f;
+float make_fp32(uint16_t x) {
+  uint32_t y = x;
+  y = y << 16;
+  auto res = reinterpret_cast<float *>(&y);
+  return *res;
 }
 
 bool check(float a, float b) {
@@ -35,10 +29,11 @@ bool check(float a, float b) {
     buffer<float> a_buf(&a[0], N);                                             \
     buffer<int> err_buf(&err, 1);                                              \
     q.submit([&](handler &cgh) {                                               \
-      auto A = a_buf.get_access<access::mode::read_write>(cgh);                \
-      auto ERR = err_buf.get_access<access::mode::write>(cgh);                 \
+      accessor<float, 1, access::mode::read_write, target::device> A(a_buf,    \
+                                                                     cgh);     \
+      accessor<int, 1, access::mode::write, target::device> ERR(err_buf, cgh); \
       cgh.parallel_for(N, [=](id<1> index) {                                   \
-        if (check(from_bf16(NAME(bfloat16{A[index]}).raw()),                   \
+        if (check(make_fp32(NAME(bfloat16{A[index]}).raw()),                   \
                   NAME(A[index]))) {                                           \
           ERR[0] = 1;                                                          \
         }                                                                      \
@@ -47,7 +42,37 @@ bool check(float a, float b) {
   }                                                                            \
   assert(err == 0);
 
-#define TEST_BUILTIN_1(NAME) TEST_BUILTIN_1_SCAL_IMPL(NAME)
+#define TEST_BUILTIN_1_ARR_IMPL(NAME, SZ)                                      \
+  {                                                                            \
+    buffer<float, 2> a_buf{range<2>{N / SZ, SZ}};                              \
+    buffer<int> err_buf(&err, 1);                                              \
+    q.submit([&](handler &cgh) {                                               \
+      accessor<float, 2, access::mode::read_write, target::device> A(a_buf,    \
+                                                                     cgh);     \
+      accessor<int, 1, access::mode::write, target::device> ERR(err_buf, cgh); \
+      cgh.parallel_for(N / SZ, [=](id<1> index) {                              \
+        marray<bfloat16, SZ> arg;                                              \
+        for (int i = 0; i < SZ; i++) {                                         \
+          arg[i] = A[index][i];                                                \
+        }                                                                      \
+        marray<bfloat16, SZ> res = NAME(arg);                                  \
+        for (int i = 0; i < SZ; i++) {                                         \
+          if (check(make_fp32(res[i].raw()), NAME(A[index][i]))) {             \
+            ERR[0] = 1;                                                        \
+          }                                                                    \
+        }                                                                      \
+      });                                                                      \
+    });                                                                        \
+  }                                                                            \
+  assert(err == 0);
+
+#define TEST_BUILTIN_1(NAME)                                                   \
+  TEST_BUILTIN_1_SCAL_IMPL(NAME)                                               \
+  TEST_BUILTIN_1_ARR_IMPL(NAME, 1)                                             \
+  TEST_BUILTIN_1_ARR_IMPL(NAME, 2)                                             \
+  TEST_BUILTIN_1_ARR_IMPL(NAME, 3)                                             \
+  TEST_BUILTIN_1_ARR_IMPL(NAME, 4)                                             \
+  TEST_BUILTIN_1_ARR_IMPL(NAME, 5)
 
 #define TEST_BUILTIN_2_SCAL_IMPL(NAME)                                         \
   {                                                                            \
@@ -55,12 +80,14 @@ bool check(float a, float b) {
     buffer<float> b_buf(&b[0], N);                                             \
     buffer<int> err_buf(&err, 1);                                              \
     q.submit([&](handler &cgh) {                                               \
-      auto A = a_buf.get_access<access::mode::read>(cgh);                      \
-      auto B = b_buf.get_access<access::mode::read>(cgh);                      \
-      auto ERR = err_buf.get_access<access::mode::write>(cgh);                 \
+      accessor<float, 1, access::mode::read_write, target::device> A(a_buf,    \
+                                                                     cgh);     \
+      accessor<float, 1, access::mode::read_write, target::device> B(b_buf,    \
+                                                                     cgh);     \
+      accessor<int, 1, access::mode::write, target::device> ERR(err_buf, cgh); \
       cgh.parallel_for(N, [=](id<1> index) {                                   \
         if (check(                                                             \
-                from_bf16(NAME(bfloat16{A[index]}, bfloat16{B[index]}).raw()), \
+                make_fp32(NAME(bfloat16{A[index]}, bfloat16{B[index]}).raw()), \
                 NAME(A[index], B[index]))) {                                   \
           ERR[0] = 1;                                                          \
         }                                                                      \
@@ -69,7 +96,42 @@ bool check(float a, float b) {
   }                                                                            \
   assert(err == 0);
 
-#define TEST_BUILTIN_2(NAME) TEST_BUILTIN_2_SCAL_IMPL(NAME)
+#define TEST_BUILTIN_2_ARR_IMPL(NAME, SZ)                                      \
+  {                                                                            \
+    buffer<float, 2> a_buf{range<2>{N / SZ, SZ}};                              \
+    buffer<float, 2> b_buf{range<2>{N / SZ, SZ}};                              \
+    buffer<int> err_buf(&err, 1);                                              \
+    q.submit([&](handler &cgh) {                                               \
+      accessor<float, 2, access::mode::read_write, target::device> A(a_buf,    \
+                                                                     cgh);     \
+      accessor<float, 2, access::mode::read_write, target::device> B(b_buf,    \
+                                                                     cgh);     \
+      accessor<int, 1, access::mode::write, target::device> ERR(err_buf, cgh); \
+      cgh.parallel_for(N / SZ, [=](id<1> index) {                              \
+        marray<bfloat16, SZ> arg0, arg1;                                       \
+        for (int i = 0; i < SZ; i++) {                                         \
+          arg0[i] = A[index][i];                                               \
+          arg1[i] = B[index][i];                                               \
+        }                                                                      \
+        marray<bfloat16, SZ> res = NAME(arg0, arg1);                           \
+        for (int i = 0; i < SZ; i++) {                                         \
+          if (check(make_fp32(res[i].raw()),                                   \
+                    NAME(A[index][i], B[index][i]))) {                         \
+            ERR[0] = 1;                                                        \
+          }                                                                    \
+        }                                                                      \
+      });                                                                      \
+    });                                                                        \
+  }                                                                            \
+  assert(err == 0);
+
+#define TEST_BUILTIN_2(NAME)                                                   \
+  TEST_BUILTIN_2_SCAL_IMPL(NAME)                                               \
+  TEST_BUILTIN_2_ARR_IMPL(NAME, 1)                                             \
+  TEST_BUILTIN_2_ARR_IMPL(NAME, 2)                                             \
+  TEST_BUILTIN_2_ARR_IMPL(NAME, 3)                                             \
+  TEST_BUILTIN_2_ARR_IMPL(NAME, 4)                                             \
+  TEST_BUILTIN_2_ARR_IMPL(NAME, 5)
 
 #define TEST_BUILTIN_3_SCAL_IMPL(NAME)                                         \
   {                                                                            \
@@ -78,12 +140,15 @@ bool check(float a, float b) {
     buffer<float> c_buf(&c[0], N);                                             \
     buffer<int> err_buf(&err, 1);                                              \
     q.submit([&](handler &cgh) {                                               \
-      auto A = a_buf.get_access<access::mode::read>(cgh);                      \
-      auto B = b_buf.get_access<access::mode::read>(cgh);                      \
-      auto C = c_buf.get_access<access::mode::read>(cgh);                      \
-      auto ERR = err_buf.get_access<access::mode::write>(cgh);                 \
+      accessor<float, 1, access::mode::read_write, target::device> A(a_buf,    \
+                                                                     cgh);     \
+      accessor<float, 1, access::mode::read_write, target::device> B(b_buf,    \
+                                                                     cgh);     \
+      accessor<float, 1, access::mode::read_write, target::device> C(c_buf,    \
+                                                                     cgh);     \
+      accessor<int, 1, access::mode::write, target::device> ERR(err_buf, cgh); \
       cgh.parallel_for(N, [=](id<1> index) {                                   \
-        if (check(from_bf16(NAME(bfloat16{A[index]}, bfloat16{B[index]},       \
+        if (check(make_fp32(NAME(bfloat16{A[index]}, bfloat16{B[index]},       \
                                  bfloat16{C[index]})                           \
                                 .raw()),                                       \
                   NAME(A[index], B[index], C[index]))) {                       \
@@ -94,7 +159,46 @@ bool check(float a, float b) {
   }                                                                            \
   assert(err == 0);
 
-#define TEST_BUILTIN_3(NAME) TEST_BUILTIN_3_SCAL_IMPL(NAME)
+#define TEST_BUILTIN_3_ARR_IMPL(NAME, SZ)                                      \
+  {                                                                            \
+    buffer<float, 2> a_buf{range<2>{N / SZ, SZ}};                              \
+    buffer<float, 2> b_buf{range<2>{N / SZ, SZ}};                              \
+    buffer<float, 2> c_buf{range<2>{N / SZ, SZ}};                              \
+    buffer<int> err_buf(&err, 1);                                              \
+    q.submit([&](handler &cgh) {                                               \
+      accessor<float, 2, access::mode::read_write, target::device> A(a_buf,    \
+                                                                     cgh);     \
+      accessor<float, 2, access::mode::read_write, target::device> B(b_buf,    \
+                                                                     cgh);     \
+      accessor<float, 2, access::mode::read_write, target::device> C(c_buf,    \
+                                                                     cgh);     \
+      accessor<int, 1, access::mode::write, target::device> ERR(err_buf, cgh); \
+      cgh.parallel_for(N / SZ, [=](id<1> index) {                              \
+        marray<bfloat16, SZ> arg0, arg1, arg2;                                 \
+        for (int i = 0; i < SZ; i++) {                                         \
+          arg0[i] = A[index][i];                                               \
+          arg1[i] = B[index][i];                                               \
+          arg2[i] = C[index][i];                                               \
+        }                                                                      \
+        marray<bfloat16, SZ> res = NAME(arg0, arg1, arg2);                     \
+        for (int i = 0; i < SZ; i++) {                                         \
+          if (check(make_fp32(res[i].raw()),                                   \
+                    NAME(A[index][i], B[index][i], C[index][i]))) {            \
+            ERR[0] = 1;                                                        \
+          }                                                                    \
+        }                                                                      \
+      });                                                                      \
+    });                                                                        \
+  }                                                                            \
+  assert(err == 0);
+
+#define TEST_BUILTIN_3(NAME)                                                   \
+  TEST_BUILTIN_3_SCAL_IMPL(NAME)                                               \
+  TEST_BUILTIN_3_ARR_IMPL(NAME, 1)                                             \
+  TEST_BUILTIN_3_ARR_IMPL(NAME, 2)                                             \
+  TEST_BUILTIN_3_ARR_IMPL(NAME, 3)                                             \
+  TEST_BUILTIN_3_ARR_IMPL(NAME, 4)                                             \
+  TEST_BUILTIN_3_ARR_IMPL(NAME, 5)
 
 int main() {
   queue q;
diff --git a/SYCL/Matrix/joint_matrix_tensorcore.cpp b/SYCL/Matrix/joint_matrix_tensorcore.cpp
index 70b6eff513..f0a92fc11b 100644
--- a/SYCL/Matrix/joint_matrix_tensorcore.cpp
+++ b/SYCL/Matrix/joint_matrix_tensorcore.cpp
@@ -46,17 +46,17 @@ class TypeHelper;
 template <typename T1, typename T2, size_t M, size_t K, size_t N>
 using KernelName = class TypeHelper<T1, T2, M, K, N>;
 
-float make_fp32(short x) {
-  unsigned int y = x;
+float make_fp32(uint16_t x) {
+  uint32_t y = x;
   y = y << 16;
-  float *res = reinterpret_cast<float *>(&y);
+  auto res = reinterpret_cast<float *>(&y);
   return *res;
 }
 
-unsigned short make_bf16(float x) {
-  int *res = reinterpret_cast<int *>(&x);
+uint16_t make_bf16(float x) {
+  auto res = reinterpret_cast<int32_t *>(&x);
   *res = *res >> 16;
-  return (unsigned short)*res;
+  return (uint16_t)*res;
 }
 
 template <typename T1, typename T2, size_t Big_N, size_t Big_K>
@@ -130,7 +130,8 @@ void test(queue &q) {
     // currently bfloat16 has to be initialized on device
     if constexpr (std::is_same<T1, bfloat16>::value) {
       q.submit([&](handler &cgh) {
-        auto accA = bufA.template get_access<access::mode::write>(cgh);
+        accessor<T1, 1, access::mode::read_write, target::device>
+            accA(bufA, cgh);
 
         cgh.parallel_for<KernelName<bfloat16, class copyA, M, K, N>>(
             range<1>(Big_M * Big_K), [=](item<1> item) {
@@ -140,7 +141,8 @@ void test(queue &q) {
       });
 
       q.submit([&](handler &cgh) {
-        auto accB = bufB.template get_access<access::mode::write>(cgh);
+        accessor<T1, 1, access::mode::read_write, target::device>
+            accB(bufB, cgh);
 
         cgh.parallel_for<KernelName<bfloat16, class copyB, M, K, N>>(
             range<1>(Big_K * Big_N), [=](item<1> item) {
@@ -151,10 +153,14 @@ void test(queue &q) {
     }
 
     q.submit([&](handler &cgh) {
-      auto accC = bufC.template get_access<access::mode::read_write>(cgh);
-      auto accA = bufA.template get_access<access::mode::read_write>(cgh);
-      auto accB = bufB.template get_access<access::mode::read_write>(cgh);
-      auto accD = bufD.template get_access<access::mode::read_write>(cgh);
+    accessor<T1, 1, access::mode::read_write, target::device>
+        accA(bufA, cgh);
+    accessor<T1, 1, access::mode::read_write, target::device>
+        accB(bufB, cgh);
+    accessor<T2, 1, access::mode::read_write, target::device>
+        accC(bufC, cgh);
+    accessor<T2, 1, access::mode::read_write, target::device>
+        accD(bufD, cgh);
 
       range<2> LocalRange = {1, N_THREADS_PER_MATRIX_OP};
       range<2> GlobalRange = {Sub_Tiles_M,
@@ -162,8 +168,8 @@ void test(queue &q) {
 
       cgh.parallel_for<KernelName<T1, T2, M, K, N>>(
           nd_range<2>(GlobalRange, LocalRange),
-          [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-            sycl::sub_group sg = item.get_sub_group();
+          [=](nd_item<2> item) {
+            sub_group sg = item.get_sub_group();
             const auto m =
                 item.get_group().get_group_id()[0]; // row id of current
                                                     // submatrix of BIG C matrix
@@ -223,7 +229,7 @@ int main() {
 
   queue Q;
   auto computeCapability =
-      std::stof(Q.get_device().get_info<sycl::info::device::backend_version>());
+      std::stof(Q.get_device().get_info<info::device::backend_version>());
 
   if (computeCapability >= 7.0) {
     // A/B half, Accumulator float

From 6856b89066432ed082dc2b8d8e833c2c02b6b576 Mon Sep 17 00:00:00 2001
From: "jack.kirk" <jack.kirk@codeplay.com>
Date: Tue, 12 Apr 2022 15:40:25 +0100
Subject: [PATCH 09/21] format

---
 SYCL/Matrix/joint_matrix_tensorcore.cpp | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

diff --git a/SYCL/Matrix/joint_matrix_tensorcore.cpp b/SYCL/Matrix/joint_matrix_tensorcore.cpp
index f0a92fc11b..a11e3b3306 100644
--- a/SYCL/Matrix/joint_matrix_tensorcore.cpp
+++ b/SYCL/Matrix/joint_matrix_tensorcore.cpp
@@ -130,8 +130,8 @@ void test(queue &q) {
     // currently bfloat16 has to be initialized on device
     if constexpr (std::is_same<T1, bfloat16>::value) {
       q.submit([&](handler &cgh) {
-        accessor<T1, 1, access::mode::read_write, target::device>
-            accA(bufA, cgh);
+        accessor<T1, 1, access::mode::read_write, target::device> accA(bufA,
+                                                                       cgh);
 
         cgh.parallel_for<KernelName<bfloat16, class copyA, M, K, N>>(
             range<1>(Big_M * Big_K), [=](item<1> item) {
@@ -141,8 +141,8 @@ void test(queue &q) {
       });
 
       q.submit([&](handler &cgh) {
-        accessor<T1, 1, access::mode::read_write, target::device>
-            accB(bufB, cgh);
+        accessor<T1, 1, access::mode::read_write, target::device> accB(bufB,
+                                                                       cgh);
 
         cgh.parallel_for<KernelName<bfloat16, class copyB, M, K, N>>(
             range<1>(Big_K * Big_N), [=](item<1> item) {
@@ -153,22 +153,17 @@ void test(queue &q) {
     }
 
     q.submit([&](handler &cgh) {
-    accessor<T1, 1, access::mode::read_write, target::device>
-        accA(bufA, cgh);
-    accessor<T1, 1, access::mode::read_write, target::device>
-        accB(bufB, cgh);
-    accessor<T2, 1, access::mode::read_write, target::device>
-        accC(bufC, cgh);
-    accessor<T2, 1, access::mode::read_write, target::device>
-        accD(bufD, cgh);
+      accessor<T1, 1, access::mode::read_write, target::device> accA(bufA, cgh);
+      accessor<T1, 1, access::mode::read_write, target::device> accB(bufB, cgh);
+      accessor<T2, 1, access::mode::read_write, target::device> accC(bufC, cgh);
+      accessor<T2, 1, access::mode::read_write, target::device> accD(bufD, cgh);
 
       range<2> LocalRange = {1, N_THREADS_PER_MATRIX_OP};
       range<2> GlobalRange = {Sub_Tiles_M,
                               Sub_Tiles_N * N_THREADS_PER_MATRIX_OP};
 
       cgh.parallel_for<KernelName<T1, T2, M, K, N>>(
-          nd_range<2>(GlobalRange, LocalRange),
-          [=](nd_item<2> item) {
+          nd_range<2>(GlobalRange, LocalRange), [=](nd_item<2> item) {
             sub_group sg = item.get_sub_group();
             const auto m =
                 item.get_group().get_group_id()[0]; // row id of current

From 4e1d6e46af60671b8bf58b5292af74b02689490d Mon Sep 17 00:00:00 2001
From: "jack.kirk" <jack.kirk@codeplay.com>
Date: Fri, 15 Apr 2022 11:36:46 +0100
Subject: [PATCH 10/21] test case for expected NAN behaviour.

---
 SYCL/BFloat16/bfloat16_builtins.cpp | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/SYCL/BFloat16/bfloat16_builtins.cpp b/SYCL/BFloat16/bfloat16_builtins.cpp
index 3c3648d917..6150472c3c 100644
--- a/SYCL/BFloat16/bfloat16_builtins.cpp
+++ b/SYCL/BFloat16/bfloat16_builtins.cpp
@@ -200,6 +200,26 @@ bool check(float a, float b) {
   TEST_BUILTIN_3_ARR_IMPL(NAME, 4)                                             \
   TEST_BUILTIN_3_ARR_IMPL(NAME, 5)
 
+#define TEST_BUILTIN_2_NAN(NAME)                                               \
+  {                                                                            \
+    buffer<int> err_buf(&err, 1);                                              \
+    buffer<float> nan_buf(&check_nan, 1);                                      \
+    q.submit([&](handler &cgh) {                                               \
+      accessor<int, 1, access::mode::write, target::device> ERR(err_buf, cgh); \
+      accessor<float, 1, access::mode::write, target::device> checkNAN(        \
+          nan_buf, cgh);                                                       \
+      cgh.single_task([=]() {                                                  \
+        checkNAN[0] = make_fp32(NAME(bfloat16{NAN}, bfloat16{NAN}).raw());     \
+        if ((make_fp32(NAME(bfloat16{2}, bfloat16{NAN}).raw()) != 2) ||        \
+            (make_fp32(NAME(bfloat16{NAN}, bfloat16{2}).raw()) != 2)) {        \
+          ERR[0] = 1;                                                          \
+        }                                                                      \
+      });                                                                      \
+    });                                                                        \
+  }                                                                            \
+  assert(err == 0);                                                            \
+  assert(std::isnan(check_nan));
+
 int main() {
   queue q;
 
@@ -222,6 +242,10 @@ int main() {
     TEST_BUILTIN_2(fmin);
     TEST_BUILTIN_2(fmax);
     TEST_BUILTIN_3(fma);
+
+    float check_nan = 0;
+    TEST_BUILTIN_2_NAN(fmin);
+    TEST_BUILTIN_2_NAN(fmax);
   }
   return 0;
 }

From 095f7752b208c3ce195c533ddbaeb150bd9c3dfb Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Tue, 31 May 2022 10:37:06 +0100
Subject: [PATCH 11/21] Added tests for elem wise ops.

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 SYCL/Matrix/element_wise_all_ops_cuda.cpp | 185 ++++++++++++++++++++++
 SYCL/Matrix/joint_matrix_tensorcore.cpp   |  11 +-
 2 files changed, 194 insertions(+), 2 deletions(-)
 create mode 100644 SYCL/Matrix/element_wise_all_ops_cuda.cpp

diff --git a/SYCL/Matrix/element_wise_all_ops_cuda.cpp b/SYCL/Matrix/element_wise_all_ops_cuda.cpp
new file mode 100644
index 0000000000..7c8887a054
--- /dev/null
+++ b/SYCL/Matrix/element_wise_all_ops_cuda.cpp
@@ -0,0 +1,185 @@
+//==----------- element_wise_all_ops_cuda.cpp  - DPC++ joint_matrix---------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: cuda
+
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX=3 %s -o %t.out 
+// RUN: %t.out
+
+#include <CL/sycl.hpp>
+
+using namespace sycl;
+using namespace sycl::ext::oneapi::experimental::matrix;
+using sycl::ext::oneapi::experimental::bfloat16;
+
+#define SG_SZ 32
+constexpr size_t nWGperDim = 2;
+
+class Logical {};
+
+template <typename T1, typename T2, size_t M, size_t K, size_t N, typename OP>
+class KernelName;
+
+template <typename T, size_t NUM_ROWS, size_t NUM_COLS> struct big_matrix {
+public:
+  T *mat;
+
+public:
+  T *get_data() { return mat; }
+  void set_data(T *data) { mat = data; }
+  big_matrix(T *data) : mat(data) {}
+};
+
+template <typename T, size_t M, size_t N>
+void assert_ops_ref(T *C, const float ref) {
+  for (size_t i = 0; i < M; i++)
+    for (size_t j = 0; j < N; j++) {
+      auto diff = C[i + j * M] - ref;
+      assert(std::fabs(static_cast<float>(diff)) <
+             std::numeric_limits<float>::epsilon());
+    }
+}
+template <typename T, typename T2, size_t M, size_t K, size_t N,
+          class Operation>
+void matrix_verify_op(queue q, big_matrix<T2, M * nWGperDim, N * nWGperDim> &C,
+                      nd_range<2> &r, const float ref, Operation Op) {
+  {
+    buffer<T2, 2> bufC(C.get_data(), range<2>(N * nWGperDim, M * nWGperDim));
+
+    q.submit([&](handler &cgh) {
+       accessor<T2, 2, access::mode::read_write, target::device> accC(bufC,
+                                                                      cgh);
+
+       cgh.parallel_for<KernelName<T, T2, M, K, N, Operation>>(
+           r, [ accC,
+                Op ](nd_item<2> spmd_item)[[sycl::reqd_sub_group_size(SG_SZ)]] {
+             const auto global_idx = spmd_item.get_global_id(0);
+             const auto global_idy = spmd_item.get_global_id(1);
+             const auto sg_startx = global_idx - spmd_item.get_local_id(0);
+             const auto sg_starty = global_idy - spmd_item.get_local_id(1);
+
+             auto sg = spmd_item.get_sub_group();
+
+             joint_matrix<T, matrix_use::a, M, K> sub_a;
+             joint_matrix<T, matrix_use::b, K, N> sub_b;
+             joint_matrix<T2, matrix_use::accumulator, M, N> sub_c;
+
+             joint_matrix_fill(sg, sub_a, 3);
+             joint_matrix_fill(sg, sub_b, 1);
+             joint_matrix_fill(sg, sub_c, -80);
+
+             auto wi_slice_a = sub_a.get_wi_data();
+             for (int i = 0; i < wi_slice_a.length(); i++) {
+               if constexpr (std::is_same_v<Operation, Logical>) {
+                 if (wi_slice_a[i]) {
+                   if (wi_slice_a[i] > 2.0 || wi_slice_a[i] >= 3.0 ||
+                       wi_slice_a[i] < 4.0 || wi_slice_a[i] <= 3.0) {
+                     T val = (wi_slice_a[i] != (2.0)) ? wi_slice_a[i]
+                                                      : static_cast<T>(2.0);
+                     val = ((val) - (1));
+                     val = ((val) + (1));
+                     if (wi_slice_a[i] == (2.0)) {
+                       val = ((val) - (2));
+                       val = ((val) * (3));
+                       val = ((val) / (2));
+
+                     } else {
+                       val = ((val) + (2));
+                     }
+                     wi_slice_a[i] = val;
+                   }
+                 }
+               } else {
+                 wi_slice_a[i] = Op(wi_slice_a[i], 2);
+               }
+             }
+
+             sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
+
+             joint_matrix_store(sg, sub_c,
+                                accC.get_pointer() +
+                                    (sg_startx * M) * (N * nWGperDim) +
+                                    sg_starty / SG_SZ * N,
+                                (N * nWGperDim));
+           }); // parallel for
+     })
+        .wait();
+  }
+  assert_ops_ref<T2, M * nWGperDim, N * nWGperDim>(C.get_data(), ref);
+}
+
+static constexpr size_t MATRIX_M = 16 * nWGperDim;
+static constexpr size_t MATRIX_N = 16 * nWGperDim;
+
+int main() {
+
+  float D[MATRIX_M][MATRIX_N];
+  big_matrix<float, MATRIX_M, MATRIX_N> MD_f((float *)&D);
+
+  queue q;
+  auto computeCapability =
+      std::stof(q.get_device().get_info<info::device::backend_version>());
+  nd_range<2> r({nWGperDim, nWGperDim * SG_SZ}, {1, 1 * SG_SZ});
+
+  if (computeCapability >= 7.0) {
+    matrix_verify_op<half, float, 16, 16, 16>(q, MD_f, r, 0.0,
+                                              std::plus<half>{});
+    matrix_verify_op<half, float, 16, 16, 16>(q, MD_f, r, 0.0, Logical{});
+    matrix_verify_op<half, float, 16, 16, 16>(q, MD_f, r, 16.0,
+                                              std::multiplies<half>{});
+    matrix_verify_op<half, float, 16, 16, 16>(q, MD_f, r, -56.0,
+                                              std::divides<half>{});
+    matrix_verify_op<half, float, 16, 16, 16>(q, MD_f, r, -64.0,
+                                              std::minus<half>{});
+  }
+
+  if (computeCapability >= 7.2) {
+    int32_t D_i[MATRIX_M][MATRIX_N];
+    big_matrix<int32_t, MATRIX_M, MATRIX_N> MD_i((int32_t *)&D_i);
+    matrix_verify_op<uint8_t, int32_t, 16, 16, 16>(q, MD_i, r, 0,
+                                                   std::plus<uint8_t>{});
+    matrix_verify_op<uint8_t, int32_t, 16, 16, 16>(q, MD_i, r, 16,
+                                                   std::multiplies<uint8_t>{});
+    matrix_verify_op<uint8_t, int32_t, 16, 16, 16>(q, MD_i, r, -64,
+                                                   std::minus<uint8_t>{});
+    matrix_verify_op<int8_t, int32_t, 16, 16, 16>(q, MD_i, r, 0,
+                                                  std::plus<int8_t>{});
+    matrix_verify_op<int8_t, int32_t, 16, 16, 16>(q, MD_i, r, 0.0, Logical{});
+    matrix_verify_op<int8_t, int32_t, 16, 16, 16>(q, MD_i, r, 16,
+                                                  std::multiplies<int8_t>{});
+    matrix_verify_op<int8_t, int32_t, 16, 16, 16>(q, MD_i, r, -64,
+                                                  std::minus<int8_t>{});
+  }
+
+  if (computeCapability >= 8.0) {
+
+    matrix_verify_op<bfloat16, float, 16, 16, 16>(q, MD_f, r, 0.0,
+                                                  std::plus<bfloat16>{});
+    matrix_verify_op<bfloat16, float, 16, 16, 16>(q, MD_f, r, 0.0, Logical{});
+    matrix_verify_op<bfloat16, float, 16, 16, 16>(q, MD_f, r, 16.0,
+                                                  std::multiplies<bfloat16>{});
+    matrix_verify_op<bfloat16, float, 16, 16, 16>(q, MD_f, r, -56.0,
+                                                  std::divides<bfloat16>{});
+    matrix_verify_op<bfloat16, float, 16, 16, 16>(q, MD_f, r, -64.0,
+                                                  std::minus<bfloat16>{});
+
+    double D_d[MATRIX_M / 2][MATRIX_N / 2];
+    big_matrix<double, 8 * nWGperDim, 8 * nWGperDim> MD_d((double *)&D_d);
+
+    matrix_verify_op<double, double, 8, 4, 8>(q, MD_d, r, -60.0,
+                                              std::plus<double>{});
+    matrix_verify_op<double, double, 8, 4, 8>(q, MD_d, r, -60.0, Logical{});
+    matrix_verify_op<double, double, 8, 4, 8>(q, MD_d, r, -56.0,
+                                              std::multiplies<double>{});
+    matrix_verify_op<double, double, 8, 4, 8>(q, MD_d, r, -74.0,
+                                              std::divides<double>{});
+    matrix_verify_op<double, double, 8, 4, 8>(q, MD_d, r, -76.0,
+                                              std::minus<double>{});
+  }
+
+  return 0;
+}
diff --git a/SYCL/Matrix/joint_matrix_tensorcore.cpp b/SYCL/Matrix/joint_matrix_tensorcore.cpp
index a11e3b3306..b41739646d 100644
--- a/SYCL/Matrix/joint_matrix_tensorcore.cpp
+++ b/SYCL/Matrix/joint_matrix_tensorcore.cpp
@@ -68,8 +68,8 @@ T2 matrix_ref_mn(const int &m, const int &n, T1 *A, T1 *B, T2 *C) {
       res += make_fp32(A[m * Big_K + k]) * make_fp32(B[k * Big_N + n]);
   } else if constexpr (std::is_same<T1, bfloat16>::value) {
     for (int k = 0; k < Big_K; k++)
-      res +=
-          make_fp32(A[m * Big_K + k].raw()) * make_fp32(B[k * Big_N + n].raw());
+      res += (make_fp32(A[m * Big_K + k].raw()) * 2 + 1) *
+             make_fp32(B[k * Big_N + n].raw());
   } else {
     for (int k = 0; k < Big_K; k++)
 
@@ -192,6 +192,13 @@ void test(queue &q) {
                                 accA.get_pointer() + (k * K) + (m * M * Big_K),
                                 Big_K);
 
+              if constexpr (std::is_same<T1, bfloat16>::value) {
+                marray<bfloat16, sub_a.wi_marray.size()> b, c;
+                b = 2;
+                c = 1;
+                sub_a.wi_marray = sycl::ext::oneapi::experimental::fma(sub_a.wi_marray, b, c);
+              }
+
               joint_matrix_load(sg, sub_b,
                                 accB.get_pointer() + (k * K * Big_N) + (n * N),
                                 Big_N);

From 5347f7ef2f7725b2a7e9a40e67cc294278d58d36 Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Tue, 31 May 2022 11:10:28 +0100
Subject: [PATCH 12/21] format

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 SYCL/Matrix/element_wise_all_ops_cuda.cpp | 7 +++----
 SYCL/Matrix/joint_matrix_tensorcore.cpp   | 3 ++-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/SYCL/Matrix/element_wise_all_ops_cuda.cpp b/SYCL/Matrix/element_wise_all_ops_cuda.cpp
index 7c8887a054..13b40d5417 100644
--- a/SYCL/Matrix/element_wise_all_ops_cuda.cpp
+++ b/SYCL/Matrix/element_wise_all_ops_cuda.cpp
@@ -55,8 +55,8 @@ void matrix_verify_op(queue q, big_matrix<T2, M * nWGperDim, N * nWGperDim> &C,
                                                                       cgh);
 
        cgh.parallel_for<KernelName<T, T2, M, K, N, Operation>>(
-           r, [ accC,
-                Op ](nd_item<2> spmd_item)[[sycl::reqd_sub_group_size(SG_SZ)]] {
+           r, [accC,
+               Op](nd_item<2> spmd_item) [[sycl::reqd_sub_group_size(SG_SZ)]] {
              const auto global_idx = spmd_item.get_global_id(0);
              const auto global_idy = spmd_item.get_global_id(1);
              const auto sg_startx = global_idx - spmd_item.get_local_id(0);
@@ -106,8 +106,7 @@ void matrix_verify_op(queue q, big_matrix<T2, M * nWGperDim, N * nWGperDim> &C,
                                     sg_starty / SG_SZ * N,
                                 (N * nWGperDim));
            }); // parallel for
-     })
-        .wait();
+     }).wait();
   }
   assert_ops_ref<T2, M * nWGperDim, N * nWGperDim>(C.get_data(), ref);
 }
diff --git a/SYCL/Matrix/joint_matrix_tensorcore.cpp b/SYCL/Matrix/joint_matrix_tensorcore.cpp
index b41739646d..07a80ca118 100644
--- a/SYCL/Matrix/joint_matrix_tensorcore.cpp
+++ b/SYCL/Matrix/joint_matrix_tensorcore.cpp
@@ -196,7 +196,8 @@ void test(queue &q) {
                 marray<bfloat16, sub_a.wi_marray.size()> b, c;
                 b = 2;
                 c = 1;
-                sub_a.wi_marray = sycl::ext::oneapi::experimental::fma(sub_a.wi_marray, b, c);
+                sub_a.wi_marray =
+                    sycl::ext::oneapi::experimental::fma(sub_a.wi_marray, b, c);
               }
 
               joint_matrix_load(sg, sub_b,

From a249c7ed7b797a44c1260280a3e69651f83c4da0 Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Tue, 31 May 2022 12:15:09 +0100
Subject: [PATCH 13/21] format

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 SYCL/Matrix/element_wise_all_ops_cuda.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/SYCL/Matrix/element_wise_all_ops_cuda.cpp b/SYCL/Matrix/element_wise_all_ops_cuda.cpp
index 13b40d5417..7b7f28ca9c 100644
--- a/SYCL/Matrix/element_wise_all_ops_cuda.cpp
+++ b/SYCL/Matrix/element_wise_all_ops_cuda.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 // REQUIRES: cuda
-
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX=3 %s -o %t.out 
 // RUN: %t.out
 

From b9ebfe95392f20d1a97d318f9c009260a29c408e Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Wed, 8 Jun 2022 10:15:00 +0100
Subject: [PATCH 14/21] Use implicit cast to float.

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 SYCL/BFloat16/bfloat16_builtins.cpp | 29 ++++++++++++-----------------
 1 file changed, 12 insertions(+), 17 deletions(-)

diff --git a/SYCL/BFloat16/bfloat16_builtins.cpp b/SYCL/BFloat16/bfloat16_builtins.cpp
index 6150472c3c..eac5df3fde 100644
--- a/SYCL/BFloat16/bfloat16_builtins.cpp
+++ b/SYCL/BFloat16/bfloat16_builtins.cpp
@@ -2,7 +2,7 @@
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_80
 // RUN: %t.out
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include <cmath>
 #include <vector>
@@ -33,8 +33,7 @@ bool check(float a, float b) {
                                                                      cgh);     \
       accessor<int, 1, access::mode::write, target::device> ERR(err_buf, cgh); \
       cgh.parallel_for(N, [=](id<1> index) {                                   \
-        if (check(make_fp32(NAME(bfloat16{A[index]}).raw()),                   \
-                  NAME(A[index]))) {                                           \
+        if (check(NAME(bfloat16{A[index]}), NAME(A[index]))) {                 \
           ERR[0] = 1;                                                          \
         }                                                                      \
       });                                                                      \
@@ -57,7 +56,7 @@ bool check(float a, float b) {
         }                                                                      \
         marray<bfloat16, SZ> res = NAME(arg);                                  \
         for (int i = 0; i < SZ; i++) {                                         \
-          if (check(make_fp32(res[i].raw()), NAME(A[index][i]))) {             \
+          if (check(res[i], NAME(A[index][i]))) {                              \
             ERR[0] = 1;                                                        \
           }                                                                    \
         }                                                                      \
@@ -86,9 +85,8 @@ bool check(float a, float b) {
                                                                      cgh);     \
       accessor<int, 1, access::mode::write, target::device> ERR(err_buf, cgh); \
       cgh.parallel_for(N, [=](id<1> index) {                                   \
-        if (check(                                                             \
-                make_fp32(NAME(bfloat16{A[index]}, bfloat16{B[index]}).raw()), \
-                NAME(A[index], B[index]))) {                                   \
+        if (check(NAME(bfloat16{A[index]}, bfloat16{B[index]}),                \
+                  NAME(A[index], B[index]))) {                                 \
           ERR[0] = 1;                                                          \
         }                                                                      \
       });                                                                      \
@@ -115,8 +113,7 @@ bool check(float a, float b) {
         }                                                                      \
         marray<bfloat16, SZ> res = NAME(arg0, arg1);                           \
         for (int i = 0; i < SZ; i++) {                                         \
-          if (check(make_fp32(res[i].raw()),                                   \
-                    NAME(A[index][i], B[index][i]))) {                         \
+          if (check(res[i], NAME(A[index][i], B[index][i]))) {                 \
             ERR[0] = 1;                                                        \
           }                                                                    \
         }                                                                      \
@@ -148,9 +145,8 @@ bool check(float a, float b) {
                                                                      cgh);     \
       accessor<int, 1, access::mode::write, target::device> ERR(err_buf, cgh); \
       cgh.parallel_for(N, [=](id<1> index) {                                   \
-        if (check(make_fp32(NAME(bfloat16{A[index]}, bfloat16{B[index]},       \
-                                 bfloat16{C[index]})                           \
-                                .raw()),                                       \
+        if (check(NAME(bfloat16{A[index]}, bfloat16{B[index]},                 \
+                       bfloat16{C[index]}),                                    \
                   NAME(A[index], B[index], C[index]))) {                       \
           ERR[0] = 1;                                                          \
         }                                                                      \
@@ -182,8 +178,7 @@ bool check(float a, float b) {
         }                                                                      \
         marray<bfloat16, SZ> res = NAME(arg0, arg1, arg2);                     \
         for (int i = 0; i < SZ; i++) {                                         \
-          if (check(make_fp32(res[i].raw()),                                   \
-                    NAME(A[index][i], B[index][i], C[index][i]))) {            \
+          if (check(res[i], NAME(A[index][i], B[index][i], C[index][i]))) {    \
             ERR[0] = 1;                                                        \
           }                                                                    \
         }                                                                      \
@@ -209,9 +204,9 @@ bool check(float a, float b) {
       accessor<float, 1, access::mode::write, target::device> checkNAN(        \
           nan_buf, cgh);                                                       \
       cgh.single_task([=]() {                                                  \
-        checkNAN[0] = make_fp32(NAME(bfloat16{NAN}, bfloat16{NAN}).raw());     \
-        if ((make_fp32(NAME(bfloat16{2}, bfloat16{NAN}).raw()) != 2) ||        \
-            (make_fp32(NAME(bfloat16{NAN}, bfloat16{2}).raw()) != 2)) {        \
+        checkNAN[0] = NAME(bfloat16{NAN}, bfloat16{NAN});                      \
+        if ((NAME(bfloat16{2}, bfloat16{NAN}) != 2) ||                         \
+            (NAME(bfloat16{NAN}, bfloat16{2}) != 2)) {                         \
           ERR[0] = 1;                                                          \
         }                                                                      \
       });                                                                      \

From 5ea081ad7a2678a977a042bc3e88092360e09875 Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Wed, 8 Jun 2022 11:39:47 +0100
Subject: [PATCH 15/21] Adds separate test comparing wi_marray with get_wi_data
 usage.

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 SYCL/Matrix/element_wise_wi_marray.cpp  | 67 +++++++++++++++++++++++++
 SYCL/Matrix/joint_matrix_tensorcore.cpp | 10 +---
 2 files changed, 68 insertions(+), 9 deletions(-)
 create mode 100644 SYCL/Matrix/element_wise_wi_marray.cpp

diff --git a/SYCL/Matrix/element_wise_wi_marray.cpp b/SYCL/Matrix/element_wise_wi_marray.cpp
new file mode 100644
index 0000000000..e4d78c22d1
--- /dev/null
+++ b/SYCL/Matrix/element_wise_wi_marray.cpp
@@ -0,0 +1,67 @@
+//==----------- element_wise_wi_marray.cpp  - DPC++ joint_matrix------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: cuda
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX=3 %s -o %t.out 
+// RUN: %t.out
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+using namespace sycl::ext::oneapi::experimental::matrix;
+using sycl::ext::oneapi::experimental::bfloat16;
+
+#define SG_SZ 32
+
+template <typename T, size_t M, size_t K> void verify_wi_marray(queue q) {
+  int err = 0;
+  {
+    buffer<int> err_buf(&err, 1);
+    q.submit([&](handler &cgh) {
+       accessor<int, 1, access::mode::write, target::device> ERR(err_buf, cgh);
+
+       cgh.parallel_for<class marray_kernel>(
+           nd_range<2>({1, 1 * SG_SZ}, {1, 1 * SG_SZ}),
+           [ERR](nd_item<2> spmd_item)[[sycl::reqd_sub_group_size(SG_SZ)]] {
+             auto sg = spmd_item.get_sub_group();
+
+             joint_matrix<T, matrix_use::a, M, K> sub_a;
+             joint_matrix<T, matrix_use::a, M, K> sub_a_2;
+
+             joint_matrix_fill(sg, sub_a, -1);
+             joint_matrix_fill(sg, sub_a_2, -1);
+
+             auto wi_slice_a = sub_a.get_wi_data();
+             for (int i = 0; i < wi_slice_a.length(); i++) {
+               wi_slice_a[i] = fabs(wi_slice_a[i]);
+             }
+             sub_a_2.wi_marray = fabs(sub_a_2.wi_marray);
+
+             for (int i = 0; i < sub_a_2.wi_marray.size(); i++) {
+               if (sub_a_2.wi_marray[i] != wi_slice_a[i]) {
+                 ERR[0] = 1;
+               }
+             }
+           }); // parallel for
+     })
+        .wait();
+  }
+  assert(err == 0);
+}
+
+int main() {
+
+  queue q;
+  auto computeCapability =
+      std::stof(q.get_device().get_info<info::device::backend_version>());
+
+  if (computeCapability >= 8.0) {
+    verify_wi_marray<bfloat16, 16, 16>(q);
+  }
+
+  return 0;
+}
diff --git a/SYCL/Matrix/joint_matrix_tensorcore.cpp b/SYCL/Matrix/joint_matrix_tensorcore.cpp
index 07a80ca118..678b4ef25d 100644
--- a/SYCL/Matrix/joint_matrix_tensorcore.cpp
+++ b/SYCL/Matrix/joint_matrix_tensorcore.cpp
@@ -68,7 +68,7 @@ T2 matrix_ref_mn(const int &m, const int &n, T1 *A, T1 *B, T2 *C) {
       res += make_fp32(A[m * Big_K + k]) * make_fp32(B[k * Big_N + n]);
   } else if constexpr (std::is_same<T1, bfloat16>::value) {
     for (int k = 0; k < Big_K; k++)
-      res += (make_fp32(A[m * Big_K + k].raw()) * 2 + 1) *
+      res += make_fp32(A[m * Big_K + k].raw()) *
              make_fp32(B[k * Big_N + n].raw());
   } else {
     for (int k = 0; k < Big_K; k++)
@@ -192,14 +192,6 @@ void test(queue &q) {
                                 accA.get_pointer() + (k * K) + (m * M * Big_K),
                                 Big_K);
 
-              if constexpr (std::is_same<T1, bfloat16>::value) {
-                marray<bfloat16, sub_a.wi_marray.size()> b, c;
-                b = 2;
-                c = 1;
-                sub_a.wi_marray =
-                    sycl::ext::oneapi::experimental::fma(sub_a.wi_marray, b, c);
-              }
-
               joint_matrix_load(sg, sub_b,
                                 accB.get_pointer() + (k * K * Big_N) + (n * N),
                                 Big_N);

From 0ede8812f2261126941b8964e8437351b82dfa9f Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Wed, 8 Jun 2022 12:27:10 +0100
Subject: [PATCH 16/21] format

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 SYCL/Matrix/element_wise_all_ops_cuda.cpp | 1 +
 SYCL/Matrix/element_wise_wi_marray.cpp    | 6 +++---
 SYCL/Matrix/joint_matrix_tensorcore.cpp   | 5 ++---
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/SYCL/Matrix/element_wise_all_ops_cuda.cpp b/SYCL/Matrix/element_wise_all_ops_cuda.cpp
index 7b7f28ca9c..13b40d5417 100644
--- a/SYCL/Matrix/element_wise_all_ops_cuda.cpp
+++ b/SYCL/Matrix/element_wise_all_ops_cuda.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 // REQUIRES: cuda
+
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX=3 %s -o %t.out 
 // RUN: %t.out
 
diff --git a/SYCL/Matrix/element_wise_wi_marray.cpp b/SYCL/Matrix/element_wise_wi_marray.cpp
index e4d78c22d1..890d342b48 100644
--- a/SYCL/Matrix/element_wise_wi_marray.cpp
+++ b/SYCL/Matrix/element_wise_wi_marray.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 // REQUIRES: cuda
+
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX=3 %s -o %t.out 
 // RUN: %t.out
 
@@ -26,7 +27,7 @@ template <typename T, size_t M, size_t K> void verify_wi_marray(queue q) {
 
        cgh.parallel_for<class marray_kernel>(
            nd_range<2>({1, 1 * SG_SZ}, {1, 1 * SG_SZ}),
-           [ERR](nd_item<2> spmd_item)[[sycl::reqd_sub_group_size(SG_SZ)]] {
+           [ERR](nd_item<2> spmd_item) [[sycl::reqd_sub_group_size(SG_SZ)]] {
              auto sg = spmd_item.get_sub_group();
 
              joint_matrix<T, matrix_use::a, M, K> sub_a;
@@ -47,8 +48,7 @@ template <typename T, size_t M, size_t K> void verify_wi_marray(queue q) {
                }
              }
            }); // parallel for
-     })
-        .wait();
+     }).wait();
   }
   assert(err == 0);
 }
diff --git a/SYCL/Matrix/joint_matrix_tensorcore.cpp b/SYCL/Matrix/joint_matrix_tensorcore.cpp
index 678b4ef25d..507dae07e4 100644
--- a/SYCL/Matrix/joint_matrix_tensorcore.cpp
+++ b/SYCL/Matrix/joint_matrix_tensorcore.cpp
@@ -1,5 +1,4 @@
 // REQUIRES: cuda
-
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX=3 %s -o %t.out
 // RUN: %t.out
 //
@@ -68,8 +67,8 @@ T2 matrix_ref_mn(const int &m, const int &n, T1 *A, T1 *B, T2 *C) {
       res += make_fp32(A[m * Big_K + k]) * make_fp32(B[k * Big_N + n]);
   } else if constexpr (std::is_same<T1, bfloat16>::value) {
     for (int k = 0; k < Big_K; k++)
-      res += make_fp32(A[m * Big_K + k].raw()) *
-             make_fp32(B[k * Big_N + n].raw());
+      res +=
+          make_fp32(A[m * Big_K + k].raw()) * make_fp32(B[k * Big_N + n].raw());
   } else {
     for (int k = 0; k < Big_K; k++)
 

From 473f88b09c732b41f99620efa71751090730bafb Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Thu, 9 Jun 2022 16:37:08 +0100
Subject: [PATCH 17/21] format

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 SYCL/Matrix/element_wise_wi_marray.cpp  |  2 +-
 SYCL/Matrix/joint_matrix_tensorcore.cpp | 16 ++++++++--------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/SYCL/Matrix/element_wise_wi_marray.cpp b/SYCL/Matrix/element_wise_wi_marray.cpp
index 890d342b48..6ab3947ed9 100644
--- a/SYCL/Matrix/element_wise_wi_marray.cpp
+++ b/SYCL/Matrix/element_wise_wi_marray.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 // REQUIRES: cuda
 
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX=3 %s -o %t.out 
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX=3 %s -o %t.out
 // RUN: %t.out
 
 #include <sycl/sycl.hpp>
diff --git a/SYCL/Matrix/joint_matrix_tensorcore.cpp b/SYCL/Matrix/joint_matrix_tensorcore.cpp
index 1a03d855a0..3d417dcc63 100644
--- a/SYCL/Matrix/joint_matrix_tensorcore.cpp
+++ b/SYCL/Matrix/joint_matrix_tensorcore.cpp
@@ -195,15 +195,15 @@ void test(queue &q) {
                                 accB.get_pointer() + (k * K * Big_N) + (n * N),
                                 Big_N);
 
-            // round values to correct precision if using tf32
-            if constexpr (std::is_same<T3, precision::tf32>::value) {
-              auto wi_size = sub_a.wi_marray.size();
-              assert(wi_size == sub_b.wi_marray.size());
-              for (auto i = 0; i < wi_size; ++i) {
-                sub_a.wi_marray[i] = round_to_tf32(sub_a.wi_marray[i]);
-                sub_b.wi_marray[i] = round_to_tf32(sub_b.wi_marray[i]);
+              // round values to correct precision if using tf32
+              if constexpr (std::is_same<T3, precision::tf32>::value) {
+                auto wi_size = sub_a.wi_marray.size();
+                assert(wi_size == sub_b.wi_marray.size());
+                for (auto i = 0; i < wi_size; ++i) {
+                  sub_a.wi_marray[i] = round_to_tf32(sub_a.wi_marray[i]);
+                  sub_b.wi_marray[i] = round_to_tf32(sub_b.wi_marray[i]);
+                }
               }
-            }
 
               sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
             }

From e817dbab99d0e2ce1f3e8f37d79d9f8cc0c56cfa Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Thu, 9 Jun 2022 16:45:13 +0100
Subject: [PATCH 18/21] format

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 SYCL/Matrix/element_wise_all_ops_cuda.cpp | 2 +-
 SYCL/Matrix/joint_matrix_tensorcore.cpp   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/SYCL/Matrix/element_wise_all_ops_cuda.cpp b/SYCL/Matrix/element_wise_all_ops_cuda.cpp
index 13b40d5417..69976fa7e4 100644
--- a/SYCL/Matrix/element_wise_all_ops_cuda.cpp
+++ b/SYCL/Matrix/element_wise_all_ops_cuda.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 // REQUIRES: cuda
 
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX=3 %s -o %t.out 
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX=3 %s -o %t.out
 // RUN: %t.out
 
 #include <CL/sycl.hpp>
diff --git a/SYCL/Matrix/joint_matrix_tensorcore.cpp b/SYCL/Matrix/joint_matrix_tensorcore.cpp
index 3d417dcc63..2b5078d415 100644
--- a/SYCL/Matrix/joint_matrix_tensorcore.cpp
+++ b/SYCL/Matrix/joint_matrix_tensorcore.cpp
@@ -267,7 +267,7 @@ int main() {
     test<bfloat16, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 16, 16, 16>(Q);
     test<bfloat16, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 8, 16, 32>(Q);
     test<bfloat16, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 32, 16, 8>(Q);
-    
+
     // A/B tf32
     test<float, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 16, 8, 16,
          precision::tf32>(Q);

From 1019a8d1c478ba5f096ce14497b6df4cbeefb492 Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Tue, 28 Jun 2022 10:00:59 +0100
Subject: [PATCH 19/21] Use ext_oneapi_bfloat16 aspect where appropriate.

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 SYCL/BFloat16/bfloat16_builtins.cpp | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/SYCL/BFloat16/bfloat16_builtins.cpp b/SYCL/BFloat16/bfloat16_builtins.cpp
index eac5df3fde..f9e2fd7977 100644
--- a/SYCL/BFloat16/bfloat16_builtins.cpp
+++ b/SYCL/BFloat16/bfloat16_builtins.cpp
@@ -218,12 +218,7 @@ bool check(float a, float b) {
 int main() {
   queue q;
 
-  auto computeCapability =
-      std::stof(q.get_device().get_info<sycl::info::device::backend_version>());
-  // TODO check for "ext_oneapi_bfloat16" aspect instead once aspect is
-  // supported. Since this test only covers CUDA the current check is
-  // functionally equivalent to "ext_oneapi_bfloat16".
-  if (computeCapability >= 8.0) {
+  if (q.get_device().has(aspect::ext_oneapi_bfloat16)) {
     std::vector<float> a(N), b(N), c(N);
     int err = 0;
 

From 3506d296a9346cc88b7a5fd94a3bd185083d839e Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Tue, 28 Jun 2022 17:58:03 +0100
Subject: [PATCH 20/21] removed requires cuda (rely on bfloat16 aspect instead)

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 SYCL/BFloat16/bfloat16_builtins.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/SYCL/BFloat16/bfloat16_builtins.cpp b/SYCL/BFloat16/bfloat16_builtins.cpp
index f9e2fd7977..ef091087d2 100644
--- a/SYCL/BFloat16/bfloat16_builtins.cpp
+++ b/SYCL/BFloat16/bfloat16_builtins.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: cuda
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_80
 // RUN: %t.out
 

From 74787938e1e1c64ee46da23f0a02e2ab3439e450 Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Wed, 29 Jun 2022 09:37:45 +0100
Subject: [PATCH 21/21] Noted test doesn't compile for other backends.

---
 SYCL/BFloat16/bfloat16_builtins.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/SYCL/BFloat16/bfloat16_builtins.cpp b/SYCL/BFloat16/bfloat16_builtins.cpp
index ef091087d2..ff84ecbeb3 100644
--- a/SYCL/BFloat16/bfloat16_builtins.cpp
+++ b/SYCL/BFloat16/bfloat16_builtins.cpp
@@ -1,3 +1,9 @@
+// REQUIRES: cuda
+//
+// Currently this test fails to compile for backends other than cuda.
+// Other backends could use this test when bfloat16 math function support is
+// added.
+//
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_80
 // RUN: %t.out