diff --git a/SYCL/BFloat16/bfloat16_builtins.cpp b/SYCL/BFloat16/bfloat16_builtins.cpp
new file mode 100644
index 0000000000..ff84ecbeb3
--- /dev/null
+++ b/SYCL/BFloat16/bfloat16_builtins.cpp
@@ -0,0 +1,246 @@
+// REQUIRES: cuda
+//
+// Currently this test fails to compile for backends other than cuda.
+// Other backends could use this test when bfloat16 math function support is
+// added.
+//
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_80
+// RUN: %t.out
+
+#include <sycl/sycl.hpp>
+
+#include <cmath>
+#include <vector>
+
+using namespace cl::sycl;
+using sycl::ext::oneapi::experimental::bfloat16;
+
+constexpr int N = 60; // divisible by all tested array sizes
+constexpr float bf16_eps = 0.00390625;
+
+float make_fp32(uint16_t x) {
+  uint32_t y = x;
+  y = y << 16;
+  auto res = reinterpret_cast<float *>(&y);
+  return *res;
+}
+
+bool check(float a, float b) {
+  return fabs(2 * (a - b) / (a + b)) > bf16_eps * 2;
+}
+
+#define TEST_BUILTIN_1_SCAL_IMPL(NAME)                                         \
+  {                                                                            \
+    buffer<float> a_buf(&a[0], N);                                             \
+    buffer<int> err_buf(&err, 1);                                              \
+    q.submit([&](handler &cgh) {                                               \
+      accessor<float, 1, access::mode::read_write, target::device> A(a_buf,    \
+                                                                     cgh);     \
+      accessor<int, 1, access::mode::write, target::device> ERR(err_buf, cgh); \
+      cgh.parallel_for(N, [=](id<1> index) {                                   \
+        if (check(NAME(bfloat16{A[index]}), NAME(A[index]))) {                 \
+          ERR[0] = 1;                                                          \
+        }                                                                      \
+      });                                                                      \
+    });                                                                        \
+  }                                                                            \
+  assert(err == 0);
+
+#define TEST_BUILTIN_1_ARR_IMPL(NAME, SZ)                                      \
+  {                                                                            \
+    buffer<float, 2> a_buf{range<2>{N / SZ, SZ}};                              \
+    buffer<int> err_buf(&err, 1);                                              \
+    q.submit([&](handler &cgh) {                                               \
+      accessor<float, 2, access::mode::read_write, target::device> A(a_buf,    \
+                                                                     cgh);     \
+      accessor<int, 1, access::mode::write, target::device> ERR(err_buf, cgh); \
+      cgh.parallel_for(N / SZ, [=](id<1> index) {                              \
+        marray<bfloat16, SZ> arg;                                              \
+        for (int i = 0; i < SZ; i++) {                                         \
+          arg[i] = A[index][i];                                                \
+        }                                                                      \
+        marray<bfloat16, SZ> res = NAME(arg);                                  \
+        for (int i = 0; i < SZ; i++) {                                         \
+          if (check(res[i], NAME(A[index][i]))) {                              \
+            ERR[0] = 1;                                                        \
+          }                                                                    \
+        }                                                                      \
+      });                                                                      \
+    });                                                                        \
+  }                                                                            \
+  assert(err == 0);
+
+#define TEST_BUILTIN_1(NAME)                                                   \
+  TEST_BUILTIN_1_SCAL_IMPL(NAME)                                               \
+  TEST_BUILTIN_1_ARR_IMPL(NAME, 1)                                             \
+  TEST_BUILTIN_1_ARR_IMPL(NAME, 2)                                             \
+  TEST_BUILTIN_1_ARR_IMPL(NAME, 3)                                             \
+  TEST_BUILTIN_1_ARR_IMPL(NAME, 4)                                             \
+  TEST_BUILTIN_1_ARR_IMPL(NAME, 5)
+
+#define TEST_BUILTIN_2_SCAL_IMPL(NAME)                                         \
+  {                                                                            \
+    buffer<float> a_buf(&a[0], N);                                             \
+    buffer<float> b_buf(&b[0], N);                                             \
+    buffer<int> err_buf(&err, 1);                                              \
+    q.submit([&](handler &cgh) {                                               \
+      accessor<float, 1, access::mode::read_write, target::device> A(a_buf,    \
+                                                                     cgh);     \
+      accessor<float, 1, access::mode::read_write, target::device> B(b_buf,    \
+                                                                     cgh);     \
+      accessor<int, 1, access::mode::write, target::device> ERR(err_buf, cgh); \
+      cgh.parallel_for(N, [=](id<1> index) {                                   \
+        if (check(NAME(bfloat16{A[index]}, bfloat16{B[index]}),                \
+                  NAME(A[index], B[index]))) {                                 \
+          ERR[0] = 1;                                                          \
+        }                                                                      \
+      });                                                                      \
+    });                                                                        \
+  }                                                                            \
+  assert(err == 0);
+
+#define TEST_BUILTIN_2_ARR_IMPL(NAME, SZ)                                      \
+  {                                                                            \
+    buffer<float, 2> a_buf{range<2>{N / SZ, SZ}};                              \
+    buffer<float, 2> b_buf{range<2>{N / SZ, SZ}};                              \
+    buffer<int> err_buf(&err, 1);                                              \
+    q.submit([&](handler &cgh) {                                               \
+      accessor<float, 2, access::mode::read_write, target::device> A(a_buf,    \
+                                                                     cgh);     \
+      accessor<float, 2, access::mode::read_write, target::device> B(b_buf,    \
+                                                                     cgh);     \
+      accessor<int, 1, access::mode::write, target::device> ERR(err_buf, cgh); \
+      cgh.parallel_for(N / SZ, [=](id<1> index) {                              \
+        marray<bfloat16, SZ> arg0, arg1;                                       \
+        for (int i = 0; i < SZ; i++) {                                         \
+          arg0[i] = A[index][i];                                               \
+          arg1[i] = B[index][i];                                               \
+        }                                                                      \
+        marray<bfloat16, SZ> res = NAME(arg0, arg1);                           \
+        for (int i = 0; i < SZ; i++) {                                         \
+          if (check(res[i], NAME(A[index][i], B[index][i]))) {                 \
+            ERR[0] = 1;                                                        \
+          }                                                                    \
+        }                                                                      \
+      });                                                                      \
+    });                                                                        \
+  }                                                                            \
+  assert(err == 0);
+
+#define TEST_BUILTIN_2(NAME)                                                   \
+  TEST_BUILTIN_2_SCAL_IMPL(NAME)                                               \
+  TEST_BUILTIN_2_ARR_IMPL(NAME, 1)                                             \
+  TEST_BUILTIN_2_ARR_IMPL(NAME, 2)                                             \
+  TEST_BUILTIN_2_ARR_IMPL(NAME, 3)                                             \
+  TEST_BUILTIN_2_ARR_IMPL(NAME, 4)                                             \
+  TEST_BUILTIN_2_ARR_IMPL(NAME, 5)
+
+#define TEST_BUILTIN_3_SCAL_IMPL(NAME)                                         \
+  {                                                                            \
+    buffer<float> a_buf(&a[0], N);                                             \
+    buffer<float> b_buf(&b[0], N);                                             \
+    buffer<float> c_buf(&c[0], N);                                             \
+    buffer<int> err_buf(&err, 1);                                              \
+    q.submit([&](handler &cgh) {                                               \
+      accessor<float, 1, access::mode::read_write, target::device> A(a_buf,    \
+                                                                     cgh);     \
+      accessor<float, 1, access::mode::read_write, target::device> B(b_buf,    \
+                                                                     cgh);     \
+      accessor<float, 1, access::mode::read_write, target::device> C(c_buf,    \
+                                                                     cgh);     \
+      accessor<int, 1, access::mode::write, target::device> ERR(err_buf, cgh); \
+      cgh.parallel_for(N, [=](id<1> index) {                                   \
+        if (check(NAME(bfloat16{A[index]}, bfloat16{B[index]},                 \
+                       bfloat16{C[index]}),                                    \
+                  NAME(A[index], B[index], C[index]))) {                       \
+          ERR[0] = 1;                                                          \
+        }                                                                      \
+      });                                                                      \
+    });                                                                        \
+  }                                                                            \
+  assert(err == 0);
+
+#define TEST_BUILTIN_3_ARR_IMPL(NAME, SZ)                                      \
+  {                                                                            \
+    buffer<float, 2> a_buf{range<2>{N / SZ, SZ}};                              \
+    buffer<float, 2> b_buf{range<2>{N / SZ, SZ}};                              \
+    buffer<float, 2> c_buf{range<2>{N / SZ, SZ}};                              \
+    buffer<int> err_buf(&err, 1);                                              \
+    q.submit([&](handler &cgh) {                                               \
+      accessor<float, 2, access::mode::read_write, target::device> A(a_buf,    \
+                                                                     cgh);     \
+      accessor<float, 2, access::mode::read_write, target::device> B(b_buf,    \
+                                                                     cgh);     \
+      accessor<float, 2, access::mode::read_write, target::device> C(c_buf,    \
+                                                                     cgh);     \
+      accessor<int, 1, access::mode::write, target::device> ERR(err_buf, cgh); \
+      cgh.parallel_for(N / SZ, [=](id<1> index) {                              \
+        marray<bfloat16, SZ> arg0, arg1, arg2;                                 \
+        for (int i = 0; i < SZ; i++) {                                         \
+          arg0[i] = A[index][i];                                               \
+          arg1[i] = B[index][i];                                               \
+          arg2[i] = C[index][i];                                               \
+        }                                                                      \
+        marray<bfloat16, SZ> res = NAME(arg0, arg1, arg2);                     \
+        for (int i = 0; i < SZ; i++) {                                         \
+          if (check(res[i], NAME(A[index][i], B[index][i], C[index][i]))) {    \
+            ERR[0] = 1;                                                        \
+          }                                                                    \
+        }                                                                      \
+      });                                                                      \
+    });                                                                        \
+  }                                                                            \
+  assert(err == 0);
+
+#define TEST_BUILTIN_3(NAME)                                                   \
+  TEST_BUILTIN_3_SCAL_IMPL(NAME)                                               \
+  TEST_BUILTIN_3_ARR_IMPL(NAME, 1)                                             \
+  TEST_BUILTIN_3_ARR_IMPL(NAME, 2)                                             \
+  TEST_BUILTIN_3_ARR_IMPL(NAME, 3)                                             \
+  TEST_BUILTIN_3_ARR_IMPL(NAME, 4)                                             \
+  TEST_BUILTIN_3_ARR_IMPL(NAME, 5)
+
+#define TEST_BUILTIN_2_NAN(NAME)                                               \
+  {                                                                            \
+    buffer<int> err_buf(&err, 1);                                              \
+    buffer<float> nan_buf(&check_nan, 1);                                      \
+    q.submit([&](handler &cgh) {                                               \
+      accessor<int, 1, access::mode::write, target::device> ERR(err_buf, cgh); \
+      accessor<float, 1, access::mode::write, target::device> checkNAN(        \
+          nan_buf, cgh);                                                       \
+      cgh.single_task([=]() {                                                  \
+        checkNAN[0] = NAME(bfloat16{NAN}, bfloat16{NAN});                      \
+        if ((NAME(bfloat16{2}, bfloat16{NAN}) != 2) ||                         \
+            (NAME(bfloat16{NAN}, bfloat16{2}) != 2)) {                         \
+          ERR[0] = 1;                                                          \
+        }                                                                      \
+      });                                                                      \
+    });                                                                        \
+  }                                                                            \
+  assert(err == 0);                                                            \
+  assert(std::isnan(check_nan));
+
+int main() {
+  queue q;
+
+  if (q.get_device().has(aspect::ext_oneapi_bfloat16)) {
+    std::vector<float> a(N), b(N), c(N);
+    int err = 0;
+
+    for (int i = 0; i < N; i++) {
+      a[i] = (i - N / 2) / (float)N;
+      b[i] = (N / 2 - i) / (float)N;
+      c[i] = (float)(3 * i);
+    }
+
+    TEST_BUILTIN_1(fabs);
+    TEST_BUILTIN_2(fmin);
+    TEST_BUILTIN_2(fmax);
+    TEST_BUILTIN_3(fma);
+
+    float check_nan = 0;
+    TEST_BUILTIN_2_NAN(fmin);
+    TEST_BUILTIN_2_NAN(fmax);
+  }
+  return 0;
+}
diff --git a/SYCL/Matrix/element_wise_all_ops_cuda.cpp b/SYCL/Matrix/element_wise_all_ops_cuda.cpp
new file mode 100644
index 0000000000..69976fa7e4
--- /dev/null
+++ b/SYCL/Matrix/element_wise_all_ops_cuda.cpp
@@ -0,0 +1,184 @@
+//==----------- element_wise_all_ops_cuda.cpp  - DPC++ joint_matrix---------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: cuda
+
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX=3 %s -o %t.out
+// RUN: %t.out
+
+#include <CL/sycl.hpp>
+
+using namespace sycl;
+using namespace sycl::ext::oneapi::experimental::matrix;
+using sycl::ext::oneapi::experimental::bfloat16;
+
+#define SG_SZ 32
+constexpr size_t nWGperDim = 2;
+
+class Logical {};
+
+template <typename T1, typename T2, size_t M, size_t K, size_t N, typename OP>
+class KernelName;
+
+template <typename T, size_t NUM_ROWS, size_t NUM_COLS> struct big_matrix {
+public:
+  T *mat;
+
+public:
+  T *get_data() { return mat; }
+  void set_data(T *data) { mat = data; }
+  big_matrix(T *data) : mat(data) {}
+};
+
+template <typename T, size_t M, size_t N>
+void assert_ops_ref(T *C, const float ref) {
+  for (size_t i = 0; i < M; i++)
+    for (size_t j = 0; j < N; j++) {
+      auto diff = C[i + j * M] - ref;
+      assert(std::fabs(static_cast<float>(diff)) <
+             std::numeric_limits<float>::epsilon());
+    }
+}
+template <typename T, typename T2, size_t M, size_t K, size_t N,
+          class Operation>
+void matrix_verify_op(queue q, big_matrix<T2, M * nWGperDim, N * nWGperDim> &C,
+                      nd_range<2> &r, const float ref, Operation Op) {
+  {
+    buffer<T2, 2> bufC(C.get_data(), range<2>(N * nWGperDim, M * nWGperDim));
+
+    q.submit([&](handler &cgh) {
+       accessor<T2, 2, access::mode::read_write, target::device> accC(bufC,
+                                                                      cgh);
+
+       cgh.parallel_for<KernelName<T, T2, M, K, N, Operation>>(
+           r, [accC,
+               Op](nd_item<2> spmd_item) [[sycl::reqd_sub_group_size(SG_SZ)]] {
+             const auto global_idx = spmd_item.get_global_id(0);
+             const auto global_idy = spmd_item.get_global_id(1);
+             const auto sg_startx = global_idx - spmd_item.get_local_id(0);
+             const auto sg_starty = global_idy - spmd_item.get_local_id(1);
+
+             auto sg = spmd_item.get_sub_group();
+
+             joint_matrix<T, matrix_use::a, M, K> sub_a;
+             joint_matrix<T, matrix_use::b, K, N> sub_b;
+             joint_matrix<T2, matrix_use::accumulator, M, N> sub_c;
+
+             joint_matrix_fill(sg, sub_a, 3);
+             joint_matrix_fill(sg, sub_b, 1);
+             joint_matrix_fill(sg, sub_c, -80);
+
+             auto wi_slice_a = sub_a.get_wi_data();
+             for (int i = 0; i < wi_slice_a.length(); i++) {
+               if constexpr (std::is_same_v<Operation, Logical>) {
+                 if (wi_slice_a[i]) {
+                   if (wi_slice_a[i] > 2.0 || wi_slice_a[i] >= 3.0 ||
+                       wi_slice_a[i] < 4.0 || wi_slice_a[i] <= 3.0) {
+                     T val = (wi_slice_a[i] != (2.0)) ? wi_slice_a[i]
+                                                      : static_cast<T>(2.0);
+                     val = ((val) - (1));
+                     val = ((val) + (1));
+                     if (wi_slice_a[i] == (2.0)) {
+                       val = ((val) - (2));
+                       val = ((val) * (3));
+                       val = ((val) / (2));
+
+                     } else {
+                       val = ((val) + (2));
+                     }
+                     wi_slice_a[i] = val;
+                   }
+                 }
+               } else {
+                 wi_slice_a[i] = Op(wi_slice_a[i], 2);
+               }
+             }
+
+             sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
+
+             joint_matrix_store(sg, sub_c,
+                                accC.get_pointer() +
+                                    (sg_startx * M) * (N * nWGperDim) +
+                                    sg_starty / SG_SZ * N,
+                                (N * nWGperDim));
+           }); // parallel for
+     }).wait();
+  }
+  assert_ops_ref<T2, M * nWGperDim, N * nWGperDim>(C.get_data(), ref);
+}
+
+static constexpr size_t MATRIX_M = 16 * nWGperDim;
+static constexpr size_t MATRIX_N = 16 * nWGperDim;
+
+int main() {
+
+  float D[MATRIX_M][MATRIX_N];
+  big_matrix<float, MATRIX_M, MATRIX_N> MD_f((float *)&D);
+
+  queue q;
+  auto computeCapability =
+      std::stof(q.get_device().get_info<info::device::backend_version>());
+  nd_range<2> r({nWGperDim, nWGperDim * SG_SZ}, {1, 1 * SG_SZ});
+
+  if (computeCapability >= 7.0) {
+    matrix_verify_op<half, float, 16, 16, 16>(q, MD_f, r, 0.0,
+                                              std::plus<half>{});
+    matrix_verify_op<half, float, 16, 16, 16>(q, MD_f, r, 0.0, Logical{});
+    matrix_verify_op<half, float, 16, 16, 16>(q, MD_f, r, 16.0,
+                                              std::multiplies<half>{});
+    matrix_verify_op<half, float, 16, 16, 16>(q, MD_f, r, -56.0,
+                                              std::divides<half>{});
+    matrix_verify_op<half, float, 16, 16, 16>(q, MD_f, r, -64.0,
+                                              std::minus<half>{});
+  }
+
+  if (computeCapability >= 7.2) {
+    int32_t D_i[MATRIX_M][MATRIX_N];
+    big_matrix<int32_t, MATRIX_M, MATRIX_N> MD_i((int32_t *)&D_i);
+    matrix_verify_op<uint8_t, int32_t, 16, 16, 16>(q, MD_i, r, 0,
+                                                   std::plus<uint8_t>{});
+    matrix_verify_op<uint8_t, int32_t, 16, 16, 16>(q, MD_i, r, 16,
+                                                   std::multiplies<uint8_t>{});
+    matrix_verify_op<uint8_t, int32_t, 16, 16, 16>(q, MD_i, r, -64,
+                                                   std::minus<uint8_t>{});
+    matrix_verify_op<int8_t, int32_t, 16, 16, 16>(q, MD_i, r, 0,
+                                                  std::plus<int8_t>{});
+    matrix_verify_op<int8_t, int32_t, 16, 16, 16>(q, MD_i, r, 0.0, Logical{});
+    matrix_verify_op<int8_t, int32_t, 16, 16, 16>(q, MD_i, r, 16,
+                                                  std::multiplies<int8_t>{});
+    matrix_verify_op<int8_t, int32_t, 16, 16, 16>(q, MD_i, r, -64,
+                                                  std::minus<int8_t>{});
+  }
+
+  if (computeCapability >= 8.0) {
+
+    matrix_verify_op<bfloat16, float, 16, 16, 16>(q, MD_f, r, 0.0,
+                                                  std::plus<bfloat16>{});
+    matrix_verify_op<bfloat16, float, 16, 16, 16>(q, MD_f, r, 0.0, Logical{});
+    matrix_verify_op<bfloat16, float, 16, 16, 16>(q, MD_f, r, 16.0,
+                                                  std::multiplies<bfloat16>{});
+    matrix_verify_op<bfloat16, float, 16, 16, 16>(q, MD_f, r, -56.0,
+                                                  std::divides<bfloat16>{});
+    matrix_verify_op<bfloat16, float, 16, 16, 16>(q, MD_f, r, -64.0,
+                                                  std::minus<bfloat16>{});
+
+    double D_d[MATRIX_M / 2][MATRIX_N / 2];
+    big_matrix<double, 8 * nWGperDim, 8 * nWGperDim> MD_d((double *)&D_d);
+
+    matrix_verify_op<double, double, 8, 4, 8>(q, MD_d, r, -60.0,
+                                              std::plus<double>{});
+    matrix_verify_op<double, double, 8, 4, 8>(q, MD_d, r, -60.0, Logical{});
+    matrix_verify_op<double, double, 8, 4, 8>(q, MD_d, r, -56.0,
+                                              std::multiplies<double>{});
+    matrix_verify_op<double, double, 8, 4, 8>(q, MD_d, r, -74.0,
+                                              std::divides<double>{});
+    matrix_verify_op<double, double, 8, 4, 8>(q, MD_d, r, -76.0,
+                                              std::minus<double>{});
+  }
+
+  return 0;
+}
diff --git a/SYCL/Matrix/element_wise_wi_marray.cpp b/SYCL/Matrix/element_wise_wi_marray.cpp
new file mode 100644
index 0000000000..6ab3947ed9
--- /dev/null
+++ b/SYCL/Matrix/element_wise_wi_marray.cpp
@@ -0,0 +1,67 @@
+//==----------- element_wise_wi_marray.cpp  - DPC++ joint_matrix------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: cuda
+
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX=3 %s -o %t.out
+// RUN: %t.out
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+using namespace sycl::ext::oneapi::experimental::matrix;
+using sycl::ext::oneapi::experimental::bfloat16;
+
+#define SG_SZ 32
+
+template <typename T, size_t M, size_t K> void verify_wi_marray(queue q) {
+  int err = 0;
+  {
+    buffer<int> err_buf(&err, 1);
+    q.submit([&](handler &cgh) {
+       accessor<int, 1, access::mode::write, target::device> ERR(err_buf, cgh);
+
+       cgh.parallel_for<class marray_kernel>(
+           nd_range<2>({1, 1 * SG_SZ}, {1, 1 * SG_SZ}),
+           [ERR](nd_item<2> spmd_item) [[sycl::reqd_sub_group_size(SG_SZ)]] {
+             auto sg = spmd_item.get_sub_group();
+
+             joint_matrix<T, matrix_use::a, M, K> sub_a;
+             joint_matrix<T, matrix_use::a, M, K> sub_a_2;
+
+             joint_matrix_fill(sg, sub_a, -1);
+             joint_matrix_fill(sg, sub_a_2, -1);
+
+             auto wi_slice_a = sub_a.get_wi_data();
+             for (int i = 0; i < wi_slice_a.length(); i++) {
+               wi_slice_a[i] = fabs(wi_slice_a[i]);
+             }
+             sub_a_2.wi_marray = fabs(sub_a_2.wi_marray);
+
+             for (int i = 0; i < sub_a_2.wi_marray.size(); i++) {
+               if (sub_a_2.wi_marray[i] != wi_slice_a[i]) {
+                 ERR[0] = 1;
+               }
+             }
+           }); // parallel for
+     }).wait();
+  }
+  assert(err == 0);
+}
+
+int main() {
+
+  queue q;
+  auto computeCapability =
+      std::stof(q.get_device().get_info<info::device::backend_version>());
+
+  if (computeCapability >= 8.0) {
+    verify_wi_marray<bfloat16, 16, 16>(q);
+  }
+
+  return 0;
+}
diff --git a/SYCL/Matrix/joint_matrix_tensorcore.cpp b/SYCL/Matrix/joint_matrix_tensorcore.cpp
index a489aeb0ca..2b5078d415 100644
--- a/SYCL/Matrix/joint_matrix_tensorcore.cpp
+++ b/SYCL/Matrix/joint_matrix_tensorcore.cpp
@@ -1,6 +1,6 @@
-// REQUIRES: gpu, cuda
-
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX=3  %s -o %t.out
+// REQUIRES: cuda
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX=3 %s -o %t.out
+// RUN: %t.out
 //
 // Specifying the sm version via the --cuda-gpu-arch flag is necessary
 // for the Nvidia case.  DPC++ JIT compilation is not
@@ -11,6 +11,8 @@
 
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
+using sycl::ext::oneapi::experimental::bfloat16;
+constexpr float bf16_eps = 0.00390625;
 
 // Example usage of Nvidia matrix multiply.
 // Optimizations such as memory paddings for avoiding bank conflicts are not
@@ -43,17 +45,17 @@ class TypeHelper;
 template <typename T1, typename T2, size_t M, size_t K, size_t N>
 using KernelName = class TypeHelper<T1, T2, M, K, N>;
 
-float make_fp32(short x) {
-  unsigned int y = x;
+float make_fp32(uint16_t x) {
+  uint32_t y = x;
   y = y << 16;
-  float *res = reinterpret_cast<float *>(&y);
+  auto res = reinterpret_cast<float *>(&y);
   return *res;
 }
 
-unsigned short make_bf16(float x) {
-  int *res = reinterpret_cast<int *>(&x);
+uint16_t make_bf16(float x) {
+  auto res = reinterpret_cast<int32_t *>(&x);
   *res = *res >> 16;
-  return (unsigned short)*res;
+  return (uint16_t)*res;
 }
 
 template <typename T1, typename T2, size_t Big_N, size_t Big_K>
@@ -63,6 +65,10 @@ T2 matrix_ref_mn(const int &m, const int &n, T1 *A, T1 *B, T2 *C) {
   if constexpr (std::is_same<T1, uint16_t>::value) {
     for (int k = 0; k < Big_K; k++)
       res += make_fp32(A[m * Big_K + k]) * make_fp32(B[k * Big_N + n]);
+  } else if constexpr (std::is_same<T1, bfloat16>::value) {
+    for (int k = 0; k < Big_K; k++)
+      res +=
+          make_fp32(A[m * Big_K + k].raw()) * make_fp32(B[k * Big_N + n].raw());
   } else {
     for (int k = 0; k < Big_K; k++)
 
@@ -75,7 +81,7 @@ T2 matrix_ref_mn(const int &m, const int &n, T1 *A, T1 *B, T2 *C) {
 
 template <typename T1, typename T2, size_t Sub_Tiles_M, size_t Sub_Tiles_K,
           size_t Sub_Tiles_N, size_t M, size_t K, size_t N, typename T3 = T1>
-void test() {
+void test(queue &q) {
 
   constexpr auto Big_M =
       Sub_Tiles_M *
@@ -105,7 +111,7 @@ void test() {
     for (int i = 0; i < Big_K * Big_N; i++) {
       B[i] = make_bf16(0.1f * (i % 10));
     }
-  } else {
+  } else if constexpr (!std::is_same<T1, bfloat16>::value) {
     for (int i = 0; i < Big_M * Big_K; i++) {
       A[i] = i % 100;
     }
@@ -114,110 +120,157 @@ void test() {
       B[i] = i % 100;
     }
   }
+  {
+    buffer<T1, 1> bufA(A, range<1>(Big_M * Big_K));
+    buffer<T1, 1> bufB(B, range<1>(Big_K * Big_N));
+    buffer<T2, 1> bufC(C, range<1>(Big_M * Big_N));
+    buffer<T2, 1> bufD(D, range<1>(Big_M * Big_N));
+
+    // currently bfloat16 has to be initialized on device
+    if constexpr (std::is_same<T1, bfloat16>::value) {
+      q.submit([&](handler &cgh) {
+        accessor<T1, 1, access::mode::read_write, target::device> accA(bufA,
+                                                                       cgh);
+
+        cgh.parallel_for<KernelName<bfloat16, class copyA, M, K, N>>(
+            range<1>(Big_M * Big_K), [=](item<1> item) {
+              auto i = item.get_linear_id();
+              accA[i] = 0.1f * (i % 10);
+            });
+      });
+
+      q.submit([&](handler &cgh) {
+        accessor<T1, 1, access::mode::read_write, target::device> accB(bufB,
+                                                                       cgh);
+
+        cgh.parallel_for<KernelName<bfloat16, class copyB, M, K, N>>(
+            range<1>(Big_K * Big_N), [=](item<1> item) {
+              auto i = item.get_linear_id();
+              accB[i] = 0.1f * (i % 10);
+            });
+      });
+    }
 
-  buffer<T1, 1> bufA(A, range<1>(Big_M * Big_K));
-  buffer<T1, 1> bufB(B, range<1>(Big_K * Big_N));
-  buffer<T2, 1> bufC(C, range<1>(Big_M * Big_N));
-  buffer<T2, 1> bufD(D, range<1>(Big_M * Big_N));
-
-  queue q;
-  q.submit([&](handler &cgh) {
-    auto accC = bufC.template get_access<access::mode::read_write>(cgh);
-    auto accA = bufA.template get_access<access::mode::read_write>(cgh);
-    auto accB = bufB.template get_access<access::mode::read_write>(cgh);
-    auto accD = bufD.template get_access<access::mode::read_write>(cgh);
-
-    range<2> LocalRange = {1, N_THREADS_PER_MATRIX_OP};
-    range<2> GlobalRange = {Sub_Tiles_M, Sub_Tiles_N * N_THREADS_PER_MATRIX_OP};
-
-    cgh.parallel_for<KernelName<T1, T2, M, K, N>>(
-        nd_range<2>(GlobalRange, LocalRange),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-          const auto m =
-              item.get_group().get_group_id()[0]; // row id of current submatrix
-                                                  // of BIG C matrix
-          const auto n =
-              item.get_group().get_group_id()[1]; // column id of current
-                                                  // submatrix of BIG C matrix
-
-          joint_matrix<T3, matrix_use::a, M, K, matrix_layout::row_major> sub_a;
-
-          joint_matrix<T3, matrix_use::b, K, N, matrix_layout::row_major> sub_b;
-
-          joint_matrix<T2, matrix_use::accumulator, M, N,
-                       matrix_layout::row_major>
-              sub_c;
-
-          joint_matrix_load(
-              sg, sub_c, accC.get_pointer() + (m * M) * Big_N + n * N, Big_N);
-
-          for (int k = 0; k < Sub_Tiles_K;
-               k++) // row/col id of current submatrix of BIG A/B matrices
-          {
-            joint_matrix_load(sg, sub_a,
-                              accA.get_pointer() + (k * K) + (m * M * Big_K),
-                              Big_K);
-
-            joint_matrix_load(sg, sub_b,
-                              accB.get_pointer() + (k * K * Big_N) + (n * N),
-                              Big_N);
-
-            // Convert values if using tf32
-            if constexpr (std::is_same<T3, precision::tf32>::value) {
-              for (auto i = 0; i < 4; ++i) {
-                sub_a.data[i] = round_to_tf32(sub_a.data[i]);
-                sub_b.data[i] = round_to_tf32(sub_b.data[i]);
+    q.submit([&](handler &cgh) {
+      accessor<T1, 1, access::mode::read_write, target::device> accA(bufA, cgh);
+      accessor<T1, 1, access::mode::read_write, target::device> accB(bufB, cgh);
+      accessor<T2, 1, access::mode::read_write, target::device> accC(bufC, cgh);
+      accessor<T2, 1, access::mode::read_write, target::device> accD(bufD, cgh);
+
+      range<2> LocalRange = {1, N_THREADS_PER_MATRIX_OP};
+      range<2> GlobalRange = {Sub_Tiles_M,
+                              Sub_Tiles_N * N_THREADS_PER_MATRIX_OP};
+
+      cgh.parallel_for<KernelName<T1, T2, M, K, N>>(
+          nd_range<2>(GlobalRange, LocalRange), [=](nd_item<2> item) {
+            sub_group sg = item.get_sub_group();
+            const auto m =
+                item.get_group().get_group_id()[0]; // row id of current
+                                                    // submatrix of BIG C matrix
+            const auto n =
+                item.get_group().get_group_id()[1]; // column id of current
+                                                    // submatrix of BIG C matrix
+
+            joint_matrix<T3, matrix_use::a, M, K, matrix_layout::row_major>
+                sub_a;
+
+            joint_matrix<T3, matrix_use::b, K, N, matrix_layout::row_major>
+                sub_b;
+
+            joint_matrix<T2, matrix_use::accumulator, M, N,
+                         matrix_layout::row_major>
+                sub_c;
+
+            joint_matrix_load(
+                sg, sub_c, accC.get_pointer() + (m * M) * Big_N + n * N, Big_N);
+
+            for (int k = 0; k < Sub_Tiles_K;
+                 k++) // row/col id of current submatrix of BIG A/B matrices
+            {
+              joint_matrix_load(sg, sub_a,
+                                accA.get_pointer() + (k * K) + (m * M * Big_K),
+                                Big_K);
+
+              joint_matrix_load(sg, sub_b,
+                                accB.get_pointer() + (k * K * Big_N) + (n * N),
+                                Big_N);
+
+              // round values to correct precision if using tf32
+              if constexpr (std::is_same<T3, precision::tf32>::value) {
+                auto wi_size = sub_a.wi_marray.size();
+                assert(wi_size == sub_b.wi_marray.size());
+                for (auto i = 0; i < wi_size; ++i) {
+                  sub_a.wi_marray[i] = round_to_tf32(sub_a.wi_marray[i]);
+                  sub_b.wi_marray[i] = round_to_tf32(sub_b.wi_marray[i]);
+                }
               }
-            }
 
-            sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
-          }
-          joint_matrix_store(
-              sg, sub_c, accD.get_pointer() + (m * M) * Big_N + n * N, Big_N);
-        });
-  });
-
-  q.wait();
+              sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
+            }
+            joint_matrix_store(
+                sg, sub_c, accD.get_pointer() + (m * M) * Big_N + n * N, Big_N);
+          });
+    });
+    q.wait();
+  }
 
-  const auto host_accessor = bufD.template get_access<access::mode::read>();
-  for (int m = 0; m < Big_M; m++)
+  for (int m = 0; m < Big_M; m++) {
     for (int n = 0; n < Big_N; n++) {
-
-      assert((host_accessor[m * Big_N + n] ==
-              matrix_ref_mn<T1, T2, Big_N, Big_K>(m, n, A, B, C)));
+      if constexpr (std::is_same<T1, bfloat16>::value) {
+        auto res_device = matrix_ref_mn<T1, T2, Big_N, Big_K>(m, n, A, B, C);
+        assert(fabs(2 * (D[m * Big_N + n] - res_device)) /
+                   (D[m * Big_N + n] + res_device) <
+               bf16_eps * 2);
+      } else {
+        assert((D[m * Big_N + n] ==
+                matrix_ref_mn<T1, T2, Big_N, Big_K>(m, n, A, B, C)));
+      }
     }
+  }
 };
 
 int main() {
-  // A/B half, Accumulator float
-  test<half, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 16, 16, 16>();
-  test<half, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 8, 16, 32>();
-  test<half, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 32, 16, 8>();
-
-  // A/B/Accumulator half
-  test<half, half, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 16, 16, 16>();
-  test<half, half, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 8, 16, 32>();
-  test<half, half, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 32, 16, 8>();
 
-  test<int8_t, int32_t, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 16, 16, 16>();
-  test<int8_t, int32_t, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 8, 16, 32>();
-  test<int8_t, int32_t, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 32, 16, 8>();
+  queue Q;
+  auto computeCapability =
+      std::stof(Q.get_device().get_info<info::device::backend_version>());
 
-  test<uint8_t, int32_t, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 16, 16, 16>();
-  test<uint8_t, int32_t, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 8, 16, 32>();
-  test<uint8_t, int32_t, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 32, 16, 8>();
+  if (computeCapability >= 7.0) {
+    // A/B half, Accumulator float
+    test<half, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 16, 16, 16>(Q);
+    test<half, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 8, 16, 32>(Q);
+    test<half, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 32, 16, 8>(Q);
 
-  test<double, double, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 8, 4, 8>();
+    // A/B/Accumulator half
+    test<half, half, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 16, 16, 16>(Q);
+    test<half, half, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 8, 16, 32>(Q);
+    test<half, half, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 32, 16, 8>(Q);
+  }
+  if (computeCapability >= 7.2) {
+    test<int8_t, int32_t, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 16, 16, 16>(Q);
+    test<int8_t, int32_t, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 8, 16, 32>(Q);
+    test<int8_t, int32_t, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 32, 16, 8>(Q);
+
+    test<uint8_t, int32_t, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 16, 16, 16>(
+        Q);
+    test<uint8_t, int32_t, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 8, 16, 32>(Q);
+    test<uint8_t, int32_t, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 32, 16, 8>(Q);
+  }
+  if (computeCapability >= 8.0) {
+    test<double, double, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 8, 4, 8>(Q);
 
-  // A/B bf16
-  test<uint16_t, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 16, 16, 16>();
-  test<uint16_t, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 8, 16, 32>();
-  test<uint16_t, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 32, 16, 8>();
+    // A/B bfloat16 using storage type
+    test<uint16_t, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 16, 16, 16>(Q);
+    test<uint16_t, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 8, 16, 32>(Q);
+    test<uint16_t, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 32, 16, 8>(Q);
 
-  // A/B tf32
-  test<float, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 16, 8, 16,
-       precision::tf32>();
+    test<bfloat16, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 16, 16, 16>(Q);
+    test<bfloat16, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 8, 16, 32>(Q);
+    test<bfloat16, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 32, 16, 8>(Q);
 
+    // A/B tf32
+    test<float, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 16, 8, 16,
+         precision::tf32>(Q);
+  }
   return 0;
 };