From fc9b2d7804d5ceaba4ada50bf21b322fef737185 Mon Sep 17 00:00:00 2001 From: "jack.kirk" Date: Mon, 4 Apr 2022 16:58:18 +0100 Subject: [PATCH 01/21] New test cases, bfloat16 builtins/matrix. Signed-off-by: jack.kirk --- SYCL/BFloat16/bfloat16_builtins.cpp | 118 ++++++++++++++++++++++++ SYCL/Matrix/joint_matrix_tensorcore.cpp | 66 ++++++++++--- 2 files changed, 172 insertions(+), 12 deletions(-) create mode 100644 SYCL/BFloat16/bfloat16_builtins.cpp diff --git a/SYCL/BFloat16/bfloat16_builtins.cpp b/SYCL/BFloat16/bfloat16_builtins.cpp new file mode 100644 index 0000000000..8baad18c61 --- /dev/null +++ b/SYCL/BFloat16/bfloat16_builtins.cpp @@ -0,0 +1,118 @@ +// REQUIRES: cuda +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_80 +// TODO: Currently the CI does not have a sm_80 capable machine. Enable the test +// execution once it does. +// RUNx: %t.out + +#include + +#include +#include + +using namespace cl::sycl; +using sycl::ext::oneapi::experimental::bfloat16; + +constexpr int N = 16 * 3; // divisible by all vector sizes +constexpr float bf16_eps = 0.00390625; + +union conv { + float f; + vec u; + uint32_t u2; +}; + +float from_bf16(uint16_t x) { + conv c; + c.u.y() = x; + c.u.x() = 0; + return c.f; +} + +bool check(float a, float b) { + return fabs(2 * (a - b) / (a + b)) > bf16_eps * 2; +} + +#define TEST_BUILTIN_1_SCAL_IMPL(NAME) \ + { \ + buffer a_buf(&a[0], N); \ + buffer err_buf(&err, 1); \ + q.submit([&](handler &cgh) { \ + auto A = a_buf.get_access(cgh); \ + auto ERR = err_buf.get_access(cgh); \ + cgh.parallel_for(N, [=](id<1> index) { \ + if (check(from_bf16(NAME(bfloat16{A[index]}).raw()), \ + NAME(A[index]))) { \ + ERR[0] = 1; \ + } \ + }); \ + }); \ + } \ + assert(err == 0); + +#define TEST_BUILTIN_1(NAME) TEST_BUILTIN_1_SCAL_IMPL(NAME) + +#define TEST_BUILTIN_2_SCAL_IMPL(NAME) \ + { \ + buffer a_buf(&a[0], N); \ + buffer b_buf(&b[0], N); \ + buffer err_buf(&err, 1); \ + q.submit([&](handler &cgh) { \ + auto A = a_buf.get_access(cgh); \ + auto B = b_buf.get_access(cgh); \ + auto ERR = err_buf.get_access(cgh); \ + cgh.parallel_for(N, [=](id<1> index) { \ + if (check( \ + from_bf16(NAME(bfloat16{A[index]}, bfloat16{B[index]}).raw()), \ + NAME(A[index], B[index]))) { \ + ERR[0] = 1; \ + } \ + }); \ + }); \ + } \ + assert(err == 0); + +#define TEST_BUILTIN_2(NAME) TEST_BUILTIN_2_SCAL_IMPL(NAME) + +#define TEST_BUILTIN_3_SCAL_IMPL(NAME) \ + { \ + buffer a_buf(&a[0], N); \ + buffer b_buf(&b[0], N); \ + buffer c_buf(&c[0], N); \ + buffer err_buf(&err, 1); \ + q.submit([&](handler &cgh) { \ + auto A = a_buf.get_access(cgh); \ + auto B = b_buf.get_access(cgh); \ + auto C = c_buf.get_access(cgh); \ + auto ERR = err_buf.get_access(cgh); \ + cgh.parallel_for(N, [=](id<1> index) { \ + if (check(from_bf16(NAME(bfloat16{A[index]}, bfloat16{B[index]}, \ + bfloat16{C[index]}) \ + .raw()), \ + NAME(A[index], B[index], C[index]))) { \ + ERR[0] = 1; \ + } \ + }); \ + }); \ + } \ + assert(err == 0); + +#define TEST_BUILTIN_3(NAME) TEST_BUILTIN_3_SCAL_IMPL(NAME) + +int main() { + queue q; + std::vector a(N), b(N), c(N); + int err = 0; + + for (int i = 0; i < N; i++) { + a[i] = (i - N / 2) / (float)N; + b[i] = (N / 2 - i) / (float)N; + c[i] = (float)(3 * i); + } + + TEST_BUILTIN_1(fabs); + TEST_BUILTIN_2(fmin); + TEST_BUILTIN_2(fmax); + TEST_BUILTIN_3(fma); + + return 0; +} diff --git a/SYCL/Matrix/joint_matrix_tensorcore.cpp b/SYCL/Matrix/joint_matrix_tensorcore.cpp index 8c827d1c11..eb602f99d7 100644 --- a/SYCL/Matrix/joint_matrix_tensorcore.cpp +++ b/SYCL/Matrix/joint_matrix_tensorcore.cpp @@ -1,6 +1,9 @@ -// REQUIRES: gpu, cuda +// REQUIRES: cuda -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX=3 %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX=3 %s -o %t.out +// TODO: Currently the CI does not have a sm_80 capable machine. Enable the test +// execution once it does. +// RUNx: %t.out // // Specifying the sm version via the --cuda-gpu-arch flag is necessary // for the Nvidia case. DPC++ JIT compilation is not @@ -11,6 +14,8 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; +using sycl::ext::oneapi::experimental::bfloat16; +constexpr float bf16_eps = 0.00390625; // Example usage of Nvidia matrix multiply. // Optimizations such as memory paddings for avoiding bank conflicts are not @@ -63,6 +68,10 @@ T2 matrix_ref_mn(const int &m, const int &n, T1 *A, T1 *B, T2 *C) { if constexpr (std::is_same::value) { for (int k = 0; k < Big_K; k++) res += make_fp32(A[m * Big_K + k]) * make_fp32(B[k * Big_N + n]); + } else if constexpr (std::is_same::value) { + for (int k = 0; k < Big_K; k++) + res += + make_fp32(A[m * Big_K + k].raw()) * make_fp32(B[k * Big_N + n].raw()); } else { for (int k = 0; k < Big_K; k++) @@ -105,7 +114,7 @@ void test() { for (int i = 0; i < Big_K * Big_N; i++) { B[i] = make_bf16(0.1f * (i % 10)); } - } else { + } else if constexpr (!std::is_same::value) { for (int i = 0; i < Big_M * Big_K; i++) { A[i] = i % 100; } @@ -114,13 +123,36 @@ void test() { B[i] = i % 100; } } - +{ buffer bufA(A, range<1>(Big_M * Big_K)); buffer bufB(B, range<1>(Big_K * Big_N)); buffer bufC(C, range<1>(Big_M * Big_N)); buffer bufD(D, range<1>(Big_M * Big_N)); queue q; +// currently bfloat16 has to be initialized on device +if constexpr (std::is_same::value) { + q.submit([&](handler &cgh) { + auto accA = bufA.template get_access(cgh); + + cgh.parallel_for>( + range<1>(Big_M * Big_K), [=](item<1> item) { + auto i = item.get_linear_id(); + accA[i] = 0.1f * (i % 10); + }); + }); + + q.submit([&](handler &cgh) { + auto accB = bufB.template get_access(cgh); + + cgh.parallel_for>( + range<1>(Big_K * Big_N), [=](item<1> item) { + auto i = item.get_linear_id(); + accB[i] = 0.1f * (i % 10); + }); + }); +} + q.submit([&](handler &cgh) { auto accC = bufC.template get_access(cgh); auto accA = bufA.template get_access(cgh); @@ -136,9 +168,9 @@ void test() { sycl::sub_group sg = item.get_sub_group(); const auto m = item.get_group() - .get_id()[0]; // row id of current submatrix of BIG C matrix + .get_group_id()[0]; // row id of current submatrix of BIG C matrix const auto n = - item.get_group().get_id()[1]; // column id of current + item.get_group().get_group_id()[1]; // column id of current // submatrix of BIG C matrix joint_matrix sub_a; @@ -171,14 +203,20 @@ void test() { }); q.wait(); +} - const auto host_accessor = bufD.template get_access(); - for (int m = 0; m < Big_M; m++) - for (int n = 0; n < Big_N; n++) { - - assert((host_accessor[m * Big_N + n] == +for (int m = 0; m < Big_M; m++) + for (int n = 0; n < Big_N; n++) { + if constexpr (std::is_same::value) { + auto res_device = matrix_ref_mn(m, n, A, B, C); + assert(fabs(2 * (D[m * Big_N + n] - res_device)) / + (D[m * Big_N + n] + res_device) < + bf16_eps * 2); + } else { + assert((D[m * Big_N + n] == matrix_ref_mn(m, n, A, B, C))); } + } }; int main() { @@ -203,10 +241,14 @@ int main() { test(); - // A/B bf16 + // A/B bf16 using storage type test(); test(); test(); + test(); + test(); + test(); + return 0; }; From e8acad9b11ab0395f39584570c420fbb167b30f5 Mon Sep 17 00:00:00 2001 From: "jack.kirk" Date: Tue, 5 Apr 2022 18:14:36 +0100 Subject: [PATCH 02/21] format --- SYCL/Matrix/joint_matrix_tensorcore.cpp | 117 ++++++++++++------------ 1 file changed, 57 insertions(+), 60 deletions(-) diff --git a/SYCL/Matrix/joint_matrix_tensorcore.cpp b/SYCL/Matrix/joint_matrix_tensorcore.cpp index eb602f99d7..cfd1c8fa8a 100644 --- a/SYCL/Matrix/joint_matrix_tensorcore.cpp +++ b/SYCL/Matrix/joint_matrix_tensorcore.cpp @@ -130,28 +130,28 @@ void test() { buffer bufD(D, range<1>(Big_M * Big_N)); queue q; -// currently bfloat16 has to be initialized on device -if constexpr (std::is_same::value) { - q.submit([&](handler &cgh) { - auto accA = bufA.template get_access(cgh); - - cgh.parallel_for>( - range<1>(Big_M * Big_K), [=](item<1> item) { - auto i = item.get_linear_id(); - accA[i] = 0.1f * (i % 10); - }); - }); - - q.submit([&](handler &cgh) { - auto accB = bufB.template get_access(cgh); - - cgh.parallel_for>( - range<1>(Big_K * Big_N), [=](item<1> item) { - auto i = item.get_linear_id(); - accB[i] = 0.1f * (i % 10); - }); - }); -} + // currently bfloat16 has to be initialized on device + if constexpr (std::is_same::value) { + q.submit([&](handler &cgh) { + auto accA = bufA.template get_access(cgh); + + cgh.parallel_for>( + range<1>(Big_M * Big_K), [=](item<1> item) { + auto i = item.get_linear_id(); + accA[i] = 0.1f * (i % 10); + }); + }); + + q.submit([&](handler &cgh) { + auto accB = bufB.template get_access(cgh); + + cgh.parallel_for>( + range<1>(Big_K * Big_N), [=](item<1> item) { + auto i = item.get_linear_id(); + accB[i] = 0.1f * (i % 10); + }); + }); + } q.submit([&](handler &cgh) { auto accC = bufC.template get_access(cgh); @@ -162,44 +162,41 @@ if constexpr (std::is_same::value) { range<2> LocalRange = {1, N_THREADS_PER_MATRIX_OP}; range<2> GlobalRange = {Sub_Tiles_M, Sub_Tiles_N * N_THREADS_PER_MATRIX_OP}; - cgh.parallel_for>( - nd_range<2>(GlobalRange, LocalRange), [= - ](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] { - sycl::sub_group sg = item.get_sub_group(); - const auto m = - item.get_group() - .get_group_id()[0]; // row id of current submatrix of BIG C matrix - const auto n = - item.get_group().get_group_id()[1]; // column id of current - // submatrix of BIG C matrix - - joint_matrix sub_a; - - joint_matrix sub_b; - - joint_matrix - sub_c; - - joint_matrix_load( - sg, sub_c, accC.get_pointer() + (m * M) * Big_N + n * N, Big_N); - - for (int k = 0; k < Sub_Tiles_K; - k++) // row/col id of current submatrix of BIG A/B matrices - { - joint_matrix_load(sg, sub_a, - accA.get_pointer() + (k * K) + (m * M * Big_K), - Big_K); - - joint_matrix_load(sg, sub_b, - accB.get_pointer() + (k * K * Big_N) + (n * N), - Big_N); - - sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c); - } - joint_matrix_store( - sg, sub_c, accD.get_pointer() + (m * M) * Big_N + n * N, Big_N); - }); + cgh.parallel_for< + KernelName>(nd_range<2>(GlobalRange, LocalRange), [= + ](nd_item<2> item)[[sycl::reqd_work_group_size(1, 1, 32)]] { + sycl::sub_group sg = item.get_sub_group(); + const auto m = + item.get_group() + .get_group_id()[0]; // row id of current submatrix of BIG C matrix + const auto n = + item.get_group().get_group_id()[1]; // column id of current + // submatrix of BIG C matrix + + joint_matrix sub_a; + + joint_matrix sub_b; + + joint_matrix + sub_c; + + joint_matrix_load(sg, sub_c, accC.get_pointer() + (m * M) * Big_N + n * N, + Big_N); + + for (int k = 0; k < Sub_Tiles_K; + k++) // row/col id of current submatrix of BIG A/B matrices + { + joint_matrix_load( + sg, sub_a, accA.get_pointer() + (k * K) + (m * M * Big_K), Big_K); + + joint_matrix_load( + sg, sub_b, accB.get_pointer() + (k * K * Big_N) + (n * N), Big_N); + + sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c); + } + joint_matrix_store(sg, sub_c, + accD.get_pointer() + (m * M) * Big_N + n * N, Big_N); + }); }); q.wait(); From caed5208c4df6a0d2f704b72411a18e4addd4d45 Mon Sep 17 00:00:00 2001 From: "jack.kirk" Date: Tue, 5 Apr 2022 18:25:16 +0100 Subject: [PATCH 03/21] format --- SYCL/Matrix/joint_matrix_tensorcore.cpp | 170 ++++++++++++------------ 1 file changed, 86 insertions(+), 84 deletions(-) diff --git a/SYCL/Matrix/joint_matrix_tensorcore.cpp b/SYCL/Matrix/joint_matrix_tensorcore.cpp index cfd1c8fa8a..fe4bfcab61 100644 --- a/SYCL/Matrix/joint_matrix_tensorcore.cpp +++ b/SYCL/Matrix/joint_matrix_tensorcore.cpp @@ -123,97 +123,99 @@ void test() { B[i] = i % 100; } } -{ - buffer bufA(A, range<1>(Big_M * Big_K)); - buffer bufB(B, range<1>(Big_K * Big_N)); - buffer bufC(C, range<1>(Big_M * Big_N)); - buffer bufD(D, range<1>(Big_M * Big_N)); - - queue q; - // currently bfloat16 has to be initialized on device - if constexpr (std::is_same::value) { - q.submit([&](handler &cgh) { - auto accA = bufA.template get_access(cgh); - - cgh.parallel_for>( - range<1>(Big_M * Big_K), [=](item<1> item) { - auto i = item.get_linear_id(); - accA[i] = 0.1f * (i % 10); - }); - }); + { + buffer bufA(A, range<1>(Big_M * Big_K)); + buffer bufB(B, range<1>(Big_K * Big_N)); + buffer bufC(C, range<1>(Big_M * Big_N)); + buffer bufD(D, range<1>(Big_M * Big_N)); + + queue q; + // currently bfloat16 has to be initialized on device + if constexpr (std::is_same::value) { + q.submit([&](handler &cgh) { + auto accA = bufA.template get_access(cgh); + + cgh.parallel_for>( + range<1>(Big_M * Big_K), [=](item<1> item) { + auto i = item.get_linear_id(); + accA[i] = 0.1f * (i % 10); + }); + }); + + q.submit([&](handler &cgh) { + auto accB = bufB.template get_access(cgh); + + cgh.parallel_for>( + range<1>(Big_K * Big_N), [=](item<1> item) { + auto i = item.get_linear_id(); + accB[i] = 0.1f * (i % 10); + }); + }); + } q.submit([&](handler &cgh) { - auto accB = bufB.template get_access(cgh); - - cgh.parallel_for>( - range<1>(Big_K * Big_N), [=](item<1> item) { - auto i = item.get_linear_id(); - accB[i] = 0.1f * (i % 10); - }); + auto accC = bufC.template get_access(cgh); + auto accA = bufA.template get_access(cgh); + auto accB = bufB.template get_access(cgh); + auto accD = bufD.template get_access(cgh); + + range<2> LocalRange = {1, N_THREADS_PER_MATRIX_OP}; + range<2> GlobalRange = {Sub_Tiles_M, + Sub_Tiles_N * N_THREADS_PER_MATRIX_OP}; + + cgh.parallel_for< + KernelName>(nd_range<2>(GlobalRange, LocalRange), [= + ](nd_item<2> item)[[sycl::reqd_work_group_size(1, 1, 32)]] { + sycl::sub_group sg = item.get_sub_group(); + const auto m = + item.get_group().get_group_id()[0]; // row id of current submatrix + // of BIG C matrix + const auto n = + item.get_group().get_group_id()[1]; // column id of current + // submatrix of BIG C matrix + + joint_matrix sub_a; + + joint_matrix sub_b; + + joint_matrix + sub_c; + + joint_matrix_load(sg, sub_c, + accC.get_pointer() + (m * M) * Big_N + n * N, Big_N); + + for (int k = 0; k < Sub_Tiles_K; + k++) // row/col id of current submatrix of BIG A/B matrices + { + joint_matrix_load( + sg, sub_a, accA.get_pointer() + (k * K) + (m * M * Big_K), Big_K); + + joint_matrix_load( + sg, sub_b, accB.get_pointer() + (k * K * Big_N) + (n * N), Big_N); + + sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c); + } + joint_matrix_store(sg, sub_c, + accD.get_pointer() + (m * M) * Big_N + n * N, Big_N); + }); }); - } - - q.submit([&](handler &cgh) { - auto accC = bufC.template get_access(cgh); - auto accA = bufA.template get_access(cgh); - auto accB = bufB.template get_access(cgh); - auto accD = bufD.template get_access(cgh); - - range<2> LocalRange = {1, N_THREADS_PER_MATRIX_OP}; - range<2> GlobalRange = {Sub_Tiles_M, Sub_Tiles_N * N_THREADS_PER_MATRIX_OP}; - - cgh.parallel_for< - KernelName>(nd_range<2>(GlobalRange, LocalRange), [= - ](nd_item<2> item)[[sycl::reqd_work_group_size(1, 1, 32)]] { - sycl::sub_group sg = item.get_sub_group(); - const auto m = - item.get_group() - .get_group_id()[0]; // row id of current submatrix of BIG C matrix - const auto n = - item.get_group().get_group_id()[1]; // column id of current - // submatrix of BIG C matrix - - joint_matrix sub_a; - joint_matrix sub_b; - - joint_matrix - sub_c; - - joint_matrix_load(sg, sub_c, accC.get_pointer() + (m * M) * Big_N + n * N, - Big_N); - - for (int k = 0; k < Sub_Tiles_K; - k++) // row/col id of current submatrix of BIG A/B matrices - { - joint_matrix_load( - sg, sub_a, accA.get_pointer() + (k * K) + (m * M * Big_K), Big_K); - - joint_matrix_load( - sg, sub_b, accB.get_pointer() + (k * K * Big_N) + (n * N), Big_N); + q.wait(); + } - sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c); + for (int m = 0; m < Big_M; m++) + for (int n = 0; n < Big_N; n++) { + if constexpr (std::is_same::value) { + auto res_device = matrix_ref_mn(m, n, A, B, C); + assert(fabs(2 * (D[m * Big_N + n] - res_device)) / + (D[m * Big_N + n] + res_device) < + bf16_eps * 2); + } else { + assert((D[m * Big_N + n] == + matrix_ref_mn(m, n, A, B, C))); } - joint_matrix_store(sg, sub_c, - accD.get_pointer() + (m * M) * Big_N + n * N, Big_N); - }); - }); - - q.wait(); -} - -for (int m = 0; m < Big_M; m++) - for (int n = 0; n < Big_N; n++) { - if constexpr (std::is_same::value) { - auto res_device = matrix_ref_mn(m, n, A, B, C); - assert(fabs(2 * (D[m * Big_N + n] - res_device)) / - (D[m * Big_N + n] + res_device) < - bf16_eps * 2); - } else { - assert((D[m * Big_N + n] == - matrix_ref_mn(m, n, A, B, C))); } - } }; int main() { From b178c43ca24056ef5568f2835f4bfa349facf517 Mon Sep 17 00:00:00 2001 From: "jack.kirk" Date: Tue, 5 Apr 2022 18:35:46 +0100 Subject: [PATCH 04/21] format --- SYCL/Matrix/joint_matrix_tensorcore.cpp | 77 +++++++++++++------------ 1 file changed, 40 insertions(+), 37 deletions(-) diff --git a/SYCL/Matrix/joint_matrix_tensorcore.cpp b/SYCL/Matrix/joint_matrix_tensorcore.cpp index fe4bfcab61..f3f322aa6a 100644 --- a/SYCL/Matrix/joint_matrix_tensorcore.cpp +++ b/SYCL/Matrix/joint_matrix_tensorcore.cpp @@ -163,43 +163,46 @@ void test() { range<2> GlobalRange = {Sub_Tiles_M, Sub_Tiles_N * N_THREADS_PER_MATRIX_OP}; - cgh.parallel_for< - KernelName>(nd_range<2>(GlobalRange, LocalRange), [= - ](nd_item<2> item)[[sycl::reqd_work_group_size(1, 1, 32)]] { - sycl::sub_group sg = item.get_sub_group(); - const auto m = - item.get_group().get_group_id()[0]; // row id of current submatrix - // of BIG C matrix - const auto n = - item.get_group().get_group_id()[1]; // column id of current - // submatrix of BIG C matrix - - joint_matrix sub_a; - - joint_matrix sub_b; - - joint_matrix - sub_c; - - joint_matrix_load(sg, sub_c, - accC.get_pointer() + (m * M) * Big_N + n * N, Big_N); - - for (int k = 0; k < Sub_Tiles_K; - k++) // row/col id of current submatrix of BIG A/B matrices - { - joint_matrix_load( - sg, sub_a, accA.get_pointer() + (k * K) + (m * M * Big_K), Big_K); - - joint_matrix_load( - sg, sub_b, accB.get_pointer() + (k * K * Big_N) + (n * N), Big_N); - - sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c); - } - joint_matrix_store(sg, sub_c, - accD.get_pointer() + (m * M) * Big_N + n * N, Big_N); - }); - }); + cgh.parallel_for>( + nd_range<2>(GlobalRange, LocalRange), + [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] { + sycl::sub_group sg = item.get_sub_group(); + const auto m = + item.get_group().get_group_id()[0]; // row id of current + // submatrix of BIG C matrix + const auto n = + item.get_group().get_group_id()[1]; // column id of current + // submatrix of BIG C matrix + + joint_matrix + sub_a; + + joint_matrix + sub_b; + + joint_matrix + sub_c; + + joint_matrix_load( + sg, sub_c, accC.get_pointer() + (m * M) * Big_N + n * N, Big_N); + + for (int k = 0; k < Sub_Tiles_K; + k++) // row/col id of current submatrix of BIG A/B matrices + { + joint_matrix_load(sg, sub_a, + accA.get_pointer() + (k * K) + (m * M * Big_K), + Big_K); + + joint_matrix_load(sg, sub_b, + accB.get_pointer() + (k * K * Big_N) + (n * N), + Big_N); + + sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c); + } + joint_matrix_store( + sg, sub_c, accD.get_pointer() + (m * M) * Big_N + n * N, Big_N); + }); q.wait(); } From ca241530a9dc33a95597a1da505d26c134b9ac28 Mon Sep 17 00:00:00 2001 From: "jack.kirk" Date: Tue, 5 Apr 2022 18:45:35 +0100 Subject: [PATCH 05/21] format --- SYCL/Matrix/joint_matrix_tensorcore.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/SYCL/Matrix/joint_matrix_tensorcore.cpp b/SYCL/Matrix/joint_matrix_tensorcore.cpp index f3f322aa6a..109ed10ad3 100644 --- a/SYCL/Matrix/joint_matrix_tensorcore.cpp +++ b/SYCL/Matrix/joint_matrix_tensorcore.cpp @@ -204,7 +204,7 @@ void test() { sg, sub_c, accD.get_pointer() + (m * M) * Big_N + n * N, Big_N); }); - q.wait(); + q.wait(); } for (int m = 0; m < Big_M; m++) @@ -219,7 +219,7 @@ void test() { matrix_ref_mn(m, n, A, B, C))); } } -}; + }; int main() { From 4013bd06edbbb0cb9524a08f918750f6c73f411b Mon Sep 17 00:00:00 2001 From: "jack.kirk" Date: Thu, 7 Apr 2022 14:58:31 +0100 Subject: [PATCH 06/21] Do compute capability check at runtime. --- SYCL/BFloat16/bfloat16_builtins.cpp | 35 +++++++------ SYCL/Matrix/joint_matrix_tensorcore.cpp | 67 +++++++++++++------------ 2 files changed, 56 insertions(+), 46 deletions(-) diff --git a/SYCL/BFloat16/bfloat16_builtins.cpp b/SYCL/BFloat16/bfloat16_builtins.cpp index 8baad18c61..9320895675 100644 --- a/SYCL/BFloat16/bfloat16_builtins.cpp +++ b/SYCL/BFloat16/bfloat16_builtins.cpp @@ -1,8 +1,6 @@ // REQUIRES: cuda // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_80 -// TODO: Currently the CI does not have a sm_80 capable machine. Enable the test -// execution once it does. -// RUNx: %t.out +// RUN: %t.out #include @@ -100,19 +98,26 @@ bool check(float a, float b) { int main() { queue q; - std::vector a(N), b(N), c(N); - int err = 0; - for (int i = 0; i < N; i++) { - a[i] = (i - N / 2) / (float)N; - b[i] = (N / 2 - i) / (float)N; - c[i] = (float)(3 * i); + auto computeCapability = + std::stof(q.get_device().get_info()); + // TODO check for "ext_oneapi_bfloat16" aspect instead once aspect is + // supported. Since this test only covers CUDA the current check is + // functionally equivalent to "ext_oneapi_bfloat16". + if (computeCapability >= 8.0) { + std::vector a(N), b(N), c(N); + int err = 0; + + for (int i = 0; i < N; i++) { + a[i] = (i - N / 2) / (float)N; + b[i] = (N / 2 - i) / (float)N; + c[i] = (float)(3 * i); + } + + TEST_BUILTIN_1(fabs); + TEST_BUILTIN_2(fmin); + TEST_BUILTIN_2(fmax); + TEST_BUILTIN_3(fma); } - - TEST_BUILTIN_1(fabs); - TEST_BUILTIN_2(fmin); - TEST_BUILTIN_2(fmax); - TEST_BUILTIN_3(fma); - return 0; } diff --git a/SYCL/Matrix/joint_matrix_tensorcore.cpp b/SYCL/Matrix/joint_matrix_tensorcore.cpp index 109ed10ad3..c184a29046 100644 --- a/SYCL/Matrix/joint_matrix_tensorcore.cpp +++ b/SYCL/Matrix/joint_matrix_tensorcore.cpp @@ -1,9 +1,7 @@ // REQUIRES: cuda // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX=3 %s -o %t.out -// TODO: Currently the CI does not have a sm_80 capable machine. Enable the test -// execution once it does. -// RUNx: %t.out +// RUN: %t.out // // Specifying the sm version via the --cuda-gpu-arch flag is necessary // for the Nvidia case. DPC++ JIT compilation is not @@ -84,7 +82,7 @@ T2 matrix_ref_mn(const int &m, const int &n, T1 *A, T1 *B, T2 *C) { template -void test() { +void test(queue& q) { constexpr auto Big_M = Sub_Tiles_M * @@ -129,7 +127,6 @@ void test() { buffer bufC(C, range<1>(Big_M * Big_N)); buffer bufD(D, range<1>(Big_M * Big_N)); - queue q; // currently bfloat16 has to be initialized on device if constexpr (std::is_same::value) { q.submit([&](handler &cgh) { @@ -203,7 +200,7 @@ void test() { joint_matrix_store( sg, sub_c, accD.get_pointer() + (m * M) * Big_N + n * N, Big_N); }); - + }); q.wait(); } @@ -223,34 +220,42 @@ void test() { int main() { - // A/B half, Accumulator float - test(); - test(); - test(); - - // A/B/Accumulator half - test(); - test(); - test(); - - test(); - test(); - test(); + queue Q; + auto computeCapability = + std::stof(Q.get_device().get_info()); - test(); - test(); - test(); + if (computeCapability >= 7.0) { + // A/B half, Accumulator float + test(Q); + test(Q); + test(Q); - test(); - - // A/B bf16 using storage type - test(); - test(); - test(); + // A/B/Accumulator half + test(Q); + test(Q); + test(Q); + } + if (computeCapability >= 7.2) { + test(Q); + test(Q); + test(Q); + + test( + Q); + test(Q); + test(Q); + } + if (computeCapability >= 8.0) { + test(Q); - test(); - test(); - test(); + // A/B bfloat16 using storage type + test(Q); + test(Q); + test(Q); + test(Q); + test(Q); + test(Q); + } return 0; }; From e4e7f205a9f3cff493ea000ad38ce30fe838d355 Mon Sep 17 00:00:00 2001 From: "jack.kirk" Date: Thu, 7 Apr 2022 15:09:43 +0100 Subject: [PATCH 07/21] format --- SYCL/Matrix/joint_matrix_tensorcore.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/SYCL/Matrix/joint_matrix_tensorcore.cpp b/SYCL/Matrix/joint_matrix_tensorcore.cpp index c184a29046..70b6eff513 100644 --- a/SYCL/Matrix/joint_matrix_tensorcore.cpp +++ b/SYCL/Matrix/joint_matrix_tensorcore.cpp @@ -82,7 +82,7 @@ T2 matrix_ref_mn(const int &m, const int &n, T1 *A, T1 *B, T2 *C) { template -void test(queue& q) { +void test(queue &q) { constexpr auto Big_M = Sub_Tiles_M * @@ -200,11 +200,11 @@ void test(queue& q) { joint_matrix_store( sg, sub_c, accD.get_pointer() + (m * M) * Big_N + n * N, Big_N); }); - }); - q.wait(); + }); + q.wait(); } - for (int m = 0; m < Big_M; m++) + for (int m = 0; m < Big_M; m++) { for (int n = 0; n < Big_N; n++) { if constexpr (std::is_same::value) { auto res_device = matrix_ref_mn(m, n, A, B, C); @@ -216,7 +216,8 @@ void test(queue& q) { matrix_ref_mn(m, n, A, B, C))); } } - }; + } +}; int main() { From 5c5d65c420efbc44ecb80ad4891bf4502e74c75e Mon Sep 17 00:00:00 2001 From: "jack.kirk" Date: Tue, 12 Apr 2022 15:13:02 +0100 Subject: [PATCH 08/21] tests for bfloat16 marray math fcts --- SYCL/BFloat16/bfloat16_builtins.cpp | 158 ++++++++++++++++++++---- SYCL/Matrix/joint_matrix_tensorcore.cpp | 36 +++--- 2 files changed, 152 insertions(+), 42 deletions(-) diff --git a/SYCL/BFloat16/bfloat16_builtins.cpp b/SYCL/BFloat16/bfloat16_builtins.cpp index 9320895675..3c3648d917 100644 --- a/SYCL/BFloat16/bfloat16_builtins.cpp +++ b/SYCL/BFloat16/bfloat16_builtins.cpp @@ -10,20 +10,14 @@ using namespace cl::sycl; using sycl::ext::oneapi::experimental::bfloat16; -constexpr int N = 16 * 3; // divisible by all vector sizes +constexpr int N = 60; // divisible by all tested array sizes constexpr float bf16_eps = 0.00390625; -union conv { - float f; - vec u; - uint32_t u2; -}; - -float from_bf16(uint16_t x) { - conv c; - c.u.y() = x; - c.u.x() = 0; - return c.f; +float make_fp32(uint16_t x) { + uint32_t y = x; + y = y << 16; + auto res = reinterpret_cast(&y); + return *res; } bool check(float a, float b) { @@ -35,10 +29,11 @@ bool check(float a, float b) { buffer a_buf(&a[0], N); \ buffer err_buf(&err, 1); \ q.submit([&](handler &cgh) { \ - auto A = a_buf.get_access(cgh); \ - auto ERR = err_buf.get_access(cgh); \ + accessor A(a_buf, \ + cgh); \ + accessor ERR(err_buf, cgh); \ cgh.parallel_for(N, [=](id<1> index) { \ - if (check(from_bf16(NAME(bfloat16{A[index]}).raw()), \ + if (check(make_fp32(NAME(bfloat16{A[index]}).raw()), \ NAME(A[index]))) { \ ERR[0] = 1; \ } \ @@ -47,7 +42,37 @@ bool check(float a, float b) { } \ assert(err == 0); -#define TEST_BUILTIN_1(NAME) TEST_BUILTIN_1_SCAL_IMPL(NAME) +#define TEST_BUILTIN_1_ARR_IMPL(NAME, SZ) \ + { \ + buffer a_buf{range<2>{N / SZ, SZ}}; \ + buffer err_buf(&err, 1); \ + q.submit([&](handler &cgh) { \ + accessor A(a_buf, \ + cgh); \ + accessor ERR(err_buf, cgh); \ + cgh.parallel_for(N / SZ, [=](id<1> index) { \ + marray arg; \ + for (int i = 0; i < SZ; i++) { \ + arg[i] = A[index][i]; \ + } \ + marray res = NAME(arg); \ + for (int i = 0; i < SZ; i++) { \ + if (check(make_fp32(res[i].raw()), NAME(A[index][i]))) { \ + ERR[0] = 1; \ + } \ + } \ + }); \ + }); \ + } \ + assert(err == 0); + +#define TEST_BUILTIN_1(NAME) \ + TEST_BUILTIN_1_SCAL_IMPL(NAME) \ + TEST_BUILTIN_1_ARR_IMPL(NAME, 1) \ + TEST_BUILTIN_1_ARR_IMPL(NAME, 2) \ + TEST_BUILTIN_1_ARR_IMPL(NAME, 3) \ + TEST_BUILTIN_1_ARR_IMPL(NAME, 4) \ + TEST_BUILTIN_1_ARR_IMPL(NAME, 5) #define TEST_BUILTIN_2_SCAL_IMPL(NAME) \ { \ @@ -55,12 +80,14 @@ bool check(float a, float b) { buffer b_buf(&b[0], N); \ buffer err_buf(&err, 1); \ q.submit([&](handler &cgh) { \ - auto A = a_buf.get_access(cgh); \ - auto B = b_buf.get_access(cgh); \ - auto ERR = err_buf.get_access(cgh); \ + accessor A(a_buf, \ + cgh); \ + accessor B(b_buf, \ + cgh); \ + accessor ERR(err_buf, cgh); \ cgh.parallel_for(N, [=](id<1> index) { \ if (check( \ - from_bf16(NAME(bfloat16{A[index]}, bfloat16{B[index]}).raw()), \ + make_fp32(NAME(bfloat16{A[index]}, bfloat16{B[index]}).raw()), \ NAME(A[index], B[index]))) { \ ERR[0] = 1; \ } \ @@ -69,7 +96,42 @@ bool check(float a, float b) { } \ assert(err == 0); -#define TEST_BUILTIN_2(NAME) TEST_BUILTIN_2_SCAL_IMPL(NAME) +#define TEST_BUILTIN_2_ARR_IMPL(NAME, SZ) \ + { \ + buffer a_buf{range<2>{N / SZ, SZ}}; \ + buffer b_buf{range<2>{N / SZ, SZ}}; \ + buffer err_buf(&err, 1); \ + q.submit([&](handler &cgh) { \ + accessor A(a_buf, \ + cgh); \ + accessor B(b_buf, \ + cgh); \ + accessor ERR(err_buf, cgh); \ + cgh.parallel_for(N / SZ, [=](id<1> index) { \ + marray arg0, arg1; \ + for (int i = 0; i < SZ; i++) { \ + arg0[i] = A[index][i]; \ + arg1[i] = B[index][i]; \ + } \ + marray res = NAME(arg0, arg1); \ + for (int i = 0; i < SZ; i++) { \ + if (check(make_fp32(res[i].raw()), \ + NAME(A[index][i], B[index][i]))) { \ + ERR[0] = 1; \ + } \ + } \ + }); \ + }); \ + } \ + assert(err == 0); + +#define TEST_BUILTIN_2(NAME) \ + TEST_BUILTIN_2_SCAL_IMPL(NAME) \ + TEST_BUILTIN_2_ARR_IMPL(NAME, 1) \ + TEST_BUILTIN_2_ARR_IMPL(NAME, 2) \ + TEST_BUILTIN_2_ARR_IMPL(NAME, 3) \ + TEST_BUILTIN_2_ARR_IMPL(NAME, 4) \ + TEST_BUILTIN_2_ARR_IMPL(NAME, 5) #define TEST_BUILTIN_3_SCAL_IMPL(NAME) \ { \ @@ -78,12 +140,15 @@ bool check(float a, float b) { buffer c_buf(&c[0], N); \ buffer err_buf(&err, 1); \ q.submit([&](handler &cgh) { \ - auto A = a_buf.get_access(cgh); \ - auto B = b_buf.get_access(cgh); \ - auto C = c_buf.get_access(cgh); \ - auto ERR = err_buf.get_access(cgh); \ + accessor A(a_buf, \ + cgh); \ + accessor B(b_buf, \ + cgh); \ + accessor C(c_buf, \ + cgh); \ + accessor ERR(err_buf, cgh); \ cgh.parallel_for(N, [=](id<1> index) { \ - if (check(from_bf16(NAME(bfloat16{A[index]}, bfloat16{B[index]}, \ + if (check(make_fp32(NAME(bfloat16{A[index]}, bfloat16{B[index]}, \ bfloat16{C[index]}) \ .raw()), \ NAME(A[index], B[index], C[index]))) { \ @@ -94,7 +159,46 @@ bool check(float a, float b) { } \ assert(err == 0); -#define TEST_BUILTIN_3(NAME) TEST_BUILTIN_3_SCAL_IMPL(NAME) +#define TEST_BUILTIN_3_ARR_IMPL(NAME, SZ) \ + { \ + buffer a_buf{range<2>{N / SZ, SZ}}; \ + buffer b_buf{range<2>{N / SZ, SZ}}; \ + buffer c_buf{range<2>{N / SZ, SZ}}; \ + buffer err_buf(&err, 1); \ + q.submit([&](handler &cgh) { \ + accessor A(a_buf, \ + cgh); \ + accessor B(b_buf, \ + cgh); \ + accessor C(c_buf, \ + cgh); \ + accessor ERR(err_buf, cgh); \ + cgh.parallel_for(N / SZ, [=](id<1> index) { \ + marray arg0, arg1, arg2; \ + for (int i = 0; i < SZ; i++) { \ + arg0[i] = A[index][i]; \ + arg1[i] = B[index][i]; \ + arg2[i] = C[index][i]; \ + } \ + marray res = NAME(arg0, arg1, arg2); \ + for (int i = 0; i < SZ; i++) { \ + if (check(make_fp32(res[i].raw()), \ + NAME(A[index][i], B[index][i], C[index][i]))) { \ + ERR[0] = 1; \ + } \ + } \ + }); \ + }); \ + } \ + assert(err == 0); + +#define TEST_BUILTIN_3(NAME) \ + TEST_BUILTIN_3_SCAL_IMPL(NAME) \ + TEST_BUILTIN_3_ARR_IMPL(NAME, 1) \ + TEST_BUILTIN_3_ARR_IMPL(NAME, 2) \ + TEST_BUILTIN_3_ARR_IMPL(NAME, 3) \ + TEST_BUILTIN_3_ARR_IMPL(NAME, 4) \ + TEST_BUILTIN_3_ARR_IMPL(NAME, 5) int main() { queue q; diff --git a/SYCL/Matrix/joint_matrix_tensorcore.cpp b/SYCL/Matrix/joint_matrix_tensorcore.cpp index 70b6eff513..f0a92fc11b 100644 --- a/SYCL/Matrix/joint_matrix_tensorcore.cpp +++ b/SYCL/Matrix/joint_matrix_tensorcore.cpp @@ -46,17 +46,17 @@ class TypeHelper; template using KernelName = class TypeHelper; -float make_fp32(short x) { - unsigned int y = x; +float make_fp32(uint16_t x) { + uint32_t y = x; y = y << 16; - float *res = reinterpret_cast(&y); + auto res = reinterpret_cast(&y); return *res; } -unsigned short make_bf16(float x) { - int *res = reinterpret_cast(&x); +uint16_t make_bf16(float x) { + auto res = reinterpret_cast(&x); *res = *res >> 16; - return (unsigned short)*res; + return (uint16_t)*res; } template @@ -130,7 +130,8 @@ void test(queue &q) { // currently bfloat16 has to be initialized on device if constexpr (std::is_same::value) { q.submit([&](handler &cgh) { - auto accA = bufA.template get_access(cgh); + accessor + accA(bufA, cgh); cgh.parallel_for>( range<1>(Big_M * Big_K), [=](item<1> item) { @@ -140,7 +141,8 @@ void test(queue &q) { }); q.submit([&](handler &cgh) { - auto accB = bufB.template get_access(cgh); + accessor + accB(bufB, cgh); cgh.parallel_for>( range<1>(Big_K * Big_N), [=](item<1> item) { @@ -151,10 +153,14 @@ void test(queue &q) { } q.submit([&](handler &cgh) { - auto accC = bufC.template get_access(cgh); - auto accA = bufA.template get_access(cgh); - auto accB = bufB.template get_access(cgh); - auto accD = bufD.template get_access(cgh); + accessor + accA(bufA, cgh); + accessor + accB(bufB, cgh); + accessor + accC(bufC, cgh); + accessor + accD(bufD, cgh); range<2> LocalRange = {1, N_THREADS_PER_MATRIX_OP}; range<2> GlobalRange = {Sub_Tiles_M, @@ -162,8 +168,8 @@ void test(queue &q) { cgh.parallel_for>( nd_range<2>(GlobalRange, LocalRange), - [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] { - sycl::sub_group sg = item.get_sub_group(); + [=](nd_item<2> item) { + sub_group sg = item.get_sub_group(); const auto m = item.get_group().get_group_id()[0]; // row id of current // submatrix of BIG C matrix @@ -223,7 +229,7 @@ int main() { queue Q; auto computeCapability = - std::stof(Q.get_device().get_info()); + std::stof(Q.get_device().get_info()); if (computeCapability >= 7.0) { // A/B half, Accumulator float From 6856b89066432ed082dc2b8d8e833c2c02b6b576 Mon Sep 17 00:00:00 2001 From: "jack.kirk" Date: Tue, 12 Apr 2022 15:40:25 +0100 Subject: [PATCH 09/21] format --- SYCL/Matrix/joint_matrix_tensorcore.cpp | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/SYCL/Matrix/joint_matrix_tensorcore.cpp b/SYCL/Matrix/joint_matrix_tensorcore.cpp index f0a92fc11b..a11e3b3306 100644 --- a/SYCL/Matrix/joint_matrix_tensorcore.cpp +++ b/SYCL/Matrix/joint_matrix_tensorcore.cpp @@ -130,8 +130,8 @@ void test(queue &q) { // currently bfloat16 has to be initialized on device if constexpr (std::is_same::value) { q.submit([&](handler &cgh) { - accessor - accA(bufA, cgh); + accessor accA(bufA, + cgh); cgh.parallel_for>( range<1>(Big_M * Big_K), [=](item<1> item) { @@ -141,8 +141,8 @@ void test(queue &q) { }); q.submit([&](handler &cgh) { - accessor - accB(bufB, cgh); + accessor accB(bufB, + cgh); cgh.parallel_for>( range<1>(Big_K * Big_N), [=](item<1> item) { @@ -153,22 +153,17 @@ void test(queue &q) { } q.submit([&](handler &cgh) { - accessor - accA(bufA, cgh); - accessor - accB(bufB, cgh); - accessor - accC(bufC, cgh); - accessor - accD(bufD, cgh); + accessor accA(bufA, cgh); + accessor accB(bufB, cgh); + accessor accC(bufC, cgh); + accessor accD(bufD, cgh); range<2> LocalRange = {1, N_THREADS_PER_MATRIX_OP}; range<2> GlobalRange = {Sub_Tiles_M, Sub_Tiles_N * N_THREADS_PER_MATRIX_OP}; cgh.parallel_for>( - nd_range<2>(GlobalRange, LocalRange), - [=](nd_item<2> item) { + nd_range<2>(GlobalRange, LocalRange), [=](nd_item<2> item) { sub_group sg = item.get_sub_group(); const auto m = item.get_group().get_group_id()[0]; // row id of current From 4e1d6e46af60671b8bf58b5292af74b02689490d Mon Sep 17 00:00:00 2001 From: "jack.kirk" Date: Fri, 15 Apr 2022 11:36:46 +0100 Subject: [PATCH 10/21] test case for expected NAN behaviour. --- SYCL/BFloat16/bfloat16_builtins.cpp | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/SYCL/BFloat16/bfloat16_builtins.cpp b/SYCL/BFloat16/bfloat16_builtins.cpp index 3c3648d917..6150472c3c 100644 --- a/SYCL/BFloat16/bfloat16_builtins.cpp +++ b/SYCL/BFloat16/bfloat16_builtins.cpp @@ -200,6 +200,26 @@ bool check(float a, float b) { TEST_BUILTIN_3_ARR_IMPL(NAME, 4) \ TEST_BUILTIN_3_ARR_IMPL(NAME, 5) +#define TEST_BUILTIN_2_NAN(NAME) \ + { \ + buffer err_buf(&err, 1); \ + buffer nan_buf(&check_nan, 1); \ + q.submit([&](handler &cgh) { \ + accessor ERR(err_buf, cgh); \ + accessor checkNAN( \ + nan_buf, cgh); \ + cgh.single_task([=]() { \ + checkNAN[0] = make_fp32(NAME(bfloat16{NAN}, bfloat16{NAN}).raw()); \ + if ((make_fp32(NAME(bfloat16{2}, bfloat16{NAN}).raw()) != 2) || \ + (make_fp32(NAME(bfloat16{NAN}, bfloat16{2}).raw()) != 2)) { \ + ERR[0] = 1; \ + } \ + }); \ + }); \ + } \ + assert(err == 0); \ + assert(std::isnan(check_nan)); + int main() { queue q; @@ -222,6 +242,10 @@ int main() { TEST_BUILTIN_2(fmin); TEST_BUILTIN_2(fmax); TEST_BUILTIN_3(fma); + + float check_nan = 0; + TEST_BUILTIN_2_NAN(fmin); + TEST_BUILTIN_2_NAN(fmax); } return 0; } From 095f7752b208c3ce195c533ddbaeb150bd9c3dfb Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Tue, 31 May 2022 10:37:06 +0100 Subject: [PATCH 11/21] Added tests for elem wise ops. Signed-off-by: JackAKirk --- SYCL/Matrix/element_wise_all_ops_cuda.cpp | 185 ++++++++++++++++++++++ SYCL/Matrix/joint_matrix_tensorcore.cpp | 11 +- 2 files changed, 194 insertions(+), 2 deletions(-) create mode 100644 SYCL/Matrix/element_wise_all_ops_cuda.cpp diff --git a/SYCL/Matrix/element_wise_all_ops_cuda.cpp b/SYCL/Matrix/element_wise_all_ops_cuda.cpp new file mode 100644 index 0000000000..7c8887a054 --- /dev/null +++ b/SYCL/Matrix/element_wise_all_ops_cuda.cpp @@ -0,0 +1,185 @@ +//==----------- element_wise_all_ops_cuda.cpp - DPC++ joint_matrix---------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: cuda + +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX=3 %s -o %t.out +// RUN: %t.out + +#include + +using namespace sycl; +using namespace sycl::ext::oneapi::experimental::matrix; +using sycl::ext::oneapi::experimental::bfloat16; + +#define SG_SZ 32 +constexpr size_t nWGperDim = 2; + +class Logical {}; + +template +class KernelName; + +template struct big_matrix { +public: + T *mat; + +public: + T *get_data() { return mat; } + void set_data(T *data) { mat = data; } + big_matrix(T *data) : mat(data) {} +}; + +template +void assert_ops_ref(T *C, const float ref) { + for (size_t i = 0; i < M; i++) + for (size_t j = 0; j < N; j++) { + auto diff = C[i + j * M] - ref; + assert(std::fabs(static_cast(diff)) < + std::numeric_limits::epsilon()); + } +} +template +void matrix_verify_op(queue q, big_matrix &C, + nd_range<2> &r, const float ref, Operation Op) { + { + buffer bufC(C.get_data(), range<2>(N * nWGperDim, M * nWGperDim)); + + q.submit([&](handler &cgh) { + accessor accC(bufC, + cgh); + + cgh.parallel_for>( + r, [ accC, + Op ](nd_item<2> spmd_item)[[sycl::reqd_sub_group_size(SG_SZ)]] { + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + auto sg = spmd_item.get_sub_group(); + + joint_matrix sub_a; + joint_matrix sub_b; + joint_matrix sub_c; + + joint_matrix_fill(sg, sub_a, 3); + joint_matrix_fill(sg, sub_b, 1); + joint_matrix_fill(sg, sub_c, -80); + + auto wi_slice_a = sub_a.get_wi_data(); + for (int i = 0; i < wi_slice_a.length(); i++) { + if constexpr (std::is_same_v) { + if (wi_slice_a[i]) { + if (wi_slice_a[i] > 2.0 || wi_slice_a[i] >= 3.0 || + wi_slice_a[i] < 4.0 || wi_slice_a[i] <= 3.0) { + T val = (wi_slice_a[i] != (2.0)) ? wi_slice_a[i] + : static_cast(2.0); + val = ((val) - (1)); + val = ((val) + (1)); + if (wi_slice_a[i] == (2.0)) { + val = ((val) - (2)); + val = ((val) * (3)); + val = ((val) / (2)); + + } else { + val = ((val) + (2)); + } + wi_slice_a[i] = val; + } + } + } else { + wi_slice_a[i] = Op(wi_slice_a[i], 2); + } + } + + sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c); + + joint_matrix_store(sg, sub_c, + accC.get_pointer() + + (sg_startx * M) * (N * nWGperDim) + + sg_starty / SG_SZ * N, + (N * nWGperDim)); + }); // parallel for + }) + .wait(); + } + assert_ops_ref(C.get_data(), ref); +} + +static constexpr size_t MATRIX_M = 16 * nWGperDim; +static constexpr size_t MATRIX_N = 16 * nWGperDim; + +int main() { + + float D[MATRIX_M][MATRIX_N]; + big_matrix MD_f((float *)&D); + + queue q; + auto computeCapability = + std::stof(q.get_device().get_info()); + nd_range<2> r({nWGperDim, nWGperDim * SG_SZ}, {1, 1 * SG_SZ}); + + if (computeCapability >= 7.0) { + matrix_verify_op(q, MD_f, r, 0.0, + std::plus{}); + matrix_verify_op(q, MD_f, r, 0.0, Logical{}); + matrix_verify_op(q, MD_f, r, 16.0, + std::multiplies{}); + matrix_verify_op(q, MD_f, r, -56.0, + std::divides{}); + matrix_verify_op(q, MD_f, r, -64.0, + std::minus{}); + } + + if (computeCapability >= 7.2) { + int32_t D_i[MATRIX_M][MATRIX_N]; + big_matrix MD_i((int32_t *)&D_i); + matrix_verify_op(q, MD_i, r, 0, + std::plus{}); + matrix_verify_op(q, MD_i, r, 16, + std::multiplies{}); + matrix_verify_op(q, MD_i, r, -64, + std::minus{}); + matrix_verify_op(q, MD_i, r, 0, + std::plus{}); + matrix_verify_op(q, MD_i, r, 0.0, Logical{}); + matrix_verify_op(q, MD_i, r, 16, + std::multiplies{}); + matrix_verify_op(q, MD_i, r, -64, + std::minus{}); + } + + if (computeCapability >= 8.0) { + + matrix_verify_op(q, MD_f, r, 0.0, + std::plus{}); + matrix_verify_op(q, MD_f, r, 0.0, Logical{}); + matrix_verify_op(q, MD_f, r, 16.0, + std::multiplies{}); + matrix_verify_op(q, MD_f, r, -56.0, + std::divides{}); + matrix_verify_op(q, MD_f, r, -64.0, + std::minus{}); + + double D_d[MATRIX_M / 2][MATRIX_N / 2]; + big_matrix MD_d((double *)&D_d); + + matrix_verify_op(q, MD_d, r, -60.0, + std::plus{}); + matrix_verify_op(q, MD_d, r, -60.0, Logical{}); + matrix_verify_op(q, MD_d, r, -56.0, + std::multiplies{}); + matrix_verify_op(q, MD_d, r, -74.0, + std::divides{}); + matrix_verify_op(q, MD_d, r, -76.0, + std::minus{}); + } + + return 0; +} diff --git a/SYCL/Matrix/joint_matrix_tensorcore.cpp b/SYCL/Matrix/joint_matrix_tensorcore.cpp index a11e3b3306..b41739646d 100644 --- a/SYCL/Matrix/joint_matrix_tensorcore.cpp +++ b/SYCL/Matrix/joint_matrix_tensorcore.cpp @@ -68,8 +68,8 @@ T2 matrix_ref_mn(const int &m, const int &n, T1 *A, T1 *B, T2 *C) { res += make_fp32(A[m * Big_K + k]) * make_fp32(B[k * Big_N + n]); } else if constexpr (std::is_same::value) { for (int k = 0; k < Big_K; k++) - res += - make_fp32(A[m * Big_K + k].raw()) * make_fp32(B[k * Big_N + n].raw()); + res += (make_fp32(A[m * Big_K + k].raw()) * 2 + 1) * + make_fp32(B[k * Big_N + n].raw()); } else { for (int k = 0; k < Big_K; k++) @@ -192,6 +192,13 @@ void test(queue &q) { accA.get_pointer() + (k * K) + (m * M * Big_K), Big_K); + if constexpr (std::is_same::value) { + marray b, c; + b = 2; + c = 1; + sub_a.wi_marray = sycl::ext::oneapi::experimental::fma(sub_a.wi_marray, b, c); + } + joint_matrix_load(sg, sub_b, accB.get_pointer() + (k * K * Big_N) + (n * N), Big_N); From 5347f7ef2f7725b2a7e9a40e67cc294278d58d36 Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Tue, 31 May 2022 11:10:28 +0100 Subject: [PATCH 12/21] format Signed-off-by: JackAKirk --- SYCL/Matrix/element_wise_all_ops_cuda.cpp | 7 +++---- SYCL/Matrix/joint_matrix_tensorcore.cpp | 3 ++- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/SYCL/Matrix/element_wise_all_ops_cuda.cpp b/SYCL/Matrix/element_wise_all_ops_cuda.cpp index 7c8887a054..13b40d5417 100644 --- a/SYCL/Matrix/element_wise_all_ops_cuda.cpp +++ b/SYCL/Matrix/element_wise_all_ops_cuda.cpp @@ -55,8 +55,8 @@ void matrix_verify_op(queue q, big_matrix &C, cgh); cgh.parallel_for>( - r, [ accC, - Op ](nd_item<2> spmd_item)[[sycl::reqd_sub_group_size(SG_SZ)]] { + r, [accC, + Op](nd_item<2> spmd_item) [[sycl::reqd_sub_group_size(SG_SZ)]] { const auto global_idx = spmd_item.get_global_id(0); const auto global_idy = spmd_item.get_global_id(1); const auto sg_startx = global_idx - spmd_item.get_local_id(0); @@ -106,8 +106,7 @@ void matrix_verify_op(queue q, big_matrix &C, sg_starty / SG_SZ * N, (N * nWGperDim)); }); // parallel for - }) - .wait(); + }).wait(); } assert_ops_ref(C.get_data(), ref); } diff --git a/SYCL/Matrix/joint_matrix_tensorcore.cpp b/SYCL/Matrix/joint_matrix_tensorcore.cpp index b41739646d..07a80ca118 100644 --- a/SYCL/Matrix/joint_matrix_tensorcore.cpp +++ b/SYCL/Matrix/joint_matrix_tensorcore.cpp @@ -196,7 +196,8 @@ void test(queue &q) { marray b, c; b = 2; c = 1; - sub_a.wi_marray = sycl::ext::oneapi::experimental::fma(sub_a.wi_marray, b, c); + sub_a.wi_marray = + sycl::ext::oneapi::experimental::fma(sub_a.wi_marray, b, c); } joint_matrix_load(sg, sub_b, From a249c7ed7b797a44c1260280a3e69651f83c4da0 Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Tue, 31 May 2022 12:15:09 +0100 Subject: [PATCH 13/21] format Signed-off-by: JackAKirk --- SYCL/Matrix/element_wise_all_ops_cuda.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/SYCL/Matrix/element_wise_all_ops_cuda.cpp b/SYCL/Matrix/element_wise_all_ops_cuda.cpp index 13b40d5417..7b7f28ca9c 100644 --- a/SYCL/Matrix/element_wise_all_ops_cuda.cpp +++ b/SYCL/Matrix/element_wise_all_ops_cuda.cpp @@ -6,7 +6,6 @@ // //===----------------------------------------------------------------------===// // REQUIRES: cuda - // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX=3 %s -o %t.out // RUN: %t.out From b9ebfe95392f20d1a97d318f9c009260a29c408e Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Wed, 8 Jun 2022 10:15:00 +0100 Subject: [PATCH 14/21] Use implicit cast to float. Signed-off-by: JackAKirk --- SYCL/BFloat16/bfloat16_builtins.cpp | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/SYCL/BFloat16/bfloat16_builtins.cpp b/SYCL/BFloat16/bfloat16_builtins.cpp index 6150472c3c..eac5df3fde 100644 --- a/SYCL/BFloat16/bfloat16_builtins.cpp +++ b/SYCL/BFloat16/bfloat16_builtins.cpp @@ -2,7 +2,7 @@ // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_80 // RUN: %t.out -#include +#include #include #include @@ -33,8 +33,7 @@ bool check(float a, float b) { cgh); \ accessor ERR(err_buf, cgh); \ cgh.parallel_for(N, [=](id<1> index) { \ - if (check(make_fp32(NAME(bfloat16{A[index]}).raw()), \ - NAME(A[index]))) { \ + if (check(NAME(bfloat16{A[index]}), NAME(A[index]))) { \ ERR[0] = 1; \ } \ }); \ @@ -57,7 +56,7 @@ bool check(float a, float b) { } \ marray res = NAME(arg); \ for (int i = 0; i < SZ; i++) { \ - if (check(make_fp32(res[i].raw()), NAME(A[index][i]))) { \ + if (check(res[i], NAME(A[index][i]))) { \ ERR[0] = 1; \ } \ } \ @@ -86,9 +85,8 @@ bool check(float a, float b) { cgh); \ accessor ERR(err_buf, cgh); \ cgh.parallel_for(N, [=](id<1> index) { \ - if (check( \ - make_fp32(NAME(bfloat16{A[index]}, bfloat16{B[index]}).raw()), \ - NAME(A[index], B[index]))) { \ + if (check(NAME(bfloat16{A[index]}, bfloat16{B[index]}), \ + NAME(A[index], B[index]))) { \ ERR[0] = 1; \ } \ }); \ @@ -115,8 +113,7 @@ bool check(float a, float b) { } \ marray res = NAME(arg0, arg1); \ for (int i = 0; i < SZ; i++) { \ - if (check(make_fp32(res[i].raw()), \ - NAME(A[index][i], B[index][i]))) { \ + if (check(res[i], NAME(A[index][i], B[index][i]))) { \ ERR[0] = 1; \ } \ } \ @@ -148,9 +145,8 @@ bool check(float a, float b) { cgh); \ accessor ERR(err_buf, cgh); \ cgh.parallel_for(N, [=](id<1> index) { \ - if (check(make_fp32(NAME(bfloat16{A[index]}, bfloat16{B[index]}, \ - bfloat16{C[index]}) \ - .raw()), \ + if (check(NAME(bfloat16{A[index]}, bfloat16{B[index]}, \ + bfloat16{C[index]}), \ NAME(A[index], B[index], C[index]))) { \ ERR[0] = 1; \ } \ @@ -182,8 +178,7 @@ bool check(float a, float b) { } \ marray res = NAME(arg0, arg1, arg2); \ for (int i = 0; i < SZ; i++) { \ - if (check(make_fp32(res[i].raw()), \ - NAME(A[index][i], B[index][i], C[index][i]))) { \ + if (check(res[i], NAME(A[index][i], B[index][i], C[index][i]))) { \ ERR[0] = 1; \ } \ } \ @@ -209,9 +204,9 @@ bool check(float a, float b) { accessor checkNAN( \ nan_buf, cgh); \ cgh.single_task([=]() { \ - checkNAN[0] = make_fp32(NAME(bfloat16{NAN}, bfloat16{NAN}).raw()); \ - if ((make_fp32(NAME(bfloat16{2}, bfloat16{NAN}).raw()) != 2) || \ - (make_fp32(NAME(bfloat16{NAN}, bfloat16{2}).raw()) != 2)) { \ + checkNAN[0] = NAME(bfloat16{NAN}, bfloat16{NAN}); \ + if ((NAME(bfloat16{2}, bfloat16{NAN}) != 2) || \ + (NAME(bfloat16{NAN}, bfloat16{2}) != 2)) { \ ERR[0] = 1; \ } \ }); \ From 5ea081ad7a2678a977a042bc3e88092360e09875 Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Wed, 8 Jun 2022 11:39:47 +0100 Subject: [PATCH 15/21] Adds separate test comparing wi_marray with get_wi_data usage. Signed-off-by: JackAKirk --- SYCL/Matrix/element_wise_wi_marray.cpp | 67 +++++++++++++++++++++++++ SYCL/Matrix/joint_matrix_tensorcore.cpp | 10 +--- 2 files changed, 68 insertions(+), 9 deletions(-) create mode 100644 SYCL/Matrix/element_wise_wi_marray.cpp diff --git a/SYCL/Matrix/element_wise_wi_marray.cpp b/SYCL/Matrix/element_wise_wi_marray.cpp new file mode 100644 index 0000000000..e4d78c22d1 --- /dev/null +++ b/SYCL/Matrix/element_wise_wi_marray.cpp @@ -0,0 +1,67 @@ +//==----------- element_wise_wi_marray.cpp - DPC++ joint_matrix------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: cuda +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX=3 %s -o %t.out +// RUN: %t.out + +#include + +using namespace sycl; +using namespace sycl::ext::oneapi::experimental::matrix; +using sycl::ext::oneapi::experimental::bfloat16; + +#define SG_SZ 32 + +template void verify_wi_marray(queue q) { + int err = 0; + { + buffer err_buf(&err, 1); + q.submit([&](handler &cgh) { + accessor ERR(err_buf, cgh); + + cgh.parallel_for( + nd_range<2>({1, 1 * SG_SZ}, {1, 1 * SG_SZ}), + [ERR](nd_item<2> spmd_item)[[sycl::reqd_sub_group_size(SG_SZ)]] { + auto sg = spmd_item.get_sub_group(); + + joint_matrix sub_a; + joint_matrix sub_a_2; + + joint_matrix_fill(sg, sub_a, -1); + joint_matrix_fill(sg, sub_a_2, -1); + + auto wi_slice_a = sub_a.get_wi_data(); + for (int i = 0; i < wi_slice_a.length(); i++) { + wi_slice_a[i] = fabs(wi_slice_a[i]); + } + sub_a_2.wi_marray = fabs(sub_a_2.wi_marray); + + for (int i = 0; i < sub_a_2.wi_marray.size(); i++) { + if (sub_a_2.wi_marray[i] != wi_slice_a[i]) { + ERR[0] = 1; + } + } + }); // parallel for + }) + .wait(); + } + assert(err == 0); +} + +int main() { + + queue q; + auto computeCapability = + std::stof(q.get_device().get_info()); + + if (computeCapability >= 8.0) { + verify_wi_marray(q); + } + + return 0; +} diff --git a/SYCL/Matrix/joint_matrix_tensorcore.cpp b/SYCL/Matrix/joint_matrix_tensorcore.cpp index 07a80ca118..678b4ef25d 100644 --- a/SYCL/Matrix/joint_matrix_tensorcore.cpp +++ b/SYCL/Matrix/joint_matrix_tensorcore.cpp @@ -68,7 +68,7 @@ T2 matrix_ref_mn(const int &m, const int &n, T1 *A, T1 *B, T2 *C) { res += make_fp32(A[m * Big_K + k]) * make_fp32(B[k * Big_N + n]); } else if constexpr (std::is_same::value) { for (int k = 0; k < Big_K; k++) - res += (make_fp32(A[m * Big_K + k].raw()) * 2 + 1) * + res += make_fp32(A[m * Big_K + k].raw()) * make_fp32(B[k * Big_N + n].raw()); } else { for (int k = 0; k < Big_K; k++) @@ -192,14 +192,6 @@ void test(queue &q) { accA.get_pointer() + (k * K) + (m * M * Big_K), Big_K); - if constexpr (std::is_same::value) { - marray b, c; - b = 2; - c = 1; - sub_a.wi_marray = - sycl::ext::oneapi::experimental::fma(sub_a.wi_marray, b, c); - } - joint_matrix_load(sg, sub_b, accB.get_pointer() + (k * K * Big_N) + (n * N), Big_N); From 0ede8812f2261126941b8964e8437351b82dfa9f Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Wed, 8 Jun 2022 12:27:10 +0100 Subject: [PATCH 16/21] format Signed-off-by: JackAKirk --- SYCL/Matrix/element_wise_all_ops_cuda.cpp | 1 + SYCL/Matrix/element_wise_wi_marray.cpp | 6 +++--- SYCL/Matrix/joint_matrix_tensorcore.cpp | 5 ++--- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/SYCL/Matrix/element_wise_all_ops_cuda.cpp b/SYCL/Matrix/element_wise_all_ops_cuda.cpp index 7b7f28ca9c..13b40d5417 100644 --- a/SYCL/Matrix/element_wise_all_ops_cuda.cpp +++ b/SYCL/Matrix/element_wise_all_ops_cuda.cpp @@ -6,6 +6,7 @@ // //===----------------------------------------------------------------------===// // REQUIRES: cuda + // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX=3 %s -o %t.out // RUN: %t.out diff --git a/SYCL/Matrix/element_wise_wi_marray.cpp b/SYCL/Matrix/element_wise_wi_marray.cpp index e4d78c22d1..890d342b48 100644 --- a/SYCL/Matrix/element_wise_wi_marray.cpp +++ b/SYCL/Matrix/element_wise_wi_marray.cpp @@ -6,6 +6,7 @@ // //===----------------------------------------------------------------------===// // REQUIRES: cuda + // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX=3 %s -o %t.out // RUN: %t.out @@ -26,7 +27,7 @@ template void verify_wi_marray(queue q) { cgh.parallel_for( nd_range<2>({1, 1 * SG_SZ}, {1, 1 * SG_SZ}), - [ERR](nd_item<2> spmd_item)[[sycl::reqd_sub_group_size(SG_SZ)]] { + [ERR](nd_item<2> spmd_item) [[sycl::reqd_sub_group_size(SG_SZ)]] { auto sg = spmd_item.get_sub_group(); joint_matrix sub_a; @@ -47,8 +48,7 @@ template void verify_wi_marray(queue q) { } } }); // parallel for - }) - .wait(); + }).wait(); } assert(err == 0); } diff --git a/SYCL/Matrix/joint_matrix_tensorcore.cpp b/SYCL/Matrix/joint_matrix_tensorcore.cpp index 678b4ef25d..507dae07e4 100644 --- a/SYCL/Matrix/joint_matrix_tensorcore.cpp +++ b/SYCL/Matrix/joint_matrix_tensorcore.cpp @@ -1,5 +1,4 @@ // REQUIRES: cuda - // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX=3 %s -o %t.out // RUN: %t.out // @@ -68,8 +67,8 @@ T2 matrix_ref_mn(const int &m, const int &n, T1 *A, T1 *B, T2 *C) { res += make_fp32(A[m * Big_K + k]) * make_fp32(B[k * Big_N + n]); } else if constexpr (std::is_same::value) { for (int k = 0; k < Big_K; k++) - res += make_fp32(A[m * Big_K + k].raw()) * - make_fp32(B[k * Big_N + n].raw()); + res += + make_fp32(A[m * Big_K + k].raw()) * make_fp32(B[k * Big_N + n].raw()); } else { for (int k = 0; k < Big_K; k++) From 473f88b09c732b41f99620efa71751090730bafb Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Thu, 9 Jun 2022 16:37:08 +0100 Subject: [PATCH 17/21] format Signed-off-by: JackAKirk --- SYCL/Matrix/element_wise_wi_marray.cpp | 2 +- SYCL/Matrix/joint_matrix_tensorcore.cpp | 16 ++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/SYCL/Matrix/element_wise_wi_marray.cpp b/SYCL/Matrix/element_wise_wi_marray.cpp index 890d342b48..6ab3947ed9 100644 --- a/SYCL/Matrix/element_wise_wi_marray.cpp +++ b/SYCL/Matrix/element_wise_wi_marray.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // REQUIRES: cuda -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX=3 %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX=3 %s -o %t.out // RUN: %t.out #include diff --git a/SYCL/Matrix/joint_matrix_tensorcore.cpp b/SYCL/Matrix/joint_matrix_tensorcore.cpp index 1a03d855a0..3d417dcc63 100644 --- a/SYCL/Matrix/joint_matrix_tensorcore.cpp +++ b/SYCL/Matrix/joint_matrix_tensorcore.cpp @@ -195,15 +195,15 @@ void test(queue &q) { accB.get_pointer() + (k * K * Big_N) + (n * N), Big_N); - // round values to correct precision if using tf32 - if constexpr (std::is_same::value) { - auto wi_size = sub_a.wi_marray.size(); - assert(wi_size == sub_b.wi_marray.size()); - for (auto i = 0; i < wi_size; ++i) { - sub_a.wi_marray[i] = round_to_tf32(sub_a.wi_marray[i]); - sub_b.wi_marray[i] = round_to_tf32(sub_b.wi_marray[i]); + // round values to correct precision if using tf32 + if constexpr (std::is_same::value) { + auto wi_size = sub_a.wi_marray.size(); + assert(wi_size == sub_b.wi_marray.size()); + for (auto i = 0; i < wi_size; ++i) { + sub_a.wi_marray[i] = round_to_tf32(sub_a.wi_marray[i]); + sub_b.wi_marray[i] = round_to_tf32(sub_b.wi_marray[i]); + } } - } sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c); } From e817dbab99d0e2ce1f3e8f37d79d9f8cc0c56cfa Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Thu, 9 Jun 2022 16:45:13 +0100 Subject: [PATCH 18/21] format Signed-off-by: JackAKirk --- SYCL/Matrix/element_wise_all_ops_cuda.cpp | 2 +- SYCL/Matrix/joint_matrix_tensorcore.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/SYCL/Matrix/element_wise_all_ops_cuda.cpp b/SYCL/Matrix/element_wise_all_ops_cuda.cpp index 13b40d5417..69976fa7e4 100644 --- a/SYCL/Matrix/element_wise_all_ops_cuda.cpp +++ b/SYCL/Matrix/element_wise_all_ops_cuda.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // REQUIRES: cuda -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX=3 %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX=3 %s -o %t.out // RUN: %t.out #include diff --git a/SYCL/Matrix/joint_matrix_tensorcore.cpp b/SYCL/Matrix/joint_matrix_tensorcore.cpp index 3d417dcc63..2b5078d415 100644 --- a/SYCL/Matrix/joint_matrix_tensorcore.cpp +++ b/SYCL/Matrix/joint_matrix_tensorcore.cpp @@ -267,7 +267,7 @@ int main() { test(Q); test(Q); test(Q); - + // A/B tf32 test(Q); From 1019a8d1c478ba5f096ce14497b6df4cbeefb492 Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Tue, 28 Jun 2022 10:00:59 +0100 Subject: [PATCH 19/21] Use ext_oneapi_bfloat16 aspect where appropriate. Signed-off-by: JackAKirk --- SYCL/BFloat16/bfloat16_builtins.cpp | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/SYCL/BFloat16/bfloat16_builtins.cpp b/SYCL/BFloat16/bfloat16_builtins.cpp index eac5df3fde..f9e2fd7977 100644 --- a/SYCL/BFloat16/bfloat16_builtins.cpp +++ b/SYCL/BFloat16/bfloat16_builtins.cpp @@ -218,12 +218,7 @@ bool check(float a, float b) { int main() { queue q; - auto computeCapability = - std::stof(q.get_device().get_info()); - // TODO check for "ext_oneapi_bfloat16" aspect instead once aspect is - // supported. Since this test only covers CUDA the current check is - // functionally equivalent to "ext_oneapi_bfloat16". - if (computeCapability >= 8.0) { + if (q.get_device().has(aspect::ext_oneapi_bfloat16)) { std::vector a(N), b(N), c(N); int err = 0; From 3506d296a9346cc88b7a5fd94a3bd185083d839e Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Tue, 28 Jun 2022 17:58:03 +0100 Subject: [PATCH 20/21] removed requires cuda (rely on bfloat16 aspect instead) Signed-off-by: JackAKirk --- SYCL/BFloat16/bfloat16_builtins.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/SYCL/BFloat16/bfloat16_builtins.cpp b/SYCL/BFloat16/bfloat16_builtins.cpp index f9e2fd7977..ef091087d2 100644 --- a/SYCL/BFloat16/bfloat16_builtins.cpp +++ b/SYCL/BFloat16/bfloat16_builtins.cpp @@ -1,4 +1,3 @@ -// REQUIRES: cuda // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_80 // RUN: %t.out From 74787938e1e1c64ee46da23f0a02e2ab3439e450 Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Wed, 29 Jun 2022 09:37:45 +0100 Subject: [PATCH 21/21] Noted test doesn't compile for other backends. --- SYCL/BFloat16/bfloat16_builtins.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/SYCL/BFloat16/bfloat16_builtins.cpp b/SYCL/BFloat16/bfloat16_builtins.cpp index ef091087d2..ff84ecbeb3 100644 --- a/SYCL/BFloat16/bfloat16_builtins.cpp +++ b/SYCL/BFloat16/bfloat16_builtins.cpp @@ -1,3 +1,9 @@ +// REQUIRES: cuda +// +// Currently this test fails to compile for backends other than cuda. +// Other backends could use this test when bfloat16 math function support is +// added. +// // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_80 // RUN: %t.out