From 7e38af53be589a86e0dc1314cab194e8866eda3b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= Date: Thu, 3 Mar 2022 12:57:19 +0000 Subject: [PATCH 1/3] [SYCL] add tests for bf16 builtins operating on storage types --- SYCL/BFloat16/bf16_storage_builtins.cpp | 444 ++++++++++++++++++++++++ 1 file changed, 444 insertions(+) create mode 100644 SYCL/BFloat16/bf16_storage_builtins.cpp diff --git a/SYCL/BFloat16/bf16_storage_builtins.cpp b/SYCL/BFloat16/bf16_storage_builtins.cpp new file mode 100644 index 0000000000..55ebeafc64 --- /dev/null +++ b/SYCL/BFloat16/bf16_storage_builtins.cpp @@ -0,0 +1,444 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_80 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// Only cuda backend implements bf16 +// REQUIRES: cuda + +#include + +#include +#include + +using namespace cl::sycl; +using namespace cl::sycl::ext::oneapi; + +constexpr int N = 16 * 3; // divisible by all vector sizes +constexpr float bf16_eps = 0.00390625; + +union conv { + float f; + vec u; + uint32_t u2; +}; + +uint16_t to_bf16(float x) { + conv c; + c.f = x; + return c.u.y(); +} + +uint32_t to_bf16x2(float x, float y) { + conv c1; + c1.f = x; + conv c2; + c2.f = y; + conv c3; + c3.u.x() = c1.u.y(); + c3.u.y() = c2.u.y(); + return c3.u2; +} + +float from_bf16(uint16_t x) { + conv c; + c.u.y() = x; + c.u.x() = 0; + return c.f; +} + +float2 from_bf16x2(uint32_t x) { + conv c; + c.u2 = x; + conv c2; + c2.u.x() = 0; + c2.u.y() = c.u.x(); + float2 res; + res.x() = c2.f; + c2.u.y() = c.u.y(); + res.y() = c2.f; + return res; +} + +bool check(float a, float b) { + return fabs(2 * (a - b) / (a + b)) > bf16_eps * 2; +} + +#define TEST_BUILTIN_1_VEC_IMPL(NAME, SZ) \ + { \ + buffer a_buf((float##SZ *)&a[0], N / SZ); \ + buffer err_buf(&err, 1); \ + q.submit([&](handler &cgh) { \ + auto A = a_buf.get_access(cgh); \ + auto ERR = err_buf.get_access(cgh); \ + cgh.parallel_for(N / SZ, [=](id<1> index) { \ + vec arg; \ + for (int i = 0; i < SZ; i++) { \ + arg[i] = to_bf16(A[index][i]); \ + } \ + vec res = NAME(arg); \ + for (int i = 0; i < SZ; i++) { \ + if (check(from_bf16(res[i]), NAME(A[index][i]))) { \ + ERR[0] = 1; \ + } \ + } \ + if (index % 2 == 0) { \ + vec x2_arg; \ + for (int i = 0; i < SZ; i++) { \ + x2_arg[i] = to_bf16x2(A[index][i], A[index + 1][i]); \ + } \ + vec x2_res = NAME(x2_arg); \ + for (int i = 0; i < SZ; i++) { \ + float2 res2 = from_bf16x2(x2_res[i]); \ + if (check(res2.x(), NAME(A[index][i])) || \ + check(res2.y(), NAME(A[index + 1][i]))) { \ + ERR[0] = 1; \ + } \ + } \ + } \ + }); \ + }); \ + } \ + assert(err == 0); + +// vectors of size 3 need separate test, as they actually have the size of 4 +// floats +#define TEST_BUILTIN_1_VEC3_IMPL(NAME) \ + { \ + buffer a_buf((float3 *)&a[0], N / 4); \ + buffer err_buf(&err, 1); \ + q.submit([&](handler &cgh) { \ + auto A = a_buf.get_access(cgh); \ + auto ERR = err_buf.get_access(cgh); \ + cgh.parallel_for(N / 4, [=](id<1> index) { \ + vec arg; \ + for (int i = 0; i < 3; i++) { \ + arg[i] = to_bf16(A[index][i]); \ + } \ + vec res = NAME(arg); \ + for (int i = 0; i < 3; i++) { \ + if (check(from_bf16(res[i]), NAME(A[index][i]))) { \ + ERR[0] = 1; \ + } \ + } \ + if (index % 2 == 0) { \ + vec x2_arg; \ + for (int i = 0; i < 3; i++) { \ + x2_arg[i] = to_bf16x2(A[index][i], A[index + 1][i]); \ + } \ + vec x2_res = NAME(x2_arg); \ + for (int i = 0; i < 3; i++) { \ + float2 res2 = from_bf16x2(x2_res[i]); \ + if (check(res2.x(), NAME(A[index][i])) || \ + check(res2.y(), NAME(A[index + 1][i]))) { \ + ERR[0] = 1; \ + } \ + } \ + } \ + }); \ + }); \ + } \ + assert(err == 0); + +#define TEST_BUILTIN_1_SCAL_IMPL(NAME) \ + { \ + buffer a_buf(&a[0], N); \ + buffer err_buf(&err, 1); \ + q.submit([&](handler &cgh) { \ + auto A = a_buf.get_access(cgh); \ + auto ERR = err_buf.get_access(cgh); \ + cgh.parallel_for(N, [=](id<1> index) { \ + if (check(from_bf16(NAME(to_bf16(A[index]))), NAME(A[index]))) { \ + ERR[0] = 1; \ + } \ + if (index % 2 == 0) { \ + float2 res = from_bf16x2(NAME(to_bf16x2(A[index], A[index + 1]))); \ + if (check(res.x(), NAME(A[index])) || \ + check(res.y(), NAME(A[index + 1]))) { \ + ERR[0] = 1; \ + } \ + } \ + }); \ + }); \ + } \ + assert(err == 0); + +#define TEST_BUILTIN_1(NAME) \ + TEST_BUILTIN_1_SCAL_IMPL(NAME) \ + TEST_BUILTIN_1_VEC_IMPL(NAME, 2) \ + TEST_BUILTIN_1_VEC3_IMPL(NAME) \ + TEST_BUILTIN_1_VEC_IMPL(NAME, 4) \ + TEST_BUILTIN_1_VEC_IMPL(NAME, 8) \ + TEST_BUILTIN_1_VEC_IMPL(NAME, 16) + +#define TEST_BUILTIN_2_VEC_IMPL(NAME, SZ) \ + { \ + buffer a_buf((float##SZ *)&a[0], N / SZ); \ + buffer b_buf((float##SZ *)&b[0], N / SZ); \ + buffer err_buf(&err, 1); \ + q.submit([&](handler &cgh) { \ + auto A = a_buf.get_access(cgh); \ + auto B = b_buf.get_access(cgh); \ + auto ERR = err_buf.get_access(cgh); \ + cgh.parallel_for(N / SZ, [=](id<1> index) { \ + vec arg0, arg1; \ + for (int i = 0; i < SZ; i++) { \ + arg0[i] = to_bf16(A[index][i]); \ + arg1[i] = to_bf16(B[index][i]); \ + } \ + vec res = NAME(arg0, arg1); \ + for (int i = 0; i < SZ; i++) { \ + if (check(from_bf16(res[i]), NAME(A[index][i], B[index][i]))) { \ + ERR[0] = 1; \ + } \ + } \ + if (index % 2 == 0) { \ + vec x2_arg0, x2_arg1; \ + for (int i = 0; i < SZ; i++) { \ + x2_arg0[i] = to_bf16x2(A[index][i], A[index + 1][i]); \ + x2_arg1[i] = to_bf16x2(B[index][i], B[index + 1][i]); \ + } \ + vec x2_res = NAME(x2_arg0, x2_arg1); \ + for (int i = 0; i < SZ; i++) { \ + float2 res2 = from_bf16x2(x2_res[i]); \ + if (check(res2.x(), NAME(A[index][i], B[index][i])) || \ + check(res2.y(), NAME(A[index + 1][i], B[index + 1][i]))) { \ + ERR[0] = 1; \ + } \ + } \ + } \ + }); \ + }); \ + } \ + assert(err == 0); + +#define TEST_BUILTIN_2_VEC3_IMPL(NAME) \ + { \ + buffer a_buf((float3 *)&a[0], N / 4); \ + buffer b_buf((float3 *)&b[0], N / 4); \ + buffer err_buf(&err, 1); \ + q.submit([&](handler &cgh) { \ + auto A = a_buf.get_access(cgh); \ + auto B = b_buf.get_access(cgh); \ + auto ERR = err_buf.get_access(cgh); \ + cgh.parallel_for(N / 4, [=](id<1> index) { \ + vec arg0, arg1; \ + for (int i = 0; i < 3; i++) { \ + arg0[i] = to_bf16(A[index][i]); \ + arg1[i] = to_bf16(B[index][i]); \ + } \ + vec res = NAME(arg0, arg1); \ + for (int i = 0; i < 3; i++) { \ + if (check(from_bf16(res[i]), NAME(A[index][i], B[index][i]))) { \ + ERR[0] = 1; \ + } \ + } \ + if (index % 2 == 0) { \ + vec x2_arg0, x2_arg1; \ + for (int i = 0; i < 3; i++) { \ + x2_arg0[i] = to_bf16x2(A[index][i], A[index + 1][i]); \ + x2_arg1[i] = to_bf16x2(B[index][i], B[index + 1][i]); \ + } \ + vec x2_res = NAME(x2_arg0, x2_arg1); \ + for (int i = 0; i < 3; i++) { \ + float2 res2 = from_bf16x2(x2_res[i]); \ + if (check(res2.x(), NAME(A[index][i], B[index][i])) || \ + check(res2.y(), NAME(A[index + 1][i], B[index + 1][i]))) { \ + ERR[0] = 1; \ + } \ + } \ + } \ + }); \ + }); \ + } \ + assert(err == 0); + +#define TEST_BUILTIN_2_SCAL_IMPL(NAME) \ + { \ + buffer a_buf(&a[0], N); \ + buffer b_buf(&b[0], N); \ + buffer err_buf(&err, 1); \ + q.submit([&](handler &cgh) { \ + auto A = a_buf.get_access(cgh); \ + auto B = b_buf.get_access(cgh); \ + auto ERR = err_buf.get_access(cgh); \ + cgh.parallel_for(N, [=](id<1> index) { \ + if (check(from_bf16(NAME(to_bf16(A[index]), to_bf16(B[index]))), \ + NAME(A[index], B[index]))) { \ + ERR[0] = 1; \ + } \ + if (index % 2 == 0) { \ + float2 res = from_bf16x2(NAME(to_bf16x2(A[index], A[index + 1]), \ + to_bf16x2(B[index], B[index + 1]))); \ + if (check(res.x(), NAME(A[index], B[index])) || \ + check(res.y(), NAME(A[index + 1], B[index + 1]))) { \ + ERR[0] = 1; \ + } \ + } \ + }); \ + }); \ + } \ + assert(err == 0); + +#define TEST_BUILTIN_2(NAME) \ + TEST_BUILTIN_2_SCAL_IMPL(NAME) \ + TEST_BUILTIN_2_VEC_IMPL(NAME, 2) \ + TEST_BUILTIN_2_VEC3_IMPL(NAME) \ + TEST_BUILTIN_2_VEC_IMPL(NAME, 4) \ + TEST_BUILTIN_2_VEC_IMPL(NAME, 8) \ + TEST_BUILTIN_2_VEC_IMPL(NAME, 16) + +#define TEST_BUILTIN_3_VEC_IMPL(NAME, SZ) \ + { \ + buffer a_buf((float##SZ *)&a[0], N / SZ); \ + buffer b_buf((float##SZ *)&b[0], N / SZ); \ + buffer c_buf((float##SZ *)&c[0], N / SZ); \ + buffer err_buf(&err, 1); \ + q.submit([&](handler &cgh) { \ + auto A = a_buf.get_access(cgh); \ + auto B = b_buf.get_access(cgh); \ + auto C = c_buf.get_access(cgh); \ + auto ERR = err_buf.get_access(cgh); \ + cgh.parallel_for(N / SZ, [=](id<1> index) { \ + vec arg0, arg1, arg2; \ + for (int i = 0; i < SZ; i++) { \ + arg0[i] = to_bf16(A[index][i]); \ + arg1[i] = to_bf16(B[index][i]); \ + arg2[i] = to_bf16(C[index][i]); \ + } \ + vec res = NAME(arg0, arg1, arg2); \ + for (int i = 0; i < SZ; i++) { \ + if (check(from_bf16(res[i]), \ + NAME(A[index][i], B[index][i], C[index][i]))) { \ + ERR[0] = 1; \ + } \ + } \ + if (index % 2 == 0) { \ + vec x2_arg0, x2_arg1, x2_arg2; \ + for (int i = 0; i < SZ; i++) { \ + x2_arg0[i] = to_bf16x2(A[index][i], A[index + 1][i]); \ + x2_arg1[i] = to_bf16x2(B[index][i], B[index + 1][i]); \ + x2_arg2[i] = to_bf16x2(C[index][i], C[index + 1][i]); \ + } \ + vec x2_res = NAME(x2_arg0, x2_arg1, x2_arg2); \ + for (int i = 0; i < SZ; i++) { \ + float2 res2 = from_bf16x2(x2_res[i]); \ + if (check(res2.x(), \ + NAME(A[index][i], B[index][i], C[index][i])) || \ + check(res2.y(), NAME(A[index + 1][i], B[index + 1][i], \ + C[index + 1][i]))) { \ + ERR[0] = 1; \ + } \ + } \ + } \ + }); \ + }); \ + } \ + assert(err == 0); + +#define TEST_BUILTIN_3_VEC3_IMPL(NAME) \ + { \ + buffer a_buf((float3 *)&a[0], N / 4); \ + buffer b_buf((float3 *)&b[0], N / 4); \ + buffer c_buf((float3 *)&c[0], N / 4); \ + buffer err_buf(&err, 1); \ + q.submit([&](handler &cgh) { \ + auto A = a_buf.get_access(cgh); \ + auto B = b_buf.get_access(cgh); \ + auto C = c_buf.get_access(cgh); \ + auto ERR = err_buf.get_access(cgh); \ + cgh.parallel_for(N / 4, [=](id<1> index) { \ + vec arg0, arg1, arg2; \ + for (int i = 0; i < 3; i++) { \ + arg0[i] = to_bf16(A[index][i]); \ + arg1[i] = to_bf16(B[index][i]); \ + arg2[i] = to_bf16(C[index][i]); \ + } \ + vec res = NAME(arg0, arg1, arg2); \ + for (int i = 0; i < 3; i++) { \ + if (check(from_bf16(res[i]), \ + NAME(A[index][i], B[index][i], C[index][i]))) { \ + ERR[0] = 1; \ + } \ + } \ + if (index % 2 == 0) { \ + vec x2_arg0, x2_arg1, x2_arg2; \ + for (int i = 0; i < 3; i++) { \ + x2_arg0[i] = to_bf16x2(A[index][i], A[index + 1][i]); \ + x2_arg1[i] = to_bf16x2(B[index][i], B[index + 1][i]); \ + x2_arg2[i] = to_bf16x2(C[index][i], C[index + 1][i]); \ + } \ + vec x2_res = NAME(x2_arg0, x2_arg1, x2_arg2); \ + for (int i = 0; i < 3; i++) { \ + float2 res2 = from_bf16x2(x2_res[i]); \ + if (check(res2.x(), \ + NAME(A[index][i], B[index][i], C[index][i])) || \ + check(res2.y(), NAME(A[index + 1][i], B[index + 1][i], \ + C[index + 1][i]))) { \ + ERR[0] = 1; \ + } \ + } \ + } \ + }); \ + }); \ + } \ + assert(err == 0); + +#define TEST_BUILTIN_3_SCAL_IMPL(NAME) \ + { \ + buffer a_buf(&a[0], N); \ + buffer b_buf(&b[0], N); \ + buffer c_buf(&c[0], N); \ + buffer err_buf(&err, 1); \ + q.submit([&](handler &cgh) { \ + auto A = a_buf.get_access(cgh); \ + auto B = b_buf.get_access(cgh); \ + auto C = c_buf.get_access(cgh); \ + auto ERR = err_buf.get_access(cgh); \ + cgh.parallel_for(N, [=](id<1> index) { \ + if (check(from_bf16(NAME(to_bf16(A[index]), to_bf16(B[index]), \ + to_bf16(C[index]))), \ + NAME(A[index], B[index], C[index]))) { \ + ERR[0] = 1; \ + } \ + if (index % 2 == 0) { \ + float2 res = from_bf16x2(NAME(to_bf16x2(A[index], A[index + 1]), \ + to_bf16x2(B[index], B[index + 1]), \ + to_bf16x2(C[index], C[index + 1]))); \ + if (check(res.x(), NAME(A[index], B[index], C[index])) || \ + check(res.y(), \ + NAME(A[index + 1], B[index + 1], C[index + 1]))) { \ + ERR[0] = 1; \ + } \ + } \ + }); \ + }); \ + } \ + assert(err == 0); + +#define TEST_BUILTIN_3(NAME) \ + TEST_BUILTIN_3_SCAL_IMPL(NAME) \ + TEST_BUILTIN_3_VEC_IMPL(NAME, 2) \ + TEST_BUILTIN_3_VEC3_IMPL(NAME) \ + TEST_BUILTIN_3_VEC_IMPL(NAME, 4) \ + TEST_BUILTIN_3_VEC_IMPL(NAME, 8) \ + TEST_BUILTIN_3_VEC_IMPL(NAME, 16) + +int main() { + queue q; + std::vector a(N), b(N), c(N); + int err = 0; + for (int i = 0; i < N; i++) { + a[i] = (i - N / 2) / (float)N; + b[i] = (N / 2 - i) / (float)N; + c[i] = (float)(3 * i); + } + + TEST_BUILTIN_1(fabs); + TEST_BUILTIN_2(fmin); + TEST_BUILTIN_2(fmax); + TEST_BUILTIN_3(fma); + + return 0; +} From 14e6b59556cca1962fa6bbee6069d537cc943fe9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= Date: Mon, 14 Mar 2022 15:49:39 +0100 Subject: [PATCH 2/3] Apply suggestions from code review Co-authored-by: Alexey Bader --- SYCL/BFloat16/bf16_storage_builtins.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/SYCL/BFloat16/bf16_storage_builtins.cpp b/SYCL/BFloat16/bf16_storage_builtins.cpp index 55ebeafc64..7b249fe83f 100644 --- a/SYCL/BFloat16/bf16_storage_builtins.cpp +++ b/SYCL/BFloat16/bf16_storage_builtins.cpp @@ -1,8 +1,6 @@ // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_80 -// RUN: %HOST_RUN_PLACEHOLDER %t.out -// RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out -// RUN: %ACC_RUN_PLACEHOLDER %t.out + // Only cuda backend implements bf16 // REQUIRES: cuda @@ -10,7 +8,8 @@ #include #include -#include +#include + using namespace cl::sycl; using namespace cl::sycl::ext::oneapi; From 1c0d632adf2067e7243c854f6f92a36965e3dcfa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= Date: Mon, 14 Mar 2022 15:01:50 +0000 Subject: [PATCH 3/3] format --- SYCL/BFloat16/bf16_storage_builtins.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/SYCL/BFloat16/bf16_storage_builtins.cpp b/SYCL/BFloat16/bf16_storage_builtins.cpp index 7b249fe83f..89919b9d7b 100644 --- a/SYCL/BFloat16/bf16_storage_builtins.cpp +++ b/SYCL/BFloat16/bf16_storage_builtins.cpp @@ -1,7 +1,6 @@ // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_80 // RUN: %GPU_RUN_PLACEHOLDER %t.out - // Only cuda backend implements bf16 // REQUIRES: cuda @@ -10,7 +9,6 @@ #include #include - using namespace cl::sycl; using namespace cl::sycl::ext::oneapi;