diff --git a/SYCL/DeviceLib/imf_bf16_comp_test.cpp b/SYCL/DeviceLib/imf_bf16_comp_test.cpp new file mode 100644 index 0000000000..af8e610ab9 --- /dev/null +++ b/SYCL/DeviceLib/imf_bf16_comp_test.cpp @@ -0,0 +1,235 @@ +// RUN: %clangxx -fsycl %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fno-builtin -fsycl-device-lib-jit-link %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// +// UNSUPPORTED: cuda || hip + +#include "imf_utils.hpp" +#include +using sycl_bfloat16 = sycl::ext::oneapi::experimental::bfloat16; +namespace sycl_imf = sycl::ext::intel::math; +int main() { + sycl::queue device_queue(sycl::default_selector_v); + std::cout << "Running on " + << device_queue.get_device().get_info() + << "\n"; + + // hisnan test + { + std::initializer_list input_vals = {0x0, 0x1, 0x7A, + 0x7F80, 0x7FC0, 0x7FC5}; + std::initializer_list ref_vals = {false, false, false, + false, true, true}; + test(device_queue, input_vals, ref_vals, + FT1(sycl_bfloat16, sycl_imf::hisnan)); + } + + // hisinf + { + std::initializer_list input_vals = { + 0x0, 0x1, 0x7A, 0x7F80, 0x7FC0, 0x7FC5, 0xFF80, 0xAE44, 0xFF84}; + std::initializer_list ref_vals = {false, false, false, true, false, + false, true, false, false}; + test(device_queue, input_vals, ref_vals, + FT1(sycl_bfloat16, sycl_imf::hisinf)); + } + + // heq + { + std::initializer_list input_vals1 = { + 0x0, 0x1, 0x7A, 0x7F81, 0x7FC0, 0x7FC5, 0xFF80, 0xAE44, 0xFF74}; + std::initializer_list input_vals2 = { + 0x1, 0x1, 0x7C, 0x7F80, 0x7FC0, 0x7FC5, 0xFF80, 0xAE44, 0xFF84}; + std::initializer_list ref_vals = {false, true, false, false, false, + false, true, true, false}; + test2(device_queue, input_vals1, input_vals2, ref_vals, + FT2(sycl_bfloat16, sycl_imf::heq)); + } + + // hequ + { + std::initializer_list input_vals1 = { + 0x0, 0x1, 0x7A, 0x7F81, 0x7FC0, 0x7FC5, 0xFF80, 0xAE44, 0xFF74, 0x7FC2}; + std::initializer_list input_vals2 = { + 0x1, 0x1, 0x7C, 0x7F80, 0x7FC0, 0x7FC9, 0xFF80, 0xAE44, 0xFF84, 0xAAEC}; + std::initializer_list ref_vals = {false, true, false, true, true, + true, true, true, false, true}; + test2(device_queue, input_vals1, input_vals2, ref_vals, + FT2(sycl_bfloat16, sycl_imf::hequ)); + } + + // hne + { + std::initializer_list input_vals1 = { + 0x0, 0x1, 0x7A, 0x7F81, 0x7FC0, 0x7FC5, 0xFF80, 0xAE44, 0xFF74}; + std::initializer_list input_vals2 = { + 0x1, 0x1, 0x7C, 0x7F80, 0x7FC0, 0x7FC5, 0xFF80, 0xAE44, 0xFF84}; + std::initializer_list ref_vals = {true, false, true, false, false, + false, false, false, false}; + test2(device_queue, input_vals1, input_vals2, ref_vals, + FT2(sycl_bfloat16, sycl_imf::hne)); + } + + // hneu + { + std::initializer_list input_vals1 = { + 0x0, 0x1, 0x7A, 0x7F81, 0x7FC0, 0x7FC5, 0xFF80, 0xAE44, 0xFF74, 0x7FC2}; + std::initializer_list input_vals2 = { + 0x1, 0x1, 0x7C, 0x7F80, 0x7FC0, 0x7FC9, 0xFF80, 0xAE44, 0xFF84, 0xAAEC}; + std::initializer_list ref_vals = {true, false, true, true, true, + true, false, false, true, true}; + test2(device_queue, input_vals1, input_vals2, ref_vals, + FT2(sycl_bfloat16, sycl_imf::hneu)); + } + + // hge + { + std::initializer_list input_vals1 = { + 0x1, 0x1, 0x7A, 0x7F81, 0x7FC0, 0x7FC5, 0xFF80, 0x5000, 0xAF74, 0x71C2}; + std::initializer_list input_vals2 = { + 0x0, 0x1, 0x7C, 0x7F80, 0x7FC0, 0x7FC9, 0xFF80, 0x6000, 0xAF84, 0xAAEC}; + std::initializer_list ref_vals = {true, true, false, false, false, + false, true, false, true, true}; + test2(device_queue, input_vals1, input_vals2, ref_vals, + FT2(sycl_bfloat16, sycl_imf::hge)); + } + + // hgeu + { + std::initializer_list input_vals1 = { + 0x1, 0x1, 0x7A, 0x7F81, 0x7FC0, 0x7FC5, 0xFF80, 0x5000, 0xAF74, 0x71C2}; + std::initializer_list input_vals2 = { + 0x0, 0x1, 0x7C, 0x7F80, 0x7FC0, 0x7FC9, 0xFF80, 0x6000, 0xAF84, 0xAAEC}; + std::initializer_list ref_vals = {true, true, false, true, true, + true, true, false, true, true}; + test2(device_queue, input_vals1, input_vals2, ref_vals, + FT2(sycl_bfloat16, sycl_imf::hgeu)); + } + + // hgt + { + std::initializer_list input_vals1 = { + 0x1, 0x1, 0x7A, 0x7F81, 0x7FC0, 0x7FC5, 0xFF80, 0x5000, 0xAF74, 0x71C2}; + std::initializer_list input_vals2 = { + 0x0, 0x1, 0x7C, 0x7F80, 0x7FC0, 0x7FC9, 0xFF80, 0x6000, 0xAF84, 0xAAEC}; + std::initializer_list ref_vals = {true, false, false, false, false, + false, false, false, true, true}; + test2(device_queue, input_vals1, input_vals2, ref_vals, + FT2(sycl_bfloat16, sycl_imf::hgt)); + } + + // hgtu + { + std::initializer_list input_vals1 = { + 0x1, 0x1, 0x7A, 0x7F81, 0x7FC0, 0x7FC5, 0xFF80, 0x5000, 0xAF74, 0x71C2}; + std::initializer_list input_vals2 = { + 0x0, 0x1, 0x7C, 0x7F80, 0x7FC0, 0x7FC9, 0xFF80, 0x6000, 0xAF84, 0xAAEC}; + std::initializer_list ref_vals = {true, false, false, true, true, + true, false, false, true, true}; + test2(device_queue, input_vals1, input_vals2, ref_vals, + FT2(sycl_bfloat16, sycl_imf::hgtu)); + } + + // hle + { + std::initializer_list input_vals1 = { + 0x1, 0x1, 0x7A, 0x7F81, 0x7FC0, 0x7FC5, 0xFF80, 0x5000, 0xAF74, 0x71C2}; + std::initializer_list input_vals2 = { + 0x0, 0x1, 0x7C, 0x7F80, 0x7FC0, 0x7FC9, 0xFF80, 0x6000, 0xAF84, 0xAAEC}; + std::initializer_list ref_vals = {false, true, true, false, false, + false, true, true, false, false}; + test2(device_queue, input_vals1, input_vals2, ref_vals, + FT2(sycl_bfloat16, sycl_imf::hle)); + } + + // hleu + { + std::initializer_list input_vals1 = { + 0x1, 0x1, 0x7A, 0x7F81, 0x7FC0, 0x7FC5, 0xFF80, 0x5000, 0xAF74, 0x71C2}; + std::initializer_list input_vals2 = { + 0x0, 0x1, 0x7C, 0x7F80, 0x7FC0, 0x7FC9, 0xFF80, 0x6000, 0xAF84, 0xAAEC}; + std::initializer_list ref_vals = {false, true, true, true, true, + true, true, true, false, false}; + test2(device_queue, input_vals1, input_vals2, ref_vals, + FT2(sycl_bfloat16, sycl_imf::hleu)); + } + + // hlt + { + std::initializer_list input_vals1 = { + 0x1, 0x1, 0x7A, 0x7F81, 0x7FC0, 0x7FC5, 0xFF80, 0x5000, 0xAF74, 0x71C2}; + std::initializer_list input_vals2 = { + 0x0, 0x1, 0x7C, 0x7F80, 0x7FC0, 0x7FC9, 0xFF80, 0x6000, 0xAF84, 0xAAEC}; + std::initializer_list ref_vals = {false, false, true, false, false, + false, false, true, false, false}; + test2(device_queue, input_vals1, input_vals2, ref_vals, + FT2(sycl_bfloat16, sycl_imf::hlt)); + } + + // hltu + { + std::initializer_list input_vals1 = { + 0x1, 0x1, 0x7A, 0x7F81, 0x7FC0, 0x7FC5, 0xFF80, 0x5000, 0xAF74, 0x71C2}; + std::initializer_list input_vals2 = { + 0x0, 0x1, 0x7C, 0x7F80, 0x7FC0, 0x7FC9, 0xFF80, 0x6000, 0xAF84, 0xAAEC}; + std::initializer_list ref_vals = {false, false, true, true, true, + true, false, true, false, false}; + test2(device_queue, input_vals1, input_vals2, ref_vals, + FT2(sycl_bfloat16, sycl_imf::hltu)); + } + + // hmax + { + std::initializer_list input_vals1 = { + 0x1, 0xAF84, 0x8000, 0x7FC0, 0x1123, 0x7FCC, 0x7F80, 0x2E05}; + std::initializer_list input_vals2 = { + 0x2, 0x4044, 0x0, 0xAAAA, 0x7FC8, 0x7FC8, 0x7EEE, 0x2E55}; + std::initializer_list ref_vals = {0x2, 0x4044, 0x0, 0xAAAA, + 0x1123, 0x7FC0, 0x7F80, 0x2E55}; + test2(device_queue, input_vals1, input_vals2, ref_vals, + FT22(sycl_bfloat16, uint16_t, sycl_imf::hmax)); + } + + // hmax_nan + { + std::initializer_list input_vals1 = { + 0x1, 0xAF84, 0x8000, 0x7FC0, 0x1123, 0x7FCC, 0x7F80, 0x2E05}; + std::initializer_list input_vals2 = { + 0x2, 0x4044, 0x0, 0xAAAA, 0x7FC8, 0x7FC8, 0x7EEE, 0x2E55}; + std::initializer_list ref_vals = {0x2, 0x4044, 0x0, 0x7FC0, + 0x7FC0, 0x7FC0, 0x7F80, 0x2E55}; + test2(device_queue, input_vals1, input_vals2, ref_vals, + FT22(sycl_bfloat16, uint16_t, sycl_imf::hmax_nan)); + } + + // hmin + { + std::initializer_list input_vals1 = { + 0x1, 0xAF84, 0x8000, 0x7FC0, 0x1123, 0x7FCC, 0x7F80, 0x2E05}; + std::initializer_list input_vals2 = { + 0x2, 0x4044, 0x0, 0xAAAA, 0x7FC8, 0x7FC8, 0x7EEE, 0x2E55}; + std::initializer_list ref_vals = {0x1, 0xAF84, 0x8000, 0xAAAA, + 0x1123, 0x7FC0, 0x7EEE, 0x2E05}; + test2(device_queue, input_vals1, input_vals2, ref_vals, + FT22(sycl_bfloat16, uint16_t, sycl_imf::hmin)); + } + + // hmin_nan + { + std::initializer_list input_vals1 = { + 0x1, 0xAF84, 0x8000, 0x7FC0, 0x1123, 0x7FCC, 0x7F80, 0x2E05}; + std::initializer_list input_vals2 = { + 0x2, 0x4044, 0x0, 0xAAAA, 0x7FC8, 0x7FC8, 0x7EEE, 0x2E55}; + std::initializer_list ref_vals = {0x1, 0xAF84, 0x8000, 0x7FC0, + 0x7FC0, 0x7FC0, 0x7EEE, 0x2E05}; + test2(device_queue, input_vals1, input_vals2, ref_vals, + FT22(sycl_bfloat16, uint16_t, sycl_imf::hmin_nan)); + } + return 0; +} diff --git a/SYCL/DeviceLib/imf_bf16_to_fp32.cpp b/SYCL/DeviceLib/imf_bf16_to_fp32.cpp new file mode 100644 index 0000000000..6f0356216d --- /dev/null +++ b/SYCL/DeviceLib/imf_bf16_to_fp32.cpp @@ -0,0 +1,53 @@ +// RUN: %clangxx -fsycl %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fno-builtin -fsycl-device-lib-jit-link %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// +// UNSUPPORTED: cuda || hip + +// All __imf_* bf16 functions are implemented via fp32 emulation, so we don't +// need to check whether underlying device supports bf16 or not. +#include "imf_utils.hpp" +#include +using sycl_bfloat16 = sycl::ext::oneapi::experimental::bfloat16; +namespace sycl_imf = sycl::ext::intel::math; +int main() { + sycl::queue device_queue(sycl::default_selector_v); + std::cout << "Running on " + << device_queue.get_device().get_info() + << "\n"; + + { + std::initializer_list input_vals = { + 0x0, // +0 + 0x8000, // -0 + 0x1, // min positive subnormal + 0x7F, // max positive subnormal + 0x5A, // positive subnormal + 0x8001, // max negative subnormal + 0x807F, // min negative subnormal + 0x805A, // negative subnormal + 0x7F80, // +inf + 0xFF80, // -inf + 0x2E05, // positive normal + 0x7E5A, // positive normal + 0xAE44, // negative normal + 0xFF84, // negative normal + 0x7F7F, // max positive normal + 0xFF7F, // min negative normal + }; + + std::initializer_list ref_vals = { + 0x0, 0x80000000, 0x10000, 0x7F0000, 0x5A0000, 0x80010000, + 0x807F0000, 0x805A0000, 0x7F800000, 0xFF800000, 0x2E050000, 0x7E5A0000, + 0xAE440000, 0xFF840000, 0x7F7F0000, 0xFF7F0000}; + + test(device_queue, input_vals, ref_vals, + FT12(sycl_bfloat16, uint32_t, sycl_imf::bfloat162float)); + } +} diff --git a/SYCL/DeviceLib/imf_fp32_to_bf16.cpp b/SYCL/DeviceLib/imf_fp32_to_bf16.cpp new file mode 100644 index 0000000000..eceb1b3b0b --- /dev/null +++ b/SYCL/DeviceLib/imf_fp32_to_bf16.cpp @@ -0,0 +1,74 @@ +// RUN: %clangxx -fsycl %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fno-builtin -fsycl-device-lib-jit-link %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// +// UNSUPPORTED: cuda || hip + +// All __imf_* bf16 functions are implemented via fp32 emulation, so we don't +// need to check whether underlying device supports bf16 or not. +#include "imf_utils.hpp" +#include +using sycl_bfloat16 = sycl::ext::oneapi::experimental::bfloat16; +namespace sycl_imf = sycl::ext::intel::math; +int main() { + sycl::queue device_queue(sycl::default_selector_v); + std::cout << "Running on " + << device_queue.get_device().get_info() + << "\n"; + + { + std::initializer_list input_vals = { + 0x0, // +0 + 0x80000000, // -0 + 0x1, // min positive subnormal + 0x7FFFFF, // max positive subnormal + 0x5A6BFC, // positive subnormal + 0x80000001, // max negative subnormal + 0x807FFFFF, // min negative subnormal + 0x805A6FED, // negative subnormal + 0x7F800000, // +inf + 0xFF800000, // -inf + 0x2E05CBA9, // positive normal + 0x7E5A8935, // positive normal + 0xAE4411FC, // negative normal + 0xFA84C773, // negative normal + 0x7F7FFFFF, // max positive normal + 0x765FCEED, // positive normal + 0xFF7FFFFF, // min negative normal + 0xAC763561, // negative normal + }; + + std::initializer_list ref_vals = { + 0x0, 0x8000, 0x0, 0x80, 0x5a, 0x8000, 0x8080, 0x805A, 0x7F80, + 0xFF80, 0x2E06, 0x7E5B, 0xAE44, 0xFA85, 0x7F80, 0x7660, 0xFF80, 0xAC76}; + + std::initializer_list ref_vals_rd = { + 0x0, 0x8000, 0x0, 0x7F, 0x5A, 0x8001, 0x8080, 0x805B, 0x7F80, + 0xFF80, 0x2E05, 0x7E5A, 0xAE45, 0xFA85, 0x7F7F, 0x765F, 0xFF80, 0xAC77}; + + std::initializer_list ref_vals_ru = { + 0x0, 0x8000, 0x1, 0x80, 0x5B, 0x8000, 0x807F, 0x805A, 0x7F80, + 0xFF80, 0x2E06, 0x7E5B, 0xAE44, 0xFA84, 0x7F80, 0x7660, 0xFF7F, 0xAC76}; + + std::initializer_list ref_vals_rz = { + 0x0, 0x8000, 0x0, 0x7F, 0x5A, 0x8000, 0x807F, 0x805A, 0x7F80, + 0xFF80, 0x2E05, 0x7E5A, 0xAE44, 0xFA84, 0x7F7F, 0x765F, 0xFF7F, 0xAC76}; + + test(device_queue, input_vals, ref_vals, + FT12(float, uint16_t, sycl_imf::float2bfloat16)); + test(device_queue, input_vals, ref_vals_rd, + FT12(float, uint16_t, sycl_imf::float2bfloat16_rd)); + test(device_queue, input_vals, ref_vals, + FT12(float, uint16_t, sycl_imf::float2bfloat16_rn)); + test(device_queue, input_vals, ref_vals_ru, + FT12(float, uint16_t, sycl_imf::float2bfloat16_ru)); + test(device_queue, input_vals, ref_vals_rz, + FT12(float, uint16_t, sycl_imf::float2bfloat16_rz)); + } +} diff --git a/SYCL/DeviceLib/imf_utils.hpp b/SYCL/DeviceLib/imf_utils.hpp index 36c88ea11f..8ecf78d360 100644 --- a/SYCL/DeviceLib/imf_utils.hpp +++ b/SYCL/DeviceLib/imf_utils.hpp @@ -163,7 +163,23 @@ void test3(sycl::queue &q, std::initializer_list Input1, #define F(Name) [](auto x) { return (Name)(x); } #define FT(T, Name) [](auto x) { return __builtin_bit_cast(T, (Name)(x)); } +// Used for float2bf16 tests, all uint32_t input converted to float, then pass +// to __imf_float2bfloat16* utils. +#define FT1(T, Name) [](auto x) { return (Name)(__builtin_bit_cast(T, x)); } +#define FT12(T1, T2, Name) \ + [](auto x) { \ + return __builtin_bit_cast(T2, (Name)(__builtin_bit_cast(T1, x))); \ + } #define F2(Name) [](auto x, auto y) { return (Name)(x, y); } +#define FT2(T, Name) \ + [](auto x, auto y) { \ + return (Name)(__builtin_bit_cast(T, x), __builtin_bit_cast(T, y)); \ + } +#define FT22(T1, T2, Name) \ + [](auto x, auto y) { \ + return __builtin_bit_cast( \ + T2, (Name)(__builtin_bit_cast(T1, x), __builtin_bit_cast(T1, y))); \ + } #define F3(Name) [](auto x, auto y, auto z) { return (Name)(x, y, z); } #if defined(__SPIR__) #define F_Half1(Name) \