diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index b9f6b5a297943..08de07064d624 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -5100,9 +5100,9 @@ def fno_sycl_dead_args_optimization : Flag<["-"], "fno-sycl-dead-args-optimizati Group, Flags<[NoArgumentUnused, CoreOption]>, HelpText<"Disables " "elimination of DPC++ dead kernel arguments">; def fsycl_device_lib_EQ : CommaJoined<["-"], "fsycl-device-lib=">, Group, Flags<[NoXarchOption, CoreOption]>, - Values<"libc, libm-fp32, libm-fp64, libimf-fp32, libimf-fp64, all">, HelpText<"Control inclusion of " + Values<"libc, libm-fp32, libm-fp64, libimf-fp32, libimf-fp64, libimf-bf16, all">, HelpText<"Control inclusion of " "device libraries into device binary linkage. Valid arguments " - "are libc, libm-fp32, libm-fp64, libimf-fp32, libimf-fp64, all">; + "are libc, libm-fp32, libm-fp64, libimf-fp32, libimf-fp64, libimf-bf16, all">; def fno_sycl_device_lib_EQ : CommaJoined<["-"], "fno-sycl-device-lib=">, Group, Flags<[NoXarchOption, CoreOption]>, Values<"libc, libm-fp32, libm-fp64, all">, HelpText<"Control exclusion of " "device libraries from device binary linkage. Valid arguments " diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index 206aa34b14175..d864a6cf0a803 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -5143,7 +5143,8 @@ class OffloadingActionBuilder final { // of "internal" libraries cannot be affected via -fno-sycl-device-lib. llvm::StringMap devicelib_link_info = { {"libc", true}, {"libm-fp32", true}, {"libm-fp64", true}, - {"libimf-fp32", true}, {"libimf-fp64", true}, {"internal", true}}; + {"libimf-fp32", true}, {"libimf-fp64", true}, {"libimf-bf16", true}, + {"internal", true}}; if (Arg *A = Args.getLastArg(options::OPT_fsycl_device_lib_EQ, options::OPT_fno_sycl_device_lib_EQ)) { if (A->getValues().size() == 0) @@ -5189,7 +5190,8 @@ class OffloadingActionBuilder final { {"libsycl-msvc-math", "libm-fp32"}, #endif {"libsycl-imf", "libimf-fp32"}, - {"libsycl-imf-fp64", "libimf-fp64"} + {"libsycl-imf-fp64", "libimf-fp64"}, + {"libsycl-imf-bf16", "libimf-bf16"}, }; // For AOT compilation, we need to link sycl_device_fallback_libs as // default too. @@ -5201,7 +5203,8 @@ class OffloadingActionBuilder final { {"libsycl-fallback-cmath", "libm-fp32"}, {"libsycl-fallback-cmath-fp64", "libm-fp64"}, {"libsycl-fallback-imf", "libimf-fp32"}, - {"libsycl-fallback-imf-fp64", "libimf-fp64"}}; + {"libsycl-fallback-imf-fp64", "libimf-fp64"}, + {"libsycl-fallback-imf-bf16", "libimf-bf16"}}; // ITT annotation libraries are linked in separately whenever the device // code instrumentation is enabled. const SYCLDeviceLibsList sycl_device_annotation_libs = { diff --git a/clang/lib/Driver/ToolChains/SYCL.cpp b/clang/lib/Driver/ToolChains/SYCL.cpp index ae40b71f4c99b..54bafca712b47 100644 --- a/clang/lib/Driver/ToolChains/SYCL.cpp +++ b/clang/lib/Driver/ToolChains/SYCL.cpp @@ -138,7 +138,8 @@ static llvm::SmallVector SYCLDeviceLibList { "imf", "imf-fp64", "itt-compiler-wrappers", "itt-stubs", "itt-user-wrappers", "fallback-cassert", "fallback-cstring", "fallback-cmath", "fallback-cmath-fp64", "fallback-complex", - "fallback-complex-fp64", "fallback-imf", "fallback-imf-fp64" + "fallback-complex-fp64", "fallback-imf", "fallback-imf-fp64", + "fallback-imf-bf16" }; const char *SYCL::Linker::constructLLVMLinkCommand( diff --git a/clang/test/Driver/Inputs/SYCL-windows/lib/libsycl-fallback-imf-bf16.obj b/clang/test/Driver/Inputs/SYCL-windows/lib/libsycl-fallback-imf-bf16.obj new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/clang/test/Driver/Inputs/SYCL-windows/lib/libsycl-imf-bf16.obj b/clang/test/Driver/Inputs/SYCL-windows/lib/libsycl-imf-bf16.obj new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/clang/test/Driver/Inputs/SYCL/lib/libsycl-fallback-imf-bf16.o b/clang/test/Driver/Inputs/SYCL/lib/libsycl-fallback-imf-bf16.o new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/clang/test/Driver/Inputs/SYCL/lib/libsycl-imf-bf16.o b/clang/test/Driver/Inputs/SYCL/lib/libsycl-imf-bf16.o new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/clang/test/Driver/sycl-cuda-tu-offload.cu b/clang/test/Driver/sycl-cuda-tu-offload.cu index 8340d71144cdb..4e3dd7859b4e3 100644 --- a/clang/test/Driver/sycl-cuda-tu-offload.cu +++ b/clang/test/Driver/sycl-cuda-tu-offload.cu @@ -93,15 +93,21 @@ // DEFAULT-PHASES2:| | +- 69: input, "{{.*}}", object // DEFAULT-PHASES2:| | +- 70: clang-offload-unbundler, {69}, object // DEFAULT-PHASES2:| |- 71: offload, " (nvptx64-nvidia-cuda)" {70}, object -// DEFAULT-PHASES2:| |- 72: input, "{{.*}}nvidiacl{{.*}}", ir, (device-sycl, sm_80) -// DEFAULT-PHASES2:| |- 73: input, "{{.*}}libdevice{{.*}}", ir, (device-sycl, sm_80) -// DEFAULT-PHASES2:| +- 74: linker, {17, 20, 23, 26, 29, 32, 35, 38, 41, 44, 47, 50, 53, 56, 59, 62, 65, 68, 71, 72, 73}, ir, (device-sycl, sm_80) -// DEFAULT-PHASES2:| +- 75: sycl-post-link, {74}, ir, (device-sycl, sm_80) -// DEFAULT-PHASES2:| | +- 76: file-table-tform, {75}, ir, (device-sycl, sm_80) -// DEFAULT-PHASES2:| | | +- 77: backend, {76}, assembler, (device-sycl, sm_80) -// DEFAULT-PHASES2:| | | |- 78: assembler, {77}, object, (device-sycl, sm_80) -// DEFAULT-PHASES2:| | |- 79: linker, {77, 78}, cuda-fatbin, (device-sycl, sm_80) -// DEFAULT-PHASES2:| |- 80: foreach, {76, 79}, cuda-fatbin, (device-sycl, sm_80) -// DEFAULT-PHASES2:| +- 81: file-table-tform, {75, 80}, tempfiletable, (device-sycl, sm_80) -// DEFAULT-PHASES2:|- 82: clang-offload-wrapper, {81}, object, (device-sycl, sm_80) -// DEFAULT-PHASES2:83: offload, "host-cuda-sycl (x86_64-unknown-linux-gnu)" {15}, "device-sycl (nvptx64-nvidia-cuda:sm_80)" {82}, image +// DEFAULT-PHASES2:| | +- 72: input, "{{.*}}", object +// DEFAULT-PHASES2:| | +- 73: clang-offload-unbundler, {72}, object +// DEFAULT-PHASES2:| |- 74: offload, " (nvptx64-nvidia-cuda)" {73}, object +// DEFAULT-PHASES2:| | +- 75: input, "{{.*}}", object +// DEFAULT-PHASES2:| | +- 76: clang-offload-unbundler, {75}, object +// DEFAULT-PHASES2:| |- 77: offload, " (nvptx64-nvidia-cuda)" {76}, object +// DEFAULT-PHASES2:| |- 78: input, "{{.*}}nvidiacl{{.*}}", ir, (device-sycl, sm_80) +// DEFAULT-PHASES2:| |- 79: input, "{{.*}}libdevice{{.*}}", ir, (device-sycl, sm_80) +// DEFAULT-PHASES2:| +- 80: linker, {17, 20, 23, 26, 29, 32, 35, 38, 41, 44, 47, 50, 53, 56, 59, 62, 65, 68, 71, 74, 77, 78, 79}, ir, (device-sycl, sm_80) +// DEFAULT-PHASES2:| +- 81: sycl-post-link, {80}, ir, (device-sycl, sm_80) +// DEFAULT-PHASES2:| | +- 82: file-table-tform, {81}, ir, (device-sycl, sm_80) +// DEFAULT-PHASES2:| | | +- 83: backend, {82}, assembler, (device-sycl, sm_80) +// DEFAULT-PHASES2:| | | |- 84: assembler, {83}, object, (device-sycl, sm_80) +// DEFAULT-PHASES2:| | |- 85: linker, {83, 84}, cuda-fatbin, (device-sycl, sm_80) +// DEFAULT-PHASES2:| |- 86: foreach, {82, 85}, cuda-fatbin, (device-sycl, sm_80) +// DEFAULT-PHASES2:| +- 87: file-table-tform, {81, 86}, tempfiletable, (device-sycl, sm_80) +// DEFAULT-PHASES2:|- 88: clang-offload-wrapper, {87}, object, (device-sycl, sm_80) +// DEFAULT-PHASES2:89: offload, "host-cuda-sycl (x86_64-unknown-linux-gnu)" {15}, "device-sycl (nvptx64-nvidia-cuda:sm_80)" {88}, image diff --git a/clang/test/Driver/sycl-device-lib.cpp b/clang/test/Driver/sycl-device-lib.cpp index fc27c5db0ff59..4a2ad0fc8bcbc 100644 --- a/clang/test/Driver/sycl-device-lib.cpp +++ b/clang/test/Driver/sycl-device-lib.cpp @@ -26,6 +26,7 @@ // SYCL_DEVICE_LIB_UNBUNDLE_DEFAULT-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-cmath-fp64.o" "-output={{.*}}libsycl-cmath-fp64-{{.*}}.o" "-unbundle" // SYCL_DEVICE_LIB_UNBUNDLE_DEFAULT-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-imf.o" "-output={{.*}}libsycl-imf-{{.*}}.o" "-unbundle" // SYCL_DEVICE_LIB_UNBUNDLE_DEFAULT-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-imf-fp64.o" "-output={{.*}}libsycl-imf-fp64-{{.*}}.o" "-unbundle" +// SYCL_DEVICE_LIB_UNBUNDLE_DEFAULT-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-imf-bf16.o" "-output={{.*}}libsycl-imf-bf16-{{.*}}.o" "-unbundle" // SYCL_DEVICE_LIB_UNBUNDLE_DEFAULT-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-cassert.o" "-output={{.*}}libsycl-fallback-cassert-{{.*}}.o" "-unbundle" // SYCL_DEVICE_LIB_UNBUNDLE_DEFAULT-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-cstring.o" "-output={{.*}}libsycl-fallback-cstring-{{.*}}.o" "-unbundle" // SYCL_DEVICE_LIB_UNBUNDLE_DEFAULT-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-complex.o" "-output={{.*}}libsycl-fallback-complex-{{.*}}.o" "-unbundle" @@ -34,6 +35,7 @@ // SYCL_DEVICE_LIB_UNBUNDLE_DEFAULT-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-cmath-fp64.o" "-output={{.*}}libsycl-fallback-cmath-fp64-{{.*}}.o" "-unbundle" // SYCL_DEVICE_LIB_UNBUNDLE_DEFAULT-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-imf.o" "-output={{.*}}libsycl-fallback-imf-{{.*}}.o" "-unbundle" // SYCL_DEVICE_LIB_UNBUNDLE_DEFAULT-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-imf-fp64.o" "-output={{.*}}libsycl-fallback-imf-fp64-{{.*}}.o" "-unbundle" +// SYCL_DEVICE_LIB_UNBUNDLE_DEFAULT-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-imf-bf16.o" "-output={{.*}}libsycl-fallback-imf-bf16-{{.*}}.o" "-unbundle" /// ########################################################################### /// test sycl fallback device libraries are not linked by default // RUN: %clangxx -fsycl -fsycl-device-lib-jit-link %s --sysroot=%S/Inputs/SYCL -### 2>&1 \ @@ -61,6 +63,7 @@ // SYCL_DEVICE_LIB_UNBUNDLE_WITH_FP64-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-cmath-fp64.o" "-output={{.*}}libsycl-cmath-fp64-{{.*}}.o" "-unbundle" // SYCL_DEVICE_LIB_UNBUNDLE_WITH_FP64-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-imf.o" "-output={{.*}}libsycl-imf-{{.*}}.o" "-unbundle" // SYCL_DEVICE_LIB_UNBUNDLE_WITH_FP64-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-imf-fp64.o" "-output={{.*}}libsycl-imf-fp64-{{.*}}.o" "-unbundle" +// SYCL_DEVICE_LIB_UNBUNDLE_WITH_FP64-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-imf-bf16.o" "-output={{.*}}libsycl-imf-bf16-{{.*}}.o" "-unbundle" // SYCL_DEVICE_LIB_UNBUNDLE_WITH_FP64-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-cassert.o" "-output={{.*}}libsycl-fallback-cassert-{{.*}}.o" "-unbundle" // SYCL_DEVICE_LIB_UNBUNDLE_WITH_FP64-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-cstring.o" "-output={{.*}}libsycl-fallback-cstring-{{.*}}.o" "-unbundle" // SYCL_DEVICE_LIB_UNBUNDLE_WITH_FP64-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-complex.o" "-output={{.*}}libsycl-fallback-complex-{{.*}}.o" "-unbundle" @@ -69,6 +72,7 @@ // SYCL_DEVICE_LIB_UNBUNDLE_WITH_FP64-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-cmath-fp64.o" "-output={{.*}}libsycl-fallback-cmath-fp64-{{.*}}.o" "-unbundle" // SYCL_DEVICE_LIB_UNBUNDLE_WITH_FP64-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-imf.o" "-output={{.*}}libsycl-fallback-imf-{{.*}}.o" "-unbundle" // SYCL_DEVICE_LIB_UNBUNDLE_WITH_FP64-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-imf-fp64.o" "-output={{.*}}libsycl-fallback-imf-fp64-{{.*}}.o" "-unbundle" +// SYCL_DEVICE_LIB_UNBUNDLE_WITH_FP64-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-imf-bf16.o" "-output={{.*}}libsycl-fallback-imf-bf16-{{.*}}.o" "-unbundle" /// ########################################################################### /// test behavior of -fno-sycl-device-lib=libc @@ -80,12 +84,14 @@ // SYCL_DEVICE_LIB_UNBUNDLE_NO_LIBC-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-cmath-fp64.o" "-output={{.*}}libsycl-cmath-fp64-{{.*}}.o" "-unbundle" // SYCL_DEVICE_LIB_UNBUNDLE_NO_LIBC-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-imf.o" "-output={{.*}}libsycl-imf-{{.*}}.o" "-unbundle" // SYCL_DEVICE_LIB_UNBUNDLE_NO_LIBC-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-imf-fp64.o" "-output={{.*}}libsycl-imf-fp64-{{.*}}.o" "-unbundle" +// SYCL_DEVICE_LIB_UNBUNDLE_NO_LIBC-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-imf-bf16.o" "-output={{.*}}libsycl-imf-bf16-{{.*}}.o" "-unbundle" // SYCL_DEVICE_LIB_UNBUNDLE_NO_LIBC-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-complex.o" "-output={{.*}}libsycl-fallback-complex-{{.*}}.o" "-unbundle" // SYCL_DEVICE_LIB_UNBUNDLE_NO_LIBC-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-complex-fp64.o" "-output={{.*}}libsycl-fallback-complex-fp64-{{.*}}.o" "-unbundle" // SYCL_DEVICE_LIB_UNBUNDLE_NO_LIBC-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-cmath.o" "-output={{.*}}libsycl-fallback-cmath-{{.*}}.o" "-unbundle" // SYCL_DEVICE_LIB_UNBUNDLE_NO_LIBC-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-cmath-fp64.o" "-output={{.*}}libsycl-fallback-cmath-fp64-{{.*}}.o" "-unbundle" // SYCL_DEVICE_LIB_UNBUNDLE_NO_LIBC-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-imf.o" "-output={{.*}}libsycl-fallback-imf-{{.*}}.o" "-unbundle" // SYCL_DEVICE_LIB_UNBUNDLE_NO_LIBC-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-imf-fp64.o" "-output={{.*}}libsycl-fallback-imf-fp64-{{.*}}.o" "-unbundle" +// SYCL_DEVICE_LIB_UNBUNDLE_NO_LIBC-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-imf-bf16.o" "-output={{.*}}libsycl-fallback-imf-bf16-{{.*}}.o" "-unbundle" /// ########################################################################### /// test behavior of -fno-sycl-device-lib=libm-fp32,libm-fp64 @@ -94,10 +100,12 @@ // SYCL_DEVICE_LIB_UNBUNDLE_NO_LIBM: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-crt.o" "-output={{.*}}libsycl-crt-{{.*}}.o" "-unbundle" // SYCL_DEVICE_LIB_UNBUNDLE_NO_LIBM: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-imf.o" "-output={{.*}}libsycl-imf-{{.*}}.o" "-unbundle" // SYCL_DEVICE_LIB_UNBUNDLE_NO_LIBM: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-imf-fp64.o" "-output={{.*}}libsycl-imf-fp64-{{.*}}.o" "-unbundle" +// SYCL_DEVICE_LIB_UNBUNDLE_NO_LIBM: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-imf-bf16.o" "-output={{.*}}libsycl-imf-bf16-{{.*}}.o" "-unbundle" // SYCL_DEVICE_LIB_UNBUNDLE_NO_LIBM-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-cassert.o" "-output={{.*}}libsycl-fallback-cassert-{{.*}}.o" "-unbundle" // SYCL_DEVICE_LIB_UNBUNDLE_NO_LIBM-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-cstring.o" "-output={{.*}}libsycl-fallback-cstring-{{.*}}.o" "-unbundle" // SYCL_DEVICE_LIB_UNBUNDLE_NO_LIBM: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-imf.o" "-output={{.*}}libsycl-fallback-imf-{{.*}}.o" "-unbundle" // SYCL_DEVICE_LIB_UNBUNDLE_NO_LIBM: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-imf-fp64.o" "-output={{.*}}libsycl-fallback-imf-fp64-{{.*}}.o" "-unbundle" +// SYCL_DEVICE_LIB_UNBUNDLE_NO_LIBM: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-imf-bf16.o" "-output={{.*}}libsycl-fallback-imf-bf16-{{.*}}.o" "-unbundle" /// ########################################################################### /// test behavior of disabling all device libraries @@ -145,6 +153,7 @@ // SYCL_LLVM_LINK_DEVICE_LIB-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-cmath-fp64.o" "-output={{.*}}libsycl-cmath-fp64-{{.*}}.o" "-unbundle" // SYCL_LLVM_LINK_DEVICE_LIB-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-imf.o" "-output={{.*}}libsycl-imf-{{.*}}.o" "-unbundle" // SYCL_LLVM_LINK_DEVICE_LIB-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-imf-fp64.o" "-output={{.*}}libsycl-imf-fp64-{{.*}}.o" "-unbundle" +// SYCL_LLVM_LINK_DEVICE_LIB-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-imf-bf16.o" "-output={{.*}}libsycl-imf-bf16-{{.*}}.o" "-unbundle" // SYCL_LLVM_LINK_DEVICE_LIB-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-cassert.o" "-output={{.*}}libsycl-fallback-cassert-{{.*}}.o" "-unbundle" // SYCL_LLVM_LINK_DEVICE_LIB-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-cstring.o" "-output={{.*}}libsycl-fallback-cstring-{{.*}}.o" "-unbundle" // SYCL_LLVM_LINK_DEVICE_LIB-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-complex.o" "-output={{.*}}libsycl-fallback-complex-{{.*}}.o" "-unbundle" @@ -153,6 +162,7 @@ // SYCL_LLVM_LINK_DEVICE_LIB-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-cmath-fp64.o" "-output={{.*}}libsycl-fallback-cmath-fp64-{{.*}}.o" "-unbundle" // SYCL_LLVM_LINK_DEVICE_LIB-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-imf.o" "-output={{.*}}libsycl-fallback-imf-{{.*}}.o" "-unbundle" // SYCL_LLVM_LINK_DEVICE_LIB-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-imf-fp64.o" "-output={{.*}}libsycl-fallback-imf-fp64-{{.*}}.o" "-unbundle" +// SYCL_LLVM_LINK_DEVICE_LIB-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-imf-bf16.o" "-output={{.*}}libsycl-fallback-imf-bf16-{{.*}}.o" "-unbundle" // SYCL_LLVM_LINK_DEVICE_LIB-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-itt-user-wrappers.o" "-output={{.*}}libsycl-itt-user-wrappers-{{.*}}.o" "-unbundle" // SYCL_LLVM_LINK_DEVICE_LIB-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-itt-compiler-wrappers.o" "-output={{.*}}libsycl-itt-compiler-wrappers-{{.*}}.o" "-unbundle" // SYCL_LLVM_LINK_DEVICE_LIB-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-itt-stubs.o" "-output={{.*}}libsycl-itt-stubs-{{.*}}.o" "-unbundle" diff --git a/clang/test/Driver/sycl-offload-nvptx.cpp b/clang/test/Driver/sycl-offload-nvptx.cpp index 36ee0168225bf..84c717f7e340f 100644 --- a/clang/test/Driver/sycl-offload-nvptx.cpp +++ b/clang/test/Driver/sycl-offload-nvptx.cpp @@ -76,51 +76,57 @@ // CHK-PHASES-NO-CC: 30: input, "{{.*}}libsycl-imf-fp64.o", object // CHK-PHASES-NO-CC: 31: clang-offload-unbundler, {30}, object // CHK-PHASES-NO-CC: 32: offload, " (nvptx64-nvidia-cuda)" {31}, object -// CHK-PHASES-NO-CC: 33: input, "{{.*}}libsycl-fallback-cassert.o", object +// CHK-PHASES-NO-CC: 33: input, "{{.*}}libsycl-imf-bf16.o", object // CHK-PHASES-NO-CC: 34: clang-offload-unbundler, {33}, object // CHK-PHASES-NO-CC: 35: offload, " (nvptx64-nvidia-cuda)" {34}, object -// CHK-PHASES-NO-CC: 36: input, "{{.*}}libsycl-fallback-cstring.o", object +// CHK-PHASES-NO-CC: 36: input, "{{.*}}libsycl-fallback-cassert.o", object // CHK-PHASES-NO-CC: 37: clang-offload-unbundler, {36}, object // CHK-PHASES-NO-CC: 38: offload, " (nvptx64-nvidia-cuda)" {37}, object -// CHK-PHASES-NO-CC: 39: input, "{{.*}}libsycl-fallback-complex.o", object +// CHK-PHASES-NO-CC: 39: input, "{{.*}}libsycl-fallback-cstring.o", object // CHK-PHASES-NO-CC: 40: clang-offload-unbundler, {39}, object // CHK-PHASES-NO-CC: 41: offload, " (nvptx64-nvidia-cuda)" {40}, object -// CHK-PHASES-NO-CC: 42: input, "{{.*}}libsycl-fallback-complex-fp64.o", object +// CHK-PHASES-NO-CC: 42: input, "{{.*}}libsycl-fallback-complex.o", object // CHK-PHASES-NO-CC: 43: clang-offload-unbundler, {42}, object // CHK-PHASES-NO-CC: 44: offload, " (nvptx64-nvidia-cuda)" {43}, object -// CHK-PHASES-NO-CC: 45: input, "{{.*}}libsycl-fallback-cmath.o", object +// CHK-PHASES-NO-CC: 45: input, "{{.*}}libsycl-fallback-complex-fp64.o", object // CHK-PHASES-NO-CC: 46: clang-offload-unbundler, {45}, object // CHK-PHASES-NO-CC: 47: offload, " (nvptx64-nvidia-cuda)" {46}, object -// CHK-PHASES-NO-CC: 48: input, "{{.*}}libsycl-fallback-cmath-fp64.o", object +// CHK-PHASES-NO-CC: 48: input, "{{.*}}libsycl-fallback-cmath.o", object // CHK-PHASES-NO-CC: 49: clang-offload-unbundler, {48}, object // CHK-PHASES-NO-CC: 50: offload, " (nvptx64-nvidia-cuda)" {49}, object -// CHK-PHASES-NO-CC: 51: input, "{{.*}}libsycl-fallback-imf.o", object +// CHK-PHASES-NO-CC: 51: input, "{{.*}}libsycl-fallback-cmath-fp64.o", object // CHK-PHASES-NO-CC: 52: clang-offload-unbundler, {51}, object // CHK-PHASES-NO-CC: 53: offload, " (nvptx64-nvidia-cuda)" {52}, object -// CHK-PHASES-NO-CC: 54: input, "{{.*}}libsycl-fallback-imf-fp64.o", object +// CHK-PHASES-NO-CC: 54: input, "{{.*}}libsycl-fallback-imf.o", object // CHK-PHASES-NO-CC: 55: clang-offload-unbundler, {54}, object // CHK-PHASES-NO-CC: 56: offload, " (nvptx64-nvidia-cuda)" {55}, object -// CHK-PHASES-NO-CC: 57: input, "{{.*}}libsycl-itt-user-wrappers.o", object +// CHK-PHASES-NO-CC: 57: input, "{{.*}}libsycl-fallback-imf-fp64.o", object // CHK-PHASES-NO-CC: 58: clang-offload-unbundler, {57}, object // CHK-PHASES-NO-CC: 59: offload, " (nvptx64-nvidia-cuda)" {58}, object -// CHK-PHASES-NO-CC: 60: input, "{{.*}}libsycl-itt-compiler-wrappers.o", object +// CHK-PHASES-NO-CC: 60: input, "{{.*}}libsycl-fallback-imf-bf16.o", object // CHK-PHASES-NO-CC: 61: clang-offload-unbundler, {60}, object // CHK-PHASES-NO-CC: 62: offload, " (nvptx64-nvidia-cuda)" {61}, object -// CHK-PHASES-NO-CC: 63: input, "{{.*}}libsycl-itt-stubs.o", object +// CHK-PHASES-NO-CC: 63: input, "{{.*}}libsycl-itt-user-wrappers.o", object // CHK-PHASES-NO-CC: 64: clang-offload-unbundler, {63}, object // CHK-PHASES-NO-CC: 65: offload, " (nvptx64-nvidia-cuda)" {64}, object -// CHK-PHASES-NO-CC: 66: input, "{{.*}}nvidiacl{{.*}}", ir, (device-sycl, sm_50) -// CHK-PHASES-NO-CC: 67: input, "{{.*}}libdevice{{.*}}", ir, (device-sycl, sm_50) -// CHK-PHASES-NO-CC: 68: linker, {11, 14, 17, 20, 23, 26, 29, 32, 35, 38, 41, 44, 47, 50, 53, 56, 59, 62, 65, 66, 67}, ir, (device-sycl, sm_50) -// CHK-PHASES-NO-CC: 69: sycl-post-link, {68}, ir, (device-sycl, sm_50) -// CHK-PHASES-NO-CC: 70: file-table-tform, {69}, ir, (device-sycl, sm_50) -// CHK-PHASES-NO-CC: 71: backend, {70}, assembler, (device-sycl, sm_50) -// CHK-PHASES-NO-CC: 72: assembler, {71}, object, (device-sycl, sm_50) -// CHK-PHASES-NO-CC: 73: linker, {71, 72}, cuda-fatbin, (device-sycl, sm_50) -// CHK-PHASES-NO-CC: 74: foreach, {70, 73}, cuda-fatbin, (device-sycl, sm_50) -// CHK-PHASES-NO-CC: 75: file-table-tform, {69, 74}, tempfiletable, (device-sycl, sm_50) -// CHK-PHASES-NO-CC: 76: clang-offload-wrapper, {75}, object, (device-sycl, sm_50) -// CHK-PHASES-NO-CC: 77: offload, "host-sycl (x86_64-{{.*}})" {10}, "device-sycl (nvptx64-nvidia-cuda:sm_50)" {76}, image +// CHK-PHASES-NO-CC: 66: input, "{{.*}}libsycl-itt-compiler-wrappers.o", object +// CHK-PHASES-NO-CC: 67: clang-offload-unbundler, {66}, object +// CHK-PHASES-NO-CC: 68: offload, " (nvptx64-nvidia-cuda)" {67}, object +// CHK-PHASES-NO-CC: 69: input, "{{.*}}libsycl-itt-stubs.o", object +// CHK-PHASES-NO-CC: 70: clang-offload-unbundler, {69}, object +// CHK-PHASES-NO-CC: 71: offload, " (nvptx64-nvidia-cuda)" {70}, object +// CHK-PHASES-NO-CC: 72: input, "{{.*}}nvidiacl{{.*}}", ir, (device-sycl, sm_50) +// CHK-PHASES-NO-CC: 73: input, "{{.*}}libdevice{{.*}}", ir, (device-sycl, sm_50) +// CHK-PHASES-NO-CC: 74: linker, {11, 14, 17, 20, 23, 26, 29, 32, 35, 38, 41, 44, 47, 50, 53, 56, 59, 62, 65, 68, 71, 72, 73}, ir, (device-sycl, sm_50) +// CHK-PHASES-NO-CC: 75: sycl-post-link, {74}, ir, (device-sycl, sm_50) +// CHK-PHASES-NO-CC: 76: file-table-tform, {75}, ir, (device-sycl, sm_50) +// CHK-PHASES-NO-CC: 77: backend, {76}, assembler, (device-sycl, sm_50) +// CHK-PHASES-NO-CC: 78: assembler, {77}, object, (device-sycl, sm_50) +// CHK-PHASES-NO-CC: 79: linker, {77, 78}, cuda-fatbin, (device-sycl, sm_50) +// CHK-PHASES-NO-CC: 80: foreach, {76, 79}, cuda-fatbin, (device-sycl, sm_50) +// CHK-PHASES-NO-CC: 81: file-table-tform, {75, 80}, tempfiletable, (device-sycl, sm_50) +// CHK-PHASES-NO-CC: 82: clang-offload-wrapper, {81}, object, (device-sycl, sm_50) +// CHK-PHASES-NO-CC: 83: offload, "host-sycl (x86_64-{{.*}})" {10}, "device-sycl (nvptx64-nvidia-cuda:sm_50)" {82}, image // // /// Check phases specifying a compute capability. @@ -167,51 +173,57 @@ // CHK-PHASES: 30: input, "{{.*}}libsycl-imf-fp64.o", object // CHK-PHASES: 31: clang-offload-unbundler, {30}, object // CHK-PHASES: 32: offload, " (nvptx64-nvidia-cuda)" {31}, object -// CHK-PHASES: 33: input, "{{.*}}libsycl-fallback-cassert.o", object +// CHK-PHASES: 33: input, "{{.*}}libsycl-imf-bf16.o", object // CHK-PHASES: 34: clang-offload-unbundler, {33}, object // CHK-PHASES: 35: offload, " (nvptx64-nvidia-cuda)" {34}, object -// CHK-PHASES: 36: input, "{{.*}}libsycl-fallback-cstring.o", object +// CHK-PHASES: 36: input, "{{.*}}libsycl-fallback-cassert.o", object // CHK-PHASES: 37: clang-offload-unbundler, {36}, object // CHK-PHASES: 38: offload, " (nvptx64-nvidia-cuda)" {37}, object -// CHK-PHASES: 39: input, "{{.*}}libsycl-fallback-complex.o", object +// CHK-PHASES: 39: input, "{{.*}}libsycl-fallback-cstring.o", object // CHK-PHASES: 40: clang-offload-unbundler, {39}, object // CHK-PHASES: 41: offload, " (nvptx64-nvidia-cuda)" {40}, object -// CHK-PHASES: 42: input, "{{.*}}libsycl-fallback-complex-fp64.o", object +// CHK-PHASES: 42: input, "{{.*}}libsycl-fallback-complex.o", object // CHK-PHASES: 43: clang-offload-unbundler, {42}, object // CHK-PHASES: 44: offload, " (nvptx64-nvidia-cuda)" {43}, object -// CHK-PHASES: 45: input, "{{.*}}libsycl-fallback-cmath.o", object +// CHK-PHASES: 45: input, "{{.*}}libsycl-fallback-complex-fp64.o", object // CHK-PHASES: 46: clang-offload-unbundler, {45}, object // CHK-PHASES: 47: offload, " (nvptx64-nvidia-cuda)" {46}, object -// CHK-PHASES: 48: input, "{{.*}}libsycl-fallback-cmath-fp64.o", object +// CHK-PHASES: 48: input, "{{.*}}libsycl-fallback-cmath.o", object // CHK-PHASES: 49: clang-offload-unbundler, {48}, object // CHK-PHASES: 50: offload, " (nvptx64-nvidia-cuda)" {49}, object -// CHK-PHASES: 51: input, "{{.*}}libsycl-fallback-imf.o", object +// CHK-PHASES: 51: input, "{{.*}}libsycl-fallback-cmath-fp64.o", object // CHK-PHASES: 52: clang-offload-unbundler, {51}, object // CHK-PHASES: 53: offload, " (nvptx64-nvidia-cuda)" {52}, object -// CHK-PHASES: 54: input, "{{.*}}libsycl-fallback-imf-fp64.o", object +// CHK-PHASES: 54: input, "{{.*}}libsycl-fallback-imf.o", object // CHK-PHASES: 55: clang-offload-unbundler, {54}, object // CHK-PHASES: 56: offload, " (nvptx64-nvidia-cuda)" {55}, object -// CHK-PHASES: 57: input, "{{.*}}libsycl-itt-user-wrappers.o", object +// CHK-PHASES: 57: input, "{{.*}}libsycl-fallback-imf-fp64.o", object // CHK-PHASES: 58: clang-offload-unbundler, {57}, object // CHK-PHASES: 59: offload, " (nvptx64-nvidia-cuda)" {58}, object -// CHK-PHASES: 60: input, "{{.*}}libsycl-itt-compiler-wrappers.o", object +// CHK-PHASES: 60: input, "{{.*}}libsycl-fallback-imf-bf16.o", object // CHK-PHASES: 61: clang-offload-unbundler, {60}, object // CHK-PHASES: 62: offload, " (nvptx64-nvidia-cuda)" {61}, object -// CHK-PHASES: 63: input, "{{.*}}libsycl-itt-stubs.o", object +// CHK-PHASES: 63: input, "{{.*}}libsycl-itt-user-wrappers.o", object // CHK-PHASES: 64: clang-offload-unbundler, {63}, object // CHK-PHASES: 65: offload, " (nvptx64-nvidia-cuda)" {64}, object -// CHK-PHASES: 66: input, "{{.*}}nvidiacl{{.*}}", ir, (device-sycl, sm_35) -// CHK-PHASES: 67: input, "{{.*}}libdevice{{.*}}", ir, (device-sycl, sm_35) -// CHK-PHASES: 68: linker, {11, 14, 17, 20, 23, 26, 29, 32, 35, 38, 41, 44, 47, 50, 53, 56, 59, 62, 65, 66, 67}, ir, (device-sycl, sm_35) -// CHK-PHASES: 69: sycl-post-link, {68}, ir, (device-sycl, sm_35) -// CHK-PHASES: 70: file-table-tform, {69}, ir, (device-sycl, sm_35) -// CHK-PHASES: 71: backend, {70}, assembler, (device-sycl, sm_35) -// CHK-PHASES: 72: assembler, {71}, object, (device-sycl, sm_35) -// CHK-PHASES: 73: linker, {71, 72}, cuda-fatbin, (device-sycl, sm_35) -// CHK-PHASES: 74: foreach, {70, 73}, cuda-fatbin, (device-sycl, sm_35) -// CHK-PHASES: 75: file-table-tform, {69, 74}, tempfiletable, (device-sycl, sm_35) -// CHK-PHASES: 76: clang-offload-wrapper, {75}, object, (device-sycl, sm_35) -// CHK-PHASES: 77: offload, "host-sycl (x86_64-{{.*}})" {10}, "device-sycl (nvptx64-nvidia-cuda:sm_35)" {76}, image +// CHK-PHASES: 66: input, "{{.*}}libsycl-itt-compiler-wrappers.o", object +// CHK-PHASES: 67: clang-offload-unbundler, {66}, object +// CHK-PHASES: 68: offload, " (nvptx64-nvidia-cuda)" {67}, object +// CHK-PHASES: 69: input, "{{.*}}libsycl-itt-stubs.o", object +// CHK-PHASES: 70: clang-offload-unbundler, {69}, object +// CHK-PHASES: 71: offload, " (nvptx64-nvidia-cuda)" {70}, object +// CHK-PHASES: 72: input, "{{.*}}nvidiacl{{.*}}", ir, (device-sycl, sm_35) +// CHK-PHASES: 73: input, "{{.*}}libdevice{{.*}}", ir, (device-sycl, sm_35) +// CHK-PHASES: 74: linker, {11, 14, 17, 20, 23, 26, 29, 32, 35, 38, 41, 44, 47, 50, 53, 56, 59, 62, 65, 68, 71, 72, 73}, ir, (device-sycl, sm_35) +// CHK-PHASES: 75: sycl-post-link, {74}, ir, (device-sycl, sm_35) +// CHK-PHASES: 76: file-table-tform, {75}, ir, (device-sycl, sm_35) +// CHK-PHASES: 77: backend, {76}, assembler, (device-sycl, sm_35) +// CHK-PHASES: 78: assembler, {77}, object, (device-sycl, sm_35) +// CHK-PHASES: 79: linker, {77, 78}, cuda-fatbin, (device-sycl, sm_35) +// CHK-PHASES: 80: foreach, {76, 79}, cuda-fatbin, (device-sycl, sm_35) +// CHK-PHASES: 81: file-table-tform, {75, 80}, tempfiletable, (device-sycl, sm_35) +// CHK-PHASES: 82: clang-offload-wrapper, {81}, object, (device-sycl, sm_35) +// CHK-PHASES: 83: offload, "host-sycl (x86_64-{{.*}})" {10}, "device-sycl (nvptx64-nvidia-cuda:sm_35)" {82}, image /// Check calling preprocessor only // RUN: %clangxx -E -fsycl -fsycl-targets=nvptx64-nvidia-cuda -ccc-print-phases %s 2>&1 \ diff --git a/libdevice/cmake/modules/ImfSrcConcate.cmake b/libdevice/cmake/modules/ImfSrcConcate.cmake index 59e40736289f6..b6c9405948f3d 100644 --- a/libdevice/cmake/modules/ImfSrcConcate.cmake +++ b/libdevice/cmake/modules/ImfSrcConcate.cmake @@ -7,12 +7,18 @@ set(imf_fp32_fallback_src_list imf_utils/integer_misc.cpp set(imf_fp64_fallback_src_list imf_utils/double_convert.cpp imf/imf_inline_fp64.cpp) -if (FP64 STREQUAL 0) +set(imf_bf16_fallback_src_list imf_utils/bfloat16_convert.cpp + imf/imf_inline_bf16.cpp) + +if (IMF_TARGET STREQUAL "FP32") set(imf_fallback_src_list ${imf_fp32_fallback_src_list}) set(imf_fallback_dest ${DEST_DIR}/imf_fp32_fallback.cpp) -else() +elseif (IMF_TARGET STREQUAL "FP64") set(imf_fallback_src_list ${imf_fp64_fallback_src_list}) set(imf_fallback_dest ${DEST_DIR}/imf_fp64_fallback.cpp) +elseif (IMF_TARGET STREQUAL "BF16") + set(imf_fallback_src_list ${imf_bf16_fallback_src_list}) + set(imf_fallback_dest ${DEST_DIR}/imf_bf16_fallback.cpp) endif() set(flag 0) diff --git a/libdevice/cmake/modules/SYCLLibdevice.cmake b/libdevice/cmake/modules/SYCLLibdevice.cmake index 8ad2fa8491874..7ae3543a789aa 100644 --- a/libdevice/cmake/modules/SYCLLibdevice.cmake +++ b/libdevice/cmake/modules/SYCLLibdevice.cmake @@ -99,7 +99,7 @@ endfunction() set(crt_obj_deps wrapper.h device.h spirv_vars.h sycl-compiler) set(complex_obj_deps device_complex.h device.h sycl-compiler) set(cmath_obj_deps device_math.h device.h sycl-compiler) -set(imf_obj_deps device_imf.hpp imf_half.hpp device.h sycl-compiler) +set(imf_obj_deps device_imf.hpp imf_half.hpp imf_bf16.hpp device.h sycl-compiler) set(itt_obj_deps device_itt.h spirv_vars.h device.h sycl-compiler) add_devicelib_obj(libsycl-itt-stubs SRC itt_stubs.cpp DEP ${itt_obj_deps}) @@ -113,6 +113,7 @@ add_devicelib_obj(libsycl-cmath SRC cmath_wrapper.cpp DEP ${cmath_obj_deps}) add_devicelib_obj(libsycl-cmath-fp64 SRC cmath_wrapper_fp64.cpp DEP ${cmath_obj_deps} ) add_devicelib_obj(libsycl-imf SRC imf_wrapper.cpp DEP ${imf_obj_deps}) add_devicelib_obj(libsycl-imf-fp64 SRC imf_wrapper_fp64.cpp DEP ${imf_obj_deps}) +add_devicelib_obj(libsycl-imf-bf16 SRC imf_wrapper_bf16.cpp DEP ${imf_obj_deps}) if(WIN32) add_devicelib_obj(libsycl-msvc-math SRC msvc_math.cpp DEP ${cmath_obj_deps}) endif() @@ -136,25 +137,38 @@ set(imf_fallback_fp32_deps device.h device_imf.hpp imf_half.hpp set(imf_fallback_fp64_deps device.h device_imf.hpp imf_half.hpp imf_utils/double_convert.cpp imf/imf_inline_fp64.cpp) +set(imf_fallback_bf16_deps device.h device_imf.hpp imf_bf16.hpp + imf_utils/bfloat16_convert.cpp + imf/imf_inline_bf16.cpp) + set(imf_fp32_fallback_src ${imf_fallback_src_dir}/imf_fp32_fallback.cpp) set(imf_fp64_fallback_src ${imf_fallback_src_dir}/imf_fp64_fallback.cpp) +set(imf_bf16_fallback_src ${imf_fallback_src_dir}/imf_bf16_fallback.cpp) + set(imf_host_cxx_flags -c -D__LIBDEVICE_HOST_IMPL__ ) add_custom_command(OUTPUT ${imf_fp32_fallback_src} COMMAND ${CMAKE_COMMAND} -D SRC_DIR=${imf_src_dir} -D DEST_DIR=${imf_fallback_src_dir} - -D FP64=0 + -D IMF_TARGET=FP32 -P ${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules/ImfSrcConcate.cmake DEPENDS ${imf_fallback_fp32_deps}) add_custom_command(OUTPUT ${imf_fp64_fallback_src} COMMAND ${CMAKE_COMMAND} -D SRC_DIR=${imf_src_dir} -D DEST_DIR=${imf_fallback_src_dir} - -D FP64=1 + -D IMF_TARGET=FP64 -P ${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules/ImfSrcConcate.cmake DEPENDS ${imf_fallback_fp64_deps}) +add_custom_command(OUTPUT ${imf_bf16_fallback_src} + COMMAND ${CMAKE_COMMAND} -D SRC_DIR=${imf_src_dir} + -D DEST_DIR=${imf_fallback_src_dir} + -D IMF_TARGET=BF16 + -P ${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules/ImfSrcConcate.cmake + DEPENDS ${imf_fallback_bf16_deps}) + add_custom_target(get_imf_fallback_fp32 DEPENDS ${imf_fp32_fallback_src}) add_custom_command(OUTPUT ${spv_binary_dir}/libsycl-fallback-imf.spv COMMAND ${clang} -fsycl-device-only -fno-sycl-use-bitcode @@ -205,6 +219,31 @@ add_custom_command(OUTPUT ${obj_binary_dir}/fallback-imf-fp64-host.${lib-suffix} DEPENDS ${imf_fallback_fp64_deps} get_imf_fallback_fp64 sycl-compiler VERBATIM) +add_custom_target(get_imf_fallback_bf16 DEPENDS ${imf_bf16_fallback_src}) +add_custom_command(OUTPUT ${spv_binary_dir}/libsycl-fallback-imf-bf16.spv + COMMAND ${clang} -fsycl-device-only -fno-sycl-use-bitcode + ${compile_opts} -I ${CMAKE_CURRENT_SOURCE_DIR}/imf + ${imf_bf16_fallback_src} + -o ${spv_binary_dir}/libsycl-fallback-imf-bf16.spv + DEPENDS ${imf_fallback_bf16_deps} get_imf_fallback_bf16 sycl-compiler + VERBATIM) + +add_custom_command(OUTPUT ${obj_binary_dir}/libsycl-fallback-imf-bf16.${lib-suffix} + COMMAND ${clang} -fsycl -c -I ${CMAKE_CURRENT_SOURCE_DIR}/imf + ${compile_opts} ${sycl_targets_opt} + ${imf_bf16_fallback_src} + -o ${obj_binary_dir}/libsycl-fallback-imf-bf16.${lib-suffix} + DEPENDS ${imf_fallback_bf16_deps} get_imf_fallback_bf16 sycl-compiler + VERBATIM) + +add_custom_command(OUTPUT ${obj_binary_dir}/fallback-imf-bf16-host.${lib-suffix} + COMMAND ${clang} ${imf_host_cxx_flags} + -I ${CMAKE_CURRENT_SOURCE_DIR}/imf + ${imf_bf16_fallback_src} + -o ${obj_binary_dir}/fallback-imf-bf16-host.${lib-suffix} + DEPENDS ${imf_fallback_bf16_deps} get_imf_fallback_bf16 sycl-compiler + VERBATIM) + add_custom_target(imf_fallback_fp32_spv DEPENDS ${spv_binary_dir}/libsycl-fallback-imf.spv) add_custom_target(imf_fallback_fp32_obj DEPENDS ${obj_binary_dir}/libsycl-fallback-imf.${lib-suffix}) add_custom_target(imf_fallback_fp32_host_obj DEPENDS ${obj_binary_dir}/fallback-imf-fp32-host.${lib-suffix}) @@ -217,6 +256,12 @@ add_custom_target(imf_fallback_fp64_host_obj DEPENDS ${obj_binary_dir}/fallback- add_dependencies(libsycldevice-spv imf_fallback_fp64_spv) add_dependencies(libsycldevice-obj imf_fallback_fp64_obj) +add_custom_target(imf_fallback_bf16_spv DEPENDS ${spv_binary_dir}/libsycl-fallback-imf-bf16.spv) +add_custom_target(imf_fallback_bf16_obj DEPENDS ${obj_binary_dir}/libsycl-fallback-imf-bf16.${lib-suffix}) +add_custom_target(imf_fallback_bf16_host_obj DEPENDS ${obj_binary_dir}/fallback-imf-bf16-host.${lib-suffix}) +add_dependencies(libsycldevice-spv imf_fallback_bf16_spv) +add_dependencies(libsycldevice-obj imf_fallback_bf16_obj) + add_custom_command(OUTPUT ${obj_binary_dir}/imf-fp32-host.${lib-suffix} COMMAND ${clang} ${imf_host_cxx_flags} ${CMAKE_CURRENT_SOURCE_DIR}/imf_wrapper.cpp @@ -233,24 +278,41 @@ add_custom_command(OUTPUT ${obj_binary_dir}/imf-fp64-host.${lib-suffix} DEPENDS ${imf_obj_deps} VERBATIM) +add_custom_command(OUTPUT ${obj_binary_dir}/imf-bf16-host.${lib-suffix} + COMMAND ${clang} ${imf_host_cxx_flags} + ${CMAKE_CURRENT_SOURCE_DIR}/imf_wrapper_bf16.cpp + -o ${obj_binary_dir}/imf-bf16-host.${lib-suffix} + MAIN_DEPENDENCY ${CMAKE_CURRENT_SOURCE_DIR}/imf_wrapper_bf16.cpp + DEPENDS ${imf_obj_deps} + VERBATIM) + add_custom_target(imf_fp32_host_obj DEPENDS ${obj_binary_dir}/imf-fp32-host.${lib-suffix}) add_custom_target(imf_fp64_host_obj DEPENDS ${obj_binary_dir}/imf-fp64-host.${lib-suffix}) +add_custom_target(imf_bf16_host_obj DEPENDS ${obj_binary_dir}/imf-bf16-host.${lib-suffix}) + add_custom_target(imf_host_obj COMMAND ${llvm-ar} rcs ${obj_binary_dir}/${devicelib_host_static} ${obj_binary_dir}/imf-fp32-host.${lib-suffix} ${obj_binary_dir}/fallback-imf-fp32-host.${lib-suffix} ${obj_binary_dir}/imf-fp64-host.${lib-suffix} ${obj_binary_dir}/fallback-imf-fp64-host.${lib-suffix} - DEPENDS imf_fp32_host_obj imf_fallback_fp32_host_obj imf_fp64_host_obj imf_fallback_fp64_host_obj sycl-compiler + ${obj_binary_dir}/imf-bf16-host.${lib-suffix} + ${obj_binary_dir}/fallback-imf-bf16-host.${lib-suffix} + DEPENDS imf_fp32_host_obj imf_fallback_fp32_host_obj + DEPENDS imf_fp64_host_obj imf_fallback_fp64_host_obj + DEPENDS imf_bf16_host_obj imf_fallback_bf16_host_obj + DEPENDS sycl-compiler VERBATIM) add_dependencies(libsycldevice-obj imf_host_obj) install(FILES ${spv_binary_dir}/libsycl-fallback-imf.spv ${spv_binary_dir}/libsycl-fallback-imf-fp64.spv + ${spv_binary_dir}/libsycl-fallback-imf-bf16.spv DESTINATION ${install_dest_spv} COMPONENT libsycldevice) install(FILES ${obj_binary_dir}/libsycl-fallback-imf.${lib-suffix} ${obj_binary_dir}/libsycl-fallback-imf-fp64.${lib-suffix} + ${obj_binary_dir}/libsycl-fallback-imf-bf16.${lib-suffix} ${obj_binary_dir}/${devicelib_host_static} DESTINATION ${install_dest_lib} COMPONENT libsycldevice) diff --git a/libdevice/device_imf.hpp b/libdevice/device_imf.hpp index afc914892c52d..30873e5a531fb 100644 --- a/libdevice/device_imf.hpp +++ b/libdevice/device_imf.hpp @@ -10,6 +10,7 @@ #define __LIBDEVICE_DEVICE_IMF_H__ #include "device.h" +#include "imf_bf16.hpp" #include "imf_half.hpp" #include #include @@ -113,7 +114,7 @@ static inline float __fclamp(float x, float y, float z) { #endif } -// fma for float, double, half math, covers both device and host. +// fma for float, double, half, bf16 math, covers both device and host. static inline float __fma(float x, float y, float z) { #if defined(__LIBDEVICE_HOST_IMPL__) return __builtin_fmaf(x, y, z); @@ -145,7 +146,16 @@ static inline _iml_half __fma(_iml_half x, _iml_half y, _iml_half z) { #endif } -// sqrt for float, double, half math, covers both device and host. +// Currently, we used fp32 to emulate all bf16 arithmetic +static inline _iml_bf16 __fma(_iml_bf16 x, _iml_bf16 y, _iml_bf16 z) { + float tmp_x = __bfloat162float(x.get_internal()); + float tmp_y = __bfloat162float(y.get_internal()); + float tmp_z = __bfloat162float(z.get_internal()); + float res = __fma(tmp_x, tmp_y, tmp_z); + return _iml_bf16(res); +} + +// sqrt for float, double, half, bf16 math, covers both device and host. static inline float __sqrt(float x) { #if defined(__LIBDEVICE_HOST_IMPL__) return __builtin_sqrtf(x); @@ -173,7 +183,13 @@ static inline _iml_half __sqrt(_iml_half x) { #endif } -// rsqrt for float, double, half math, covers both device and host. +static inline _iml_bf16 __sqrt(_iml_bf16 x) { + float tmp_x = __bfloat162float(x.get_internal()); + float res = __sqrt(tmp_x); + return _iml_bf16(res); +} + +// rsqrt for float, double, half, bf16 math, covers both device and host. static inline float __rsqrt(float x) { #if defined(__LIBDEVICE_HOST_IMPL__) return 1.f / __builtin_sqrtf(x); @@ -201,7 +217,13 @@ static inline _iml_half __rsqrt(_iml_half x) { #endif } -// fmin for float, double, half math, covers both device and host. +static inline _iml_bf16 __rsqrt(_iml_bf16 x) { + float tmp_x = __bfloat162float(x.get_internal()); + float res = __rsqrt(tmp_x); + return _iml_bf16(res); +} + +// fmin for float, double, half, bf16 math, covers both device and host. static inline float __fmin(float x, float y) { #if defined(__LIBDEVICE_HOST_IMPL__) return __builtin_fminf(x, y); @@ -231,7 +253,14 @@ static inline _iml_half __fmin(_iml_half x, _iml_half y) { #endif } -// fmax for float, double, half math, covers both device and host. +static inline _iml_bf16 __fmin(_iml_bf16 x, _iml_bf16 y) { + float tmp_x = __bfloat162float(x.get_internal()); + float tmp_y = __bfloat162float(y.get_internal()); + float res = __fmin(tmp_x, tmp_y); + return _iml_bf16(res); +} + +// fmax for float, double, half, bf16 math, covers both device and host. static inline float __fmax(float x, float y) { #if defined(__LIBDEVICE_HOST_IMPL__) return __builtin_fmaxf(x, y); @@ -261,7 +290,14 @@ static inline _iml_half __fmax(_iml_half x, _iml_half y) { #endif } -// copysign for float, double, half math, covers both device and host. +static inline _iml_bf16 __fmax(_iml_bf16 x, _iml_bf16 y) { + float tmp_x = __bfloat162float(x.get_internal()); + float tmp_y = __bfloat162float(y.get_internal()); + float res = __fmax(tmp_x, tmp_y); + return _iml_bf16(res); +} + +// copysign for float, double, half, bf16 math, covers both device and host. static inline float __copysign(float x, float y) { #if defined(__LIBDEVICE_HOST_IMPL__) return __builtin_copysignf(x, y); @@ -291,7 +327,14 @@ static inline _iml_half __copysign(_iml_half x, _iml_half y) { #endif } -// fabs for float, double, half math, covers both device and host. +static inline _iml_bf16 __copysign(_iml_bf16 x, _iml_bf16 y) { + float tmp_x = __bfloat162float(x.get_internal()); + float tmp_y = __bfloat162float(y.get_internal()); + float res = __copysign(tmp_x, tmp_y); + return _iml_bf16(res); +} + +// fabs for float, double, half, bf16 math, covers both device and host. static inline float __fabs(float x) { #if defined(__LIBDEVICE_HOST_IMPL__) return __builtin_fabsf(x); @@ -319,7 +362,13 @@ static inline _iml_half __fabs(_iml_half x) { #endif } -// rint for float, double, half math, covers both device and host. +static inline _iml_bf16 __fabs(_iml_bf16 x) { + float tmp_x = __bfloat162float(x.get_internal()); + float res = __fabs(tmp_x); + return _iml_bf16(res); +} + +// rint for float, double, half, bf16 math, covers both device and host. static inline float __rint(float x) { #if defined(__LIBDEVICE_HOST_IMPL__) return __builtin_rintf(x); @@ -347,7 +396,13 @@ static inline _iml_half __rint(_iml_half x) { #endif } -// floor for float, double, half math, covers both device and host. +static inline _iml_bf16 __rint(_iml_bf16 x) { + float tmp_x = __bfloat162float(x.get_internal()); + float res = __rint(tmp_x); + return _iml_bf16(res); +} + +// floor for float, double, half, bf16 math, covers both device and host. static inline float __floor(float x) { #if defined(__LIBDEVICE_HOST_IMPL__) return __builtin_floorf(x); @@ -375,7 +430,13 @@ static inline _iml_half __floor(_iml_half x) { #endif } -// ceil for float, double, half math, covers both device and host. +static inline _iml_bf16 __floor(_iml_bf16 x) { + float tmp_x = __bfloat162float(x.get_internal()); + float res = __floor(tmp_x); + return _iml_bf16(res); +} + +// ceil for float, double, half, bf16 math, covers both device and host. static inline float __ceil(float x) { #if defined(__LIBDEVICE_HOST_IMPL__) return __builtin_ceilf(x); @@ -403,7 +464,13 @@ static inline _iml_half __ceil(_iml_half x) { #endif } -// trunc for float, double, half math, covers both device and host. +static inline _iml_bf16 __ceil(_iml_bf16 x) { + float tmp_x = __bfloat162float(x.get_internal()); + float res = __ceil(tmp_x); + return _iml_bf16(res); +} + +// trunc for float, double, half, bf16 math, covers both device and host. static inline float __trunc(float x) { #if defined(__LIBDEVICE_HOST_IMPL__) return __builtin_truncf(x); @@ -431,6 +498,12 @@ static inline _iml_half __trunc(_iml_half x) { #endif } +static inline _iml_bf16 __trunc(_iml_bf16 x) { + float tmp_x = __bfloat162float(x.get_internal()); + float res = __trunc(tmp_x); + return _iml_bf16(res); +} + static inline int __clz(int x) { if (x == 0) return 32; diff --git a/libdevice/imf/imf_inline_bf16.cpp b/libdevice/imf/imf_inline_bf16.cpp new file mode 100644 index 0000000000000..c7165a1ee0183 --- /dev/null +++ b/libdevice/imf/imf_inline_bf16.cpp @@ -0,0 +1,70 @@ +//==----- imf_inline_bf16.cpp - some bf16 trivial intel math functions -----==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include "../device_imf.hpp" + +#ifdef __LIBDEVICE_IMF_ENABLED__ +DEVICE_EXTERN_C_INLINE +_iml_bf16_internal __devicelib_imf_fmabf16(_iml_bf16_internal a, + _iml_bf16_internal b, + _iml_bf16_internal c) { + return __fma(_iml_bf16(a), _iml_bf16(b), _iml_bf16(c)).get_internal(); +} + +DEVICE_EXTERN_C_INLINE +_iml_bf16_internal __devicelib_imf_sqrtbf16(_iml_bf16_internal a) { + return __sqrt(_iml_bf16(a)).get_internal(); +} + +DEVICE_EXTERN_C_INLINE +_iml_bf16_internal __devicelib_imf_rsqrtbf16(_iml_bf16_internal a) { + return __rsqrt(_iml_bf16(a)).get_internal(); +} + +DEVICE_EXTERN_C_INLINE +_iml_bf16_internal __devicelib_imf_fminbf16(_iml_bf16_internal a, + _iml_bf16_internal b) { + return __fmin(_iml_bf16(a), _iml_bf16(b)).get_internal(); +} + +DEVICE_EXTERN_C_INLINE +_iml_bf16_internal __devicelib_imf_fmaxbf16(_iml_bf16_internal a, + _iml_bf16_internal b) { + return __fmax(_iml_bf16(a), _iml_bf16(b)).get_internal(); +} + +DEVICE_EXTERN_C_INLINE +_iml_bf16_internal __devicelib_imf_copysignbf16(_iml_bf16_internal a, + _iml_bf16_internal b) { + return __copysign(_iml_bf16(a), _iml_bf16(b)).get_internal(); +} + +DEVICE_EXTERN_C_INLINE +_iml_bf16_internal __devicelib_imf_fabsbf16(_iml_bf16_internal a) { + return __fabs(_iml_bf16(a)).get_internal(); +} + +DEVICE_EXTERN_C_INLINE +_iml_bf16_internal __devicelib_imf_rintbf16(_iml_bf16_internal a) { + return __rint(_iml_bf16(a)).get_internal(); +} + +DEVICE_EXTERN_C_INLINE +_iml_bf16_internal __devicelib_imf_floorbf16(_iml_bf16_internal a) { + return __floor(_iml_bf16(a)).get_internal(); +} + +DEVICE_EXTERN_C_INLINE +_iml_bf16_internal __devicelib_imf_ceilbf16(_iml_bf16_internal a) { + return __ceil(_iml_bf16(a)).get_internal(); +} + +DEVICE_EXTERN_C_INLINE +_iml_bf16_internal __devicelib_imf_truncbf16(_iml_bf16_internal a) { + return __trunc(_iml_bf16(a)).get_internal(); +} +#endif diff --git a/libdevice/imf_bf16.hpp b/libdevice/imf_bf16.hpp new file mode 100644 index 0000000000000..84c94def05837 --- /dev/null +++ b/libdevice/imf_bf16.hpp @@ -0,0 +1,182 @@ +//==------- imf_bf16.hpp - BFloat16 emulation for intel math functions -----==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//==------------------------------------------------------------------------==// + +#ifndef __LIBDEVICE_BF16_EMUL_H__ +#define __LIBDEVICE_BF16_EMUL_H__ + +#include "device.h" +#include + +// Currently, we use uint16_t to emulate BFloat16 for all device. +typedef uint16_t _iml_bf16_internal; + +static inline float __bfloat162float(_iml_bf16_internal b) { + uint16_t bf16_mant = b & 0x7F; + uint16_t bf16_sign_exp = (b & 0xFF80); + uint32_t f32_sign_exp = static_cast(bf16_sign_exp) << 16; + uint32_t f32_mant = static_cast(bf16_mant) << 16; + return __builtin_bit_cast(float, f32_sign_exp | f32_mant); +}; + +static inline _iml_bf16_internal +__float2bfloat16(float f, __iml_rounding_mode rounding_mode) { + union { + float f_val; + uint32_t u32_val; + } fp32_bits; + + fp32_bits.f_val = f; + uint16_t bf16_sign = + static_cast((fp32_bits.u32_val & 0x80000000) >> 31); + uint16_t bf16_exp = + static_cast((fp32_bits.u32_val & 0x7F800000) >> 23); + uint32_t f_mant = fp32_bits.u32_val & 0x7FFFFF; + uint16_t bf16_mant = static_cast(f_mant >> 16); + // +/-infinity and NAN + if (bf16_exp == 0xFF) { + if (!f_mant) + return bf16_sign ? 0xFF80 : 0x7F80; + else + return (bf16_sign << 15) | (bf16_exp << 7) | bf16_mant; + } + + // +/-0 + if (!bf16_exp && !f_mant) { + return bf16_sign ? 0x8000 : 0x0; + } + + uint16_t mant_discard = static_cast(f_mant & 0xFFFF); + switch (rounding_mode) { + case __IML_RTN: + if (bf16_sign && mant_discard) + bf16_mant++; + break; + case __IML_RTZ: + break; + case __IML_RTP: + if (!bf16_sign && mant_discard) + bf16_mant++; + break; + case __IML_RTE: + if ((mant_discard > 0x8000) || + ((mant_discard == 0x8000) && ((bf16_mant & 0x1) == 0x1))) + bf16_mant++; + break; + } + + // if overflow happens, bf16_exp will be 0xFF and bf16_mant will be 0, + // infinity will be returned. + if (bf16_mant == 0x80) { + bf16_mant = 0; + bf16_exp++; + } + + return (bf16_sign << 15) | (bf16_exp << 7) | bf16_mant; +} + +// We convert bf16 to fp32 and do all arithmetic operations, then convert back. +class _iml_bf16 { +public: + _iml_bf16(_iml_bf16_internal b) : _bf16_internal(b) {} + _iml_bf16() = default; + _iml_bf16(const _iml_bf16 &) = default; + _iml_bf16 &operator=(const _iml_bf16 &rh) = default; + _iml_bf16 &operator=(float fval) { + _bf16_internal = __float2bfloat16(fval, __IML_RTE); + return *this; + } + _iml_bf16(float fval) : _bf16_internal(__float2bfloat16(fval, __IML_RTE)) {} + explicit operator float() const { return __bfloat162float(_bf16_internal); } + + _iml_bf16_internal get_internal() const { return _bf16_internal; } + bool operator==(const _iml_bf16 &rh) { + return _bf16_internal == rh._bf16_internal; + } + bool operator!=(const _iml_bf16 &rh) { return !operator==(rh); } + + _iml_bf16 &operator+=(const _iml_bf16 &rh) { + *this = (operator float() + static_cast(rh)); + return *this; + } + _iml_bf16 &operator-=(const _iml_bf16 &rh) { + *this = (operator float() - static_cast(rh)); + return *this; + } + _iml_bf16 &operator*=(const _iml_bf16 &rh) { + *this = (operator float() * static_cast(rh)); + return *this; + } + _iml_bf16 &operator/=(const _iml_bf16 &rh) { + *this = (operator float() / static_cast(rh)); + return *this; + } + _iml_bf16 &operator++() { + *this = operator float() + 1.f; + return *this; + } + _iml_bf16 operator++(int) { + _iml_bf16 res(*this); + operator++(); + return res; + } + _iml_bf16 &operator--() { + *this = operator float() - 1.f; + return *this; + } + _iml_bf16 operator--(int) { + _iml_bf16 res(*this); + operator--(); + return res; + } + + _iml_bf16 operator-() { + _iml_bf16 res(-operator float()); + return res; + } + + bool operator<(const _iml_bf16 &rh) { + return operator float() < static_cast(rh); + } + bool operator>(const _iml_bf16 &rh) { + return operator float() > static_cast(rh); + } + + _iml_bf16 operator+(const _iml_bf16 &rh) { + _iml_bf16 res(*this); + res += rh; + return res; + } + + _iml_bf16 operator-(const _iml_bf16 &rh) { + _iml_bf16 res(*this); + res -= rh; + return res; + } + + _iml_bf16 operator*(const _iml_bf16 &rh) { + _iml_bf16 res(*this); + res *= rh; + return res; + } + + _iml_bf16 operator/(const _iml_bf16 &rh) { + _iml_bf16 res(*this); + res /= rh; + return res; + } + bool operator<=(const _iml_bf16 &rh) { + return operator<(rh) || operator==(rh); + } + bool operator>=(const _iml_bf16 &rh) { + return operator>(rh) || operator==(rh); + } + +private: + _iml_bf16_internal _bf16_internal; +}; +#endif diff --git a/libdevice/imf_utils/bfloat16_convert.cpp b/libdevice/imf_utils/bfloat16_convert.cpp new file mode 100644 index 0000000000000..31637dae768fa --- /dev/null +++ b/libdevice/imf_utils/bfloat16_convert.cpp @@ -0,0 +1,42 @@ +//==-- bfloat16_convert.cpp - fallback implementation of bfloat16 to other type +// convert--==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "../device_imf.hpp" + +#ifdef __LIBDEVICE_IMF_ENABLED__ +DEVICE_EXTERN_C_INLINE +float __devicelib_imf_bfloat162float(_iml_bf16_internal b) { + return __bfloat162float(b); +} + +DEVICE_EXTERN_C_INLINE +_iml_bf16_internal __devicelib_imf_float2bfloat16(float f) { + return __float2bfloat16(f, __IML_RTE); +} + +DEVICE_EXTERN_C_INLINE +_iml_bf16_internal __devicelib_imf_float2bfloat16_rd(float f) { + return __float2bfloat16(f, __IML_RTN); +} + +DEVICE_EXTERN_C_INLINE +_iml_bf16_internal __devicelib_imf_float2bfloat16_rn(float f) { + return __float2bfloat16(f, __IML_RTE); +} + +DEVICE_EXTERN_C_INLINE +_iml_bf16_internal __devicelib_imf_float2bfloat16_ru(float f) { + return __float2bfloat16(f, __IML_RTP); +} + +DEVICE_EXTERN_C_INLINE +_iml_bf16_internal __devicelib_imf_float2bfloat16_rz(float f) { + return __float2bfloat16(f, __IML_RTZ); +} +#endif diff --git a/libdevice/imf_wrapper_bf16.cpp b/libdevice/imf_wrapper_bf16.cpp new file mode 100644 index 0000000000000..21c9a2b3406a4 --- /dev/null +++ b/libdevice/imf_wrapper_bf16.cpp @@ -0,0 +1,156 @@ +//==----- imf_wrapper_bf16.cpp - wrappers for BFloat16 intel math library +// functions ------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "imf_bf16.hpp" + +#ifdef __LIBDEVICE_IMF_ENABLED__ + +DEVICE_EXTERN_C_INLINE +float __devicelib_imf_bfloat162float(_iml_bf16_internal); + +DEVICE_EXTERN_C_INLINE +float __imf_bfloat162float(_iml_bf16_internal b) { + return __devicelib_imf_bfloat162float(b); +} + +DEVICE_EXTERN_C_INLINE +_iml_bf16_internal __devicelib_imf_float2bfloat16(float); + +DEVICE_EXTERN_C_INLINE +_iml_bf16_internal __imf_float2bfloat16(float f) { + return __devicelib_imf_float2bfloat16(f); +} + +DEVICE_EXTERN_C_INLINE +_iml_bf16_internal __devicelib_imf_float2bfloat16_rd(float); + +DEVICE_EXTERN_C_INLINE +_iml_bf16_internal __imf_float2bfloat16_rd(float f) { + return __devicelib_imf_float2bfloat16_rd(f); +} + +DEVICE_EXTERN_C_INLINE +_iml_bf16_internal __devicelib_imf_float2bfloat16_rn(float); + +DEVICE_EXTERN_C_INLINE +_iml_bf16_internal __imf_float2bfloat16_rn(float f) { + return __devicelib_imf_float2bfloat16_rn(f); +} + +DEVICE_EXTERN_C_INLINE +_iml_bf16_internal __devicelib_imf_float2bfloat16_ru(float); + +DEVICE_EXTERN_C_INLINE +_iml_bf16_internal __imf_float2bfloat16_ru(float f) { + return __devicelib_imf_float2bfloat16_ru(f); +} + +DEVICE_EXTERN_C_INLINE +_iml_bf16_internal __devicelib_imf_float2bfloat16_rz(float); + +DEVICE_EXTERN_C_INLINE +_iml_bf16_internal __imf_float2bfloat16_rz(float f) { + return __devicelib_imf_float2bfloat16_rz(f); +} + +DEVICE_EXTERN_C_INLINE +_iml_bf16_internal __devicelib_imf_fmabf16(_iml_bf16_internal, + _iml_bf16_internal, + _iml_bf16_internal); + +DEVICE_EXTERN_C_INLINE +_iml_bf16_internal __imf_fmabf16(_iml_bf16_internal a, _iml_bf16_internal b, + _iml_bf16_internal c) { + return __devicelib_imf_fmabf16(a, b, c); +} + +DEVICE_EXTERN_C_INLINE +_iml_bf16_internal __devicelib_imf_sqrtbf16(_iml_bf16_internal); + +DEVICE_EXTERN_C_INLINE +_iml_bf16_internal __imf_sqrtbf16(_iml_bf16_internal a) { + return __devicelib_imf_sqrtbf16(a); +} + +DEVICE_EXTERN_C_INLINE +_iml_bf16_internal __devicelib_imf_rsqrtbf16(_iml_bf16_internal); + +DEVICE_EXTERN_C_INLINE +_iml_bf16_internal __imf_rsqrtbf16(_iml_bf16_internal a) { + return __devicelib_imf_rsqrtbf16(a); +} + +DEVICE_EXTERN_C_INLINE +_iml_bf16_internal __devicelib_imf_fminbf16(_iml_bf16_internal, + _iml_bf16_internal); + +DEVICE_EXTERN_C_INLINE +_iml_bf16_internal __imf_fminbf16(_iml_bf16_internal a, _iml_bf16_internal b) { + return __devicelib_imf_fminbf16(a, b); +} + +DEVICE_EXTERN_C_INLINE +_iml_bf16_internal __devicelib_imf_fmaxbf16(_iml_bf16_internal, + _iml_bf16_internal); + +DEVICE_EXTERN_C_INLINE +_iml_bf16_internal __imf_fmaxbf16(_iml_bf16_internal a, _iml_bf16_internal b) { + return __devicelib_imf_fmaxbf16(a, b); +} + +DEVICE_EXTERN_C_INLINE +_iml_bf16_internal __devicelib_imf_fabsbf16(_iml_bf16_internal); + +DEVICE_EXTERN_C_INLINE +_iml_bf16_internal __imf_fabsbf16(_iml_bf16_internal a) { + return __devicelib_imf_fabsbf16(a); +} + +DEVICE_EXTERN_C_INLINE +_iml_bf16_internal __devicelib_imf_copysignbf16(_iml_bf16_internal, + _iml_bf16_internal); + +DEVICE_EXTERN_C_INLINE +_iml_bf16_internal __imf_copysignbf16(_iml_bf16_internal a, + _iml_bf16_internal b) { + return __devicelib_imf_copysignbf16(a, b); +} + +DEVICE_EXTERN_C_INLINE +_iml_bf16_internal __devicelib_imf_rintbf16(_iml_bf16_internal); + +DEVICE_EXTERN_C_INLINE +_iml_bf16_internal __imf_rintbf16(_iml_bf16_internal a) { + return __devicelib_imf_rintbf16(a); +} + +DEVICE_EXTERN_C_INLINE +_iml_bf16_internal __devicelib_imf_floorbf16(_iml_bf16_internal); + +DEVICE_EXTERN_C_INLINE +_iml_bf16_internal __imf_floorbf16(_iml_bf16_internal a) { + return __devicelib_imf_floorbf16(a); +} + +DEVICE_EXTERN_C_INLINE +_iml_bf16_internal __devicelib_imf_ceilbf16(_iml_bf16_internal); + +DEVICE_EXTERN_C_INLINE +_iml_bf16_internal __imf_ceilbf16(_iml_bf16_internal a) { + return __devicelib_imf_ceilbf16(a); +} + +DEVICE_EXTERN_C_INLINE +_iml_bf16_internal __devicelib_imf_truncbf16(_iml_bf16_internal); + +DEVICE_EXTERN_C_INLINE +_iml_bf16_internal __imf_truncbf16(_iml_bf16_internal a) { + return __devicelib_imf_truncbf16(a); +} +#endif // __LIBDEVICE_IMF_ENABLED__ diff --git a/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp b/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp index f03f9d489dcc7..83312731f6d18 100644 --- a/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp +++ b/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp @@ -474,6 +474,29 @@ SYCLDeviceLibFuncMap SDLMap = { DeviceLibExt::cl_intel_devicelib_imf_fp64}, {"__devicelib_imf_longlong_as_double", DeviceLibExt::cl_intel_devicelib_imf_fp64}, + {"__devicelib_imf_bfloat162float", + DeviceLibExt::cl_intel_devicelib_imf_bf16}, + {"__devicelib_imf_float2bfloat16", + DeviceLibExt::cl_intel_devicelib_imf_bf16}, + {"__devicelib_imf_float2bfloat16_rd", + DeviceLibExt::cl_intel_devicelib_imf_bf16}, + {"__devicelib_imf_float2bfloat16_rn", + DeviceLibExt::cl_intel_devicelib_imf_bf16}, + {"__devicelib_imf_float2bfloat16_ru", + DeviceLibExt::cl_intel_devicelib_imf_bf16}, + {"__devicelib_imf_float2bfloat16_rz", + DeviceLibExt::cl_intel_devicelib_imf_bf16}, + {"__devicelib_imf_fmabf16", DeviceLibExt::cl_intel_devicelib_imf_bf16}, + {"__devicelib_imf_fmaxbf16", DeviceLibExt::cl_intel_devicelib_imf_bf16}, + {"__devicelib_imf_fminbf16", DeviceLibExt::cl_intel_devicelib_imf_bf16}, + {"__devicelib_imf_copysignbf16", DeviceLibExt::cl_intel_devicelib_imf_bf16}, + {"__devicelib_imf_sqrtbf16", DeviceLibExt::cl_intel_devicelib_imf_bf16}, + {"__devicelib_imf_rsqrtbf16", DeviceLibExt::cl_intel_devicelib_imf_bf16}, + {"__devicelib_imf_fabsbf16", DeviceLibExt::cl_intel_devicelib_imf_bf16}, + {"__devicelib_imf_rintbf16", DeviceLibExt::cl_intel_devicelib_imf_bf16}, + {"__devicelib_imf_floorbf16", DeviceLibExt::cl_intel_devicelib_imf_bf16}, + {"__devicelib_imf_ceilbf16", DeviceLibExt::cl_intel_devicelib_imf_bf16}, + {"__devicelib_imf_truncbf16", DeviceLibExt::cl_intel_devicelib_imf_bf16}, }; // Each fallback device library corresponds to one bit in "require mask" which @@ -488,6 +511,7 @@ SYCLDeviceLibFuncMap SDLMap = { // fallback-cstring: 0x20 // fallback-imf: 0x40 // fallback-imf-fp64: 0x80 +// fallback-imf-bf16: 0x100 uint32_t getDeviceLibBits(const std::string &FuncName) { auto DeviceLibFuncIter = SDLMap.find(FuncName); return ((DeviceLibFuncIter == SDLMap.end()) diff --git a/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.h b/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.h index 15cae43da0779..4e340e9be6b19 100644 --- a/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.h +++ b/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.h @@ -34,6 +34,7 @@ enum class DeviceLibExt : std::uint32_t { cl_intel_devicelib_cstring, cl_intel_devicelib_imf, cl_intel_devicelib_imf_fp64, + cl_intel_devicelib_imf_bf16, }; uint32_t getSYCLDeviceLibReqMask(const Module &M); diff --git a/sycl/include/sycl/builtins.hpp b/sycl/include/sycl/builtins.hpp index 4dbbec61409c8..d1fab63474931 100644 --- a/sycl/include/sycl/builtins.hpp +++ b/sycl/include/sycl/builtins.hpp @@ -1896,6 +1896,23 @@ extern SYCL_EXTERNAL _Float16 __imf_fmaxf16(_Float16 x, _Float16 y); extern SYCL_EXTERNAL _Float16 __imf_fminf16(_Float16 x, _Float16 y); extern SYCL_EXTERNAL _Float16 __imf_copysignf16(_Float16 x, _Float16 y); extern SYCL_EXTERNAL float __imf_half2float(_Float16 x); +extern SYCL_EXTERNAL float __imf_bfloat162float(uint16_t x); +extern SYCL_EXTERNAL uint16_t __imf_float2bfloat16(float x); +extern SYCL_EXTERNAL uint16_t __imf_float2bfloat16_rd(float x); +extern SYCL_EXTERNAL uint16_t __imf_float2bfloat16_rn(float x); +extern SYCL_EXTERNAL uint16_t __imf_float2bfloat16_ru(float x); +extern SYCL_EXTERNAL uint16_t __imf_float2bfloat16_rz(float x); +extern SYCL_EXTERNAL uint16_t __imf_fmabf16(uint16_t x, uint16_t y, uint16_t z); +extern SYCL_EXTERNAL uint16_t __imf_fmaxbf16(uint16_t x, uint16_t y); +extern SYCL_EXTERNAL uint16_t __imf_fminbf16(uint16_t x, uint16_t y); +extern SYCL_EXTERNAL uint16_t __imf_fabsbf16(uint16_t x); +extern SYCL_EXTERNAL uint16_t __imf_rintbf16(uint16_t x); +extern SYCL_EXTERNAL uint16_t __imf_floorbf16(uint16_t x); +extern SYCL_EXTERNAL uint16_t __imf_ceilbf16(uint16_t x); +extern SYCL_EXTERNAL uint16_t __imf_truncbf16(uint16_t x); +extern SYCL_EXTERNAL uint16_t __imf_copysignbf16(uint16_t x, uint16_t y); +extern SYCL_EXTERNAL uint16_t __imf_sqrtbf16(uint16_t x); +extern SYCL_EXTERNAL uint16_t __imf_rsqrtbf16(uint16_t x); extern SYCL_EXTERNAL double __imf_fma(double x, double y, double z); extern SYCL_EXTERNAL double __imf_fabs(double x); extern SYCL_EXTERNAL double __imf_floor(double x); diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp index 7bea37764cd4f..45162fe108b0f 100644 --- a/sycl/source/detail/program_manager/program_manager.cpp +++ b/sycl/source/detail/program_manager/program_manager.cpp @@ -818,6 +818,8 @@ static const char *getDeviceLibFilename(DeviceLibExt Extension) { return "libsycl-fallback-imf.spv"; case DeviceLibExt::cl_intel_devicelib_imf_fp64: return "libsycl-fallback-imf-fp64.spv"; + case DeviceLibExt::cl_intel_devicelib_imf_bf16: + return "libsycl-fallback-imf-bf16.spv"; } throw compile_program_error("Unhandled (new?) device library extension", PI_ERROR_INVALID_OPERATION); @@ -841,6 +843,8 @@ static const char *getDeviceLibExtensionStr(DeviceLibExt Extension) { return "cl_intel_devicelib_imf"; case DeviceLibExt::cl_intel_devicelib_imf_fp64: return "cl_intel_devicelib_imf_fp64"; + case DeviceLibExt::cl_intel_devicelib_imf_bf16: + return "cl_intel_devicelib_imf_bf16"; } throw compile_program_error("Unhandled (new?) device library extension", PI_ERROR_INVALID_OPERATION); @@ -1005,7 +1009,8 @@ getDeviceLibPrograms(const ContextImplPtr Context, const RT::PiDevice &Device, {DeviceLibExt::cl_intel_devicelib_complex_fp64, false}, {DeviceLibExt::cl_intel_devicelib_cstring, false}, {DeviceLibExt::cl_intel_devicelib_imf, false}, - {DeviceLibExt::cl_intel_devicelib_imf_fp64, false}}; + {DeviceLibExt::cl_intel_devicelib_imf_fp64, false}, + {DeviceLibExt::cl_intel_devicelib_imf_bf16, false}}; // Disable all devicelib extensions requiring fp64 support if at least // one underlying device doesn't support cl_khr_fp64. diff --git a/sycl/source/detail/program_manager/program_manager.hpp b/sycl/source/detail/program_manager/program_manager.hpp index 0a66a576abed7..522ee861f482c 100644 --- a/sycl/source/detail/program_manager/program_manager.hpp +++ b/sycl/source/detail/program_manager/program_manager.hpp @@ -66,6 +66,7 @@ enum class DeviceLibExt : std::uint32_t { cl_intel_devicelib_cstring, cl_intel_devicelib_imf, cl_intel_devicelib_imf_fp64, + cl_intel_devicelib_imf_bf16, }; // Provides single loading and building OpenCL programs with unique contexts