diff --git a/doc/math.qbk b/doc/math.qbk index d6b90efb0b..67375da114 100644 --- a/doc/math.qbk +++ b/doc/math.qbk @@ -313,6 +313,7 @@ and use the function's name as the link text.] [def __sqrt1pm1 [link math_toolkit.powers.sqrt1pm1 sqrt1pm1]] [def __rsqrt [link math_toolkit.powers.rsqrt rsqrt]] [def __powm1 [link math_toolkit.powers.powm1 powm1]] +[def __pow1p [link math_toolkit.powers.pow1p pow1p]] [def __hypot [link math_toolkit.powers.hypot hypot]] [def __pow [link math_toolkit.powers.ct_pow pow]] diff --git a/doc/sf/pow1p.qbk b/doc/sf/pow1p.qbk new file mode 100644 index 0000000000..a2201d77ae --- /dev/null +++ b/doc/sf/pow1p.qbk @@ -0,0 +1,34 @@ +[/ + Copyright Matt Borland 2024 + Distributed under the Boost Software License, Version 1.0. + (See accompanying file LICENSE_1_0.txt or copy at + http://www.boost.org/LICENSE_1_0.txt). +] + +[section:pow1p pow1p] + +[h4 Synopsis] + + #include + + namespace boost { + namespace math { + + template + BOOST_MATH_GPU_ENABLED tools::promote_args_t + pow1p(const T1 x, const T2 y, const Policy& pol) + + template + BOOST_MATH_GPU_ENABLED tools::promote_args_t + pow1p(const T1 x, const T2 y) + + }} // namespaces + + +The function `pow1p` computes (1 + x)[super y ] where x and y are real numbers. +This function is particularly useful for scenarios where adding 1 to x before raising it to the power of y is required. +It provides a more numerically stable and efficient way to perform this computation, especially for small values of x. +When x is very small, directly computing (1 + x)[super y ] using standard arithmetic operations can lead to a loss of precision due to floating-point arithmetic limitations. +The pow1p function helps mitigate this issue by internally handling such cases more accurately. + +[endsect] diff --git a/include/boost/math/special_functions.hpp b/include/boost/math/special_functions.hpp index d3aa95c5c1..3275ab3d2b 100644 --- a/include/boost/math/special_functions.hpp +++ b/include/boost/math/special_functions.hpp @@ -79,6 +79,7 @@ #include #include #include +#include #ifndef BOOST_MATH_NO_EXCEPTIONS #include #endif diff --git a/include/boost/math/special_functions/pow1p.hpp b/include/boost/math/special_functions/pow1p.hpp new file mode 100644 index 0000000000..1b718f99bf --- /dev/null +++ b/include/boost/math/special_functions/pow1p.hpp @@ -0,0 +1,269 @@ +// (C) Copyright Matt Borland 2024. +// (C) Copyrigh Fancidev 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#ifndef BOOST_MATH_SF_POW1P_HPP +#define BOOST_MATH_SF_POW1P_HPP + +#include +#include +#include +#include +#include +#include +#include +#include + +// For cuda we would rather use builtin nextafter than unsupported boost::math::nextafter +#ifndef BOOST_MATH_HAS_GPU_SUPPORT +# include +# include +# include +#endif // BOOST_MATH_ENABLE_CUDA + +namespace boost { +namespace math { + +namespace detail { + +#ifndef BOOST_MATH_HAS_GPU_SUPPORT + +template +using fma_t = decltype(fma(std::declval(), std::declval(), std::declval())); + +template +constexpr bool is_fma_detected_v = boost::math::tools::is_detected::value; + +template , bool> = true> +BOOST_MATH_FORCEINLINE BOOST_MATH_GPU_ENABLED T local_fma(const T x, const T y, const T z) +{ + using std::fma; + return fma(x, y, z); +} + +template , bool> = true> +BOOST_MATH_FORCEINLINE BOOST_MATH_GPU_ENABLED T local_fma(const T x, const T y, const T z) +{ + return x * y + z; +} + +#else + +template +BOOST_MATH_FORCEINLINE BOOST_MATH_GPU_ENABLED T local_fma(const T x, const T y, const T z) +{ + return ::fma(x, y, z); +} + +template <> +BOOST_MATH_FORCEINLINE BOOST_MATH_GPU_ENABLED float local_fma(const float x, const float y, const float z) +{ + return ::fmaf(x, y, z); +} + +#endif // BOOST_MATH_HAS_GPU_SUPPORT + +template +BOOST_MATH_GPU_ENABLED T pow1p_imp(const T x, const T y, const Policy& pol) +{ + BOOST_MATH_STD_USING + constexpr auto function = "boost::math::pow1p<%1%>(%1%, %1%)"; + + // The special values follow the spec of the pow() function defined in + // IEEE-754, Section 9.2.1. The order of the `if` statements matters. + if (abs(y) == T(0)) + { + // pow(x, +/-0) + return T(1); + } + + if (x == T(-1)) + { + // pow(+/-0, y) + if ((boost::math::isfinite)(y) && y < 0) + { + return boost::math::policies::raise_domain_error(function, "Division by 0", x, pol); + } + + // Gets correct special handling + return pow(T(0), y); + } + + if (x == T(-2) && (boost::math::isinf)(y)) + { + // pow(-1, +/-inf) + return T(1); + } + + if (x == T(0)) + { + // pow(+1, y) + return T(1); + } + + if ((boost::math::isinf)(y)) + { + if ((boost::math::signbit)(y) == 1) + { + // pow(x, -inf) + return T(0); + } + else + { + // pow(x, +inf) + return y; + } + } + + if ((boost::math::isinf)(x)) + { + // pow(+/-inf, y) + return pow(x, y); + } + + // Up to this point, (1+x) = {+/-0, +/-1, +/-inf} have been handled, and + // and y = {+/-0, +/-inf} have been handled. Next, we handle `nan` and + // y = {+/-1}. + if ((boost::math::isnan)(x)) + { + return x; + } + else if ((boost::math::isnan)(y)) + { + return y; + } + + if (y == T(1)) + { + return T(1) + x; + } + if (y == T(-1)) + { + return T(1) / (T(1) + x); // guaranteed no overflow + } + + // Handle (1+x) < 0 + if (x < -1) + { + // TODO(fancidev): maybe use (1+x)^y == [(1+x)^2]^(y/2) + return pow(1+x, y); + } + + // Now x, y are finite and not equal to 0 or +/-1, and x > -1. + // To compute (1+x)^y, write (1+x) == (s+t) where s is equal to (1+x) + // rounded toward 1, and t is the (exact) rounding error. + T s, t; + if (x < 0) + { + s = T(1) + x; + t = x - (s - T(1)); + if (t > 0) + { + #ifdef BOOST_MATH_HAS_GPU_SUPPORT + s = ::nextafter(s, T(1)); + #else + s = boost::math::nextafter(s, T(1)); + #endif + + t = x - (s - T(1)); + } + } + else if (x < 1) + { + s = T(1) + x; + t = x - (s - T(1)); + if (t < 0) + { + #ifdef BOOST_MATH_HAS_GPU_SUPPORT + s = ::nextafter(s, T(0)); + #else + s = boost::math::nextafter(s, T(0)); + #endif + + t = x - (s - T(1)); + } + } + else + { + s = x + T(1); + t = T(1) - (s - x); + if (t < 0) + { + #ifdef BOOST_MATH_HAS_GPU_SUPPORT + s = ::nextafter(s, T(0)); + #else + s = boost::math::nextafter(s, T(0)); + #endif + + t = T(1) - (s - x); + } + } + + // Because x > -1 and s is rounded toward 1, s is guaranteed to be > 0. + // Write (1+x)^y == (s+t)^y == (s^y)*((1+t/s)^y) == term1*term2. + // It can be shown that either both terms <= 1 or both >= 1; so + // if the first term over/underflows, then the result over/underflows. + // And of course, term2 == 1 if t == 0. + T term1 = pow(s, y); + if (t == T(0) || term1 == T(0) || (boost::math::isinf)(term1)) + { + return term1; + } + + // (1+t/s)^y == exp(y*log(1+t/s)). The relative error of the result is + // equal to the absolute error of the exponent (plus the relative error + // of 'exp'). Therefore, when the exponent is small, it is accurately + // evaluated to machine epsilon using T arithmetic. In addition, since + // t/s <= epsilon, log(1+t/s) is well approximated by t/s to first order. + T u = t / s; + T w = y * u; + if (abs(w) <= T(0.5)) + { + T term2 = exp(w); + return term1 * term2; + } + + // Now y*log(1+t/s) is large, and its relative error is "magnified" by + // the exponent. To reduce the error, we use double-T arithmetic, and + // expand log(1+t/s) to second order. + + // (u + uu) ~= t/s. + T r1 = local_fma(-u, s, t); + T uu = r1 / s; + + // (u + vv) ~= log(1+(u+uu)) ~= log(1+t/s). + T vv = local_fma(T(-0.5)*u, u, uu); + + // (w + ww) ~= y*(u+vv) ~= y*log(1+t/s). + T r2 = local_fma(y, u, -w); + T ww = local_fma(y, vv, r2); + + // TODO: maybe ww is small enough such that exp(ww) ~= 1+ww. + T term2 = exp(w) * exp(ww); + return term1 * term2; +} + +} // Namespace detail + +template +BOOST_MATH_GPU_ENABLED tools::promote_args_t +pow1p(const T1 x, const T2 y, const Policy& pol) +{ + using result_type = tools::promote_args_t; + return detail::pow1p_imp(static_cast(x), static_cast(y), pol); +} + +template +BOOST_MATH_GPU_ENABLED tools::promote_args_t +pow1p(const T1 x, const T2 y) +{ + using result_type = tools::promote_args_t; + return detail::pow1p_imp(static_cast(x), static_cast(y), policies::policy<>()); +} + +} // Namespace math +} // Namespace boost + +#endif // BOOST_MATH_SF_POW1P_HPP diff --git a/test/Jamfile.v2 b/test/Jamfile.v2 index 85ddf38f9b..18018d63af 100644 --- a/test/Jamfile.v2 +++ b/test/Jamfile.v2 @@ -128,6 +128,7 @@ test-suite special_fun : [ run hypot_test.cpp test_instances//test_instances pch_light ../../test/build//boost_unit_test_framework ] [ run pow_test.cpp ../../test/build//boost_unit_test_framework ] + [ run test_pow1p.cpp ] [ run logaddexp_test.cpp ../../test/build//boost_unit_test_framework ] [ run logsumexp_test.cpp ../../test/build//boost_unit_test_framework : : : [ requires cxx11_hdr_initializer_list cxx11_variadic_templates ] ] [ run ccmath_sqrt_test.cpp ../../test/build//boost_unit_test_framework : : : [ requires cxx17_if_constexpr ] [ check-target-builds ../config//has_float128 "GCC libquadmath and __float128 support" : BOOST_MATH_TEST_FLOAT128 "-Bstatic -lquadmath -Bdynamic" ] ] diff --git a/test/cuda_jamfile b/test/cuda_jamfile index 64fac76fba..2829aa19ee 100644 --- a/test/cuda_jamfile +++ b/test/cuda_jamfile @@ -171,3 +171,5 @@ run test_trigamma_float.cu ; run test_trunc_double.cu ; run test_trunc_float.cu ; +run test_pow1p_double.cu ; +run test_pow1p_float.cu ; diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile index de235822ef..d636fc67be 100644 --- a/test/nvrtc_jamfile +++ b/test/nvrtc_jamfile @@ -168,3 +168,4 @@ run test_trigamma_nvrtc_double.cpp ; run test_trigamma_nvrtc_float.cpp ; run test_trunc_nvrtc_double.cpp ; +run test_pow1p_nvrtc_double.cpp ; diff --git a/test/sycl_jamfile b/test/sycl_jamfile index 97c48474ca..3dd7d054af 100644 --- a/test/sycl_jamfile +++ b/test/sycl_jamfile @@ -38,3 +38,4 @@ run test_digamma_simple.cpp ; run test_trigamma.cpp ; run test_erf.cpp ; run test_gamma.cpp ; +run test_pow1p.cpp ; diff --git a/test/test_pow1p.cpp b/test/test_pow1p.cpp new file mode 100644 index 0000000000..9b5ba7c8e7 --- /dev/null +++ b/test/test_pow1p.cpp @@ -0,0 +1,122 @@ +// (C) Copyright Matt Borland 2024. +// (C) Copyrigh Fancidev 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include + +#if __has_include() && !defined(BOOST_MATH_HAS_GPU_SUPPORT) +# include +#endif + +#include "math_unit_test.hpp" + +template +void test() +{ + using std::pow; + + // First we hit all the special cases + // pow(x, +/-0) + CHECK_EQUAL(boost::math::pow1p(T(1), T(0)), T(1)); + + // pow(0, y) + #ifndef BOOST_MATH_NO_EXCEPTIONS + CHECK_THROW(boost::math::pow1p(T(-1), T(-1)), std::domain_error); + #endif + CHECK_EQUAL(boost::math::pow1p(T(-1), T(1)), T(0)); + + // pow(-1, inf) + CHECK_EQUAL(boost::math::pow1p(T(-2), boost::math::numeric_limits::infinity()), T(1)); + + // pow(1, y) + CHECK_EQUAL(boost::math::pow1p(T(0), T(2)), T(1)); + + // pow(x, +/-inf) + BOOST_MATH_IF_CONSTEXPR (boost::math::numeric_limits::has_infinity) + { + CHECK_EQUAL(boost::math::pow1p(T(5), -boost::math::numeric_limits::infinity()), T(0)); + CHECK_EQUAL(boost::math::pow1p(T(5), boost::math::numeric_limits::infinity()), boost::math::numeric_limits::infinity()); + + + // pow(+/-inf, y) + CHECK_EQUAL(boost::math::pow1p(boost::math::numeric_limits::infinity(), T(2)), boost::math::numeric_limits::infinity()); + CHECK_EQUAL(boost::math::pow1p(-boost::math::numeric_limits::infinity(), T(2)), boost::math::numeric_limits::infinity()); + } + + // NANs for x and y + BOOST_MATH_IF_CONSTEXPR (boost::math::numeric_limits::has_quiet_NaN) + { + CHECK_EQUAL(boost::math::isnan(boost::math::pow1p(boost::math::numeric_limits::quiet_NaN(), T(1))), true); + CHECK_EQUAL(boost::math::isnan(boost::math::pow1p(T(1), boost::math::numeric_limits::quiet_NaN())), true); + } + + // pow(x, +/-1) + CHECK_ULP_CLOSE(boost::math::pow1p(T(2), T(1)), pow(T(3), T(1)), 10); + CHECK_ULP_CLOSE(boost::math::pow1p(T(2), T(-1)), pow(T(3), T(-1)), 10); + + // (1+x) < 0 + CHECK_ULP_CLOSE(boost::math::pow1p(T(-3), T(2)), pow(T(-2), T(2)), 10); + + // x < 0 + std::mt19937_64 rng; + std::uniform_real_distribution dist (-1, 0); + std::uniform_real_distribution dist_y (0, 10); + constexpr int N = 1024; + for (int i = 0; i < N; ++i) + { + const auto x = static_cast(dist(rng)); + const auto y = static_cast(dist_y(rng)); + + CHECK_ULP_CLOSE(boost::math::pow1p(x, y), pow(x + 1, y), 100); + } + + // 0 < x < 1 + std::uniform_real_distribution dist_x_1(0, 1); + for (int i = 0; i < N; ++i) + { + const auto x = static_cast(dist_x_1(rng)); + const auto y = static_cast(dist_y(rng)); + + CHECK_ULP_CLOSE(boost::math::pow1p(x, y), pow(x + 1, y), 100); + } + + // Else + std::uniform_real_distribution dist_other_x(1, 1000); + for (int i = 0; i < N; ++i) + { + const auto x = static_cast(dist_other_x(rng)); + const auto y = static_cast(dist_y(rng)); + + CHECK_ULP_CLOSE(boost::math::pow1p(x, y), pow(x + 1, y), 100); + } +} + +int main() +{ + #ifdef __STDCPP_FLOAT32_T__ + test(); + #else + test(); + #endif + + #ifdef __STDCPP_FLOAT64_T__ + test(); + #else + test(); + #endif + + #ifndef BOOST_MATH_NO_LONG_DOUBLE_MATH_FUNCTIONS + test(); + #endif + + #ifndef BOOST_MATH_NO_REAL_CONCEPT_TESTS + test(); + #endif + + return boost::math::test::report_errors(); +} diff --git a/test/test_pow1p_double.cu b/test/test_pow1p_double.cu new file mode 100644 index 0000000000..c7746bd3e7 --- /dev/null +++ b/test/test_pow1p_double.cu @@ -0,0 +1,111 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::pow1p(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + std::mt19937_64 rng; + std::uniform_real_distribution x_vals(-1, 1); + std::uniform_real_distribution y_vals(-10, 10); + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = x_vals(rng); + input_vector2[i] = y_vals(rng); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 1024; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + { + results.push_back(boost::math::pow1p(input_vector1[i], input_vector2[i])); + } + + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_pow1p_float.cu b/test/test_pow1p_float.cu new file mode 100644 index 0000000000..0ecdf0606c --- /dev/null +++ b/test/test_pow1p_float.cu @@ -0,0 +1,111 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::pow1p(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + std::mt19937_64 rng; + std::uniform_real_distribution x_vals(-1, 1); + std::uniform_real_distribution y_vals(-10, 10); + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = x_vals(rng); + input_vector2[i] = y_vals(rng); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 1024; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + { + results.push_back(boost::math::pow1p(input_vector1[i], input_vector2[i])); + } + + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_pow1p_nvrtc_double.cpp b/test/test_pow1p_nvrtc_double.cpp new file mode 100644 index 0000000000..d9f97d94c0 --- /dev/null +++ b/test/test_pow1p_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_trunc_kernel(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::pow1p(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_trunc_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_trunc_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_trunc_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution x_vals(-1, 1); + std::uniform_real_distribution y_vals(-10, 10); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(x_vals(rng)); + h_in2[i] = static_cast(y_vals(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::pow1p(h_in1[i], h_in2[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_pow1p_nvrtc_float.cpp b/test/test_pow1p_nvrtc_float.cpp new file mode 100644 index 0000000000..d9f97d94c0 --- /dev/null +++ b/test/test_pow1p_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_trunc_kernel(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::pow1p(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_trunc_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_trunc_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_trunc_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution x_vals(-1, 1); + std::uniform_real_distribution y_vals(-10, 10); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(x_vals(rng)); + h_in2[i] = static_cast(y_vals(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::pow1p(h_in1[i], h_in2[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +}