diff --git a/doc/math.qbk b/doc/math.qbk
index d6b90efb0b..67375da114 100644
--- a/doc/math.qbk
+++ b/doc/math.qbk
@@ -313,6 +313,7 @@ and use the function's name as the link text.]
 [def __sqrt1pm1 [link math_toolkit.powers.sqrt1pm1 sqrt1pm1]]
 [def __rsqrt [link math_toolkit.powers.rsqrt rsqrt]]
 [def __powm1 [link math_toolkit.powers.powm1 powm1]]
+[def __pow1p [link math_toolkit.powers.pow1p pow1p]]
 [def __hypot [link math_toolkit.powers.hypot hypot]]
 [def __pow [link math_toolkit.powers.ct_pow pow]]
 
diff --git a/doc/sf/pow1p.qbk b/doc/sf/pow1p.qbk
new file mode 100644
index 0000000000..a2201d77ae
--- /dev/null
+++ b/doc/sf/pow1p.qbk
@@ -0,0 +1,34 @@
+[/
+  Copyright Matt Borland 2024
+  Distributed under the Boost Software License, Version 1.0.
+  (See accompanying file LICENSE_1_0.txt or copy at
+  http://www.boost.org/LICENSE_1_0.txt).
+]
+
+[section:pow1p pow1p]
+
+[h4 Synopsis]
+
+    #include <boost/math/special_functions/pow1p.hpp>
+
+    namespace boost {
+    namespace math {
+
+    template <typename T1, typename T2, typename Policy>
+    BOOST_MATH_GPU_ENABLED tools::promote_args_t<T1, T2>
+    pow1p(const T1 x, const T2 y, const Policy& pol)
+
+    template <typename T1, typename T2>
+    BOOST_MATH_GPU_ENABLED tools::promote_args_t<T1, T2>
+    pow1p(const T1 x, const T2 y)
+
+    }} // namespaces
+
+
+The function `pow1p` computes (1 + x)[super y ] where x and y are real numbers. 
+This function is particularly useful for scenarios where adding 1 to x before raising it to the power of y is required. 
+It provides a more numerically stable and efficient way to perform this computation, especially for small values of x.
+When x is very small, directly computing (1 + x)[super y ] using standard arithmetic operations can lead to a loss of precision due to floating-point arithmetic limitations. 
+The pow1p function helps mitigate this issue by internally handling such cases more accurately.
+
+[endsect]
diff --git a/include/boost/math/special_functions.hpp b/include/boost/math/special_functions.hpp
index d3aa95c5c1..3275ab3d2b 100644
--- a/include/boost/math/special_functions.hpp
+++ b/include/boost/math/special_functions.hpp
@@ -79,6 +79,7 @@
 #include <boost/math/special_functions/lambert_w.hpp>
 #include <boost/math/special_functions/gegenbauer.hpp>
 #include <boost/math/special_functions/jacobi.hpp>
+#include <boost/math/special_functions/pow1p.hpp>
 #ifndef BOOST_MATH_NO_EXCEPTIONS
 #include <boost/math/special_functions/legendre_stieltjes.hpp>
 #endif
diff --git a/include/boost/math/special_functions/pow1p.hpp b/include/boost/math/special_functions/pow1p.hpp
new file mode 100644
index 0000000000..1b718f99bf
--- /dev/null
+++ b/include/boost/math/special_functions/pow1p.hpp
@@ -0,0 +1,269 @@
+//  (C) Copyright Matt Borland 2024.
+//  (C) Copyrigh Fancidev 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef BOOST_MATH_SF_POW1P_HPP
+#define BOOST_MATH_SF_POW1P_HPP
+
+#include <boost/math/tools/config.hpp>
+#include <boost/math/tools/numeric_limits.hpp>
+#include <boost/math/tools/promotion.hpp>
+#include <boost/math/tools/is_detected.hpp>
+#include <boost/math/tools/precision.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/sign.hpp>
+#include <boost/math/policies/error_handling.hpp>
+
+// For cuda we would rather use builtin nextafter than unsupported boost::math::nextafter
+#ifndef BOOST_MATH_HAS_GPU_SUPPORT
+#  include <boost/math/special_functions/next.hpp>
+#  include <boost/math/special_functions/math_fwd.hpp>
+#  include <utility>
+#endif // BOOST_MATH_ENABLE_CUDA
+
+namespace boost {
+namespace math {
+
+namespace detail {
+
+#ifndef BOOST_MATH_HAS_GPU_SUPPORT
+
+template <typename T>
+using fma_t = decltype(fma(std::declval<T>(), std::declval<T>(), std::declval<T>()));
+
+template <typename T>
+constexpr bool is_fma_detected_v = boost::math::tools::is_detected<fma_t, T>::value;
+
+template <typename T, boost::math::enable_if_t<is_fma_detected_v<T>, bool> = true>
+BOOST_MATH_FORCEINLINE BOOST_MATH_GPU_ENABLED T local_fma(const T x, const T y, const T z)
+{
+    using std::fma;
+    return fma(x, y, z);
+}
+
+template <typename T, boost::math::enable_if_t<!is_fma_detected_v<T>, bool> = true>
+BOOST_MATH_FORCEINLINE BOOST_MATH_GPU_ENABLED T local_fma(const T x, const T y, const T z)
+{
+    return x * y + z;
+}
+
+#else
+
+template <typename T>
+BOOST_MATH_FORCEINLINE BOOST_MATH_GPU_ENABLED T local_fma(const T x, const T y, const T z)
+{
+    return ::fma(x, y, z);
+}
+
+template <>
+BOOST_MATH_FORCEINLINE BOOST_MATH_GPU_ENABLED float local_fma(const float x, const float y, const float z)
+{
+    return ::fmaf(x, y, z);
+}
+
+#endif // BOOST_MATH_HAS_GPU_SUPPORT
+
+template <typename T, typename Policy>
+BOOST_MATH_GPU_ENABLED T pow1p_imp(const T x, const T y, const Policy& pol)
+{
+    BOOST_MATH_STD_USING
+    constexpr auto function = "boost::math::pow1p<%1%>(%1%, %1%)";
+
+    // The special values follow the spec of the pow() function defined in
+    // IEEE-754, Section 9.2.1.  The order of the `if` statements matters.
+    if (abs(y) == T(0)) 
+    { 
+        // pow(x, +/-0)
+        return T(1);
+    }
+    
+    if (x == T(-1))
+    { 
+        // pow(+/-0, y)
+        if ((boost::math::isfinite)(y) && y < 0) 
+        {
+            return boost::math::policies::raise_domain_error<T>(function, "Division by 0", x, pol);
+        }
+
+        // Gets correct special handling
+        return pow(T(0), y);
+    }
+
+    if (x == T(-2) && (boost::math::isinf)(y)) 
+    { 
+        // pow(-1, +/-inf)
+        return T(1);
+    }
+
+    if (x == T(0))
+    { 
+        // pow(+1, y)
+        return T(1);
+    }
+
+    if ((boost::math::isinf)(y))
+    {
+        if ((boost::math::signbit)(y) == 1)
+        {
+            // pow(x, -inf)
+            return T(0);
+        }
+        else
+        {
+            // pow(x, +inf)
+            return y;
+        }
+    }
+
+    if ((boost::math::isinf)(x)) 
+    { 
+        // pow(+/-inf, y)
+        return pow(x, y);
+    }
+
+    // Up to this point, (1+x) = {+/-0, +/-1, +/-inf} have been handled, and
+    // and y = {+/-0, +/-inf} have been handled.  Next, we handle `nan` and
+    // y = {+/-1}.
+    if ((boost::math::isnan)(x)) 
+    {
+        return x;
+    }
+    else if ((boost::math::isnan)(y))
+    {
+        return y;
+    }
+
+    if (y == T(1)) 
+    {
+        return T(1) + x;
+    }
+    if (y == T(-1)) 
+    {
+        return T(1) / (T(1) + x); // guaranteed no overflow
+    }
+
+    // Handle (1+x) < 0
+    if (x < -1) 
+    {
+        // TODO(fancidev): maybe use (1+x)^y == [(1+x)^2]^(y/2)
+        return pow(1+x, y);
+    }
+
+    // Now x, y are finite and not equal to 0 or +/-1, and x > -1.
+    // To compute (1+x)^y, write (1+x) == (s+t) where s is equal to (1+x)
+    // rounded toward 1, and t is the (exact) rounding error.
+    T s, t;
+    if (x < 0) 
+    {
+        s = T(1) + x;
+        t = x - (s - T(1));
+        if (t > 0) 
+        {
+            #ifdef BOOST_MATH_HAS_GPU_SUPPORT
+            s = ::nextafter(s, T(1));
+            #else
+            s = boost::math::nextafter(s, T(1));
+            #endif
+
+            t = x - (s - T(1));
+        }
+    } 
+    else if (x < 1) 
+    {
+        s = T(1) + x;
+        t = x - (s - T(1));
+        if (t < 0) 
+        {
+            #ifdef BOOST_MATH_HAS_GPU_SUPPORT
+            s = ::nextafter(s, T(0));
+            #else
+            s = boost::math::nextafter(s, T(0));
+            #endif
+
+            t = x - (s - T(1));
+        }
+    } 
+    else
+    {
+        s = x + T(1);
+        t = T(1) - (s - x);
+        if (t < 0) 
+        {
+            #ifdef BOOST_MATH_HAS_GPU_SUPPORT
+            s = ::nextafter(s, T(0));
+            #else
+            s = boost::math::nextafter(s, T(0));
+            #endif
+
+            t = T(1) - (s - x);
+        }
+    }
+
+    // Because x > -1 and s is rounded toward 1, s is guaranteed to be > 0.
+    // Write (1+x)^y == (s+t)^y == (s^y)*((1+t/s)^y) == term1*term2.
+    // It can be shown that either both terms <= 1 or both >= 1; so
+    // if the first term over/underflows, then the result over/underflows.
+    // And of course, term2 == 1 if t == 0.
+    T term1 = pow(s, y);
+    if (t == T(0) || term1 == T(0) || (boost::math::isinf)(term1))
+    {
+        return term1;
+    }
+
+    // (1+t/s)^y == exp(y*log(1+t/s)).  The relative error of the result is
+    // equal to the absolute error of the exponent (plus the relative error
+    // of 'exp').  Therefore, when the exponent is small, it is accurately
+    // evaluated to machine epsilon using T arithmetic.  In addition, since
+    // t/s <= epsilon, log(1+t/s) is well approximated by t/s to first order.
+    T u = t / s;
+    T w = y * u;
+    if (abs(w) <= T(0.5)) 
+    {
+        T term2 = exp(w);
+        return term1 * term2;
+    }
+
+    // Now y*log(1+t/s) is large, and its relative error is "magnified" by
+    // the exponent.  To reduce the error, we use double-T arithmetic, and
+    // expand log(1+t/s) to second order.
+
+    // (u + uu) ~= t/s.
+    T r1 = local_fma(-u, s, t);
+    T uu = r1 / s;
+
+    // (u + vv) ~= log(1+(u+uu)) ~= log(1+t/s).
+    T vv = local_fma(T(-0.5)*u, u, uu);
+
+    // (w + ww) ~= y*(u+vv) ~= y*log(1+t/s).
+    T r2 = local_fma(y, u, -w);
+    T ww = local_fma(y, vv, r2);
+
+    // TODO: maybe ww is small enough such that exp(ww) ~= 1+ww.
+    T term2 = exp(w) * exp(ww);
+    return term1 * term2;
+}
+
+} // Namespace detail
+
+template <typename T1, typename T2, typename Policy>
+BOOST_MATH_GPU_ENABLED tools::promote_args_t<T1, T2>
+pow1p(const T1 x, const T2 y, const Policy& pol)
+{
+    using result_type = tools::promote_args_t<T1, T2>;
+    return detail::pow1p_imp(static_cast<result_type>(x), static_cast<result_type>(y), pol);
+}
+
+template <typename T1, typename T2>
+BOOST_MATH_GPU_ENABLED tools::promote_args_t<T1, T2>
+pow1p(const T1 x, const T2 y)
+{
+    using result_type = tools::promote_args_t<T1, T2>;
+    return detail::pow1p_imp(static_cast<result_type>(x), static_cast<result_type>(y), policies::policy<>());
+}
+
+} // Namespace math
+} // Namespace boost
+
+#endif // BOOST_MATH_SF_POW1P_HPP
diff --git a/test/Jamfile.v2 b/test/Jamfile.v2
index 85ddf38f9b..18018d63af 100644
--- a/test/Jamfile.v2
+++ b/test/Jamfile.v2
@@ -128,6 +128,7 @@ test-suite special_fun :
 
    [ run hypot_test.cpp test_instances//test_instances pch_light ../../test/build//boost_unit_test_framework ]
    [ run pow_test.cpp ../../test/build//boost_unit_test_framework ]
+   [ run test_pow1p.cpp ]
    [ run logaddexp_test.cpp ../../test/build//boost_unit_test_framework ]
    [ run logsumexp_test.cpp ../../test/build//boost_unit_test_framework : : : [ requires cxx11_hdr_initializer_list cxx11_variadic_templates ] ]
    [ run ccmath_sqrt_test.cpp ../../test/build//boost_unit_test_framework : : : [ requires cxx17_if_constexpr ] [ check-target-builds ../config//has_float128 "GCC libquadmath and __float128 support" : <define>BOOST_MATH_TEST_FLOAT128 <linkflags>"-Bstatic -lquadmath -Bdynamic" ] ]
diff --git a/test/cuda_jamfile b/test/cuda_jamfile
index 64fac76fba..2829aa19ee 100644
--- a/test/cuda_jamfile
+++ b/test/cuda_jamfile
@@ -171,3 +171,5 @@ run test_trigamma_float.cu ;
 
 run test_trunc_double.cu ;
 run test_trunc_float.cu ;
+run test_pow1p_double.cu ;
+run test_pow1p_float.cu ;
diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile
index de235822ef..d636fc67be 100644
--- a/test/nvrtc_jamfile
+++ b/test/nvrtc_jamfile
@@ -168,3 +168,4 @@ run test_trigamma_nvrtc_double.cpp ;
 run test_trigamma_nvrtc_float.cpp ;
 
 run test_trunc_nvrtc_double.cpp ;
+run test_pow1p_nvrtc_double.cpp ;
diff --git a/test/sycl_jamfile b/test/sycl_jamfile
index 97c48474ca..3dd7d054af 100644
--- a/test/sycl_jamfile
+++ b/test/sycl_jamfile
@@ -38,3 +38,4 @@ run test_digamma_simple.cpp ;
 run test_trigamma.cpp ;
 run test_erf.cpp ;
 run test_gamma.cpp ;
+run test_pow1p.cpp ;
diff --git a/test/test_pow1p.cpp b/test/test_pow1p.cpp
new file mode 100644
index 0000000000..9b5ba7c8e7
--- /dev/null
+++ b/test/test_pow1p.cpp
@@ -0,0 +1,122 @@
+//  (C) Copyright Matt Borland 2024.
+//  (C) Copyrigh Fancidev 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <boost/math/special_functions/pow1p.hpp>
+#include <boost/math/concepts/real_concept.hpp>
+#include <exception>
+#include <random>
+
+#if __has_include(<stdfloat>) && !defined(BOOST_MATH_HAS_GPU_SUPPORT)
+#  include <stdfloat>
+#endif
+
+#include "math_unit_test.hpp"
+
+template <typename T>
+void test()
+{
+    using std::pow;
+
+    // First we hit all the special cases
+    // pow(x, +/-0)
+    CHECK_EQUAL(boost::math::pow1p(T(1), T(0)), T(1));
+    
+    // pow(0, y)
+    #ifndef BOOST_MATH_NO_EXCEPTIONS
+    CHECK_THROW(boost::math::pow1p(T(-1), T(-1)), std::domain_error);
+    #endif
+    CHECK_EQUAL(boost::math::pow1p(T(-1), T(1)), T(0));
+
+    // pow(-1, inf)
+    CHECK_EQUAL(boost::math::pow1p(T(-2), boost::math::numeric_limits<T>::infinity()), T(1));
+    
+    // pow(1, y)
+    CHECK_EQUAL(boost::math::pow1p(T(0), T(2)), T(1));
+
+    // pow(x, +/-inf)
+    BOOST_MATH_IF_CONSTEXPR (boost::math::numeric_limits<T>::has_infinity)
+    {
+        CHECK_EQUAL(boost::math::pow1p(T(5), -boost::math::numeric_limits<T>::infinity()), T(0));
+        CHECK_EQUAL(boost::math::pow1p(T(5), boost::math::numeric_limits<T>::infinity()), boost::math::numeric_limits<T>::infinity());
+    
+
+    // pow(+/-inf, y)
+        CHECK_EQUAL(boost::math::pow1p(boost::math::numeric_limits<T>::infinity(), T(2)), boost::math::numeric_limits<T>::infinity());
+        CHECK_EQUAL(boost::math::pow1p(-boost::math::numeric_limits<T>::infinity(), T(2)), boost::math::numeric_limits<T>::infinity());
+    }
+
+    // NANs for x and y
+    BOOST_MATH_IF_CONSTEXPR (boost::math::numeric_limits<T>::has_quiet_NaN)
+    {
+        CHECK_EQUAL(boost::math::isnan(boost::math::pow1p(boost::math::numeric_limits<T>::quiet_NaN(), T(1))), true);
+        CHECK_EQUAL(boost::math::isnan(boost::math::pow1p(T(1), boost::math::numeric_limits<T>::quiet_NaN())), true);
+    }
+
+    // pow(x, +/-1)
+    CHECK_ULP_CLOSE(boost::math::pow1p(T(2), T(1)), pow(T(3), T(1)), 10);
+    CHECK_ULP_CLOSE(boost::math::pow1p(T(2), T(-1)), pow(T(3), T(-1)), 10);
+    
+    // (1+x) < 0
+    CHECK_ULP_CLOSE(boost::math::pow1p(T(-3), T(2)), pow(T(-2), T(2)), 10);
+
+    // x < 0
+    std::mt19937_64 rng;
+    std::uniform_real_distribution<double> dist (-1, 0);
+    std::uniform_real_distribution<double> dist_y (0, 10);
+    constexpr int N = 1024;
+    for (int i = 0; i < N; ++i)
+    {
+        const auto x = static_cast<T>(dist(rng));
+        const auto y = static_cast<T>(dist_y(rng));
+
+        CHECK_ULP_CLOSE(boost::math::pow1p(x, y), pow(x + 1, y), 100);
+    }
+
+    // 0 < x < 1
+    std::uniform_real_distribution<double> dist_x_1(0, 1);
+    for (int i = 0; i < N; ++i)
+    {
+        const auto x = static_cast<T>(dist_x_1(rng));
+        const auto y = static_cast<T>(dist_y(rng));
+
+        CHECK_ULP_CLOSE(boost::math::pow1p(x, y), pow(x + 1, y), 100);
+    }
+
+    // Else
+    std::uniform_real_distribution<double> dist_other_x(1, 1000);
+    for (int i = 0; i < N; ++i)
+    {
+        const auto x = static_cast<T>(dist_other_x(rng));
+        const auto y = static_cast<T>(dist_y(rng));
+
+        CHECK_ULP_CLOSE(boost::math::pow1p(x, y), pow(x + 1, y), 100);
+    }
+}
+
+int main()
+{
+    #ifdef __STDCPP_FLOAT32_T__
+    test<std::float32_t>();
+    #else
+    test<float>();
+    #endif
+
+    #ifdef __STDCPP_FLOAT64_T__
+    test<std::float64_t>();
+    #else
+    test<double>();
+    #endif
+
+    #ifndef BOOST_MATH_NO_LONG_DOUBLE_MATH_FUNCTIONS
+    test<long double>();
+    #endif
+
+    #ifndef BOOST_MATH_NO_REAL_CONCEPT_TESTS
+    test<boost::math::concepts::real_concept>();
+    #endif
+
+    return boost::math::test::report_errors();
+}
diff --git a/test/test_pow1p_double.cu b/test/test_pow1p_double.cu
new file mode 100644
index 0000000000..c7746bd3e7
--- /dev/null
+++ b/test/test_pow1p_double.cu
@@ -0,0 +1,111 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::pow1p(in1[i], in2[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed input vector B
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    std::mt19937_64 rng;
+    std::uniform_real_distribution<float_type> x_vals(-1, 1);
+    std::uniform_real_distribution<float_type> y_vals(-10, 10);
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = x_vals(rng);
+        input_vector2[i] = y_vals(rng);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+    {
+       results.push_back(boost::math::pow1p(input_vector1[i], input_vector2[i]));
+    }
+
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_pow1p_float.cu b/test/test_pow1p_float.cu
new file mode 100644
index 0000000000..0ecdf0606c
--- /dev/null
+++ b/test/test_pow1p_float.cu
@@ -0,0 +1,111 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::pow1p(in1[i], in2[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed input vector B
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    std::mt19937_64 rng;
+    std::uniform_real_distribution<float_type> x_vals(-1, 1);
+    std::uniform_real_distribution<float_type> y_vals(-10, 10);
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = x_vals(rng);
+        input_vector2[i] = y_vals(rng);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+    {
+       results.push_back(boost::math::pow1p(input_vector1[i], input_vector2[i]));
+    }
+
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_pow1p_nvrtc_double.cpp b/test/test_pow1p_nvrtc_double.cpp
new file mode 100644
index 0000000000..d9f97d94c0
--- /dev/null
+++ b/test/test_pow1p_nvrtc_double.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/pow1p.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/pow1p.hpp>
+extern "C" __global__ 
+void test_trunc_kernel(const float_type *in1, const float_type *in2, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::pow1p(in1[i], in2[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_trunc_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_trunc_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_trunc_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> x_vals(-1, 1);
+        std::uniform_real_distribution<float_type> y_vals(-10, 10);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(x_vals(rng));
+            h_in2[i] = static_cast<float_type>(y_vals(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::pow1p(h_in1[i], h_in2[i]);
+                       
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_pow1p_nvrtc_float.cpp b/test/test_pow1p_nvrtc_float.cpp
new file mode 100644
index 0000000000..d9f97d94c0
--- /dev/null
+++ b/test/test_pow1p_nvrtc_float.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/pow1p.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/pow1p.hpp>
+extern "C" __global__ 
+void test_trunc_kernel(const float_type *in1, const float_type *in2, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::pow1p(in1[i], in2[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_trunc_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_trunc_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_trunc_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> x_vals(-1, 1);
+        std::uniform_real_distribution<float_type> y_vals(-10, 10);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(x_vals(rng));
+            h_in2[i] = static_cast<float_type>(y_vals(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::pow1p(h_in1[i], h_in2[i]);
+                       
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}