Extracted Segmented Reduce kernels into NVRTC compilable header (#3727)

oleksandr-pavlyk · web-flow · commit a067839b1d66 · 2025-02-07T14:27:28.000-05:00
* Moved definition of DeviceSegmentedReduceKernel out to NVRTC compilable header file

Moved the definition from cub/device/dispatch/dispatch_reduce.cuh to
new file cub/device/dispatch/kernels/segmented_reduce.cuh

Added compilation of the new file to cub/test/catch2_test_nvrtc.cu

Needed to remove friended operator&lt;&lt; overload for std::ostream from
arg_index_input_iterator.cuh, since this header is needed by
DeviceSegmentedReduceKernel implementation, and use of std::ostream
breaks compilation by nvrtc

* Include cub/util_type.cuh for value_t and non_void_value_t

* Include cub/detail/type_traits.cuh

* Restore friend operator ostream conditionally for nvcc/host compiler
diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh
@@ -44,19 +44,18 @@
 #  pragma system_header
 #endif // no system header
 
-#include <cub/agent/agent_reduce.cuh>
 #include <cub/detail/launcher/cuda_runtime.cuh>
+#include <cub/detail/type_traits.cuh> // for cub::detail::invoke_result_t
 #include <cub/device/dispatch/kernels/reduce.cuh>
+#include <cub/device/dispatch/kernels/segmented_reduce.cuh>
 #include <cub/device/dispatch/tuning/tuning_reduce.cuh>
 #include <cub/grid/grid_even_share.cuh>
-#include <cub/iterator/arg_index_input_iterator.cuh>
 #include <cub/thread/thread_operators.cuh>
 #include <cub/thread/thread_store.cuh>
 #include <cub/util_debug.cuh>
 #include <cub/util_device.cuh>
 #include <cub/util_temporary_storage.cuh>
-
-#include <iterator>
+#include <cub/util_type.cuh> // for cub::detail::non_void_value_t, cub::detail::value_t
 
 _CCCL_SUPPRESS_DEPRECATED_PUSH
 #include <cuda/std/functional>
@@ -69,131 +68,6 @@ CUB_NAMESPACE_BEGIN
 namespace detail::reduce
 {
 
-/// Normalize input iterator to segment offset
-template <typename T, typename OffsetT, typename IteratorT>
-_CCCL_DEVICE _CCCL_FORCEINLINE void NormalizeReductionOutput(T& /*val*/, OffsetT /*base_offset*/, IteratorT /*itr*/)
-{}
-
-/// Normalize input iterator to segment offset (specialized for arg-index)
-template <typename KeyValuePairT, typename OffsetT, typename WrappedIteratorT, typename OutputValueT>
-_CCCL_DEVICE _CCCL_FORCEINLINE void NormalizeReductionOutput(
-  KeyValuePairT& val, OffsetT base_offset, ArgIndexInputIterator<WrappedIteratorT, OffsetT, OutputValueT> /*itr*/)
-{
-  val.key -= base_offset;
-}
-
-/**
- * Segmented reduction (one block per segment)
- * @tparam ChainedPolicyT
- *   Chained tuning policy
- *
- * @tparam InputIteratorT
- *   Random-access input iterator type for reading input items @iterator
- *
- * @tparam OutputIteratorT
- *   Output iterator type for recording the reduced aggregate @iterator
- *
- * @tparam BeginOffsetIteratorT
- *   Random-access input iterator type for reading segment beginning offsets
- *   @iterator
- *
- * @tparam EndOffsetIteratorT
- *   Random-access input iterator type for reading segment ending offsets
- *   @iterator
- *
- * @tparam OffsetT
- *   Signed integer type for global offsets
- *
- * @tparam ReductionOpT
- *   Binary reduction functor type having member
- *   `T operator()(const T &a, const U &b)`
- *
- * @tparam InitT
- *   Initial value type
- *
- * @param[in] d_in
- *   Pointer to the input sequence of data items
- *
- * @param[out] d_out
- *   Pointer to the output aggregate
- *
- * @param[in] d_begin_offsets
- *   Random-access input iterator to the sequence of beginning offsets of
- *   length `num_segments`, such that `d_begin_offsets[i]` is the first element
- *   of the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`
- *
- * @param[in] d_end_offsets
- *   Random-access input iterator to the sequence of ending offsets of length
- *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of
- *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
- *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup> is
- *   considered empty.
- *
- * @param[in] num_segments
- *   The number of segments that comprise the sorting data
- *
- * @param[in] reduction_op
- *   Binary reduction functor
- *
- * @param[in] init
- *   The initial value of the reduction
- */
-template <typename ChainedPolicyT,
-          typename InputIteratorT,
-          typename OutputIteratorT,
-          typename BeginOffsetIteratorT,
-          typename EndOffsetIteratorT,
-          typename OffsetT,
-          typename ReductionOpT,
-          typename InitT,
-          typename AccumT>
-CUB_DETAIL_KERNEL_ATTRIBUTES
-__launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)) void DeviceSegmentedReduceKernel(
-  InputIteratorT d_in,
-  OutputIteratorT d_out,
-  BeginOffsetIteratorT d_begin_offsets,
-  EndOffsetIteratorT d_end_offsets,
-  int /*num_segments*/,
-  ReductionOpT reduction_op,
-  InitT init)
-{
-  // Thread block type for reducing input tiles
-  using AgentReduceT =
-    AgentReduce<typename ChainedPolicyT::ActivePolicy::ReducePolicy,
-                InputIteratorT,
-                OutputIteratorT,
-                OffsetT,
-                ReductionOpT,
-                AccumT>;
-
-  // Shared memory storage
-  __shared__ typename AgentReduceT::TempStorage temp_storage;
-
-  OffsetT segment_begin = d_begin_offsets[blockIdx.x];
-  OffsetT segment_end   = d_end_offsets[blockIdx.x];
-
-  // Check if empty problem
-  if (segment_begin == segment_end)
-  {
-    if (threadIdx.x == 0)
-    {
-      *(d_out + blockIdx.x) = init;
-    }
-    return;
-  }
-
-  // Consume input tiles
-  AccumT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeRange(segment_begin, segment_end);
-
-  // Normalize as needed
-  NormalizeReductionOutput(block_aggregate, segment_begin, d_in);
-
-  if (threadIdx.x == 0)
-  {
-    finalize_and_store_aggregate(d_out + blockIdx.x, reduction_op, init, block_aggregate);
-  }
-}
-
 template <typename MaxPolicyT,
           typename InputIteratorT,
           typename OutputIteratorT,
diff --git a/cub/cub/device/dispatch/kernels/segmented_reduce.cuh b/cub/cub/device/dispatch/kernels/segmented_reduce.cuh
@@ -0,0 +1,178 @@
+/******************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/agent/agent_reduce.cuh>
+#include <cub/iterator/arg_index_input_iterator.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+namespace detail
+{
+namespace reduce
+{
+
+/// Normalize input iterator to segment offset
+template <typename T, typename OffsetT, typename IteratorT>
+_CCCL_DEVICE _CCCL_FORCEINLINE void NormalizeReductionOutput(T& /*val*/, OffsetT /*base_offset*/, IteratorT /*itr*/)
+{}
+
+/// Normalize input iterator to segment offset (specialized for arg-index)
+template <typename KeyValuePairT, typename OffsetT, typename WrappedIteratorT, typename OutputValueT>
+_CCCL_DEVICE _CCCL_FORCEINLINE void NormalizeReductionOutput(
+  KeyValuePairT& val, OffsetT base_offset, ArgIndexInputIterator<WrappedIteratorT, OffsetT, OutputValueT> /*itr*/)
+{
+  val.key -= base_offset;
+}
+
+/**
+ * Segmented reduction (one block per segment)
+ * @tparam ChainedPolicyT
+ *   Chained tuning policy
+ *
+ * @tparam InputIteratorT
+ *   Random-access input iterator type for reading input items @iterator
+ *
+ * @tparam OutputIteratorT
+ *   Output iterator type for recording the reduced aggregate @iterator
+ *
+ * @tparam BeginOffsetIteratorT
+ *   Random-access input iterator type for reading segment beginning offsets
+ *   @iterator
+ *
+ * @tparam EndOffsetIteratorT
+ *   Random-access input iterator type for reading segment ending offsets
+ *   @iterator
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
+ *
+ * @tparam ReductionOpT
+ *   Binary reduction functor type having member
+ *   `T operator()(const T &a, const U &b)`
+ *
+ * @tparam InitT
+ *   Initial value type
+ *
+ * @param[in] d_in
+ *   Pointer to the input sequence of data items
+ *
+ * @param[out] d_out
+ *   Pointer to the output aggregate
+ *
+ * @param[in] d_begin_offsets
+ *   Random-access input iterator to the sequence of beginning offsets of
+ *   length `num_segments`, such that `d_begin_offsets[i]` is the first element
+ *   of the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`
+ *
+ * @param[in] d_end_offsets
+ *   Random-access input iterator to the sequence of ending offsets of length
+ *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of
+ *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
+ *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup> is
+ *   considered empty.
+ *
+ * @param[in] num_segments
+ *   The number of segments that comprise the sorting data
+ *
+ * @param[in] reduction_op
+ *   Binary reduction functor
+ *
+ * @param[in] init
+ *   The initial value of the reduction
+ */
+template <typename ChainedPolicyT,
+          typename InputIteratorT,
+          typename OutputIteratorT,
+          typename BeginOffsetIteratorT,
+          typename EndOffsetIteratorT,
+          typename OffsetT,
+          typename ReductionOpT,
+          typename InitT,
+          typename AccumT>
+CUB_DETAIL_KERNEL_ATTRIBUTES
+__launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)) void DeviceSegmentedReduceKernel(
+  InputIteratorT d_in,
+  OutputIteratorT d_out,
+  BeginOffsetIteratorT d_begin_offsets,
+  EndOffsetIteratorT d_end_offsets,
+  int /*num_segments*/,
+  ReductionOpT reduction_op,
+  InitT init)
+{
+  // Thread block type for reducing input tiles
+  using AgentReduceT =
+    AgentReduce<typename ChainedPolicyT::ActivePolicy::ReducePolicy,
+                InputIteratorT,
+                OutputIteratorT,
+                OffsetT,
+                ReductionOpT,
+                AccumT>;
+
+  // Shared memory storage
+  __shared__ typename AgentReduceT::TempStorage temp_storage;
+
+  OffsetT segment_begin = d_begin_offsets[blockIdx.x];
+  OffsetT segment_end   = d_end_offsets[blockIdx.x];
+
+  // Check if empty problem
+  if (segment_begin == segment_end)
+  {
+    if (threadIdx.x == 0)
+    {
+      *(d_out + blockIdx.x) = init;
+    }
+    return;
+  }
+
+  // Consume input tiles
+  AccumT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeRange(segment_begin, segment_end);
+
+  // Normalize as needed
+  NormalizeReductionOutput(block_aggregate, segment_begin, d_in);
+
+  if (threadIdx.x == 0)
+  {
+    finalize_and_store_aggregate(d_out + blockIdx.x, reduction_op, init, block_aggregate);
+  }
+}
+
+} // namespace reduce
+} // namespace detail
+
+CUB_NAMESPACE_END
diff --git a/cub/cub/iterator/arg_index_input_iterator.cuh b/cub/cub/iterator/arg_index_input_iterator.cuh
@@ -43,14 +43,14 @@
 #  pragma system_header
 #endif // no system header
 
-#include <cub/thread/thread_load.cuh>
-#include <cub/thread/thread_store.cuh>
+#include <cub/util_type.cuh>
 
 #include <thrust/iterator/iterator_facade.h>
 #include <thrust/iterator/iterator_traits.h>
-#include <thrust/version.h>
 
-#include <ostream>
+#if !_CCCL_COMPILER(NVRTC)
+#  include <ostream>
+#endif // !_CCCL_COMPILER(NVRTC)
 
 CUB_NAMESPACE_BEGIN
 
@@ -246,11 +246,12 @@ public:
     offset = 0;
   }
 
-  /// ostream operator
+#if !_CCCL_COMPILER(NVRTC)
   friend std::ostream& operator<<(std::ostream& os, const self_type& /*itr*/)
   {
     return os;
   }
+#endif // !_CCCL_COMPILER(NVRTC)
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/test/catch2_test_nvrtc.cu b/cub/test/catch2_test_nvrtc.cu
@@ -59,6 +59,7 @@ TEST_CASE("Test nvrtc", "[test][nvrtc]")
     #include <cub/device/dispatch/kernels/for_each.cuh>
     #include <cub/device/dispatch/kernels/scan.cuh>
     #include <cub/device/dispatch/kernels/merge_sort.cuh>
+    #include <cub/device/dispatch/kernels/segmented_reduce.cuh>
 
     #include <thrust/iterator/constant_iterator.h>
     #include <thrust/iterator/counting_iterator.h>