diff --git a/c++/mpi/array.hpp b/c++/mpi/array.hpp
index af6a3f69..cd4a7b5a 100644
--- a/c++/mpi/array.hpp
+++ b/c++/mpi/array.hpp
@@ -16,7 +16,7 @@
 
 /**
  * @file
- * @brief Provides an MPI broadcast, reduce, scatter and gather for std::vector.
+ * @brief Provides an MPI broadcast and reduce for `std::array`.
  */
 
 #pragma once
@@ -29,6 +29,8 @@
 
 #include <array>
 #include <cstddef>
+#include <type_traits>
+#include <utility>
 
 namespace mpi {
 
@@ -38,55 +40,65 @@ namespace mpi {
    */
 
   /**
-   * @brief Implementation of an MPI broadcast for a std::arr.
+   * @brief Implementation of an MPI broadcast for a `std::array`.
    *
-   * @details It simply calls mpi::broadcast_range with the input array.
+   * @details It calls mpi::broadcast_range with the given array.
    *
    * @tparam T Value type of the array.
    * @tparam N Size of the array.
-   * @param arr std::array to broadcast.
+   * @param arr `std::array` to broadcast (into).
    * @param c mpi::communicator.
    * @param root Rank of the root process.
    */
   template <typename T, std::size_t N> void mpi_broadcast(std::array<T, N> &arr, communicator c = {}, int root = 0) { broadcast_range(arr, c, root); }
 
   /**
-   * @brief Implementation of an in-place MPI reduce for a std::array.
+   * @brief Implementation of an MPI reduce for a `std::array`.
    *
-   * @details It simply calls mpi::reduce_in_place_range with the given input array.
+   * @details It constructs the output array with its value type equal to the return type of `reduce(std::declval<T>())`
+   * and calls mpi::reduce_range with the input and constructed output array.
+   *
+   * Note that the output array will always have the same size as the input array, no matter if the rank receives the
+   * reduced data or not.
    *
    * @tparam T Value type of the array.
    * @tparam N Size of the array.
-   * @param arr std::array to reduce.
+   * @param arr `std::array` to reduce.
    * @param c mpi::communicator.
    * @param root Rank of the root process.
    * @param all Should all processes receive the result of the reduction.
    * @param op `MPI_Op` used in the reduction.
+   * @return `std::array` containing the result of the reduction.
    */
   template <typename T, std::size_t N>
-  void mpi_reduce_in_place(std::array<T, N> &arr, communicator c = {}, int root = 0, bool all = false, MPI_Op op = MPI_SUM) {
-    reduce_in_place_range(arr, c, root, all, op);
+  auto mpi_reduce(std::array<T, N> const &arr, communicator c = {}, int root = 0, bool all = false, MPI_Op op = MPI_SUM) {
+    using value_t = std::remove_cvref_t<decltype(reduce(std::declval<T>()))>;
+    std::array<value_t, N> res{};
+    reduce_range(arr, res, c, root, all, op);
+    return res;
   }
 
   /**
-   * @brief Implementation of an MPI reduce for a std::array.
+   * @brief Implementation of an MPI reduce for a `std::array` that reduces directly into an existing output array.
    *
-   * @details It simply calls mpi::reduce_range with the given input array and an empty array of the same size.
+   * @details It calls mpi::reduce_range with the input and output array. The output array must be the same size as the
+   * input array on receiving ranks.
    *
-   * @tparam T Value type of the array.
-   * @tparam N Size of the array.
-   * @param arr std::array to reduce.
+   * @tparam T1 Value type of the array to be reduced.
+   * @tparam N1 Size of the array to be reduced.
+   * @tparam T2 Value type of the array to be reduced into.
+   * @tparam N2 Size of the array to be reduced into.
+   * @param arr_in `std::array` to reduce.
+   * @param arr_out `std::array` to reduce into.
    * @param c mpi::communicator.
    * @param root Rank of the root process.
    * @param all Should all processes receive the result of the reduction.
    * @param op `MPI_Op` used in the reduction.
-   * @return std::array containing the result of each individual reduction.
    */
-  template <typename T, std::size_t N>
-  auto mpi_reduce(std::array<T, N> const &arr, communicator c = {}, int root = 0, bool all = false, MPI_Op op = MPI_SUM) {
-    std::array<regular_t<T>, N> res{};
-    reduce_range(arr, res, c, root, all, op);
-    return res;
+  template <typename T1, std::size_t N1, typename T2, std::size_t N2>
+  void mpi_reduce_into(std::array<T1, N1> const &arr_in, std::array<T2, N2> &arr_out, communicator c = {}, int root = 0, bool all = false,
+                       MPI_Op op = MPI_SUM) {
+    reduce_range(arr_in, arr_out, c, root, all, op);
   }
 
   /** @} */
diff --git a/c++/mpi/chunk.hpp b/c++/mpi/chunk.hpp
index 2c275646..85466cd1 100644
--- a/c++/mpi/chunk.hpp
+++ b/c++/mpi/chunk.hpp
@@ -22,11 +22,11 @@
 #pragma once
 
 #include "./communicator.hpp"
+#include "./macros.hpp"
 
 #include <itertools/itertools.hpp>
 
 #include <iterator>
-#include <stdexcept>
 #include <utility>
 
 namespace mpi {
@@ -39,7 +39,7 @@ namespace mpi {
    * @details The optional parameter `min_size` can be used to first divide the range into equal parts of size
    * `min_size` before distributing them as evenly as possible across the number of specified subranges.
    *
-   * It throws an exception if `min_size < 1` or if it is not a divisor of `end`.
+   * It is expected that `min_size > 0` and that `min_size` is a divisor of `end`.
    *
    * @param end End of the integer range `[0, end)`.
    * @param nranges Number of subranges.
@@ -48,7 +48,7 @@ namespace mpi {
    * @return Length of the i<sup>th</sup> subrange.
    */
   [[nodiscard]] inline long chunk_length(long end, int nranges, int i, long min_size = 1) {
-    if (min_size < 1 || end % min_size != 0) throw std::runtime_error("Error in mpi::chunk_length: min_size must be a divisor of end");
+    EXPECTS_WITH_MESSAGE(min_size > 0 && end % min_size == 0, "Error in mpi::chunk_length: min_size must be a divisor of end");
     auto [node_begin, node_end] = itertools::chunk_range(0, end / min_size, nranges, i);
     return (node_end - node_begin) * min_size;
   }
diff --git a/c++/mpi/datatypes.hpp b/c++/mpi/datatypes.hpp
index 1857d752..69f3bba7 100644
--- a/c++/mpi/datatypes.hpp
+++ b/c++/mpi/datatypes.hpp
@@ -91,8 +91,9 @@ namespace mpi {
   template <typename T> struct mpi_type<const T> : mpi_type<T> {};
 
   /**
-   * @brief Type trait to check if a type T has a corresponding MPI datatype, i.e. if mpi::mpi_type has been specialized.
-   * @tparam T Type to be checked.
+   * @brief Type trait to check if a type `T` has a corresponding MPI datatype, i.e. if mpi::mpi_type has been
+   * specialized.
+   * @tparam `T` Type to be checked.
    */
   template <typename T, typename = void> constexpr bool has_mpi_type = false;
 
diff --git a/c++/mpi/generic_communication.hpp b/c++/mpi/generic_communication.hpp
index 244d19e1..e0d613c7 100644
--- a/c++/mpi/generic_communication.hpp
+++ b/c++/mpi/generic_communication.hpp
@@ -24,66 +24,59 @@
 
 #pragma once
 
+#include "./communicator.hpp"
 #include "./datatypes.hpp"
-#include "./lazy.hpp"
+#include "./macros.hpp"
 #include "./utils.hpp"
 
 #include <mpi.h>
 
+#include <algorithm>
+#include <concepts>
+#include <ranges>
 #include <type_traits>
-#include <utility>
 #include <vector>
 
 namespace mpi {
 
+  /**
+   * @ingroup utilities
+   * @brief A concept that checks if a range type is contiguous and sized and has an MPI compatible value type.
+   * @tparam R Range type.
+   */
+  template <typename R>
+  concept MPICompatibleRange = std::ranges::contiguous_range<R> && std::ranges::sized_range<R> && has_mpi_type<std::ranges::range_value_t<R>>;
+
   /**
    * @addtogroup coll_comm
    * @{
    */
 
-  namespace detail {
-
-    // Type trait to check if a type is a std::vector.
-    template <typename T> inline constexpr bool is_std_vector = false;
-
-    // Spezialization of is_std_vector for std::vector<T>.
-    template <typename T> inline constexpr bool is_std_vector<std::vector<T>> = true;
-
-    // Convert an object of type V to an object of type T.
-    template <typename T, typename V> T convert(V v) {
-      if constexpr (is_std_vector<T>) {
-        T res;
-        res.reserve(v.size());
-        for (auto &x : v) res.emplace_back(convert<typename T::value_type>(std::move(x)));
-        return res;
-      } else
-        return T{std::move(v)};
-    }
-
-  } // namespace detail
-
   /**
    * @brief Generic MPI broadcast.
    *
-   * @details If mpi::has_env is true, this function calls the specialized `mpi_broadcast` function for the given
-   * object, otherwise it does nothing.
+   * @details It calls the specialized `mpi_broadcast` function.
+   *
+   * @note We do not check if an MPI runtime environment is being used, i.e. if mpi::has_env is true. It is the
+   * responsibility of the specializations to do this check, in case they make direct calls to the MPI C library.
    *
    * @tparam T Type to be broadcasted.
-   * @param x Object to be broadcasted.
+   * @param x Object to be broadcasted (into).
    * @param c mpi::communicator.
    * @param root Rank of the root process.
    */
-  template <typename T> [[gnu::always_inline]] void broadcast(T &&x, communicator c = {}, int root = 0) {
-    static_assert(not std::is_const_v<T>, "mpi::broadcast cannot be called on const objects");
-    if (has_env) mpi_broadcast(std::forward<T>(x), c, root);
+  template <typename T> [[gnu::always_inline]] void broadcast(T &&x, communicator c = {}, int root = 0) { // NOLINT (forwarding is not needed)
+    mpi_broadcast(x, c, root);
   }
 
   /**
    * @brief Generic MPI reduce.
    *
-   * @details If mpi::has_env is true or if the return type of the specialized `mpi_reduce` is lazy, this function calls
-   * the specialized `mpi_reduce` function for the given object. Otherwise, it simply converts the input object to the
-   * output type `mpi_reduce` would return.
+   * @details If there is a specialized `mpi_reduce` for the given type, we call it. Otherwise, we call mpi::reduce_into
+   * with the given input object and a default constructed output object of type `T`.
+   *
+   * @note We do not check if an MPI runtime environment is being used, i.e. if mpi::has_env is true. It is the
+   * responsibility of the specializations to do this check, in case they make direct calls to the MPI C library.
    *
    * @tparam T Type to be reduced.
    * @param x Object to be reduced.
@@ -91,140 +84,253 @@ namespace mpi {
    * @param root Rank of the root process.
    * @param all Should all processes receive the result of the reduction.
    * @param op `MPI_Op` used in the reduction.
-   * @return The result of the specialized `mpi_reduce` call.
+   * @return Result of the specialized `mpi_reduce` call.
    */
   template <typename T>
-  [[gnu::always_inline]] inline decltype(auto) reduce(T &&x, communicator c = {}, int root = 0, bool all = false, MPI_Op op = MPI_SUM) {
-    // return type of mpi_reduce
-    using r_t = decltype(mpi_reduce(std::forward<T>(x), c, root, all, op));
-    if constexpr (is_mpi_lazy<r_t>) {
-      return mpi_reduce(std::forward<T>(x), c, root, all, op);
+  [[gnu::always_inline]] decltype(auto) reduce(T &&x, communicator c = {}, int root = 0, bool all = false, // NOLINT (forwarding is not needed)
+                                               MPI_Op op = MPI_SUM) {
+    if constexpr (requires { mpi_reduce(x, c, root, all, op); }) {
+      return mpi_reduce(x, c, root, all, op);
     } else {
-      if (has_env)
-        return mpi_reduce(std::forward<T>(x), c, root, all, op);
-      else
-        return detail::convert<r_t>(std::forward<T>(x));
+      std::remove_cvref_t<T> res;
+      reduce_into(x, res, c, root, all, op);
+      return res;
     }
   }
 
   /**
-   * @brief Generic in-place MPI reduce.
+   * @brief Generic in place MPI reduce.
+   *
+   * @details We call mpi::reduce_into with the given object as the input and output argument.
    *
-   * @details If mpi::has_env is true, this functions calls the specialized `mpi_reduce_in_place` function for the given
-   * object. Otherwise, it does nothing.
+   * @note We do not check if an MPI runtime environment is being used, i.e. if mpi::has_env is true. It is the
+   * responsibility of the specializations to do this check, in case they make direct calls to the MPI C library.
    *
    * @tparam T Type to be reduced.
-   * @param x Object to be reduced.
+   * @param x Object to be reduced (into).
    * @param c mpi::communicator.
    * @param root Rank of the root process.
    * @param all Should all processes receive the result of the reduction.
    * @param op `MPI_Op` used in the reduction.
    */
   template <typename T>
-  [[gnu::always_inline]] inline void reduce_in_place(T &&x, communicator c = {}, int root = 0, bool all = false, MPI_Op op = MPI_SUM) {
-    static_assert(not std::is_const_v<T>, "In-place mpi functions cannot be called on const objects");
-    if (has_env) mpi_reduce_in_place(std::forward<T>(x), c, root, all, op);
+  [[gnu::always_inline]] void reduce_in_place(T &&x, communicator c = {}, int root = 0, bool all = false, // NOLINT (forwarding is not needed)
+                                              MPI_Op op = MPI_SUM) {
+    mpi_reduce_into(x, x, c, root, all, op);
+  }
+
+  /**
+   * @brief Generic MPI reduce that reduces directly into an existing output object.
+   *
+   * @details It calls the specialized `mpi_reduce_into` function.
+   *
+   * @note We do not check if an MPI runtime environment is being used, i.e. if mpi::has_env is true. It is the
+   * responsibility of the specializations to do this check, in case they make direct calls to the MPI C library.
+   *
+   * @tparam T1 Type to be reduced.
+   * @tparam T2 Type to be reduced into.
+   * @param x_in Object to be reduced.
+   * @param x_out Object to be reduced into.
+   * @param c mpi::communicator.
+   * @param root Rank of the root process.
+   * @param all Should all processes receive the result of the reduction.
+   * @param op `MPI_Op` used in the reduction.
+   */
+  template <typename T1, typename T2>
+  [[gnu::always_inline]] void reduce_into(T1 &&x_in, T2 &&x_out, communicator c = {}, int root = 0, // NOLINT (forwarding is not needed)
+                                          bool all = false, MPI_Op op = MPI_SUM) {
+    mpi_reduce_into(x_in, x_out, c, root, all, op);
   }
 
   /**
    * @brief Generic MPI scatter.
    *
-   * @details If mpi::has_env is true or if the return type of the specialized `mpi_scatter` is lazy, this function
-   * calls the specialized `mpi_scatter` function for the given object. Otherwise, it simply converts the input object
-   * to the output type `mpi_scatter` would return.
+   * @details If there is a specialized `mpi_scatter` for the given type, we call it. Otherwise, we call
+   * mpi::scatter_into with the given input object and a default constructed output object of type `T`.
+   *
+   * @note We do not check if an MPI runtime environment is being used, i.e. if mpi::has_env is true. It is the
+   * responsibility of the specializations to do this check, in case they make direct calls to the MPI C library.
    *
    * @tparam T Type to be scattered.
    * @param x Object to be scattered.
    * @param c mpi::communicator.
    * @param root Rank of the root process.
-   * @return The result of the specialized `mpi_scatter` call.
+   * @return Result of the specialized `mpi_scatter` call.
    */
-  template <typename T> [[gnu::always_inline]] inline decltype(auto) scatter(T &&x, mpi::communicator c = {}, int root = 0) {
-    // return type of mpi_scatter
-    using r_t = decltype(mpi_scatter(std::forward<T>(x), c, root));
-    if constexpr (is_mpi_lazy<r_t>) {
-      return mpi_scatter(std::forward<T>(x), c, root);
+  template <typename T>
+  [[gnu::always_inline]] decltype(auto) scatter(T &&x, mpi::communicator c = {}, int root = 0) { // NOLINT (forwarding is not needed)
+    if constexpr (requires { mpi_scatter(x, c, root); }) {
+      return mpi_scatter(x, c, root);
     } else {
-      if (has_env)
-        return mpi_scatter(std::forward<T>(x), c, root);
-      else
-        return detail::convert<r_t>(std::forward<T>(x));
+      std::remove_cvref_t<T> res;
+      scatter_into(x, res, c, root);
+      return res;
     }
   }
 
+  /**
+   * @brief Generic MPI scatter that scatters directly into an existing output object.
+   *
+   * @details It calls the specialized `mpi_scatter_into` function.
+   *
+   * @note We do not check if an MPI runtime environment is being used, i.e. if mpi::has_env is true. It is the
+   * responsibility of the specializations to do this check, in case they make direct calls to the MPI C library.
+   *
+   * @tparam T1 Type to be scattered.
+   * @tparam T2 Type to be scattered into.
+   * @param x_in Object to be scattered.
+   * @param x_out Object to be scattered into.
+   * @param c mpi::communicator.
+   * @param root Rank of the root process.
+   */
+  template <typename T1, typename T2>
+  [[gnu::always_inline]] void scatter_into(T1 &&x_in, T2 &&x_out, communicator c = {}, int root = 0) { // NOLINT (forwarding is not needed)
+    mpi_scatter_into(x_in, x_out, c, root);
+  }
+
   /**
    * @brief Generic MPI gather.
    *
-   * @details If mpi::has_env is true or if the return type of the specialized `mpi_gather` is lazy, this function
-   * calls the specialized `mpi_gather` function for the given object. Otherwise, it simply converts the input object to
-   * the output type `mpi_gather` would return.
+   * @details If there is a specialized `mpi_gather` for the given type, we call it. Otherwise, we call mpi::gather_into
+   * with the given input object and a default constructed output object of type `T`.
+   *
+   * @note We do not check if an MPI runtime environment is being used, i.e. if mpi::has_env is true. It is the
+   * responsibility of the specializations to do this check, in case they make direct calls to the MPI C library.
    *
    * @tparam T Type to be gathered.
    * @param x Object to be gathered.
    * @param c mpi::communicator.
    * @param root Rank of the root process.
    * @param all Should all processes receive the result of the gather.
-   * @return The result of the specialized `mpi_gather` call.
+   * @return Result of the specialized `mpi_gather` call.
    */
-  template <typename T> [[gnu::always_inline]] inline decltype(auto) gather(T &&x, mpi::communicator c = {}, int root = 0, bool all = false) {
-    // return type of mpi_gather
-    using r_t = decltype(mpi_gather(std::forward<T>(x), c, root, all));
-    if constexpr (is_mpi_lazy<r_t>) {
-      return mpi_gather(std::forward<T>(x), c, root, all);
+  template <typename T>
+  [[gnu::always_inline]] decltype(auto) gather(T &&x, communicator c = {}, int root = 0, bool all = false) { // NOLINT (forwarding is not needed)
+    if constexpr (requires { mpi_gather(x, c, root, all); }) {
+      return mpi_gather(x, c, root, all);
     } else {
-      if (has_env)
-        return mpi_gather(std::forward<T>(x), c, root, all);
-      else
-        return detail::convert<r_t>(std::forward<T>(x));
+      std::remove_cvref_t<T> res;
+      gather_into(x, res, c, root, all);
+      return res;
     }
   }
 
+  /**
+   * @brief Generic MPI gather that gathers directly into an existing output object.
+   *
+   * @details It calls the specialized `mpi_gather_into` function.
+   *
+   * @note We do not check if an MPI runtime environment is being used, i.e. if mpi::has_env is true. It is the
+   * responsibility of the specializations to do this check, in case they make direct calls to the MPI C library.
+   *
+   * @tparam T1 Type to be gathered.
+   * @tparam T2 Type to be gathered into.
+   * @param x_in Object to be gathered.
+   * @param x_out Object to be gathered into.
+   * @param c mpi::communicator.
+   * @param root Rank of the root process.
+   * @param all Should all processes receive the result of the gather.
+   */
+  template <typename T1, typename T2>
+  [[gnu::always_inline]] void gather_into(T1 &&x_in, T2 &&x_out, communicator c = {}, int root = 0, // NOLINT (forwarding is not needed)
+                                          bool all = false) {
+    mpi_gather_into(x_in, x_out, c, root, all);
+  }
+
   /**
    * @brief Generic MPI all-reduce.
    * @details It simply calls mpi::reduce with `all = true`.
    */
-  template <typename T> [[gnu::always_inline]] inline decltype(auto) all_reduce(T &&x, communicator c = {}, MPI_Op op = MPI_SUM) {
-    return reduce(std::forward<T>(x), c, 0, true, op);
+  template <typename T>
+  [[gnu::always_inline]] decltype(auto) all_reduce(T &&x, communicator c = {}, MPI_Op op = MPI_SUM) { // NOLINT (forwarding is not needed)
+    return reduce(x, c, 0, true, op);
   }
 
   /**
-   * @brief Generic MPI all-reduce in-place.
+   * @brief Generic MPI all-reduce in place.
    * @details It simply calls mpi::reduce_in_place with `all = true`.
    */
-  template <typename T> [[gnu::always_inline]] inline void all_reduce_in_place(T &&x, communicator c = {}, MPI_Op op = MPI_SUM) {
-    reduce_in_place(std::forward<T>(x), c, 0, true, op);
+  template <typename T>
+  [[gnu::always_inline]] void all_reduce_in_place(T &&x, communicator c = {}, MPI_Op op = MPI_SUM) { // NOLINT (forwarding is not needed)
+    reduce_in_place(x, c, 0, true, op);
+  }
+
+  /**
+   * @brief Generic MPI all-reduce that reduces directly into an existing output object.
+   * @details It simply calls mpi::reduce_into with `all = true`.
+   */
+  template <typename T1, typename T2>
+  [[gnu::always_inline]] void all_reduce_into(T1 &&x_in, T2 &&x_out, communicator c = {}, MPI_Op op = MPI_SUM) { // NOLINT (forwarding is not needed)
+    return reduce_into(x_in, x_out, c, 0, true, op);
   }
 
   /**
    * @brief Generic MPI all-gather.
    * @details It simply calls mpi::gather with `all = true`.
    */
-  template <typename T> [[gnu::always_inline]] inline decltype(auto) all_gather(T &&x, communicator c = {}) {
-    return gather(std::forward<T>(x), c, 0, true);
+  template <typename T> [[gnu::always_inline]] decltype(auto) all_gather(T &&x, communicator c = {}) { // NOLINT (forwarding is not needed)
+    return gather(x, c, 0, true);
+  }
+
+  /**
+   * @brief Generic MPI all-gather that gathers directly into an existing output object.
+   * @details It simply calls mpi::gather_into with `all = true`.
+   */
+  template <typename T1, typename T2>
+  [[gnu::always_inline]] void all_gather_into(T1 &&x_in, T2 &&x_out, communicator c = {}) { // NOLINT (forwarding is not needed)
+    return gather_into(x_in, x_out, c, 0, true);
+  }
+
+  /**
+   * @brief Checks if a given object is equal across all ranks in the given communicator.
+   *
+   * @details It makes two calls to mpi::all_reduce, one with `MPI_MIN` and the other with `MPI_MAX`, and compares their
+   * results.
+   *
+   * @note `MPI_MIN` and `MPI_MAX` need to make sense for the given type `T`.
+   *
+   * @tparam T Type to be checked.
+   * @param x Object to be equality compared.
+   * @param c mpi::communicator.
+   * @return If the given object is equal on all ranks, it returns true. Otherwise, it returns false.
+   */
+  template <typename T> bool all_equal(T const &x, communicator c = {}) {
+    if (!has_env || c.size() < 2) return true;
+    auto min_obj = all_reduce(x, c, MPI_MIN);
+    auto max_obj = all_reduce(x, c, MPI_MAX);
+    return min_obj == max_obj;
   }
 
   /**
-   * @brief Implementation of an MPI broadcast for types that have a corresponding MPI datatype, i.e. for which a
-   * specialization of mpi::mpi_type has been defined.
+   * @brief Implementation of an MPI broadcast for types that have a corresponding MPI datatype.
+   *
+   * @details If mpi::has_env is false or if the communicator size is < 2, it does nothing. Otherwise, it calls
+   * `MPI_Bcast`.
    *
-   * @details It throws an exception in case a call to the MPI C library fails.
+   * It throws an exception in case the call to the MPI C library fails.
    *
    * @tparam T Type to be broadcasted.
-   * @param x Object to be broadcasted.
+   * @param x Object to be broadcasted (into).
    * @param c mpi::communicator.
    * @param root Rank of the root process.
    */
   template <typename T>
     requires(has_mpi_type<T>)
   void mpi_broadcast(T &x, communicator c = {}, int root = 0) {
+    // in case there is no active MPI environment or if the communicator size is < 2, do nothing
+    if (!has_env || c.size() < 2) return;
+
+    // make the MPI C library call
     check_mpi_call(MPI_Bcast(&x, 1, mpi_type<T>::get(), root, c.get()), "MPI_Bcast");
   }
 
   /**
-   * @brief Implementation of an MPI reduce for types that have a corresponding MPI datatype, i.e. for which a
-   * specialization of mpi::mpi_type has been defined.
+   * @brief Implementation of an MPI reduce for types that have a corresponding MPI datatype.
+   *
+   * @details If mpi::has_env is false or if the communicator size is < 2, it returns a copy of the input object.
+   * Otherwise, it calls `MPI_Allreduce` or `MPI_Reduce` with a default constructed output object.
    *
-   * @details It throws an exception in case a call to the MPI C library fails.
+   * It throws an exception in case the call to the MPI C library fails.
    *
    * @tparam T Type to be reduced.
    * @param x Object to be reduced.
@@ -232,29 +338,39 @@ namespace mpi {
    * @param root Rank of the root process.
    * @param all Should all processes receive the result of the reduction.
    * @param op `MPI_Op` used in the reduction.
-   * @return The result of the reduction.
+   * @return Result of the reduction.
    */
   template <typename T>
     requires(has_mpi_type<T>)
   T mpi_reduce(T const &x, communicator c = {}, int root = 0, bool all = false, MPI_Op op = MPI_SUM) {
-    T b;
-    auto d = mpi_type<T>::get();
-    if (!all)
-      // old MPI implementations may require a non-const send buffer
-      check_mpi_call(MPI_Reduce(const_cast<T *>(&x), &b, 1, d, op, root, c.get()), "MPI_Reduce"); // NOLINT
-    else
-      check_mpi_call(MPI_Allreduce(const_cast<T *>(&x), &b, 1, d, op, c.get()), "MPI_Allreduce"); // NOLINT
-    return b;
+    // in case there is no active MPI environment or if the communicator size is < 2, return the input object
+    if (!has_env || c.size() < 2) return x;
+
+    // make the MPI C library call with a default constructed output object
+    T res;
+    if (all) {
+      check_mpi_call(MPI_Allreduce(&x, &res, 1, mpi_type<T>::get(), op, c.get()), "MPI_Allreduce");
+    } else {
+      check_mpi_call(MPI_Reduce(&x, &res, 1, mpi_type<T>::get(), op, root, c.get()), "MPI_Reduce");
+    }
+    return res;
   }
 
   /**
-   * @brief Implementation of an in-place MPI reduce for types that have a corresponding MPI datatype, i.e. for which
-   * a specialization of mpi::mpi_type has been defined.
+   * @brief Implementation of an MPI reduce that reduces directly into an existing output object for types that have a
+   * corresponding MPI datatype.
+   *
+   * @details If the addresses of the input and output objects are equal, the reduction is done in place.
    *
-   * @details It throws an exception in case a call to the MPI C library fails.
+   * If mpi::has_env is false or if the communicator size is < 2, it either does nothing (in place) or copies the input
+   * into the output object. Otherwise, it calls `MPI_Allreduce` or `MPI_Reduce` (with `MPI_IN_PLACE`).
+   *
+   * It throws an exception in case the call to the MPI C library fails and it is expected that either all or none of
+   * the receiving processes choose the in place option.
    *
    * @tparam T Type to be reduced.
-   * @param x Object to be reduced.
+   * @param x_in Object to be reduced.
+   * @param x_out Object to be reduced into.
    * @param c mpi::communicator.
    * @param root Rank of the root process.
    * @param all Should all processes receive the result of the reduction.
@@ -262,33 +378,91 @@ namespace mpi {
    */
   template <typename T>
     requires(has_mpi_type<T>)
-  void mpi_reduce_in_place(T &x, communicator c = {}, int root = 0, bool all = false, MPI_Op op = MPI_SUM) {
-    if (!all)
-      check_mpi_call(MPI_Reduce((c.rank() == root ? MPI_IN_PLACE : &x), &x, 1, mpi_type<T>::get(), op, root, c.get()), "MPI_Reduce");
-    else
-      check_mpi_call(MPI_Allreduce(MPI_IN_PLACE, &x, 1, mpi_type<T>::get(), op, c.get()), "MPI_Allreduce");
+  void mpi_reduce_into(T const &x_in, T &x_out, communicator c = {}, int root = 0, bool all = false, MPI_Op op = MPI_SUM) {
+    // check if the reduction is in place
+    auto in_ptr         = static_cast<void const *>(&x_in);
+    auto out_ptr        = static_cast<void *>(&x_out);
+    bool const in_place = (in_ptr == out_ptr);
+    if (all) {
+      EXPECTS_WITH_MESSAGE(all_equal(static_cast<int>(in_place), c),
+                           "Either zero or all receiving processes have to choose the in place option in mpi_reduce_into");
+    }
+
+    // in case there is no active MPI environment or if the communicator size is < 2, do nothing (in place) or copy
+    if (!has_env || c.size() < 2) {
+      if (!in_place) x_out = x_in;
+      return;
+    }
+
+    // make the MPI C library call
+    if (in_place && (c.rank() == root || all)) in_ptr = MPI_IN_PLACE;
+    if (all) {
+      check_mpi_call(MPI_Allreduce(in_ptr, out_ptr, 1, mpi_type<T>::get(), op, c.get()), "MPI_Allreduce");
+    } else {
+      check_mpi_call(MPI_Reduce(in_ptr, out_ptr, 1, mpi_type<T>::get(), op, root, c.get()), "MPI_Reduce");
+    }
   }
 
   /**
-   * @brief Checks if a given object is equal across all ranks in the given communicator.
+   * @brief Implementation of an MPI gather for types that have a corresponding MPI datatype.
    *
-   * @details It requires that there is a specialized `mpi_reduce` for the given type `T` and that it is equality
-   * comparable as well as default constructible.
+   * @details It constructs an output vector, resizes it on receiving ranks to the size of the communicator and calls
+   * mpi::mpi_gather_into. On non-receiving ranks the output vector is empty.
    *
-   * It makes two calls to mpi::all_reduce, one with `MPI_MIN` and the other with `MPI_MAX`, and compares their results.
+   * @tparam T Type to be gathered.
+   * @param x Object to be gathered.
+   * @param c mpi::communicator.
+   * @param root Rank of the root process.
+   * @param all Should all processes receive the result of the gather.
+   * @return `std::vector` containing the gathered objects.
+   */
+  template <typename T>
+    requires(has_mpi_type<T>)
+  std::vector<T> mpi_gather(T const &x, communicator c = {}, int root = 0, bool all = false) {
+    std::vector<T> res(c.rank() == root || all ? c.size() : 0);
+    mpi_gather_into(x, res, c, root, all);
+    return res;
+  }
+
+  /**
+   * @brief Implementation of an MPI gather that gathers directly into an existing output range for types that have a
+   * corresponding MPI datatype.
    *
-   * @note `MPI_MIN` and `MPI_MAX` need to make sense for the given type `T`.
+   * @details If mpi::has_env is false or if the communicator size is < 2, it copies the input object into the range.
+   * Otherwise, it calls `MPI_Allgather` or `MPI_Gather.
    *
-   * @tparam T Type to be checked.
-   * @param x Object to be equality compared.
+   * It throws an exception in case a call to the MPI C library fails and it expects that the range size on receiving
+   * processes is equal the communicator size.
+   *
+   * @tparam T Type to be gathered.
+   * @tparam R MPICompatibleRange type to be gathered into.
+   * @param x Object to be gathered.
+   * @param rg Range to be gathered into.
    * @param c mpi::communicator.
-   * @return If the given object is equal on all ranks, it returns true. Otherwise, it returns false.
+   * @param root Rank of the root process.
+   * @param all Should all processes receive the result of the gather.
    */
-  template <typename T> bool all_equal(T const &x, communicator c = {}) {
-    if (!has_env) return true;
-    auto min_obj = all_reduce(x, c, MPI_MIN);
-    auto max_obj = all_reduce(x, c, MPI_MAX);
-    return min_obj == max_obj;
+  template <typename T, MPICompatibleRange R>
+    requires(has_mpi_type<T> && std::same_as<T, std::remove_cvref_t<std::ranges::range_value_t<R>>>)
+  void mpi_gather_into(T const &x, R &&rg, communicator c = {}, int root = 0, bool all = false) { // NOLINT (ranges need not be forwarded)
+    // check the size of the output range
+    if (c.rank() == root || all) {
+      EXPECTS_WITH_MESSAGE(c.size() == std::ranges::size(rg), "Output range size is not equal the number of ranks in mpi_gather_into");
+    }
+
+    // in case there is no active MPI environment or if the communicator size is < 2, copy the input into the range
+    if (!has_env || c.size() < 2) {
+      std::ranges::copy(std::views::single(x), std::ranges::begin(rg));
+      return;
+    }
+
+    // make the MPI C library call
+    using value_t = std::ranges::range_value_t<R>;
+    if (all) {
+      check_mpi_call(MPI_Allgather(&x, 1, mpi_type<T>::get(), std::ranges::data(rg), 1, mpi_type<value_t>::get(), c.get()), "MPI_Allgather");
+    } else {
+      check_mpi_call(MPI_Gather(&x, 1, mpi_type<T>::get(), std::ranges::data(rg), 1, mpi_type<value_t>::get(), root, c.get()), "MPI_Gather");
+    }
   }
 
   /** @} */
diff --git a/c++/mpi/lazy.hpp b/c++/mpi/lazy.hpp
deleted file mode 100644
index e142ca56..00000000
--- a/c++/mpi/lazy.hpp
+++ /dev/null
@@ -1,96 +0,0 @@
-// Copyright (c) 2024 Simons Foundation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0.txt
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-// Authors: Thomas Hahn, Alexander Hampel, Olivier Parcollet, Nils Wentzell
-
-/**
- * @file
- * @brief Provides a struct and tags to represent lazy MPI communication.
- */
-
-#pragma once
-
-#include "./communicator.hpp"
-
-#include <mpi.h>
-
-namespace mpi {
-
-  namespace tag {
-
-    /**
-     * @ingroup mpi_lazy
-     * @brief Tag to specify a lazy MPI reduce call.
-     */
-    struct reduce {};
-
-    /**
-     * @ingroup mpi_lazy
-     * @brief Tag to specify a lazy MPI scatter call.
-     */
-    struct scatter {};
-
-    /**
-     * @ingroup mpi_lazy
-     * @brief Tag to specify a lazy MPI gather call.
-     */
-    struct gather {};
-
-  } // namespace tag
-
-  /**
-   * @addtogroup mpi_lazy
-   * @{
-   */
-
-  /**
-   * @brief Represents a lazy MPI communication.
-   *
-   * @tparam Tag An mpi::tag to specify the kind of MPI communication.
-   * @tparam T Type to be communicated.
-   */
-  template <typename Tag, typename T> struct lazy {
-    /// Object to be communicated.
-    T rhs;
-
-    /// mpi::communicator used in the lazy communication.
-    communicator c;
-
-    /// Rank of the root process.
-    int root{};
-
-    /// Whether to use the `MPI_Allxxx` operation
-    bool all{};
-
-    /// `MPI_Op` used in the lazy communication (only relevant if mpi::tag::reduce is used).
-    MPI_Op op{};
-  };
-
-  /**
-   * @brief Type trait to check if a type is mpi::lazy.
-   * @tparam T Type to be checked.
-   */
-  template <typename T> inline constexpr bool is_mpi_lazy = false;
-
-  /**
-   * @brief Spezialization of mpi::is_mpi_lazy.
-   *
-   * @tparam Tag Type to specify the kind of MPI call.
-   * @tparam T Type to be checked.
-   */
-  template <typename Tag, typename T> inline constexpr bool is_mpi_lazy<lazy<Tag, T>> = true;
-
-  /** @} */
-
-} // namespace mpi
diff --git a/c++/mpi/macros.hpp b/c++/mpi/macros.hpp
index 1dec58d4..cd566580 100644
--- a/c++/mpi/macros.hpp
+++ b/c++/mpi/macros.hpp
@@ -87,12 +87,12 @@
 
 #ifdef NDEBUG
 
-#define EXPECTS(X)
-#define ASSERT(X)
-#define ENSURES(X)
-#define EXPECTS_WITH_MESSAGE(X, ...)
-#define ASSERT_WITH_MESSAGE(X, ...)
-#define ENSURES_WITH_MESSAGE(X, ...)
+#define EXPECTS(X) {}
+#define ASSERT(X) {}
+#define ENSURES(X) {}
+#define EXPECTS_WITH_MESSAGE(X, ...) {}
+#define ASSERT_WITH_MESSAGE(X, ...) {}
+#define ENSURES_WITH_MESSAGE(X, ...) {}
 
 #else
 
diff --git a/c++/mpi/mpi.hpp b/c++/mpi/mpi.hpp
index 592214c3..46ac1be6 100644
--- a/c++/mpi/mpi.hpp
+++ b/c++/mpi/mpi.hpp
@@ -27,7 +27,6 @@
 #include "./datatypes.hpp"
 #include "./environment.hpp"
 #include "./generic_communication.hpp"
-#include "./lazy.hpp"
 #include "./monitor.hpp"
 #include "./operators.hpp"
 #include "./pair.hpp"
diff --git a/c++/mpi/pair.hpp b/c++/mpi/pair.hpp
index e8900ea8..115d6ed1 100644
--- a/c++/mpi/pair.hpp
+++ b/c++/mpi/pair.hpp
@@ -16,7 +16,7 @@
 
 /**
  * @file
- * @brief Provides an MPI broadcast and reduce for std::pair.
+ * @brief Provides an MPI broadcast and reduce for `std::pair`.
  */
 
 #pragma once
@@ -35,13 +35,13 @@ namespace mpi {
    */
 
   /**
-   * @brief Implementation of an MPI broadcast for a std::pair.
+   * @brief Implementation of an MPI broadcast for a `std::pair`.
    *
-   * @details Simply calls the generic mpi::broadcast for the first and second element of the pair.
+   * @details It calls the generic mpi::broadcast for the first and second element of the pair.
    *
    * @tparam T1 Type of the first element of the pair.
    * @tparam T2 Type of the second element of the pair.
-   * @param p std::pair to broadcast.
+   * @param p `std::pair` to broadcast.
    * @param c mpi::communicator.
    * @param root Rank of the root process.
    */
@@ -51,18 +51,18 @@ namespace mpi {
   }
 
   /**
-   * @brief Implementation of an MPI reduce for a std::pair.
+   * @brief Implementation of an MPI reduce for a `std::pair`.
    *
-   * @details Simply calls the generic mpi::reduce for the first and second element of the pair.
+   * @details It calls the generic mpi::reduce for the first and second element of the pair separately.
    *
    * @tparam T1 Type of the first element of the pair.
    * @tparam T2 Type of the second element of the pair.
-   * @param p std::pair to be reduced.
+   * @param p `std::pair` to be reduced.
    * @param c mpi::communicator.
    * @param root Rank of the root process.
    * @param all Should all processes receive the result of the reduction.
    * @param op `MPI_Op` used in the reduction.
-   * @return std::pair<T1, T2> containing the result of each individual reduction.
+   * @return `std::pair` containing the results of the two reductions.
    */
   template <typename T1, typename T2>
   auto mpi_reduce(std::pair<T1, T2> const &p, communicator c = {}, int root = 0, bool all = false, MPI_Op op = MPI_SUM) {
diff --git a/c++/mpi/ranges.hpp b/c++/mpi/ranges.hpp
index 5ca79124..63ff0d11 100644
--- a/c++/mpi/ranges.hpp
+++ b/c++/mpi/ranges.hpp
@@ -16,7 +16,7 @@
 
 /**
  * @file
- * @brief Provides an MPI broadcast, reduce, scatter and gather for contiguous ranges.
+ * @brief Provides an MPI broadcast, reduce, scatter and gather for generic ranges.
  */
 
 #pragma once
@@ -33,8 +33,13 @@
 #include <mpi.h>
 
 #include <algorithm>
+#include <concepts>
+#include <limits>
+#include <numeric>
 #include <ranges>
 #include <stdexcept>
+#include <type_traits>
+#include <utility>
 #include <vector>
 
 namespace mpi {
@@ -45,425 +50,262 @@ namespace mpi {
    */
 
   /**
-   * @brief Implementation of an MPI broadcast for an mpi::contiguous_sized_range object.
+   * @brief Implementation of an MPI broadcast for `std::ranges::sized_range` objects.
    *
-   * @details If mpi::has_mpi_type is true for the value type of the range, then the range is broadcasted using a simple
-   * `MPI_Bcast`. Otherwise, the generic mpi::broadcast is called for each element of the range.
+   * @details The behaviour of this function is as follows:
+   * - If the number of elements to be broadcasted is zero, it does nothing.
+   * - If the range is contiguous with an MPI compatible value type, it calls `MPI_Bcast` and broadcasts the elements
+   * from the input range on the root process to all other processes.
+   * - Otherwise, it calls mpi::broadcast for each element separately.
    *
-   * It throws an exception in case a call to the MPI C library fails and it expects that the sizes of the ranges are
-   * equal across all processes.
+   * It throws an exception in case a call to the MPI C library fails and it expects that the input range size is equal
+   * on all processes.
    *
-   * If the ranges are empty or if mpi::has_env is false or if the communicator size is < 2, it does nothing.
-   *
-   * @note It is recommended to use the generic mpi::broadcast for supported types, e.g. `std::vector`, `std::array` or
-   * `std::string`. It is the user's responsibility to ensure that ranges have the correct sizes.
-   *
-   * @code{.cpp}
-   * // create a vector on all ranks
-   * auto vec = std::vector<int>(5);
-   *
-   * if (comm.rank() == 0) {
-   *   // on rank 0, initialize the vector and broadcast the first 3 elements
-   *   vec = {1, 2, 3, 0, 0};
-   *   mpi::broadcast_range(std::span{vec.data(), 3}, comm);
-   * } else {
-   *   // on other ranks, broadcast to the last 3 elements of the vector
-   *   mpi::broadcast_range(std::span{vec.data() + 2, 3}, comm);
-   * }
-   *
-   * // output result
-   * for (auto x : vec) std::cout << x << " ";
-   * std::cout << std::endl;
-   * @endcode
-   *
-   * Output (with 4 processes):
-   *
-   * ```
-   * 1 2 3 0 0
-   * 0 0 1 2 3
-   * 0 0 1 2 3
-   * 0 0 1 2 3
-   * ```
-   *
-   * @tparam R mpi::contiguous_sized_range type.
-   * @param rg Range to broadcast.
+   * @tparam R `std::ranges::sized_range` type.
+   * @param rg Range to be broadcasted (into).
    * @param c mpi::communicator.
    * @param root Rank of the root process.
    */
-  template <contiguous_sized_range R> void broadcast_range(R &&rg, communicator c = {}, int root = 0) { // NOLINT (ranges need not be forwarded)
-    // check the sizes of all ranges
-    using value_t   = std::ranges::range_value_t<R>;
-    auto const size = std::ranges::size(rg);
-    EXPECTS_WITH_MESSAGE(all_equal(size, c), "Range sizes are not equal across all processes in mpi::broadcast_range");
-
-    // do nothing if the range is empty, if MPI is not initialized or if the communicator size is < 2
-    if (size == 0 || !has_env || c.size() < 2) return;
-
-    // broadcast the range
-    if constexpr (has_mpi_type<value_t>)
-      // make an MPI C library call for MPI compatible value types
-      check_mpi_call(MPI_Bcast(std::ranges::data(rg), size, mpi_type<value_t>::get(), root, c.get()), "MPI_Bcast");
-    else
-      // otherwise call the specialized mpi_broadcast for each element
-      for (auto &val : rg) broadcast(val, c, root);
-  }
+  template <std::ranges::sized_range R> void broadcast_range(R &&rg, communicator c = {}, int root = 0) { // NOLINT (ranges need not be forwarded)
+    // check the size of the range
+    auto size = static_cast<long>(std::ranges::size(rg));
+    EXPECTS_WITH_MESSAGE(all_equal(size, c), "Range sizes are not equal on all processes in mpi::broadcast_range");
 
-  /**
-   * @brief Implementation of an in-place MPI reduce for an mpi::contiguous_sized_range object.
-   *
-   * @details If mpi::has_mpi_type is true for the value type of the range, then the range is reduced using a simple
-   * `MPI_Reduce` or `MPI_Allreduce` with `MPI_IN_PLACE`. Otherwise, the specialized `mpi_reduce_in_place` is called
-   * for each element in the range.
-   *
-   * It throws an exception in case a call to the MPI C library fails and it expects that the sizes of the ranges are
-   * equal across all processes.
-   *
-   * If the ranges are empty or if mpi::has_env is false or if the communicator size is < 2, it does nothing.
-   *
-   * @note It is recommended to use the generic mpi::reduce_in_place and mpi::all_reduce_in_place for supported types,
-   * e.g. `std::vector` or `std::array`. It is the user's responsibility to ensure that ranges have the correct sizes.
-   *
-   * @code{.cpp}
-   * // create a vector on all ranks
-   * auto vec = std::vector<int>{0, 1, 2, 3, 4};
-   *
-   * // in-place reduce the middle elements only on rank 0
-   * mpi::reduce_in_place_range(std::span{vec.data() + 1, 3}, comm);
-   *
-   * // output result
-   * for (auto x : vec) std::cout << x << " ";
-   * std::cout << std::endl;
-   * @endcode
-   *
-   * Output (with 4 processes):
-   *
-   * ```
-   * 0 1 2 3 4
-   * 0 1 2 3 4
-   * 0 1 2 3 4
-   * 0 4 8 12 4
-   * ```
-   *
-   * @tparam R mpi::contiguous_sized_range type.
-   * @param rg Range to reduce.
-   * @param c mpi::communicator.
-   * @param root Rank of the root process.
-   * @param all Should all processes receive the result of the reduction.
-   * @param op `MPI_Op` used in the reduction.
-   */
-  template <contiguous_sized_range R>
-  void reduce_in_place_range(R &&rg, communicator c = {}, int root = 0, bool all = false, // NOLINT (ranges need not be forwarded)
-                             MPI_Op op = MPI_SUM) {
-    // check the sizes of all ranges
-    using value_t   = std::ranges::range_value_t<R>;
-    auto const size = std::ranges::size(rg);
-    EXPECTS_WITH_MESSAGE(all_equal(size, c), "Range sizes are not equal across all processes in mpi::reduce_in_place_range");
-
-    // do nothing if the range is empty, if MPI is not initialized or if the communicator size is < 2
-    if (size == 0 || !has_env || c.size() < 2) return;
-
-    // reduce the ranges
-    if constexpr (has_mpi_type<value_t>) {
-      // make an MPI C library call for MPI compatible value types
-      auto data = std::ranges::data(rg);
-      if (!all)
-        check_mpi_call(MPI_Reduce((c.rank() == root ? MPI_IN_PLACE : data), data, size, mpi_type<value_t>::get(), op, root, c.get()), "MPI_Reduce");
-      else
-        check_mpi_call(MPI_Allreduce(MPI_IN_PLACE, data, size, mpi_type<value_t>::get(), op, c.get()), "MPI_Allreduce");
+    // do nothing if no elements are broadcasted
+    if (size <= 0) return;
+
+    // call the MPI C library if the ranges are contiguous with MPI compatible value types, otherwise do element-wise
+    // broadcasts
+    if constexpr (MPICompatibleRange<R>) {
+      // in case there is no active MPI environment or if the communicator size is < 2, do nothing
+      if (!has_env || c.size() < 2) return;
+
+      // make the MPI C library call (allow the number of elements to larger than INT_MAX)
+      constexpr long max_int = std::numeric_limits<int>::max();
+      for (long offset = 0; size > 0; offset += max_int, size -= max_int) {
+        auto const count = static_cast<int>(std::min(size, max_int));
+        check_mpi_call(MPI_Bcast(std::ranges::data(rg) + offset, count, mpi_type<std::ranges::range_value_t<R>>::get(), root, c.get()), "MPI_Bcast");
+      }
     } else {
-      // otherwise call the specialized mpi_reduce_in_place for each element
-      for (auto &val : rg) mpi_reduce_in_place(val, c, root, all, op);
+      // otherwise call the generic broadcast for each element separately
+      for (auto &x : rg) broadcast(x, c, root);
     }
   }
 
   /**
-   * @brief Implementation of an MPI reduce for an mpi::contiguous_sized_range.
-   *
-   * @details If mpi::has_mpi_type is true for the value type of the range, then the range is reduced using a simple
-   * `MPI_Reduce` or `MPI_Allreduce`. Otherwise, the specialized `mpi_reduce` is called for each element in the range.
-   *
-   * It throws an exception in case a call to the MPI C library fails and it expects that the sizes of the input ranges
-   * are equal across all processes and that they are equal to the size of the output range on receiving processes.
-   *
-   * If the input ranges are empty, it does nothing. If mpi::has_env is false or if the communicator size is < 2, it
-   * simply copies the input range to the output range.
-   *
-   * @note It is recommended to use the generic mpi::reduce and mpi::all_reduce for supported types, e.g. `std::vector`
-   * or `std::array`. It is the user's responsibility to ensure that ranges have the correct sizes.
-   *
-   * @code{.cpp}
-   * // create input and output vectors on all ranks
-   * auto in_vec = std::vector<int>{0, 1, 2, 3, 4};
-   * auto out_vec = std::vector<int>(in_vec.size(), 0);
-   *
-   * // allreduce the middle elements of the input vector to the last elements of the output vector
-   * mpi::reduce_range(std::span{in_vec.data() + 1, 3}, std::span{out_vec.data() + 2, 3}, comm, 0, true);
-   *
-   * // output result
-   * for (auto x : out_vec) std::cout << x << " ";
-   * std::cout << std::endl;
-   * @endcode
-   *
-   * Output (with 4 processes):
-   *
-   * ```
-   * 0 0 4 8 12
-   * 0 0 4 8 12
-   * 0 0 4 8 12
-   * 0 0 4 8 12
-   * ```
-   *
-   * @tparam R1 mpi::contiguous_sized_range type.
-   * @tparam R2 mpi::contiguous_sized_range type.
-   * @param in_rg Range to reduce.
-   * @param out_rg Range to reduce into.
+   * @brief Implementation of an MPI reduce for `std::ranges::sized_range` objects.
+   *
+   * @details The behaviour of this function is as follows:
+   * - If the number of elements to be reduced is zero, it does nothing.
+   * - If the range is contiguous with an MPI compatible value type, it calls `MPI_Reduce` or `MPI_Allreduce` to reduce
+   * the elements in the input ranges into the output ranges on receiving ranks.
+   *   - If the input and output ranges point to the same data, the reduction is done in place.
+   * - Otherwise, it calls mpi::reduce_into for each input-output element pair separately.
+   *
+   * It throws an exception in case a call to the MPI C library fails and it expects
+   * - that the input range size on all processes and the output range size on receiving processes are equal and
+   * - that either all or none of the receiving processes choose the in place option.
+   *
+   * @tparam R1 `std::ranges::sized_range` type.
+   * @tparam R2 `std::ranges::sized_range` type.
+   * @param in_rg Range to be reduced.
+   * @param out_rg Range to be reduced into.
    * @param c mpi::communicator.
    * @param root Rank of the root process.
    * @param all Should all processes receive the result of the reduction.
    * @param op `MPI_Op` used in the reduction.
    */
-  template <contiguous_sized_range R1, contiguous_sized_range R2>
+  template <std::ranges::sized_range R1, std::ranges::sized_range R2>
   void reduce_range(R1 &&in_rg, R2 &&out_rg, communicator c = {}, int root = 0, bool all = false, // NOLINT (ranges need not be forwarded)
                     MPI_Op op = MPI_SUM) {
-    // check input and output ranges
-    auto const in_size = std::ranges::size(in_rg);
-    EXPECTS_WITH_MESSAGE(all_equal(in_size, c), "Input range sizes are not equal across all processes in mpi::reduce_range");
-    if (c.rank() == root || all) {
-      EXPECTS_WITH_MESSAGE(in_size == std::ranges::size(out_rg), "Input and output range sizes are not equal in mpi::reduce_range");
-    }
+    // check the size of the input range
+    auto size = static_cast<long>(std::ranges::size(in_rg));
+    EXPECTS_WITH_MESSAGE(all_equal(size, c), "Input range sizes are not equal on all processes in mpi::reduce_range");
 
-    // do nothing if the input range is empty
-    if (in_size == 0) return;
+    // do nothing if no elements are reduced
+    if (size <= 0) return;
 
-    // simply copy if there is no active MPI environment or if the communicator size is < 2
-    if (!has_env || c.size() < 2) {
-      std::ranges::copy(std::forward<R1>(in_rg), std::ranges::data(out_rg));
-      return;
-    }
+    // check the size of the output range
+    bool const receives = (c.rank() == root || all);
+    if (receives) EXPECTS_WITH_MESSAGE(size == std::ranges::size(out_rg), "Input and output range sizes are not equal in mpi::reduce_range");
+
+    // call the MPI C library if the ranges are contiguous with MPI compatible value types
+    if constexpr (MPICompatibleRange<R1> && MPICompatibleRange<R2>) {
+      static_assert(std::same_as<std::remove_cvref_t<std::ranges::range_value_t<R1>>, std::remove_cvref_t<std::ranges::range_value_t<R2>>>,
+                    "Value types of input and output ranges not compatible in mpi::reduce_range");
 
-    // reduce the ranges
-    using in_value_t  = std::ranges::range_value_t<R1>;
-    using out_value_t = std::ranges::range_value_t<R2>;
-    if constexpr (has_mpi_type<in_value_t> && std::same_as<in_value_t, out_value_t>) {
-      // make an MPI C library call for MPI compatible value types
-      auto const in_data = std::ranges::data(in_rg);
-      auto out_data      = std::ranges::data(out_rg);
-      if (!all)
-        check_mpi_call(MPI_Reduce(in_data, out_data, in_size, mpi_type<in_value_t>::get(), op, root, c.get()), "MPI_Reduce");
-      else
-        check_mpi_call(MPI_Allreduce(in_data, out_data, in_size, mpi_type<in_value_t>::get(), op, c.get()), "MPI_Allreduce");
+      // check if the reduction is in place
+      bool const in_place = (static_cast<void const *>(std::ranges::data(in_rg)) == static_cast<void *>(std::ranges::data(out_rg)));
+      if (all) {
+        EXPECTS_WITH_MESSAGE(all_equal(static_cast<int>(in_place), c),
+                             "Either zero or all receiving processes have to choose the in place option in mpi::reduce_range");
+      }
+
+      // in case there is no active MPI environment or if the communicator size is < 2, copy to the output range
+      if (!has_env || c.size() < 2) {
+        std::ranges::copy(std::forward<R1>(in_rg), std::ranges::data(out_rg));
+        return;
+      }
+
+      // make the MPI C library call (allow the number of elements to larger than INT_MAX)
+      constexpr long max_int = std::numeric_limits<int>::max();
+      for (long offset = 0; size > 0; offset += max_int, size -= max_int) {
+        auto in_data  = static_cast<void const *>(std::ranges::data(in_rg) + offset);
+        auto out_data = std::ranges::data(out_rg) + offset;
+        if (receives and in_place) in_data = MPI_IN_PLACE;
+        auto const count = static_cast<int>(std::min(size, max_int));
+        if (all) {
+          check_mpi_call(MPI_Allreduce(in_data, out_data, count, mpi_type<std::ranges::range_value_t<R1>>::get(), op, c.get()), "MPI_Allreduce");
+        } else {
+          check_mpi_call(MPI_Reduce(in_data, out_data, count, mpi_type<std::ranges::range_value_t<R1>>::get(), op, root, c.get()), "MPI_Reduce");
+        }
+      }
     } else {
-      // otherwise call the specialized mpi_reduce for each element
-      // the size of the output range is arbitrary on non-recieving ranks, so we cannot use transform on them
-      if (c.rank() == root || all)
-        std::ranges::transform(std::forward<R1>(in_rg), std::ranges::data(out_rg), [&](auto const &val) { return reduce(val, c, root, all, op); });
-      else
-        // the assignment is needed in case a lazy object is returned
-        std::ranges::for_each(std::forward<R1>(in_rg), [&](auto const &val) { [[maybe_unused]] out_value_t ignore = reduce(val, c, root, all, op); });
+      // fallback to element-wise reduction if the range is not contiguous with an MPI compatible value type
+      if (size <= std::ranges::size(out_rg)) {
+        // on ranks where the output range size is large enough, reduce into the output elements
+        for (auto &&[x_in, x_out] : itertools::zip(in_rg, out_rg)) reduce_into(x_in, x_out, c, root, all, op);
+      } else {
+        // on all other ranks, reduce into a dummy output object (needs to be default constructible)
+        using out_value_t = std::ranges::range_value_t<R2>;
+        if constexpr (std::is_default_constructible_v<out_value_t>) {
+          out_value_t out_dummy{};
+          for (auto &&x_in : in_rg) reduce_into(x_in, out_dummy, c, root, all, op);
+        } else {
+          // if it is not default constructible, is there something we can do?
+          throw std::runtime_error("Cannot default construct dummy object in mpi::reduce_range");
+        }
+      }
     }
   }
 
   /**
-   * @brief Implementation of an MPI scatter for an mpi::contiguous_sized_range.
-   *
-   * @details If mpi::has_mpi_type is true for the value type of the range, then the range is scattered as evenly as
-   * possible across the processes in the communicator using a simple `MPI_Scatterv`. Otherwise an exception is thrown.
-   *
-   * The user can specify a chunk size which is used to divide the input range into chunks of the specified size. The
-   * number of chunks are then distributed evenly across the processes in the communicator. The size of the input range
-   * is required to be a multiple of the given chunk size, otherwise an exception is thrown.
-   *
-   * It throws an exception in case a call to the MPI C library fails and it expects that the output ranges have the
-   * correct size and that they add up to the size of the input range on the root process.
-   *
-   * If the input range is empty on root, it does nothing. If mpi::has_env is false or if the communicator size is < 2,
-   * it simply copies the input range to the output range.
-   *
-   * @note It is recommended to use the generic mpi::scatter for supported types, e.g. `std::vector`. It is the user's
-   * responsibility to ensure that the ranges have the correct sizes (mpi::chunk_length can be useful to do that).
-   *
-   * @code{.cpp}
-   * // create input and output vectors on all ranks
-   * auto in_vec = std::vector<int>{};
-   * if (comm.rank() == 0) in_vec = {0, 1, 2, 3, 4, 5, 6, 7};
-   * auto out_vec = std::vector<int>(mpi::chunk_length(5, comm.size(), comm.rank()), 0);
-   *
-   * // scatter the middle elements of the input vector from rank 0 to all ranks
-   * mpi::scatter_range(std::span{in_vec.data() + 1, 5}, out_vec, 5, comm);
-   *
-   * // output result
-   * for (auto x : out_vec) std::cout << x << " ";
-   * std::cout << std::endl;
-   * @endcode
-   *
-   * Output (with 2 processes):
-   *
-   * ```
-   * 4 5
-   * 1 2 3
-   * ```
-   *
-   * @tparam R1 mpi::contiguous_sized_range type.
-   * @tparam R2 mpi::contiguous_sized_range type.
-   * @param in_rg Range to scatter.
-   * @param out_rg Range to scatter into.
-   * @param in_size Size of the input range on root (must also be given on non-root ranks).
+   * @brief Implementation of an MPI scatter for mpi::MPICompatibleRange objects.
+   *
+   * @details The behaviour of this function is as follows:
+   * - If the number of elements to be scattered is zero, it does nothing.
+   * - Otherwise, it calls `MPI_Scatterv` to scatter the input range from the root process to the output ranges on all
+   * other processes.
+   *
+   * By default, the input range is scattered as evenly as possible from the root process to all other processes in the
+   * communicator. To change that, the user can specify a chunk size which is used to divide the number of elements to
+   * be scattered into chunks of the specified size. Then, instead of single elements, the chunks are distributed evenly
+   * across the processes in the communicator.
+   *
+   * It throws an exception if call to the MPI C library fails and it expects
+   * - that the number of elements to be scattered is equal on all processes,
+   * - that the size of the input range on the root process is equal the number of elements to be scattered and
+   * - that the output range size is equal the number of elements to be received on all processes.
+   *
+   * @note In place scattering is not supported.
+   *
+   * @tparam R1 mpi::MPICompatibleRange type.
+   * @tparam R2 mpi::MPICompatibleRange type.
+   * @param in_rg Range to be scattered.
+   * @param out_rg Range to be scattered into.
+   * @param scatter_size Number of elements to be scattered.
    * @param c mpi::communicator.
    * @param root Rank of the root process.
    * @param chunk_size Size of the chunks to scatter.
    */
-  template <contiguous_sized_range R1, contiguous_sized_range R2>
-    requires(std::same_as<std::ranges::range_value_t<R1>, std::ranges::range_value_t<R2>>)
-  void scatter_range(R1 &&in_rg, R2 &&out_rg, long in_size, communicator c = {}, int root = 0, // NOLINT (ranges need not be forwarded)
+  template <MPICompatibleRange R1, MPICompatibleRange R2>
+    requires(std::same_as<std::remove_cvref_t<std::ranges::range_value_t<R1>>, std::remove_cvref_t<std::ranges::range_value_t<R2>>>)
+  void scatter_range(R1 &&in_rg, R2 &&out_rg, long scatter_size, communicator c = {}, int root = 0, // NOLINT (ranges need not be forwarded)
                      long chunk_size = 1) {
-    // check the sizes of the input and output ranges
+    // check the number of elements to be scattered
+    EXPECTS_WITH_MESSAGE(all_equal(scatter_size, c), "Number of elements to be scattered is not equal on all processes in mpi::scatter_range");
+
+    // do nothing if no elements are scattered
+    if (scatter_size == 0) return;
+
+    // check the size of the input range on root
     if (c.rank() == root) {
-      EXPECTS_WITH_MESSAGE(in_size == std::ranges::size(in_rg), "Input range size not equal to provided size in mpi::scatter_range");
+      EXPECTS_WITH_MESSAGE(scatter_size == std::ranges::size(in_rg),
+                           "Input range size on root is not equal the number of elements to be scattered in mpi::scatter_range");
     }
-    EXPECTS_WITH_MESSAGE(in_size == all_reduce(std::ranges::size(out_rg), c),
-                         "Output range sizes don't add up to input range size in mpi::scatter_range");
 
-    // do nothing if the input range is empty
-    if (in_size == 0) return;
+    // check the size of the output range
+    auto const recvcount = static_cast<int>(chunk_length(scatter_size, c.size(), c.rank(), chunk_size));
+    EXPECTS_WITH_MESSAGE(recvcount == std::ranges::size(out_rg),
+                         "Output range size is not equal the number of elements to be received in mpi::scatter_range");
 
-    // simply copy if there is no active MPI environment or if the communicator size is < 2
+    // in case there is no active MPI environment or if the communicator size is < 2, copy to output range
     if (!has_env || c.size() < 2) {
       std::ranges::copy(std::forward<R1>(in_rg), std::ranges::data(out_rg));
       return;
     }
 
-    // check the size of the output range
-    int recvcount = static_cast<int>(chunk_length(in_size, c.size(), c.rank(), chunk_size));
-    EXPECTS_WITH_MESSAGE(recvcount == std::ranges::size(out_rg), "Output range size is incorrect in mpi::scatter_range");
-
     // prepare arguments for the MPI call
     auto sendcounts = std::vector<int>(c.size());
     auto displs     = std::vector<int>(c.size() + 1, 0);
     for (int i = 0; i < c.size(); ++i) {
-      sendcounts[i] = static_cast<int>(chunk_length(in_size, c.size(), i, chunk_size));
+      sendcounts[i] = static_cast<int>(chunk_length(scatter_size, c.size(), i, chunk_size));
       displs[i + 1] = sendcounts[i] + displs[i];
     }
 
-    // scatter the range
-    using in_value_t  = std::ranges::range_value_t<R1>;
-    using out_value_t = std::ranges::range_value_t<R2>;
-    if constexpr (has_mpi_type<in_value_t> && has_mpi_type<out_value_t>) {
-      // make an MPI C library call for MPI compatible value types
-      auto const in_data = std::ranges::data(in_rg);
-      auto out_data      = std::ranges::data(out_rg);
-      check_mpi_call(MPI_Scatterv(in_data, sendcounts.data(), displs.data(), mpi_type<in_value_t>::get(), out_data, recvcount,
-                                  mpi_type<out_value_t>::get(), root, c.get()),
-                     "MPI_Scatterv");
-    } else {
-      // otherwise throw an exception
-      throw std::runtime_error{"Error in mpi::scatter_range: Types with no corresponding datatype can only be all-gathered"};
-    }
+    // make the MPI C library call
+    check_mpi_call(MPI_Scatterv(std::ranges::data(in_rg), sendcounts.data(), displs.data(), mpi_type<std::ranges::range_value_t<R1>>::get(),
+                                std::ranges::data(out_rg), recvcount, mpi_type<std::ranges::range_value_t<R2>>::get(), root, c.get()),
+                   "MPI_Scatterv");
   }
 
   /**
-   * @brief Implementation of an MPI gather for an mpi::contiguous_sized_range.
-   *
-   * @details If mpi::has_mpi_type is true for the value type of the input ranges, then the ranges are gathered using a
-   * simple `MPI_Gatherv` or `MPI_Allgatherv`. Otherwise, each process broadcasts its elements to all other processes
-   * which implies that `all == true` is required in this case.
-   *
-   * It throws an exception in case a call to the MPI C library fails and it expects that the sizes of the input ranges
-   * add up to the given size of the output range and that the output ranges have the correct size on receiving
-   * processes.
-   *
-   * If the input ranges are all empty, it does nothing. If mpi::has_env is false or if the communicator size is < 2, it
-   * simply copies the input range to the output range.
-   *
-   * @note It is recommended to use the generic mpi::gather for supported types, e.g. `std::vector` and `std::string`.
-   * It is the user's responsibility to ensure that the ranges have the correct sizes.
-   *
-   * @code{.cpp}
-   * // create input and output vectors on all ranks
-   * auto in_vec  = std::vector<int>{0, 1, 2, 3, 4};
-   * auto out_vec = std::vector<int>(3 * comm.size(), 0);
+   * @brief Implementation of an MPI gather for mpi::MPICompatibleRange objects.
    *
-   * // gather the middle elements of the input vectors from all ranks on rank 0
-   * mpi::gather_range(std::span{in_vec.data() + 1, 3}, out_vec, 3 * comm.size(), comm);
+   * @details The behaviour of this function is as follows:
+   * - If the number of elements to be gathered is zero, it does nothing.
+   * - Otherwise, it calls `MPI_Gatherv` or `MPI_Allgatherv` to gather the elements from the input ranges on all
+   * processes into the output ranges on receiving processes.
    *
-   * // output result
-   * for (auto x : out_vec) std::cout << x << " ";
-   * std::cout << std::endl;
-   * @endcode
+   * This is the inverse operation of mpi::scatter_range. The numbers of elements to be gathered do not have to be equal
+   * on all processes.
    *
-   * Output (with 2 processes):
+   * It throws an exception in case a call to the MPI C library fails and it expects that the output range sizes on
+   * receiving processes is the number of elements to be gathered.
    *
-   * ```
-   * 0 0 0 0 0 0 0 0 0 0 0 0
-   * 0 0 0 0 0 0 0 0 0 0 0 0
-   * 0 0 0 0 0 0 0 0 0 0 0 0
-   * 1 2 3 1 2 3 1 2 3 1 2 3
-   * ```
+   * @note In place gathering is not supported.
    *
-   * @tparam R1 mpi::contiguous_sized_range type.
-   * @tparam R2 mpi::contiguous_sized_range type.
-   * @param in_rg Range to gather.
-   * @param out_rg Range to gather into.
-   * @param out_size Size of the output range on receiving processes (must also be given on non-receiving ranks).
+   * @tparam R1 mpi::MPICompatibleRange type.
+   * @tparam R2 mpi::MPICompatibleRange type.
+   * @param in_rg Range to be gathered.
+   * @param out_rg Range to be gathered into.
    * @param c mpi::communicator.
    * @param root Rank of the root process.
-   * @param all Should all processes receive the result of the reduction.
+   * @param all Should all processes receive the result of the gather operation.
    */
-  template <contiguous_sized_range R1, contiguous_sized_range R2>
-  void gather_range(R1 &&in_rg, R2 &&out_rg, long out_size, communicator c = {}, int root = 0, // NOLINT (ranges need not be forwarded)
-                    bool all = false) {
-    // check the sizes of the input and output ranges
-    auto const in_size = std::ranges::size(in_rg);
-    EXPECTS_WITH_MESSAGE(out_size = all_reduce(in_size, c), "Input range sizes don't add up to output range size in mpi::gather_range");
+  template <MPICompatibleRange R1, MPICompatibleRange R2>
+    requires(std::same_as<std::remove_cvref_t<std::ranges::range_value_t<R1>>, std::remove_cvref_t<std::ranges::range_value_t<R2>>>)
+  void gather_range(R1 &&in_rg, R2 &&out_rg, communicator c = {}, int root = 0, bool all = false) { // NOLINT (ranges need not be forwarded)
+    // get the receive counts (sendcount from each process) and the displacements
+    auto sendcount  = static_cast<int>(std::ranges::size(in_rg));
+    auto recvcounts = all_gather(sendcount, c);
+    auto displs     = std::vector<int>(c.size() + 1, 0);
+    std::partial_sum(recvcounts.begin(), recvcounts.end(), displs.begin() + 1);
+
+    // do nothing if there are no elements to gather
+    if (displs.back() == 0) return;
+
+    // check the size of the output range on receiving ranks
     if (c.rank() == root || all) {
-      EXPECTS_WITH_MESSAGE(out_size == std::ranges::size(out_rg), "Output range size is incorrect in mpi::gather_range");
+      EXPECTS_WITH_MESSAGE(displs.back() == std::ranges::size(out_rg),
+                           "Output range size is not equal the number of elements to be received in mpi::gather_range");
     }
 
-    // do nothing if the output range is empty
-    if (out_size == 0) return;
-
-    // simply copy if there is no active MPI environment or if the communicator size is < 2
+    // in case there is no active MPI environment or if the communicator size is < 2, copy to the output range
     if (!has_env || c.size() < 2) {
       std::ranges::copy(std::forward<R1>(in_rg), std::ranges::data(out_rg));
       return;
     }
 
-    // prepare arguments for the MPI call
-    auto recvcounts = std::vector<int>(c.size());
-    auto displs     = std::vector<int>(c.size() + 1, 0);
-    int sendcount   = in_size;
-    if (!all)
-      check_mpi_call(MPI_Gather(&sendcount, 1, mpi_type<int>::get(), recvcounts.data(), 1, mpi_type<int>::get(), root, c.get()), "MPI_Gather");
-    else
-      check_mpi_call(MPI_Allgather(&sendcount, 1, mpi_type<int>::get(), recvcounts.data(), 1, mpi_type<int>::get(), c.get()), "MPI_Allgather");
-    for (int i = 0; i < c.size(); ++i) displs[i + 1] = recvcounts[i] + displs[i];
-
-    // gather the ranges
-    using in_value_t  = std::ranges::range_value_t<R1>;
-    using out_value_t = std::ranges::range_value_t<R2>;
-    if constexpr (has_mpi_type<in_value_t> && has_mpi_type<out_value_t>) {
-      // make an MPI C library call for MPI compatible value types
-      auto const in_data = std::ranges::data(in_rg);
-      auto out_data      = std::ranges::data(out_rg);
-      if (!all)
-        check_mpi_call(MPI_Gatherv(in_data, sendcount, mpi_type<in_value_t>::get(), out_data, recvcounts.data(), displs.data(),
-                                   mpi_type<out_value_t>::get(), root, c.get()),
-                       "MPI_Gatherv");
-      else
-        check_mpi_call(MPI_Allgatherv(in_data, sendcount, mpi_type<in_value_t>::get(), out_data, recvcounts.data(), displs.data(),
-                                      mpi_type<out_value_t>::get(), c.get()),
-                       "MPI_Allgatherv");
+    // make the MPI C library call
+    if (all) {
+      check_mpi_call(MPI_Allgatherv(std::ranges::data(in_rg), sendcount, mpi_type<std::ranges::range_value_t<R1>>::get(), std::ranges::data(out_rg),
+                                    recvcounts.data(), displs.data(), mpi_type<std::ranges::range_value_t<R2>>::get(), c.get()),
+                     "MPI_Allgatherv");
     } else {
-      if (all) {
-        // if all == true, each process broadcasts it elements to all other ranks
-        for (int i = 0; i < c.size(); ++i) {
-          auto view = std::views::drop(out_rg, displs[i]) | std::views::take(displs[i + 1] - displs[i]);
-          if (c.rank() == i) std::ranges::copy(in_rg, std::ranges::begin(view));
-          broadcast_range(view, c, i);
-        }
-      } else {
-        // otherwise throw an exception
-        throw std::runtime_error{"Error in mpi::gather_range: Types with no corresponding datatype can only be all-gathered"};
-      }
+      check_mpi_call(MPI_Gatherv(std::ranges::data(in_rg), sendcount, mpi_type<std::ranges::range_value_t<R1>>::get(), std::ranges::data(out_rg),
+                                 recvcounts.data(), displs.data(), mpi_type<std::ranges::range_value_t<R2>>::get(), root, c.get()),
+                     "MPI_Gatherv");
     }
   }
 
diff --git a/c++/mpi/string.hpp b/c++/mpi/string.hpp
index f20c1474..d5ce188f 100644
--- a/c++/mpi/string.hpp
+++ b/c++/mpi/string.hpp
@@ -16,7 +16,7 @@
 
 /**
  * @file
- * @brief Provides an MPI broadcast for std::string.
+ * @brief Provides an MPI broadcast and gather for `std::string`.
  */
 
 #pragma once
@@ -35,39 +35,39 @@ namespace mpi {
    */
 
   /**
-   * @brief Implementation of an MPI broadcast for a std::string.
+   * @brief Implementation of an MPI broadcast for a `std::string`.
    *
    * @details It first broadcasts the size of the string from the root process to all other processes, then resizes the
    * string on all non-root processes and calls mpi::broadcast_range with the (resized) input string.
    *
-   * @param s std::string to broadcast.
+   * @param s `std::string` to broadcast (into).
    * @param c mpi::communicator.
    * @param root Rank of the root process.
    */
   inline void mpi_broadcast(std::string &s, communicator c, int root) {
-    size_t len = s.size();
-    broadcast(len, c, root);
-    if (c.rank() != root) s.resize(len);
+    auto count = s.size();
+    broadcast(count, c, root);
+    if (c.rank() != root) s.resize(count);
     broadcast_range(s, c, root);
   }
 
   /**
-   * @brief Implementation of an MPI gather for a std::string.
+   * @brief Implementation of an MPI gather for a `std::string` that gathers directly into an existing output string.
    *
-   * @details It first all-reduces the sizes of the input string from all processes and then calls mpi::gather_range.
+   * @details It first all-reduces the sizes of the input strings from all processes. On receiving ranks, the output
+   * string is resized to the reduced size in case it has not the correct size. On non-receiving ranks, the output
+   * string is always unmodified. Then mpi::gather_range with the input and (resized) output strings is called.
    *
-   * @param s std::string to gather.
+   * @param s_in `std::string` to gather.
+   * @param s_out `std::string` to gather into.
    * @param c mpi::communicator.
    * @param root Rank of the root process.
    * @param all Should all processes receive the result.
-   * @return std::string containing the result of the gather operation.
    */
-  inline std::string mpi_gather(std::string const &s, communicator c = {}, int root = 0, bool all = false) {
-    long len = static_cast<long>(all_reduce(s.size(), c));
-    std::string res{};
-    if (c.rank() == root || all) res.resize(len);
-    gather_range(s, res, len, c, root, all);
-    return res;
+  inline void mpi_gather_into(std::string const &s_in, std::string &s_out, communicator c = {}, int root = 0, bool all = false) {
+    auto const gather_size = mpi::all_reduce(s_in.size(), c);
+    if ((c.rank() == root || all) && s_out.size() != s_in.size()) s_out.resize(gather_size);
+    gather_range(s_in, s_out, c, root, all);
   }
 
   /** @} */
diff --git a/c++/mpi/utils.hpp b/c++/mpi/utils.hpp
index 9be3cbbf..597514e5 100644
--- a/c++/mpi/utils.hpp
+++ b/c++/mpi/utils.hpp
@@ -25,7 +25,6 @@
 
 #include <stdexcept>
 #include <string>
-#include <type_traits>
 
 namespace mpi {
 
@@ -34,27 +33,6 @@ namespace mpi {
    * @{
    */
 
-  namespace detail {
-
-    // Helper struct to get the regular type of a type.
-    template <typename T, typename Enable = void> struct _regular {
-      using type = T;
-    };
-
-    // Spezialization of _regular for types with a `regular_type` type alias.
-    template <typename T> struct _regular<T, std::void_t<typename T::regular_type>> {
-      using type = typename T::regular_type;
-    };
-
-  } // namespace detail
-
-  /**
-   * @ingroup utilities
-   * @brief Type trait to get the regular type of a type.
-   * @tparam T Type to check.
-   */
-  template <typename T> using regular_t = typename detail::_regular<std::decay_t<T>>::type;
-
   /**
    * @brief Check the success of an MPI call.
    * @details It checks if the given error code returned by an MPI routine is equal to `MPI_SUCCESS`. If it isn't, it
@@ -74,13 +52,6 @@ namespace mpi {
     if (errcode != MPI_SUCCESS) throw std::runtime_error("MPI error " + std::to_string(errcode) + " in MPI routine " + mpi_routine);
   }
 
-  /**
-   * @brief A concept that checks if a range type is contiguous and sized.
-   * @tparam R Range type.
-   */
-  template <typename R>
-  concept contiguous_sized_range = std::ranges::contiguous_range<R> && std::ranges::sized_range<R>;
-
   /** @} */
 
 } // namespace mpi
diff --git a/c++/mpi/vector.hpp b/c++/mpi/vector.hpp
index bfd58535..91de1a29 100644
--- a/c++/mpi/vector.hpp
+++ b/c++/mpi/vector.hpp
@@ -16,7 +16,7 @@
 
 /**
  * @file
- * @brief Provides an MPI broadcast, reduce, scatter and gather for std::vector.
+ * @brief Provides an MPI broadcast, reduce, scatter and gather for `std::vector`.
  */
 
 #pragma once
@@ -28,6 +28,8 @@
 
 #include <mpi.h>
 
+#include <type_traits>
+#include <utility>
 #include <vector>
 
 namespace mpi {
@@ -38,96 +40,108 @@ namespace mpi {
    */
 
   /**
-   * @brief Implementation of an MPI broadcast for a std::vector.
+   * @brief Implementation of an MPI broadcast for a `std::vector`.
    *
    * @details It first broadcasts the size of the vector from the root process to all other processes, then resizes the
    * vector on all non-root processes and calls mpi::broadcast_range with the (resized) input vector.
    *
    * @tparam T Value type of the vector.
-   * @param v std::vector to broadcast.
+   * @param v `std::vector` to broadcast.
    * @param c mpi::communicator.
    * @param root Rank of the root process.
    */
   template <typename T> void mpi_broadcast(std::vector<T> &v, communicator c = {}, int root = 0) {
-    auto bsize = v.size();
-    broadcast(bsize, c, root);
-    if (c.rank() != root) v.resize(bsize);
+    auto count = v.size();
+    broadcast(count, c, root);
+    if (c.rank() != root) v.resize(count);
     broadcast_range(v, c, root);
   }
 
   /**
-   * @brief Implementation of an in-place MPI reduce for a std::vector.
+   * @brief Implementation of an MPI reduce for a `std::vector`.
    *
-   * @details It simply calls mpi::reduce_in_place_range with the given input vector.
+   * @details It first constructs the output vector with its value type equal to the return type of
+   * `reduce(std::declval<T>())`. On receiving ranks, the output vector is then resized to the size of the input vector.
+   * On non-receiving ranks, the output vector is always empty.
+   *
+   * It calls mpi::reduce_range with the input and constructed output vector.
    *
    * @tparam T Value type of the vector.
-   * @param v std::vector to reduce.
+   * @param v `std::vector` to reduce.
    * @param c mpi::communicator.
    * @param root Rank of the root process.
    * @param all Should all processes receive the result of the reduction.
    * @param op `MPI_Op` used in the reduction.
+   * @return `std::vector` containing the result of the reduction.
    */
-  template <typename T> void mpi_reduce_in_place(std::vector<T> &v, communicator c = {}, int root = 0, bool all = false, MPI_Op op = MPI_SUM) {
-    reduce_in_place_range(v, c, root, all, op);
+  template <typename T> auto mpi_reduce(std::vector<T> const &v, communicator c = {}, int root = 0, bool all = false, MPI_Op op = MPI_SUM) {
+    using value_type = std::remove_cvref_t<decltype(reduce(std::declval<T>()))>;
+    std::vector<value_type> res(c.rank() == root || all ? v.size() : 0);
+    reduce_range(v, res, c, root, all, op);
+    return res;
   }
 
   /**
-   * @brief Implementation of an MPI reduce for a std::vector.
+   * @brief Implementation of an MPI reduce for a `std::vector` that reduces directly into a given output vector.
    *
-   * @details It simply calls mpi::reduce_range with the given input vector and an empty vector of the same size.
+   * @details It first resizes the output vector to the size of the input vector on receiving ranks and then calls
+   * mpi::reduce_range with the input and (resized) output vector.
    *
-   * @tparam T Value type of the vector.
-   * @param v std::vector to reduce.
+   * @tparam T1 Value type of the vector to be reduced.
+   * @tparam T2 Value type of the vector to be reduced into.
+   * @param v_in `std::vector` to reduce.
+   * @param v_out `std::vector` to reduce into.
    * @param c mpi::communicator.
    * @param root Rank of the root process.
    * @param all Should all processes receive the result of the reduction.
    * @param op `MPI_Op` used in the reduction.
-   * @return std::vector containing the result of each individual reduction.
    */
-  template <typename T>
-  auto mpi_reduce(std::vector<T> const &v, communicator c = {}, int root = 0, bool all = false, MPI_Op op = MPI_SUM) {
-    std::vector<regular_t<T>> res(c.rank() == root || all ? v.size() : 0);
-    reduce_range(v, res, c, root, all, op);
-    return res;
+  template <typename T1, typename T2>
+  void mpi_reduce_into(std::vector<T1> const &v_in, std::vector<T2> &v_out, communicator c = {}, int root = 0, bool all = false,
+                       MPI_Op op = MPI_SUM) {
+    if ((c.rank() == root || all) && v_out.size() != v_in.size()) v_out.resize(v_in.size());
+    reduce_range(v_in, v_out, c, root, all, op);
   }
 
   /**
-   * @brief Implementation of an MPI scatter for a std::vector.
+   * @brief Implementation of an MPI scatter for a `std::vector` that scatters directly into an existing output vector.
    *
-   * @details It first broadcasts the size of the vector from the root process to all other processes and then calls
-   * mpi::scatter_range.
+   * @details It first broadcasts the size of the input vector from the root process to all other processes and
+   * resizes the output vector if it has not the correct size. The size of the output vector is determined with
+   * mpi::chunk_length. Then mpi::scatter_range is called with the input and (resized) output vector.
    *
    * @tparam T Value type of the vector.
-   * @param v std::vector to scatter.
+   * @param v_in `std::vector` to scatter.
+   * @param v_out `std::vector` to scatter into.
    * @param c mpi::communicator.
    * @param root Rank of the root process.
-   * @return std::vector containing the result of the scatter operation.
    */
-  template <typename T> auto mpi_scatter(std::vector<T> const &v, communicator c = {}, int root = 0) {
-    auto bsize = v.size();
-    broadcast(bsize, c, root);
-    std::vector<T> res(chunk_length(bsize, c.size(), c.rank()));
-    scatter_range(v, res, bsize, c, root);
-    return res;
+  template <typename T> void mpi_scatter_into(std::vector<T> const &v_in, std::vector<T> &v_out, communicator c = {}, int root = 0) {
+    auto scatter_size = static_cast<int>(v_in.size());
+    broadcast(scatter_size, c, root);
+    auto const recvcount = chunk_length(scatter_size, c.size(), c.rank());
+    if (v_out.size() != recvcount) v_out.resize(recvcount);
+    scatter_range(v_in, v_out, scatter_size, c, root);
   }
 
   /**
-   * @brief Implementation of an MPI gather for a std::vector.
+   * @brief Implementation of an MPI gather for a `std::vector` that gathers directly into an existing output vector.
    *
-   * @details It first all-reduces the sizes of the input vectors from all processes and then calls mpi::gather_range.
+   * @details It first all-reduces the sizes of the input vectors from all processes. On receiving ranks, the output
+   * vector is resized to the reduced size in case it has not the correct size. On non-receiving ranks, the output
+   * vector is always unmodified. Then mpi::gather_range with the input and (resized) output vector is called.
    *
    * @tparam T Value type of the vector.
-   * @param v std::vector to gather.
+   * @param v_in `std::vector` to gather.
+   * @param v_out `std::vector` to gather into.
    * @param c mpi::communicator.
    * @param root Rank of the root process.
    * @param all Should all processes receive the result.
-   * @return std::vector containing the result of the gather operation.
    */
-  template <typename T> auto mpi_gather(std::vector<T> const &v, communicator c = {}, int root = 0, bool all = false) {
-    long bsize = mpi::all_reduce(v.size(), c);
-    std::vector<T> res(c.rank() == root || all ? bsize : 0);
-    gather_range(v, res, bsize, c, root, all);
-    return res;
+  template <typename T> void mpi_gather_into(std::vector<T> const &v_in, std::vector<T> &v_out, communicator c = {}, int root = 0, bool all = false) {
+    auto const gather_size = mpi::all_reduce(v_in.size(), c);
+    if ((c.rank() == root || all) && v_out.size() != gather_size) v_out.resize(gather_size);
+    gather_range(v_in, v_out, c, root, all);
   }
 
   /** @} */
diff --git a/doc/DoxygenLayout.xml b/doc/DoxygenLayout.xml
index 48ea50bb..031fd722 100644
--- a/doc/DoxygenLayout.xml
+++ b/doc/DoxygenLayout.xml
@@ -22,6 +22,7 @@
       <tab type="user" url="@ref ex1" title="Example 1: Hello world!"/>
       <tab type="user" url="@ref ex2" title="Example 2: Use monitor to communicate errors"/>
       <tab type="user" url="@ref ex3" title="Example 3: Custom type and operator"/>
+      <tab type="user" url="@ref ex4" title="Example 4: Provide custom spezializations"/>
     </tab>
     <tab type="usergroup" url="@ref documentation" title="API Documentation">
       <tab type="usergroup" url="@ref mpi_essentials" title="MPI essentials">
@@ -50,17 +51,11 @@
         </tab>
       </tab>
       <tab type="user" url="@ref coll_comm" title="Collective MPI communication"/>
-      <tab type="usergroup" url="@ref mpi_lazy" title="Lazy MPI communication">
-        <tab type="user" url="@ref mpi::lazy" title="lazy"/>
-        <tab type="user" url="@ref mpi::tag::gather" title="gather tag"/>
-        <tab type="user" url="@ref mpi::tag::reduce" title="reduce tag"/>
-        <tab type="user" url="@ref mpi::tag::scatter" title="scatter tag"/>
-      </tab>
       <tab type="usergroup" url="@ref event_handling" title="Event handling">
         <tab type="user" url="@ref mpi::monitor" title="monitor"/>
       </tab>
       <tab type="usergroup" url="@ref utilities" title="Utilities">
-        <tab type="user" url="@ref mpi::contiguous_sized_range" title="contiguous_sized_range"/>
+        <tab type="user" url="@ref mpi::MPICompatibleRange" title="MPICompatibleRange"/>
       </tab>
       <tab type="filelist" visible="yes" title="" intro=""/>
     </tab>
diff --git a/doc/documentation.md b/doc/documentation.md
index a73e2696..515ab404 100644
--- a/doc/documentation.md
+++ b/doc/documentation.md
@@ -35,17 +35,7 @@ Furthermore, it offers tools to simplify the creation of custom MPI operations u
 
 ## Collective MPI communication
 
-The following generic collective communications are defined in @ref coll_comm "Collective MPI communication":
-
-* @ref mpi::all_gather "all_gather"
-* @ref mpi::all_reduce "all_reduce"
-* @ref mpi::all_reduce_in_place "all_reduce_in_place"
-* @ref mpi::broadcast "broadcast"
-* @ref mpi::gather "gather"
-* @ref mpi::reduce "reduce"
-* @ref mpi::reduce_in_place "reduce_in_place"
-* @ref mpi::scatter "scatter"
-
+**mpi** provides several generic @ref coll_comm "Collective MPI communication".
 They offer a much simpler interface than their MPI C library analogs.
 For example, the following broadcasts a `std::vector<double>` from the process with rank 0 to all others:
 
@@ -61,18 +51,11 @@ MPI_Bcast(vec.data(), static_cast<int>(vec.size()), MPI_DOUBLE, 0, MPI_COMM_WORL
 
 Under the hood, the generic mpi::broadcast implementation calls the specialized
 @ref "mpi::mpi_broadcast(std::vector< T >&, mpi::communicator, int)".
-The other generic functions are implemented in the same way.
-See the "Functions" section in @ref coll_comm to check which datatypes are supported out of the box.
+Other generic functions in **mpi** work similarly.
+See the "Functions" section in @ref coll_comm to check which datatypes and MPI operations are supported out of the box.
 
 In case your datatype is not supported, you are free to provide your own specialization.
 
-Furthermore, there are several functions to simplify communicating generic, contiguous ranges:
-- mpi::broadcast_range,
-- mpi::gather_range,
-- mpi::reduce_in_place_range,
-- mpi::reduce_range and
-- mpi::scatter_range.
-
 ## Lazy MPI communication
 
 @ref mpi_lazy can be used to provied collective MPI communication for lazy expression types.
diff --git a/doc/ex4.md b/doc/ex4.md
new file mode 100644
index 00000000..7d619d5b
--- /dev/null
+++ b/doc/ex4.md
@@ -0,0 +1,63 @@
+@page ex4 Example 4: Provide custom spezializations
+
+[TOC]
+
+In this example, we show how to write a specialized `mpi_reduce_into` for a custom type.
+
+```cpp
+#include <mpi/mpi.hpp>
+#include <iostream>
+#include <vector>
+
+// Custom type.
+class foo {
+  public:
+  // Constructor.
+  foo(int x = 5) : x_(x) {}
+
+  // Get the value stored in the class.
+  int x() const { return x_; }
+
+  // Specialization of mpi_reduce_into for the custom type.
+  friend void mpi_reduce_into(foo const &f_in, foo &f_out, mpi::communicator c = {}, int root = 0, bool all = false, MPI_Op op = MPI_SUM) {
+    mpi::reduce_into(f_in.x_, f_out.x_, c, root, all, op);
+  }
+
+  private:
+  int x_;
+};
+
+int main(int argc, char *argv[]) {
+  // initialize MPI environment
+  mpi::environment env(argc, argv);
+  mpi::communicator world;
+
+  // create a vector of foo objects
+  std::vector<foo> vec {foo{1}, foo{2}, foo{3}, foo{4}, foo{5}};
+
+  // reduce the vector of foo objects
+  auto result = mpi::reduce(vec, world);
+
+  // print the result on rank 0
+  if (world.rank() == 0) {
+    std::cout << "Reduced vector: ";
+    for (auto const &f : result) std::cout << f.x() << " ";
+    std::cout << "\n";
+  }
+}
+```
+
+Output (running with `-n 4`):
+
+```
+Reduced vector: 4 8 12 16 20
+```
+
+Note that by providing a simple `mpi_reduce_into` for our custom `foo` type, we are able to reduce a `std::vector` of
+`foo` objects without any additional work.
+
+Under the hood, each `foo` object is reduced spearately using the above specialization.
+For large amounts of data or in performance critical code sections, this might not be desired.
+In such a case, it is usally better to make the type MPI compatible such that the reduction can be done with a single
+call to MPI C library.
+See @ref ex3 for more details.
diff --git a/doc/examples.md b/doc/examples.md
index 94c15688..2b85e2a2 100644
--- a/doc/examples.md
+++ b/doc/examples.md
@@ -5,13 +5,16 @@
 - @ref ex1 "Example 1: Hello world!"
 - @ref ex2 "Example 2: Use monitor to communicate errors"
 - @ref ex3 "Example 3: Custom type and operator"
+- @ref ex4 "Example 4: Provide custom spezializations"
 
 @section compiling Compiling the examples
 
-All examples have been compiled on a MacBook Pro with an Apple M2 Max chip and [open-mpi](https://www.open-mpi.org/) 4.1.5.
-We further used clang 16.0.6 together with cmake 3.27.2.
+All examples have been compiled on a MacBook Pro with an Apple M2 Max chip and [open-mpi](https://www.open-mpi.org/)
+5.0.1.
+We further used clang 19.1.7 together with cmake 3.31.5.
 
-Assuming that the actual example code is in a file `main.cpp`, the following generic `CMakeLists.txt` should work for all examples:
+Assuming that the actual example code is in a file `main.cpp`, the following generic `CMakeLists.txt` should work for
+all examples:
 
 ```cmake
 cmake_minimum_required(VERSION 3.20)
@@ -28,7 +31,7 @@ include (FetchContent)
 FetchContent_Declare(
   mpi
   GIT_REPOSITORY https://github.com/TRIQS/mpi.git
-  GIT_TAG        1.2.x
+  GIT_TAG        1.3.x
 )
 FetchContent_MakeAvailable(mpi)
 
diff --git a/doc/groups.dox b/doc/groups.dox
index 8675dcfb..0e77a743 100644
--- a/doc/groups.dox
+++ b/doc/groups.dox
@@ -51,17 +51,46 @@
  * @brief Generic and specialized implementations for a subset of collective MPI communications (broadcast, reduce,
  * gather, scatter).
  *
- * @details The generic functions (mpi::broadcast, mpi::reduce, mpi::scatter, ...) call their more specialized
- * counterparts (e.g. mpi::mpi_broadcast, mpi::mpi_reduce, mpi::mpi_scatter, ...).
- *
- * **mpi** provides (some) implementations for
- * - scalar types that have a corresponding mpi::mpi_type,
- * - `std::vector` and `std::array` types with MPI compatible value types,
- * - `std::string` and
- * - `std::pair`.
- *
- * Furthermore, there are several functions to simplify communicating generic, contiguous ranges: mpi::broadcast_range,
- * mpi::gather_range, mpi::reduce_in_place_range, mpi::reduce_range and mpi::scatter_range.
+ * @details **mpi** provides several generic collective communications routines as well as specializations for certain
+ * common types. The generic functions usually simply forward the call to one of the specializations (`mpi_broadcast`,
+ * `mpi_gather`, `mpi_gather_into`, `mpi_reduce`, `mpi_reduce_into`, `mpi_scatter` or `mpi_scatter_into`) using ADL but
+ * can also perform some additional checks. It is therefore recommended to always use the generic versions when
+ * possible.
+ *
+ * Here is a short overview of the available generic functions:
+ * - mpi::broadcast: Calls the specialization `mpi_broadcast`.
+ * - mpi::gather: Calls the specialization `mpi_gather` if it is implemented. Otherwise, it calls mpi::gather_into with
+ * a default constructed output object.
+ * - mpi::gather_into: Calls the specialization `mpi_gather_into`.
+ * - mpi::reduce: Calls the specialization `mpi_reduce` if it is implemented. Otherwise, it calls mpi::reduce_into with
+ * a default constructed output object.
+ * - mpi::reduce_in_place: Calls the specialization `mpi_reduce_into` with the same input and output object.
+ * - mpi::reduce_into: Calls the specialization `mpi_reduce_into`.
+ * - mpi::scatter: Calls the specialization `mpi_scatter` if it is implemented. Otherwise, it calls mpi::scatter_into
+ * with a default constructed output object.
+ * - mpi::scatter_into: Calls the specialization `mpi_scatter_into`.
+ *
+ * In case, all processes should receive the result of the MPI operation, one can use the convenience functions
+ * mpi::all_gather, mpi::all_gather_into, mpi::all_reduce, mpi::all_reduce_in_place or mpi::all_reduce_into. They
+ * forward the given arguments to their "non-all" counterparts with the `all` argument set to true.
+ *
+ * **mpi** provides various specializations for several types. For example,
+ * - for MPI compatible types, i.e. for types that have a corresponding mpi::mpi_type, it provides an
+ * @ref "mpi::mpi_broadcast(T &x, mpi::communicator, int)" "mpi_broadcast",
+ * @ref "mpi::mpi_reduce(T const &, mpi::communicator, int, bool, MPI_Op)" "mpi_reduce",
+ * @ref "mpi::mpi_reduce_into(T const &, T &, mpi::communicator, int, bool, MPI_Op)" "mpi_reduce_into",
+ * @ref "mpi::mpi_gather(T const &, mpi::communicator, int, bool)" "mpi_gather" and an
+ * @ref "mpi::mpi_gather_into(T const &, R &&, mpi::communicator, int, bool)" "mpi_gather_into".
+ * - for strings, it provides an @ref "mpi::mpi_broadcast(std::string &, mpi::communicator, int)" "mpi_broadcast"
+ * and an @ref "mpi::mpi_gather_into(std::string const &, std::string &, mpi::communicator, int, bool)"
+ * "mpi_gather_into".
+ *
+ * Users are encouraged to implement their own specializations for their custom types or in case a specialization is
+ * missing (see e.g. @ref ex4).
+ *
+ * Furthermore, there are several functions to simplify communicating (contiguous) ranges: mpi::broadcast_range,
+ * mpi::gather_range, mpi::reduce_range and mpi::scatter_range. Some of these range functions are more generic than
+ * others. Please check the documentation of the specific function for more details.
  */
 
 /**
diff --git a/test/c++/custom_types.hpp b/test/c++/custom_types.hpp
new file mode 100644
index 00000000..f803ee7d
--- /dev/null
+++ b/test/c++/custom_types.hpp
@@ -0,0 +1,69 @@
+// Copyright (c) 2022-2024 Simons Foundation
+// Copyright (c) 2022 Hugo U.R. Strand
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Authors: Thomas Hahn, Hugo U.R. Strand
+
+#pragma once
+
+#include <gtest/gtest.h>
+#include <mpi/mpi.hpp>
+
+#include <algorithm>
+#include <tuple>
+#include <vector>
+
+// Custom type which is MPI compatible.
+struct mpi_t {
+  long a{0};
+  bool operator==(const mpi_t &) const = default;
+  mpi_t operator+(mpi_t x) const {
+    x.a += a;
+    return x;
+  }
+};
+
+// Tie the data (to make it MPI compatible).
+inline auto tie_data(mpi_t const &x) { return std::tie(x.a); }
+
+// Custom type which is not MPI compatible but has specialized mpi_xxx implementations.
+struct non_mpi_t {
+  int a{1};
+  bool operator==(const non_mpi_t &) const = default;
+};
+
+// Specialize mpi_broadcast for non_mpi_t.
+void mpi_broadcast(non_mpi_t &x, mpi::communicator c = {}, int root = 0) { broadcast(x.a, c, root); }
+
+// Specialize mpi_reduce_into for non_mpi_t.
+void mpi_reduce_into(non_mpi_t const &in, non_mpi_t &out, mpi::communicator c = {}, int root = 0, bool all = false, MPI_Op op = MPI_SUM) {
+  mpi::reduce_into(in.a, out.a, c, root, all, op);
+}
+
+// Specialize mpi_gather for non_mpi_t.
+std::vector<non_mpi_t> mpi_gather(non_mpi_t const &x, mpi::communicator c = {}, int root = 0, bool all = false) {
+  std::vector<int> a_vec = gather(x.a, c, root, all);
+  std::vector<non_mpi_t> res{};
+  if (c.rank() == root || all) {
+    res.resize(c.size());
+    std::ranges::transform(a_vec, res.begin(), [](int a) { return non_mpi_t{a}; });
+  }
+  return res;
+}
+
+// Specialize mpi_gather_into for non_mpi_t.
+void mpi_gather_into(non_mpi_t const &x, auto &&rg, mpi::communicator c = {}, int root = 0, bool all = false) {
+  auto vec = mpi_gather(x, c, root, all);
+  if (c.rank() == root || all) std::ranges::copy(vec, std::ranges::begin(rg));
+}
diff --git a/test/c++/mpi_array.cpp b/test/c++/mpi_array.cpp
deleted file mode 100644
index 884d0c80..00000000
--- a/test/c++/mpi_array.cpp
+++ /dev/null
@@ -1,105 +0,0 @@
-// Copyright (c) 2020-2024 Simons Foundation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0.txt
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-// Authors: Thomas Hahn, Nils Wentzell
-
-#include "./non_mpi_t.hpp"
-
-#include <gtest/gtest.h>
-#include <itertools/itertools.hpp>
-#include <mpi/mpi.hpp>
-
-#include <complex>
-#include <numeric>
-#include <tuple>
-
-TEST(MPI, ArrayBroadcastMPIType) {
-  // broadcast an array with an MPI type
-  mpi::communicator world;
-  std::array<int, 5> arr{};
-  if (world.rank() == 0) std::iota(arr.begin(), arr.end(), 0);
-  mpi::broadcast(arr, world);
-  for (int i = 0; i < 5; ++i) EXPECT_EQ(arr[i], i);
-}
-
-TEST(MPI, ArrayBroadcastTypeWithSpezializedMPIBroadcast) {
-  // broadcast an array with a type that has a specialized mpi_broadcast
-  mpi::communicator world;
-  std::array<non_mpi_t, 5> arr{};
-  if (world.rank() == 0) {
-    for (int i = 0; i < 5; ++i) arr[i].a = i;
-  }
-  mpi::broadcast(arr, world);
-  for (int i = 0; i < 5; ++i) EXPECT_EQ(arr[i].a, i);
-}
-
-TEST(MPI, ArrayReduceInPlaceMPIType) {
-  // in-place reduce an array with an MPI type
-  mpi::communicator world;
-  std::array<int, 5> arr{0, 1, 2, 3, 4};
-  mpi::reduce_in_place(arr, world);
-  if (world.rank() == 0)
-    for (int i = 0; i < 5; ++i) EXPECT_EQ(arr[i], i * world.size());
-  else
-    for (int i = 0; i < 5; ++i) EXPECT_EQ(arr[i], i);
-
-  // in-place allreduce an array with an MPI type
-  std::iota(arr.begin(), arr.end(), 0);
-  mpi::all_reduce_in_place(arr, world);
-  for (int i = 0; i < 5; ++i) EXPECT_EQ(arr[i], i * world.size());
-}
-
-TEST(MPI, ArrayReduceInPlaceTypeWithSpezializedMPIReduceInPlace) {
-  // in-place reduce an array with a type that has a specialized mpi_reduce_in_place
-  mpi::communicator world;
-  std::array<non_mpi_t, 5> arr{};
-  for (int i = 0; i < 5; ++i) arr[i].a = i;
-  mpi::reduce_in_place(arr, world);
-  if (world.rank() == 0)
-    for (int i = 0; i < 5; ++i) EXPECT_EQ(arr[i].a, i * world.size());
-  else
-    for (int i = 0; i < 5; ++i) EXPECT_EQ(arr[i].a, i);
-
-  // in-place allreduce an array with a type that has a specialized mpi_reduce_in_place
-  for (int i = 0; i < 5; ++i) arr[i].a = i;
-  mpi::all_reduce_in_place(arr, world);
-  for (int i = 0; i < 5; ++i) EXPECT_EQ(arr[i].a, i * world.size());
-}
-
-TEST(MPI, ArrayReduceMPIType) {
-  // reduce an array with complex numbers
-  mpi::communicator world;
-  using arr_type = std::array<std::complex<double>, 7>;
-  const int size = 7;
-  arr_type arr{};
-  for (int i = 0; i < size; ++i) arr[i] = std::complex<double>(i, -i);
-  auto arr_reduced = mpi::reduce(arr, world);
-  if (world.rank() == 0)
-    for (int i = 0; i < size; ++i) EXPECT_EQ(arr_reduced[i], std::complex<double>(i * world.size(), -i * world.size()));
-  else
-    EXPECT_EQ(arr_reduced, arr_type{});
-
-  // allreduce an array with complex numbers
-  auto arr_reduced_all = mpi::all_reduce(arr, world);
-  for (int i = 0; i < size; ++i) EXPECT_EQ(arr_reduced_all[i], std::complex<double>(i * world.size(), -i * world.size()));
-}
-
-TEST(MPI, EmptyArrayReduce) {
-  // reduce an empty array
-  mpi::communicator world;
-  std::array<double, 0> arr{};
-  std::ignore = mpi::reduce(arr, world);
-}
-
-MPI_TEST_MAIN;
diff --git a/test/c++/mpi_broadcast.cpp b/test/c++/mpi_broadcast.cpp
new file mode 100644
index 00000000..c7b91243
--- /dev/null
+++ b/test/c++/mpi_broadcast.cpp
@@ -0,0 +1,52 @@
+// Copyright (c) 2022-2024 Simons Foundation
+// Copyright (c) 2022 Hugo U.R. Strand
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Authors: Thomas Hahn, Hugo U.R. Strand
+
+#include "./custom_types.hpp"
+
+#include <gtest/gtest.h>
+#include <mpi/mpi.hpp>
+
+#include <complex>
+#include <string>
+#include <utility>
+
+// Test broadcasting a single value/object.
+template <typename T> void test_broadcast(T root_value) {
+  mpi::communicator world;
+  for (int root = 0; root < world.size(); ++root) {
+    T bcast_value{};
+    if (world.rank() == root) bcast_value = root_value;
+    mpi::broadcast(bcast_value, world, root);
+    EXPECT_EQ(bcast_value, root_value);
+  }
+}
+
+TEST(MPI, BroadcastInteger) { test_broadcast(42); }
+
+TEST(MPI, BroadcastComplex) { test_broadcast(std::complex<double>{1.0, 2.0}); }
+
+TEST(MPI, BroadcastCustomMPIType) { test_broadcast(mpi_t{42}); }
+
+TEST(MPI, BroadcastCustomNonMPIType) { test_broadcast(non_mpi_t{42}); }
+
+TEST(MPI, BroadcastString) { test_broadcast(std::string{"Hello World"}); }
+
+TEST(MPI, BroadcastPairOfStringAndComplex) { test_broadcast(std::make_pair(std::string{"Hello"}, std::complex<double>{1.0, 2.0})); }
+
+TEST(MPI, BroadcastPairOfCustomMPITypeAndCustomNonMPIType) { test_broadcast(std::make_pair(mpi_t{42}, non_mpi_t{-5})); }
+
+MPI_TEST_MAIN;
diff --git a/test/c++/mpi_broadcast_array.cpp b/test/c++/mpi_broadcast_array.cpp
new file mode 100644
index 00000000..e5a50dd6
--- /dev/null
+++ b/test/c++/mpi_broadcast_array.cpp
@@ -0,0 +1,82 @@
+// Copyright (c) 2022-2024 Simons Foundation
+// Copyright (c) 2022 Hugo U.R. Strand
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Authors: Thomas Hahn, Hugo U.R. Strand
+
+#include "./custom_types.hpp"
+
+#include <gtest/gtest.h>
+#include <mpi/mpi.hpp>
+
+#include <array>
+#include <complex>
+#include <ranges>
+#include <string>
+#include <utility>
+
+// Check if two ranges are equal.
+void expect_range_eq(auto &&rg1, auto &&rg2) {
+  EXPECT_EQ(std::ranges::size(rg1), std::ranges::size(rg2));
+  auto it2 = std::ranges::begin(rg2);
+  for (auto &&a : rg1) { EXPECT_EQ(a, *it2++); }
+}
+
+// Test broadcasting arrays.
+template <typename T> void test_broadcast_array(std::array<T, 5> const &root_values) {
+  mpi::communicator world;
+  auto arr = root_values;
+
+  // broadcast an array from different roots
+  for (int root = 0; root < world.size(); ++root) {
+    arr = {};
+    if (world.rank() == root) arr = root_values;
+    mpi::broadcast(arr, world, root);
+    expect_range_eq(arr, root_values);
+  }
+
+  // broadcast an empty array
+  std::array<T, 0> empty_arr{};
+  mpi::broadcast(empty_arr, world);
+  expect_range_eq(arr, root_values);
+}
+
+TEST(MPI, BroadcastIntegerArray) { test_broadcast_array(std::array<int, 5>{1, 2, 3, 4, 5}); }
+
+TEST(MPI, BroadcastComplexArray) {
+  using namespace std::complex_literals;
+  test_broadcast_array(std::array<std::complex<double>, 5>{1.0 - 1.0i, 2.0 - 2.0i, 3.0 - 3.0i, 4.0 - 4.0i, 5.0 - 5.0i});
+}
+
+TEST(MPI, BroadcastCustomMPITypeArray) { test_broadcast_array(std::array<mpi_t, 5>{mpi_t{1}, mpi_t{2}, mpi_t{3}, mpi_t{4}, mpi_t{5}}); }
+
+TEST(MPI, BroadcastCustomNonMPITypeArray) {
+  test_broadcast_array(std::array<non_mpi_t, 5>{non_mpi_t{1}, non_mpi_t{2}, non_mpi_t{3}, non_mpi_t{4}, non_mpi_t{5}});
+}
+
+TEST(MPI, BroadcastStringArray) { test_broadcast_array(std::array<std::string, 5>{"Hello", "World", "MPI", "Broadcast", "Array"}); }
+
+TEST(MPI, BroadcastPairArray) {
+  test_broadcast_array(std::array<std::pair<int, std::string>, 5>{{{1, "Hello"}, {2, "World"}, {3, "MPI"}, {4, "Broadcast"}, {5, "Array"}}});
+}
+
+TEST(MPI, BroadcastArrayOfDoubleArrays) {
+  std::array<std::array<double, 2>, 5> root_values{};
+  for (int i = 0; i < 5; ++i) {
+    for (int j = 0; j < 2; ++j) root_values[i][j] = i * 2 + j;
+  }
+  test_broadcast_array(root_values);
+}
+
+MPI_TEST_MAIN;
diff --git a/test/c++/mpi_broadcast_range.cpp b/test/c++/mpi_broadcast_range.cpp
new file mode 100644
index 00000000..bde14643
--- /dev/null
+++ b/test/c++/mpi_broadcast_range.cpp
@@ -0,0 +1,89 @@
+// Copyright (c) 2022-2024 Simons Foundation
+// Copyright (c) 2022 Hugo U.R. Strand
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Authors: Thomas Hahn, Hugo U.R. Strand
+
+#include "./custom_types.hpp"
+
+#include <gtest/gtest.h>
+#include <mpi/mpi.hpp>
+
+#include <array>
+#include <complex>
+#include <list>
+#include <ranges>
+#include <span>
+#include <utility>
+
+// Check if two ranges are equal.
+void expect_range_eq(auto &&rg1, auto &&rg2) {
+  EXPECT_EQ(std::ranges::size(rg1), std::ranges::size(rg2));
+  auto it2 = std::ranges::begin(rg2);
+  for (auto &&a : rg1) { EXPECT_EQ(a, *it2++); }
+}
+
+// Test broadcasting a range of objects.
+template <typename T> void test_broadcast_range(std::array<T, 5> root_values) {
+  mpi::communicator world;
+  std::array<T, 5> def_arr{};
+  def_arr.fill(root_values[0]);
+
+  // broadcast a contiguous range from different roots
+  auto arr = root_values;
+  for (int root = 0; root < world.size(); ++root) {
+    if (world.rank() == root) {
+      arr = root_values;
+      mpi::broadcast_range(std::span{arr.begin() + 2, 3}, world, root);
+      expect_range_eq(arr, root_values);
+    } else {
+      arr = def_arr;
+      mpi::broadcast_range(std::span{arr.begin(), 3}, world, root);
+      expect_range_eq(std::span{arr.begin(), 3}, std::span{root_values.begin() + 2, 3});
+      expect_range_eq(std::span{arr.begin() + 3, 2}, std::span{def_arr.begin() + 3, 2});
+    }
+  }
+
+  // broadcast a view on a non-contiguous list
+  std::list<T> list(def_arr.begin(), def_arr.end());
+  if (world.rank() == 0) list.assign(root_values.begin(), root_values.end());
+  mpi::broadcast_range(std::ranges::drop_view(list, 2), world);
+  if (world.rank() == 0) {
+    expect_range_eq(list, root_values);
+  } else {
+    expect_range_eq(std::ranges::drop_view(list, 2), std::ranges::drop_view(root_values, 2));
+    expect_range_eq(std::ranges::take_view(list, 2), std::ranges::take_view(def_arr, 2));
+  }
+}
+
+TEST(MPI, BroadcastIntegerRange) { test_broadcast_range(std::array<int, 5>{1, 2, 3, 4, 5}); }
+
+TEST(MPI, BroadcastComplexRange) {
+  using namespace std::complex_literals;
+  test_broadcast_range(std::array<std::complex<double>, 5>{1.0 - 1.0i, 2.0 - 2.0i, 3.0 - 3.0i, 4.0 - 4.0i, 5.0 - 5.0i});
+}
+
+TEST(MPI, BroadcastCustomMPITypeRange) { test_broadcast_range(std::array<mpi_t, 5>{mpi_t{1}, mpi_t{2}, mpi_t{3}, mpi_t{4}, mpi_t{5}}); }
+
+TEST(MPI, BroadcastCustomNonMPITypeRange) {
+  test_broadcast_range(std::array<non_mpi_t, 5>{non_mpi_t{1}, non_mpi_t{2}, non_mpi_t{3}, non_mpi_t{4}, non_mpi_t{5}});
+}
+
+TEST(MPI, BroadcastStringRange) { test_broadcast_range(std::array<std::string, 5>{"Hello", "World", "MPI", "Broadcast", "Array"}); }
+
+TEST(MPI, BroadcastPairRange) {
+  test_broadcast_range(std::array<std::pair<int, std::string>, 5>{{{1, "Hello"}, {2, "World"}, {3, "MPI"}, {4, "Broadcast"}, {5, "Array"}}});
+}
+
+MPI_TEST_MAIN;
diff --git a/test/c++/mpi_broadcast_vector.cpp b/test/c++/mpi_broadcast_vector.cpp
new file mode 100644
index 00000000..92ba8eef
--- /dev/null
+++ b/test/c++/mpi_broadcast_vector.cpp
@@ -0,0 +1,88 @@
+// Copyright (c) 2022-2024 Simons Foundation
+// Copyright (c) 2022 Hugo U.R. Strand
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Authors: Thomas Hahn, Hugo U.R. Strand
+
+#include "./custom_types.hpp"
+
+#include <gtest/gtest.h>
+#include <mpi/mpi.hpp>
+
+#include <complex>
+#include <ranges>
+#include <string>
+#include <utility>
+#include <vector>
+
+// Check if two ranges are equal.
+void expect_range_eq(auto &&rg1, auto &&rg2) {
+  EXPECT_EQ(std::ranges::size(rg1), std::ranges::size(rg2));
+  auto it2 = std::ranges::begin(rg2);
+  for (auto &&a : rg1) { EXPECT_EQ(a, *it2++); }
+}
+
+// Test broadcasting vectors.
+template <typename T> void test_broadcast_vector(std::vector<T> const &root_values) {
+  mpi::communicator world;
+  auto vec = root_values;
+
+  // broadcast a vector from different roots
+  for (int root = 0; root < world.size(); ++root) {
+    vec.clear();
+    if (world.rank() == root) vec = root_values;
+    mpi::broadcast(vec, world, root);
+    expect_range_eq(vec, root_values);
+  }
+
+  // broadcast an empty vector
+  if (world.rank() == 0) {
+    vec.clear();
+    mpi::broadcast(vec, world);
+    EXPECT_TRUE(vec.empty());
+  } else {
+    vec = root_values;
+    mpi::broadcast(vec, world);
+    EXPECT_TRUE(vec.empty());
+  }
+}
+
+TEST(MPI, BroadcastIntegerVector) { test_broadcast_vector(std::vector<int>{1, 2, 3, 4, 5}); }
+
+TEST(MPI, BroadcastComplexVector) {
+  using namespace std::complex_literals;
+  test_broadcast_vector(std::vector<std::complex<double>>{1.0 - 1.0i, 2.0 - 2.0i, 3.0 - 3.0i, 4.0 - 4.0i, 5.0 - 5.0i});
+}
+
+TEST(MPI, BroadcastCustomMPITypeVector) { test_broadcast_vector(std::vector<mpi_t>{mpi_t{1}, mpi_t{2}, mpi_t{3}, mpi_t{4}, mpi_t{5}}); }
+
+TEST(MPI, BroadcastCustomNonMPITypeVector) {
+  test_broadcast_vector(std::vector<non_mpi_t>{non_mpi_t{1}, non_mpi_t{2}, non_mpi_t{3}, non_mpi_t{4}, non_mpi_t{5}});
+}
+
+TEST(MPI, BroadcastStringVector) { test_broadcast_vector(std::vector<std::string>{"Hello", "World", "MPI", "Broadcast", "Array"}); }
+
+TEST(MPI, BroadcastPairVector) {
+  test_broadcast_vector(std::vector<std::pair<int, std::string>>{{{1, "Hello"}, {2, "World"}, {3, "MPI"}, {4, "Broadcast"}, {5, "Array"}}});
+}
+
+TEST(MPI, BroadcastVectorOfDoubleVectors) {
+  std::vector<std::vector<double>> root_values(5, std::vector<double>(2));
+  for (int i = 0; i < 5; ++i) {
+    for (int j = 0; j < 2; ++j) root_values[i][j] = i * 2 + j;
+  }
+  test_broadcast_vector(root_values);
+}
+
+MPI_TEST_MAIN;
diff --git a/test/c++/mpi_cplx.cpp b/test/c++/mpi_cplx.cpp
deleted file mode 100644
index 13c14451..00000000
--- a/test/c++/mpi_cplx.cpp
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright (c) 2022-2024 Simons Foundation
-// Copyright (c) 2022 Hugo U.R. Strand
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0.txt
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-// Authors: Thomas Hahn, Hugo U.R. Strand
-
-#include <gtest/gtest.h>
-#include <mpi/mpi.hpp>
-
-#include <complex>
-
-TEST(MPI, ComplexBroadcast) {
-  // broadcast a complex number
-  mpi::communicator world;
-
-  std::complex<double> cplx;
-  if (world.rank() == 0) cplx = std::complex<double>(1., 2.);
-
-  mpi::broadcast(cplx);
-
-  EXPECT_EQ(cplx, std::complex<double>(1., 2.));
-}
-
-MPI_TEST_MAIN;
diff --git a/test/c++/mpi_gather.cpp b/test/c++/mpi_gather.cpp
new file mode 100644
index 00000000..4d599285
--- /dev/null
+++ b/test/c++/mpi_gather.cpp
@@ -0,0 +1,127 @@
+// Copyright (c) 2022-2024 Simons Foundation
+// Copyright (c) 2022 Hugo U.R. Strand
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Authors: Thomas Hahn, Hugo U.R. Strand
+
+#include "./custom_types.hpp"
+
+#include <gtest/gtest.h>
+#include <mpi/mpi.hpp>
+
+#include <complex>
+#include <ranges>
+#include <string>
+#include <vector>
+
+// Check if two ranges are equal.
+void expect_range_eq(auto &&rg1, auto &&rg2) {
+  EXPECT_EQ(std::ranges::size(rg1), std::ranges::size(rg2));
+  auto it2 = std::ranges::begin(rg2);
+  for (auto &&a : rg1) { EXPECT_EQ(a, *it2++); }
+}
+
+// Test gathering single values/objects.
+template <typename T> void test_gather(std::vector<T> result) {
+  mpi::communicator world;
+
+  // gather from different roots
+  for (int root = 0; root < world.size(); ++root) {
+    // gather single objects into a vector
+    auto vec = mpi::gather(result[world.rank()], world, root);
+    if (world.rank() == root)
+      expect_range_eq(vec, result);
+    else
+      EXPECT_TRUE(vec.empty());
+
+    // gather single objects into an existing vector
+    if (world.rank() == root) {
+      vec.assign(world.size(), T{0});
+      mpi::gather_into(result[world.rank()], vec, world, root);
+      expect_range_eq(vec, result);
+    } else {
+      vec.clear();
+      mpi::gather_into(result[world.rank()], vec, world, root);
+      EXPECT_TRUE(vec.empty());
+    }
+  }
+
+  // allgather single objects into a vector
+  auto vec = mpi::all_gather(result[world.rank()], world);
+  expect_range_eq(vec, result);
+
+  // allgather single objects into an existing vector
+  vec.assign(world.size(), T{0});
+  mpi::all_gather_into(result[world.rank()], vec, world);
+  expect_range_eq(vec, result);
+}
+
+TEST(MPI, GatherInteger) {
+  mpi::communicator world;
+  std::vector<int> result(world.size());
+  for (int i = 0; i < world.size(); ++i) result[i] = i + 1;
+  test_gather(result);
+}
+
+TEST(MPI, GatherComplex) {
+  mpi::communicator world;
+  std::vector<std::complex<double>> result(world.size());
+  for (int i = 0; i < world.size(); ++i) result[i] = std::complex<double>{i + 1.0, -(i + 1.0)};
+  test_gather(result);
+}
+
+TEST(MPI, GatherCustomMPIType) {
+  mpi::communicator world;
+  std::vector<mpi_t> result(world.size());
+  for (int i = 0; i < world.size(); ++i) result[i] = mpi_t{i + 1};
+  test_gather(result);
+}
+
+TEST(MPI, GatherCustomNonMPIType) {
+  mpi::communicator world;
+  std::vector<non_mpi_t> result(world.size());
+  for (int i = 0; i < world.size(); ++i) result[i] = non_mpi_t{i + 1};
+  test_gather(result);
+}
+
+// Test gathering a string.
+TEST(MPI, GatherString) {
+  mpi::communicator world;
+  std::string str{}, result{};
+  for (int i = 0; i < world.size(); ++i) {
+    for (int j = 0; j < i + 1; ++j) result += "a";
+    result += std::to_string(i);
+  }
+  for (int i = 0; i < world.rank() + 1; ++i) str += "a";
+  str += std::to_string(world.rank());
+
+  // gather strings
+  for (int root = 0; root < world.size(); ++root) {
+    auto str_gathered = mpi::gather(str, world, root);
+    if (world.rank() == root)
+      EXPECT_EQ(str_gathered, result);
+    else
+      EXPECT_TRUE(str_gathered.empty());
+  }
+
+  // allgather strings
+  auto str_gathered = mpi::all_gather(str);
+  EXPECT_EQ(str_gathered, result);
+
+  // allgather empty strings
+  auto empty_str = mpi::all_gather(std::string{});
+  EXPECT_TRUE(empty_str.empty());
+}
+
+MPI_TEST_MAIN;
diff --git a/test/c++/mpi_gather_range.cpp b/test/c++/mpi_gather_range.cpp
new file mode 100644
index 00000000..2768db7b
--- /dev/null
+++ b/test/c++/mpi_gather_range.cpp
@@ -0,0 +1,88 @@
+// Copyright (c) 2022-2024 Simons Foundation
+// Copyright (c) 2022 Hugo U.R. Strand
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Authors: Thomas Hahn, Hugo U.R. Strand
+
+#include "./custom_types.hpp"
+
+#include <gtest/gtest.h>
+#include <mpi/mpi.hpp>
+
+#include <complex>
+#include <ranges>
+#include <vector>
+
+// Check if two ranges are equal.
+void expect_range_eq(auto &&rg1, auto &&rg2) {
+  EXPECT_EQ(std::ranges::size(rg1), std::ranges::size(rg2));
+  auto it2 = std::ranges::begin(rg2);
+  for (auto &&a : rg1) { EXPECT_EQ(a, *it2++); }
+}
+
+// Test gathering a range of objects.
+template <typename T> void test_gather_range(std::vector<T> const &values, std::vector<T> const &result) {
+  mpi::communicator world;
+
+  // gather on different roots
+  for (int root = 0; root < world.size(); ++root) {
+    // gather spans into a view of a vector
+    std::vector<T> vec(result.size() * 2, T{0});
+    mpi::gather_range(std::span{values}, std::ranges::drop_view(vec, result.size()), world, root);
+    if (world.rank() == root) {
+      expect_range_eq(std::ranges::drop_view(vec, result.size()), result);
+      expect_range_eq(std::ranges::take_view(vec, result.size()), std::vector<T>(result.size(), T{0}));
+    } else {
+      expect_range_eq(vec, std::vector<T>(result.size() * 2, T{0}));
+    }
+  }
+
+  // allgather vectors into an oversized vector
+  std::vector<T> vec(result.size() * 2, T{0});
+  mpi::gather_range(values, std::span{vec.begin(), result.size()}, world, 0, true);
+  expect_range_eq(std::ranges::take_view(vec, result.size()), result);
+  expect_range_eq(std::ranges::drop_view(vec, result.size()), std::vector<T>(result.size(), T{0}));
+}
+
+TEST(MPI, GatherIntegerRange) {
+  mpi::communicator world;
+  std::vector<int> values, result;
+  for (int i = 0; i < world.size(); ++i) {
+    for (int j = 0; j < 2 * (i + 1); ++j) result.emplace_back(i);
+  };
+  for (int i = 0; i < 2 * (world.rank() + 1); ++i) values.emplace_back(world.rank());
+  test_gather_range(values, result);
+}
+
+TEST(MPI, GatherComplexRange) {
+  mpi::communicator world;
+  std::vector<std::complex<double>> values, result;
+  for (int i = 0; i < world.size(); ++i) {
+    for (int j = 0; j < 2 * (i + 1); ++j) result.emplace_back(i, -i);
+  }
+  for (int i = 0; i < 2 * (world.rank() + 1); ++i) values.emplace_back(world.rank(), -world.rank());
+  test_gather_range(values, result);
+}
+
+TEST(MPI, GatherCustomMPITypeRange) {
+  mpi::communicator world;
+  std::vector<mpi_t> values, result;
+  for (int i = 0; i < world.size(); ++i) {
+    for (int j = 0; j < 2 * (i + 1); ++j) result.emplace_back(i);
+  }
+  for (int i = 0; i < 2 * (world.rank() + 1); ++i) values.emplace_back(world.rank());
+  test_gather_range(values, result);
+}
+
+MPI_TEST_MAIN;
diff --git a/test/c++/mpi_gather_vector.cpp b/test/c++/mpi_gather_vector.cpp
new file mode 100644
index 00000000..f5a8cadd
--- /dev/null
+++ b/test/c++/mpi_gather_vector.cpp
@@ -0,0 +1,102 @@
+// Copyright (c) 2022-2024 Simons Foundation
+// Copyright (c) 2022 Hugo U.R. Strand
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Authors: Thomas Hahn, Hugo U.R. Strand
+
+#include "./custom_types.hpp"
+
+#include <gtest/gtest.h>
+#include <mpi/mpi.hpp>
+
+#include <complex>
+#include <ranges>
+#include <vector>
+
+// Check if two ranges are equal.
+void expect_range_eq(auto &&rg1, auto &&rg2) {
+  EXPECT_EQ(std::ranges::size(rg1), std::ranges::size(rg2));
+  auto it2 = std::ranges::begin(rg2);
+  for (auto &&a : rg1) { EXPECT_EQ(a, *it2++); }
+}
+
+// Test gathering vectors.
+template <typename T> void test_gather_vector(std::vector<T> const &values, std::vector<T> const &result) {
+  mpi::communicator world;
+
+  // gather on different roots
+  for (int root = 0; root < world.size(); ++root) {
+    if constexpr (mpi::has_mpi_type<T>) {
+      // gather vectors into a new vector
+      auto vec = mpi::gather(values, world, root);
+      if (world.rank() == root)
+        expect_range_eq(vec, result);
+      else
+        EXPECT_TRUE(vec.empty());
+
+      // gather vectors into an existing vector
+      vec.clear();
+      mpi::gather_into(values, vec, world, root);
+      if (world.rank() == root)
+        expect_range_eq(vec, result);
+      else
+        EXPECT_TRUE(vec.empty());
+    }
+
+    // gather empty vectors
+    auto vec = mpi::gather(std::vector<T>{}, world, root);
+    EXPECT_TRUE(vec.empty());
+  }
+
+  // allgather vectors into a new vector
+  auto vec = mpi::all_gather(values, world);
+  expect_range_eq(vec, result);
+
+  // allgather vectors into an existing vector
+  vec.clear();
+  mpi::all_gather_into(values, vec, world);
+  expect_range_eq(vec, result);
+}
+
+TEST(MPI, GatherIntegerVector) {
+  mpi::communicator world;
+  std::vector<int> values, result;
+  for (int i = 0; i < world.size(); ++i) {
+    for (int j = 0; j < 2 * (i + 1); ++j) result.emplace_back(i);
+  }
+  for (int i = 0; i < 2 * (world.rank() + 1); ++i) values.emplace_back(world.rank());
+  test_gather_vector(values, result);
+}
+
+TEST(MPI, GatherComplexVector) {
+  mpi::communicator world;
+  std::vector<std::complex<double>> values, result;
+  for (int i = 0; i < world.size(); ++i) {
+    for (int j = 0; j < 2 * (i + 1); ++j) result.emplace_back(i, -i);
+  }
+  for (int i = 0; i < 2 * (world.rank() + 1); ++i) values.emplace_back(world.rank(), -world.rank());
+  test_gather_vector(values, result);
+}
+
+TEST(MPI, GatherCustomMPITypeVector) {
+  mpi::communicator world;
+  std::vector<mpi_t> values, result;
+  for (int i = 0; i < world.size(); ++i) {
+    for (int j = 0; j < 2 * (i + 1); ++j) result.emplace_back(i);
+  }
+  for (int i = 0; i < 2 * (world.rank() + 1); ++i) values.emplace_back(world.rank());
+  test_gather_vector(values, result);
+}
+
+MPI_TEST_MAIN;
diff --git a/test/c++/mpi_pair.cpp b/test/c++/mpi_pair.cpp
deleted file mode 100644
index 46b9616d..00000000
--- a/test/c++/mpi_pair.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-// Copyright (c) 2021-2024 Simons Foundation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0.txt
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-// Authors: Thomas Hahn, Nils Wentzell
-
-#include <gtest/gtest.h>
-#include <mpi/mpi.hpp>
-
-#include <complex>
-#include <string>
-#include <utility>
-
-TEST(MPI, PairBroadcast) {
-  // broadcast a pair consisting of a string and a complex number
-  std::pair<std::string, std::complex<double>> p;
-
-  auto str  = std::string{"Hello"};
-  auto cplx = std::complex<double>(1.0, 2.0);
-
-  mpi::communicator world;
-  if (world.rank() == 0) p = {str, cplx};
-
-  mpi::broadcast(p);
-  auto [str_bc, cplx_bc] = p;
-  EXPECT_EQ(str, str_bc);
-  EXPECT_EQ(cplx, cplx_bc);
-}
-
-TEST(MPI, PairReduce) {
-  // reduce a pair of integers
-  mpi::communicator world;
-  auto r = world.rank();
-  auto p = std::pair{1, r};
-
-  auto [r1, r2] = mpi::all_reduce(p);
-  auto nr       = world.size();
-  EXPECT_EQ(r1, nr);
-  EXPECT_EQ(r2, nr * (nr - 1) / 2);
-}
-
-MPI_TEST_MAIN;
diff --git a/test/c++/mpi_ranges.cpp b/test/c++/mpi_ranges.cpp
deleted file mode 100644
index ad0ad63e..00000000
--- a/test/c++/mpi_ranges.cpp
+++ /dev/null
@@ -1,158 +0,0 @@
-// Copyright (c) 2022-2024 Simons Foundation
-// Copyright (c) 2022 Hugo U.R. Strand
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0.txt
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-// Authors: Thomas Hahn, Hugo U.R. Strand
-
-#include "./non_mpi_t.hpp"
-
-#include <gtest/gtest.h>
-#include <mpi/mpi.hpp>
-
-#include <array>
-#include <numeric>
-#include <vector>
-
-TEST(MPI, RangesBroadcastMPIType) {
-  // broadcast a range with an MPI type
-  mpi::communicator world;
-  std::array<int, 5> arr{};
-  if (world.rank() == 0) {
-    for (int i = 0; i < 5; ++i) arr[i] = i;
-  }
-  mpi::broadcast_range(arr, world);
-  for (int i = 0; i < 5; ++i) EXPECT_EQ(arr[i], i);
-}
-
-TEST(MPI, RangesBroadcastTypeWithSpezializedMPIBroadcast) {
-  // broadcast a range with a type that has a specialized mpi_broadcast
-  mpi::communicator world;
-  std::vector<non_mpi_t> vec(5);
-  if (world.rank() == 0) {
-    for (int i = 0; i < 5; ++i) vec[i].a = i;
-  }
-  mpi::broadcast_range(vec, world);
-  for (int i = 0; i < 5; ++i) EXPECT_EQ(vec[i].a, i);
-}
-
-TEST(MPI, RangesReduceInPlaceMPIType) {
-  // in-place reduce a range with an MPI type
-  mpi::communicator world;
-  std::array<int, 5> arr{0, 1, 2, 3, 4};
-  mpi::reduce_in_place_range(arr, world);
-  if (world.rank() == 0)
-    for (int i = 0; i < 5; ++i) EXPECT_EQ(arr[i], i * world.size());
-  else
-    for (int i = 0; i < 5; ++i) EXPECT_EQ(arr[i], i);
-
-  // in-place allreduce a range with an MPI type
-  arr = {0, 1, 2, 3, 4};
-  mpi::reduce_in_place_range(arr, world, 0, true);
-  for (int i = 0; i < 5; ++i) EXPECT_EQ(arr[i], i * world.size());
-}
-
-TEST(MPI, RangesReduceInPlaceTypeWithSpezializedMPIReduceInPlace) {
-  // in-place reduce a range with a type that has a specialized mpi_reduce_in_place
-  mpi::communicator world;
-  std::vector<non_mpi_t> vec(5);
-  for (int i = 0; i < 5; ++i) vec[i].a = i;
-  mpi::reduce_in_place_range(vec, world);
-  if (world.rank() == 0)
-    for (int i = 0; i < 5; ++i) EXPECT_EQ(vec[i].a, i * world.size());
-  else
-    for (int i = 0; i < 5; ++i) EXPECT_EQ(vec[i].a, i);
-
-  // in-place allreduce a range with a type that has a specialized mpi_reduce_in_place
-  for (int i = 0; i < 5; ++i) vec[i].a = i;
-  mpi::reduce_in_place_range(vec, world, 0, true);
-  for (int i = 0; i < 5; ++i) EXPECT_EQ(vec[i].a, i * world.size());
-}
-
-TEST(MPI, RangesReduceMPIType) {
-  // reduce a range with an MPI type
-  mpi::communicator world;
-  std::array<int, 5> arr{0, 1, 2, 3, 4}, arr_red{};
-  mpi::reduce_range(arr, arr_red, world);
-  if (world.rank() == 0)
-    for (int i = 0; i < 5; ++i) EXPECT_EQ(arr_red[i], i * world.size());
-  else
-    for (int i = 0; i < 5; ++i) EXPECT_EQ(arr_red[i], 0);
-
-  // allreduce a range with an MPI type
-  arr     = {0, 1, 2, 3, 4};
-  arr_red = {};
-  mpi::reduce_range(arr, arr_red, world, 0, true);
-  for (int i = 0; i < 5; ++i) EXPECT_EQ(arr_red[i], i * world.size());
-}
-
-TEST(MPI, RangesReduceTypeWithSpezializedMPIReduceInPlace) {
-  // reduce a range with a type that has a specialized mpi_reduce_in_place
-  mpi::communicator world;
-  std::vector<non_mpi_t> vec(5, non_mpi_t{}), vec_red(5, non_mpi_t{});
-  for (int i = 0; i < 5; ++i) vec[i].a = i;
-  mpi::reduce_range(vec, vec_red, world);
-  if (world.rank() == 0)
-    for (int i = 0; i < 5; ++i) EXPECT_EQ(vec_red[i].a, i * world.size());
-  else
-    for (int i = 0; i < 5; ++i) EXPECT_EQ(vec_red[i].a, non_mpi_t{}.a);
-
-  // allreduce a range with a type that has a specialized mpi_reduce_in_place
-  for (int i = 0; i < 5; ++i) vec[i].a = i;
-  mpi::reduce_range(vec, vec_red, world, 0, true);
-  for (int i = 0; i < 5; ++i) EXPECT_EQ(vec_red[i].a, i * world.size());
-}
-
-TEST(MPI, RangesScatterMPIType) {
-  // scatter a range with an MPI type
-  mpi::communicator world;
-  auto const rank = world.rank();
-  auto sizes      = std::vector<int>(world.size());
-  for (int i = 0; i < world.size(); ++i) sizes[i] = static_cast<int>(mpi::chunk_length(10, world.size(), i));
-  auto acc_sizes = std::vector<int>(world.size() + 1, 0);
-  std::partial_sum(sizes.begin(), sizes.end(), std::next(acc_sizes.begin()));
-  std::vector<int> vec(10, 0), vec_scattered(sizes[rank], 0);
-  if (rank == 0) {
-    for (int i = 0; i < 10; ++i) vec[i] = i;
-  }
-  mpi::scatter_range(vec, vec_scattered, 10, world, 0);
-  for (int i = 0; i < sizes[rank]; ++i) EXPECT_EQ(vec_scattered[i], i + acc_sizes[rank]);
-}
-
-TEST(MPI, RangesGatherMPIType) {
-  // gather a range with an MPI type
-  mpi::communicator world;
-  auto const rank          = world.rank();
-  auto const gathered_size = (world.size() + 1) * world.size() / 2;
-  std::vector<int> vec(world.rank() + 1, 0), vec_gathered(gathered_size, 0);
-  std::iota(vec.begin(), vec.end(), rank * (rank + 1) / 2);
-  mpi::gather_range(vec, vec_gathered, gathered_size, world, 0, false);
-  if (rank == 0) {
-    for (int i = 0; i < gathered_size; ++i) EXPECT_EQ(vec_gathered[i], i);
-  }
-}
-
-TEST(MPI, RangesGatherTypeWithSpecializedMPIBroadcast) {
-  // gather a range with a type that has a specialized mpi_broadcast
-  mpi::communicator world;
-  auto const rank          = world.rank();
-  auto const gathered_size = (world.size() + 1) * world.size() / 2;
-  std::vector<non_mpi_t> vec(world.rank() + 1, non_mpi_t{}), vec_gathered(gathered_size, non_mpi_t{});
-  for (int i = 0; i < vec.size(); ++i) vec[i].a = i + rank * (rank + 1) / 2;
-
-  // providing the size of the output range
-  mpi::gather_range(vec, vec_gathered, gathered_size, world, 0, true);
-  for (int i = 0; i < gathered_size; ++i) EXPECT_EQ(vec_gathered[i].a, i);
-}
-
-MPI_TEST_MAIN;
diff --git a/test/c++/mpi_reduce.cpp b/test/c++/mpi_reduce.cpp
new file mode 100644
index 00000000..67a06fc4
--- /dev/null
+++ b/test/c++/mpi_reduce.cpp
@@ -0,0 +1,121 @@
+// Copyright (c) 2022-2024 Simons Foundation
+// Copyright (c) 2022 Hugo U.R. Strand
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Authors: Thomas Hahn, Hugo U.R. Strand
+
+#include "./custom_types.hpp"
+
+#include <gtest/gtest.h>
+#include <mpi/mpi.hpp>
+
+#include <complex>
+#include <utility>
+
+// Test reducing a single value/object.
+template <typename T> void test_reduce(T value, T result, T def_value, MPI_Op op = MPI_SUM) {
+  mpi::communicator world;
+
+  // reduce from different roots
+  for (int root = 0; root < world.size(); ++root) {
+    // reduce an object into new object
+    auto red_value = mpi::reduce(value, world, root, false, op);
+    if (world.rank() == root) { EXPECT_EQ(red_value, result); }
+
+    // reduce an object in place
+    red_value = value;
+    mpi::reduce_in_place(red_value, world, root, false, op);
+    if (world.rank() == root)
+      EXPECT_EQ(red_value, result);
+    else
+      EXPECT_EQ(red_value, value);
+
+    // reduce an object into an existing object
+    red_value = def_value;
+    mpi::reduce_into(value, red_value, world, root, false, op);
+    if (world.rank() == root)
+      EXPECT_EQ(red_value, result);
+    else
+      EXPECT_EQ(red_value, def_value);
+  }
+
+  // allreduce an object into a new object
+  auto red_value = mpi::all_reduce(value, world, op);
+  EXPECT_EQ(red_value, result);
+
+  // allreduce an object in place
+  red_value = value;
+  mpi::all_reduce_in_place(red_value, world, op);
+  EXPECT_EQ(red_value, result);
+
+  // allreduce an object using all_reduce_into
+  red_value = value;
+  mpi::all_reduce_into(value, red_value, world, op);
+  EXPECT_EQ(red_value, result);
+
+  // allreduce an object in place using all_reduce_into
+  red_value = value;
+  mpi::all_reduce_into(red_value, red_value, world, op);
+  EXPECT_EQ(red_value, result);
+}
+
+TEST(MPI, ReduceInteger) {
+  mpi::communicator world;
+  int rank     = world.rank() + 1;
+  int red_rank = world.size() * (world.size() + 1) / 2;
+  test_reduce(rank, red_rank, 0);
+}
+
+TEST(MPI, ReduceComplex) {
+  mpi::communicator world;
+  double rank     = world.rank() + 1.0;
+  double red_rank = world.size() * (world.size() + 1) * 0.5;
+  test_reduce(std::complex<double>{rank, -rank}, std::complex<double>{red_rank, -red_rank}, std::complex<double>{0, 0});
+}
+
+TEST(MPI, ReduceCustomMPIType) {
+  mpi::communicator world;
+  int rank     = world.rank() + 1;
+  int red_rank = world.size() * (world.size() + 1) / 2;
+  if (world.size() > 1) test_reduce(mpi_t{rank}, mpi_t{red_rank}, mpi_t{0}, mpi::map_add<mpi_t>());
+}
+
+TEST(MPI, ReduceCustomNonMPIType) {
+  mpi::communicator world;
+  int rank     = world.rank() + 1;
+  int red_rank = world.size() * (world.size() + 1) / 2;
+  test_reduce(non_mpi_t{rank}, non_mpi_t{red_rank}, non_mpi_t{0});
+}
+
+// Test reducing a pair.
+TEST(MPI, ReducePair) {
+  mpi::communicator world;
+
+  // allreduce a pair of integers
+  auto p1 = mpi::all_reduce(std::pair{world.rank(), -world.rank()}, world, MPI_MAX);
+  EXPECT_EQ(p1.first, world.size() - 1);
+  EXPECT_EQ(p1.second, 0);
+
+  // reduce a pair of non_mpi_t
+  auto p2 = mpi::reduce(std::pair{non_mpi_t{1}, non_mpi_t{world.rank() + 1}}, world, world.size() - 1);
+  if (world.rank() == world.size() - 1) {
+    EXPECT_EQ(p2.first, non_mpi_t(world.size()));
+    EXPECT_EQ(p2.second, non_mpi_t(world.size() * (world.size() + 1) / 2));
+  } else {
+    EXPECT_EQ(p2.first, non_mpi_t());
+    EXPECT_EQ(p2.second, non_mpi_t());
+  }
+}
+
+MPI_TEST_MAIN;
diff --git a/test/c++/mpi_reduce_array.cpp b/test/c++/mpi_reduce_array.cpp
new file mode 100644
index 00000000..96720a2a
--- /dev/null
+++ b/test/c++/mpi_reduce_array.cpp
@@ -0,0 +1,127 @@
+// Copyright (c) 2022-2024 Simons Foundation
+// Copyright (c) 2022 Hugo U.R. Strand
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Authors: Thomas Hahn, Hugo U.R. Strand
+
+#include "./custom_types.hpp"
+
+#include <gtest/gtest.h>
+#include <mpi/mpi.hpp>
+
+#include <array>
+#include <complex>
+#include <ranges>
+
+// Check if two ranges are equal.
+void expect_range_eq(auto &&rg1, auto &&rg2) {
+  EXPECT_EQ(std::ranges::size(rg1), std::ranges::size(rg2));
+  auto it2 = std::ranges::begin(rg2);
+  for (auto &&a : rg1) { EXPECT_EQ(a, *it2++); }
+}
+
+// Test reducing arrays.
+template <typename T> void test_reduce_array(std::array<T, 5> const &values, std::array<T, 5> const &result, MPI_Op op = MPI_SUM) {
+  mpi::communicator world;
+
+  // reduce from different roots
+  for (int root = 0; root < world.size(); ++root) {
+    // reduce an array into a new array
+    auto arr = mpi::reduce(values, world, root, false, op);
+    if (world.rank() == root) expect_range_eq(arr, result);
+
+    // reduce an empty array
+    std::array<T, 0> empty_arr{};
+    auto empty_red = mpi::reduce(empty_arr, world, root, false, op);
+    static_assert(empty_red.size() == 0);
+
+    // reduce an array in place
+    arr = values;
+    mpi::reduce_in_place(arr, world, root, false, op);
+    if (world.rank() == root)
+      expect_range_eq(arr, result);
+    else
+      expect_range_eq(arr, values);
+
+    // reduce an array into an existing array
+    arr = {};
+    mpi::reduce_into(values, arr, world, root, false, op);
+    if (world.rank() == root) expect_range_eq(arr, result);
+
+    // reduce an empty array into an existing array
+    mpi::reduce_into(empty_arr, empty_arr, world, root, false, op);
+  }
+
+  // allreduce an array into new array
+  auto arr = mpi::all_reduce(values, world, op);
+  expect_range_eq(arr, result);
+
+  // allreduce an array in place
+  arr = values;
+  mpi::all_reduce_in_place(arr, world, op);
+  expect_range_eq(arr, result);
+
+  // allreduce an array in place using all_reduce_into
+  arr = values;
+  mpi::all_reduce_into(arr, arr, world, op);
+  expect_range_eq(arr, result);
+}
+
+TEST(MPI, ReduceIntegerArray) {
+  mpi::communicator world;
+  std::array<int, 5> values{}, result{};
+  for (int i = 0; i < 5; ++i) {
+    values[i] = (i + 1) * (world.rank() + 1);
+    result[i] = (i + 1) * world.size() * (world.size() + 1) / 2;
+  }
+  test_reduce_array(values, result);
+}
+
+TEST(MPI, ReduceComplexArray) {
+  mpi::communicator world;
+  double rank     = world.rank() + 1.0;
+  double red_rank = world.size() * (world.size() + 1) * 0.5;
+  std::array<std::complex<double>, 5> values{}, result{};
+  for (int i = 0; i < 5; ++i) {
+    values[i] = std::complex<double>{rank * (i + 1), -rank * (i + 1)};
+    result[i] = std::complex<double>{red_rank * (i + 1), -red_rank * (i + 1)};
+  }
+  test_reduce_array(values, result);
+}
+
+TEST(MPI, ReduceCustomMPITypeArray) {
+  mpi::communicator world;
+  long rank     = world.rank() + 1;
+  long red_rank = world.size() * (world.size() + 1) / 2;
+  std::array<mpi_t, 5> values{}, result{};
+  for (int i = 0; i < 5; ++i) {
+    values[i] = mpi_t{rank * (i + 1)};
+    result[i] = mpi_t{red_rank * (i + 1)};
+  }
+  if (world.size() > 1) { test_reduce_array(values, result, mpi::map_add<mpi_t>()); }
+}
+
+TEST(MPI, ReduceCustomNonMPITypeArray) {
+  mpi::communicator world;
+  int rank     = world.rank() + 1;
+  int red_rank = world.size() * (world.size() + 1) / 2;
+  std::array<non_mpi_t, 5> values{}, result{};
+  for (int i = 0; i < 5; ++i) {
+    values[i] = non_mpi_t{rank * (i + 1)};
+    result[i] = non_mpi_t{red_rank * (i + 1)};
+  }
+  test_reduce_array(values, result);
+}
+
+MPI_TEST_MAIN;
diff --git a/test/c++/mpi_reduce_range.cpp b/test/c++/mpi_reduce_range.cpp
new file mode 100644
index 00000000..5f8cd193
--- /dev/null
+++ b/test/c++/mpi_reduce_range.cpp
@@ -0,0 +1,154 @@
+// Copyright (c) 2022-2024 Simons Foundation
+// Copyright (c) 2022 Hugo U.R. Strand
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Authors: Thomas Hahn, Hugo U.R. Strand
+
+#include "./custom_types.hpp"
+
+#include <gtest/gtest.h>
+#include <mpi/mpi.hpp>
+
+#include <array>
+#include <complex>
+#include <list>
+#include <ranges>
+#include <span>
+
+// Check if two ranges are equal.
+void expect_range_eq(auto &&rg1, auto &&rg2) {
+  EXPECT_EQ(std::ranges::size(rg1), std::ranges::size(rg2));
+  auto it2 = std::ranges::begin(rg2);
+  for (auto &&a : rg1) { EXPECT_EQ(a, *it2++); }
+}
+
+// Test reducing a range of objects.
+template <typename T> void test_reduce_range(std::array<T, 5> const &values, std::array<T, 5> const &result, MPI_Op op = MPI_SUM) {
+  mpi::communicator world;
+
+  // reduce from different roots
+  for (int root = 0; root < world.size(); ++root) {
+    // reduce a span into an array
+    auto arr = values;
+    mpi::reduce_range(std::span{values.data() + 2, 3}, std::span{arr.begin(), 3}, world, root, false, op);
+    if (world.rank() == root) {
+      expect_range_eq(std::span{arr.data(), 3}, std::span{result.data() + 2, 3});
+      expect_range_eq(std::span{arr.data() + 3, 2}, std::span{values.data() + 3, 2});
+    } else {
+      expect_range_eq(arr, values);
+    }
+
+    // reduce a list into a list
+    std::list<T> list(values.begin(), values.end()), list_red(values.begin(), values.end());
+    if (world.rank() == root) {
+      mpi::reduce_range(list, list_red, world, root, false, op);
+      expect_range_eq(list_red, result);
+    } else {
+      list_red.clear();
+      mpi::reduce_range(list, list_red, world, root, false, op);
+      EXPECT_TRUE(list_red.empty());
+    }
+
+    // reduce a view on a list in place
+    list.assign(values.begin(), values.end());
+    mpi::reduce_range(std::ranges::take_view(list, 2), std::ranges::take_view(list, 2), world, root, false, op);
+    if (world.rank() == root) {
+      expect_range_eq(std::ranges::take_view(list, 2), std::ranges::take_view(result, 2));
+      expect_range_eq(std::ranges::drop_view(list, 2), std::ranges::drop_view(values, 2));
+    } else {
+      expect_range_eq(list, values);
+    }
+
+    // reduce a span in place
+    arr = values;
+    mpi::reduce_range(std::span{arr.data() + 2, 3}, std::span{arr.data() + 2, 3}, world, root, false, op);
+    if (world.rank() == root) {
+      expect_range_eq(std::span{arr.data() + 2, 3}, std::span{result.data() + 2, 3});
+      expect_range_eq(std::span{arr.data(), 2}, std::span{values.data(), 2});
+    } else {
+      expect_range_eq(arr, values);
+    }
+
+    // reduce an array into a list
+    if (world.rank() == root) {
+      list = std::list<T>(5);
+      mpi::reduce_range(values, list, world, root, false, op);
+      expect_range_eq(list, result);
+    } else {
+      list.clear();
+      mpi::reduce_range(values, list, world, root, false, op);
+      EXPECT_TRUE(list.empty());
+    }
+  }
+
+  // allreduce a list in place using reduce_range
+  std::list<T> list(values.begin(), values.end());
+  mpi::reduce_range(list, list, world, 0, true, op);
+  expect_range_eq(list, result);
+
+  // allreduce a span in place
+  auto arr = values;
+  mpi::reduce_range(std::span{arr.data() + 1, 3}, std::span{arr.data() + 1, 3}, world, 0, true, op);
+  expect_range_eq(std::span{arr.data() + 1, 3}, std::span{result.data() + 1, 3});
+  EXPECT_EQ(arr[0], values[0]);
+  EXPECT_EQ(arr[4], values[4]);
+}
+
+TEST(MPI, ReduceIntegerRange) {
+  mpi::communicator world;
+  std::array<int, 5> values{}, result{};
+  for (int i = 0; i < 5; ++i) {
+    values[i] = (i + 1) * (world.rank() + 1);
+    result[i] = (i + 1) * world.size() * (world.size() + 1) / 2;
+  }
+  test_reduce_range(values, result);
+}
+
+TEST(MPI, ReduceComplexRange) {
+  mpi::communicator world;
+  double rank     = world.rank() + 1.0;
+  double red_rank = world.size() * (world.size() + 1) * 0.5;
+  std::array<std::complex<double>, 5> values{}, result{};
+  for (int i = 0; i < 5; ++i) {
+    values[i] = std::complex<double>{rank * (i + 1), -rank * (i + 1)};
+    result[i] = std::complex<double>{red_rank * (i + 1), -red_rank * (i + 1)};
+  }
+  test_reduce_range(values, result);
+}
+
+TEST(MPI, ReduceCustomMPITypeRange) {
+  mpi::communicator world;
+  long rank     = world.rank() + 1;
+  long red_rank = world.size() * (world.size() + 1) / 2;
+  std::array<mpi_t, 5> values{}, result{};
+  for (int i = 0; i < 5; ++i) {
+    values[i] = mpi_t{rank * (i + 1)};
+    result[i] = mpi_t{red_rank * (i + 1)};
+  }
+  if (world.size() > 1) { test_reduce_range(values, result, mpi::map_add<mpi_t>()); }
+}
+
+TEST(MPI, ReduceCustomNonMPITypeRange) {
+  mpi::communicator world;
+  int rank     = world.rank() + 1;
+  int red_rank = world.size() * (world.size() + 1) / 2;
+  std::array<non_mpi_t, 5> values{}, result{};
+  for (int i = 0; i < 5; ++i) {
+    values[i] = non_mpi_t{rank * (i + 1)};
+    result[i] = non_mpi_t{red_rank * (i + 1)};
+  }
+  test_reduce_range(values, result);
+}
+
+MPI_TEST_MAIN;
diff --git a/test/c++/mpi_reduce_vector.cpp b/test/c++/mpi_reduce_vector.cpp
new file mode 100644
index 00000000..29142bcb
--- /dev/null
+++ b/test/c++/mpi_reduce_vector.cpp
@@ -0,0 +1,138 @@
+// Copyright (c) 2022-2024 Simons Foundation
+// Copyright (c) 2022 Hugo U.R. Strand
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Authors: Thomas Hahn, Hugo U.R. Strand
+
+#include "./custom_types.hpp"
+
+#include <gtest/gtest.h>
+#include <mpi/mpi.hpp>
+
+#include <complex>
+#include <ranges>
+#include <vector>
+
+// Check if two ranges are equal.
+void expect_range_eq(auto &&rg1, auto &&rg2) {
+  EXPECT_EQ(std::ranges::size(rg1), std::ranges::size(rg2));
+  auto it2 = std::ranges::begin(rg2);
+  for (auto &&a : rg1) { EXPECT_EQ(a, *it2++); }
+}
+
+// Test reducing a vector.
+template <typename T> void test_reduce_vector(std::vector<T> const &values, std::vector<T> const &result, MPI_Op op = MPI_SUM) {
+  mpi::communicator world;
+
+  // reduce from different roots
+  for (int root = 0; root < world.size(); ++root) {
+    // reduce a vector into a new vector
+    auto vec = mpi::reduce(values, world, root, false, op);
+    if (world.rank() == root) expect_range_eq(vec, result);
+
+    // reduce an empty vector
+    auto empty_vec = mpi::reduce(std::vector<T>{}, world, root, false, op);
+    EXPECT_EQ(empty_vec.size(), 0);
+
+    // reduce a vector in place
+    vec = values;
+    mpi::reduce_in_place(vec, world, root, false, op);
+    if (world.rank() == root)
+      expect_range_eq(vec, result);
+    else
+      expect_range_eq(vec, values);
+
+    // reduce an empty vector in place
+    mpi::reduce_in_place(empty_vec, world, root, false, op);
+    EXPECT_EQ(empty_vec.size(), 0);
+
+    // reduce a vector into an existing empty vector
+    vec.clear();
+    mpi::reduce_into(values, vec, world, root, false, op);
+    if (world.rank() == root)
+      expect_range_eq(vec, result);
+    else
+      EXPECT_TRUE(vec.empty());
+
+    // reduce an empty vector into an existing vector
+    vec = values;
+    mpi::reduce_into(empty_vec, vec, world, root, false, op);
+    if (world.rank() == root)
+      EXPECT_EQ(vec.size(), 0);
+    else
+      expect_range_eq(vec, values);
+  }
+
+  // allreduce a vector into a new vector
+  auto vec = mpi::all_reduce(values, world, op);
+  expect_range_eq(vec, result);
+
+  // allreduce a vector in place
+  vec = values;
+  mpi::all_reduce_in_place(vec, world, op);
+  expect_range_eq(vec, result);
+
+  // allreduce a vector in place using all_reduce_into
+  vec = values;
+  mpi::all_reduce_into(vec, vec, world, op);
+  expect_range_eq(vec, result);
+}
+
+TEST(MPI, ReduceIntegerVector) {
+  mpi::communicator world;
+  std::vector<int> values(5), result(5);
+  for (int i = 0; i < 5; ++i) {
+    values[i] = (i + 1) * (world.rank() + 1);
+    result[i] = (i + 1) * world.size() * (world.size() + 1) / 2;
+  }
+  test_reduce_vector(values, result);
+}
+
+TEST(MPI, ReduceComplexVector) {
+  mpi::communicator world;
+  double rank     = world.rank() + 1.0;
+  double red_rank = world.size() * (world.size() + 1) * 0.5;
+  std::vector<std::complex<double>> values(5), result(5);
+  for (int i = 0; i < 5; ++i) {
+    values[i] = std::complex<double>{rank * (i + 1), -rank * (i + 1)};
+    result[i] = std::complex<double>{red_rank * (i + 1), -red_rank * (i + 1)};
+  }
+  test_reduce_vector(values, result);
+}
+
+TEST(MPI, ReduceCustomMPITypeVector) {
+  mpi::communicator world;
+  long rank     = world.rank() + 1;
+  long red_rank = world.size() * (world.size() + 1) / 2;
+  std::vector<mpi_t> values(5), result(5);
+  for (int i = 0; i < 5; ++i) {
+    values[i] = mpi_t{rank * (i + 1)};
+    result[i] = mpi_t{red_rank * (i + 1)};
+  }
+  if (world.size() > 1) { test_reduce_vector(values, result, mpi::map_add<mpi_t>()); }
+}
+
+TEST(MPI, ReduceCustomNonMPITypeVector) {
+  mpi::communicator world;
+  int rank     = world.rank() + 1;
+  int red_rank = world.size() * (world.size() + 1) / 2;
+  std::vector<non_mpi_t> values(5), result(5);
+  for (int i = 0; i < 5; ++i) {
+    values[i] = non_mpi_t{rank * (i + 1)};
+    result[i] = non_mpi_t{red_rank * (i + 1)};
+  }
+  test_reduce_vector(values, result);
+}
+
+MPI_TEST_MAIN;
diff --git a/test/c++/mpi_scatter_range.cpp b/test/c++/mpi_scatter_range.cpp
new file mode 100644
index 00000000..a81e3a35
--- /dev/null
+++ b/test/c++/mpi_scatter_range.cpp
@@ -0,0 +1,114 @@
+// Copyright (c) 2022-2024 Simons Foundation
+// Copyright (c) 2022 Hugo U.R. Strand
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Authors: Thomas Hahn, Hugo U.R. Strand
+
+#include "./custom_types.hpp"
+
+#include <gtest/gtest.h>
+#include <mpi/mpi.hpp>
+
+#include <complex>
+#include <numeric>
+#include <ranges>
+#include <span>
+#include <vector>
+
+// Check if two ranges are equal.
+void expect_range_eq(auto &&rg1, auto &&rg2) {
+  EXPECT_EQ(std::ranges::size(rg1), std::ranges::size(rg2));
+  auto it2 = std::ranges::begin(rg2);
+  for (auto &&a : rg1) { EXPECT_EQ(a, *it2++); }
+}
+
+// Test scattering a vector.
+template <typename T> void test_scatter_range(std::vector<T> const &values, long chunk_size) {
+  mpi::communicator world;
+  const int rank = world.rank();
+  auto sizes     = std::vector<int>(world.size());
+  for (int i = 0; i < world.size(); ++i) sizes[i] = static_cast<int>(mpi::chunk_length(values.size(), world.size(), i, chunk_size));
+  auto acc_sizes = std::vector<int>(world.size() + 1, 0);
+  std::partial_sum(sizes.begin(), sizes.end(), std::next(acc_sizes.begin()));
+  EXPECT_EQ(acc_sizes.back(), values.size());
+
+  // scatter from different roots
+  for (int root = 0; root < world.size(); ++root) {
+    // scatter a vector into a span
+    auto vec = std::vector<T>(sizes[rank], T{0});
+    mpi::scatter_range(values, std::span(vec.begin(), sizes[rank]), values.size(), world, root, chunk_size);
+    expect_range_eq(vec, std::span(values.begin() + acc_sizes[rank], sizes[rank]));
+
+    // scatter with chunk size = number of elements to be scattered
+    vec = std::vector<T>((rank == 0 ? values.size() : 0), T{0});
+    mpi::scatter_range(values, vec, values.size(), world, root, values.size());
+    if (world.rank() == 0)
+      expect_range_eq(vec, values);
+    else
+      EXPECT_TRUE(vec.empty());
+  }
+}
+
+TEST(MPI, ScatterIntegerRange) {
+  mpi::communicator world;
+  const long min_nchunks = 3;
+  const long chunk_size  = 4;
+  for (int i = 0; i < world.size(); ++i) {
+    // chunk size = 1
+    std::vector<int> values(min_nchunks * world.size() + i);
+    std::iota(values.begin(), values.end(), 0);
+    test_scatter_range(values, 1);
+
+    // chunk size = 4
+    values.resize((min_nchunks * world.size() + i) * chunk_size);
+    std::iota(values.begin(), values.end(), 0);
+    test_scatter_range(values, chunk_size);
+  }
+}
+
+TEST(MPI, ScatterComplexRange) {
+  mpi::communicator world;
+  const long min_nchunks = 3;
+  const long chunk_size  = 4;
+  for (int i = 0; i < world.size(); ++i) {
+    // chunk size = 1
+    std::vector<std::complex<double>> values(min_nchunks * world.size() + i);
+    for (int j = 0; j < values.size(); ++j) values[j] = std::complex<double>(j, -j);
+    test_scatter_range(values, 1);
+
+    // chunk size = 4
+    values.resize((min_nchunks * world.size() + i) * chunk_size);
+    for (int j = 0; j < values.size(); ++j) values[j] = std::complex<double>(j, -j);
+    test_scatter_range(values, chunk_size);
+  }
+}
+
+TEST(MPI, ScatterCustomMPITypeRange) {
+  mpi::communicator world;
+  const long min_nchunks = 3;
+  const long chunk_size  = 4;
+  for (int i = 0; i < world.size(); ++i) {
+    // chunk size = 1
+    std::vector<mpi_t> values(min_nchunks * world.size() + i);
+    for (int j = 0; j < values.size(); ++j) values[j].a = j;
+    test_scatter_range(values, 1);
+
+    // chunk size = 4
+    values.resize((min_nchunks * world.size() + i) * chunk_size);
+    for (int j = 0; j < values.size(); ++j) values[j].a = j;
+    test_scatter_range(values, chunk_size);
+  }
+}
+
+MPI_TEST_MAIN;
diff --git a/test/c++/mpi_scatter_vector.cpp b/test/c++/mpi_scatter_vector.cpp
new file mode 100644
index 00000000..e019080c
--- /dev/null
+++ b/test/c++/mpi_scatter_vector.cpp
@@ -0,0 +1,90 @@
+// Copyright (c) 2022-2024 Simons Foundation
+// Copyright (c) 2022 Hugo U.R. Strand
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Authors: Thomas Hahn, Hugo U.R. Strand
+
+#include "./custom_types.hpp"
+
+#include <gtest/gtest.h>
+#include <mpi/mpi.hpp>
+
+#include <complex>
+#include <numeric>
+#include <ranges>
+#include <span>
+#include <vector>
+
+// Check if two ranges are equal.
+void expect_range_eq(auto &&rg1, auto &&rg2) {
+  EXPECT_EQ(std::ranges::size(rg1), std::ranges::size(rg2));
+  auto it2 = std::ranges::begin(rg2);
+  for (auto &&a : rg1) { EXPECT_EQ(a, *it2++); }
+}
+
+// Test scattering a vector.
+template <typename T> void test_scatter_vector(std::vector<T> const &values) {
+  mpi::communicator world;
+  auto recvcounts = std::vector<int>(world.size());
+  for (int i = 0; i < world.size(); ++i) recvcounts[i] = static_cast<int>(mpi::chunk_length(values.size(), world.size(), i));
+  auto displs = std::vector<int>(world.size() + 1, 0);
+  std::partial_sum(recvcounts.begin(), recvcounts.end(), std::next(displs.begin()));
+  auto const recvcount = recvcounts[world.rank()];
+  auto const displ     = displs[world.rank()];
+
+  // scatter from different roots
+  for (int root = 0; root < world.size(); ++root) {
+    // scatter a vector into a new vector
+    auto vec = mpi::scatter(world.rank() == root ? values : std::vector<T>{}, world, root);
+    expect_range_eq(vec, std::span(values.begin() + displ, recvcount));
+
+    // scatter a vector into an existing vector
+    vec.clear();
+    mpi::scatter_into(values, vec, world, root);
+    expect_range_eq(vec, std::span(values.begin() + displ, recvcount));
+  }
+
+  // scatter an empty vector
+  auto vec = mpi::scatter(std::vector<T>{}, world);
+  EXPECT_TRUE(vec.empty());
+}
+
+TEST(MPI, ScatterIntegerVector) {
+  mpi::communicator world;
+  for (int total_size = 3 * world.size(); total_size < 4 * world.size(); ++total_size) {
+    std::vector<int> values(total_size);
+    std::iota(values.begin(), values.end(), 0);
+    test_scatter_vector(values);
+  }
+}
+
+TEST(MPI, ScatterComplexVector) {
+  mpi::communicator world;
+  for (int total_size = 3 * world.size(); total_size < 4 * world.size(); ++total_size) {
+    std::vector<std::complex<double>> values(total_size);
+    for (int i = 0; i < total_size; ++i) values[i] = std::complex<double>(i, -i);
+    test_scatter_vector(values);
+  }
+}
+
+TEST(MPI, ScatterCustomMPITypeVector) {
+  mpi::communicator world;
+  for (int total_size = 3 * world.size(); total_size < 4 * world.size(); ++total_size) {
+    std::vector<mpi_t> values(total_size);
+    for (int i = 0; i < total_size; ++i) values[i].a = i;
+    test_scatter_vector(values);
+  }
+}
+
+MPI_TEST_MAIN;
diff --git a/test/c++/mpi_string.cpp b/test/c++/mpi_string.cpp
deleted file mode 100644
index 9c69dda1..00000000
--- a/test/c++/mpi_string.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-// Copyright (c) 2020-2024 Simons Foundation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0.txt
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-// Authors: Thomas Hahn, Nils Wentzell
-
-#include <gtest/gtest.h>
-#include <mpi/mpi.hpp>
-
-#include <string>
-
-TEST(MPI, StringBroadcast) {
-  // broadcast a string
-  mpi::communicator world;
-
-  std::string s;
-  if (world.rank() == 0) s = "Hello World";
-
-  mpi::broadcast(s);
-
-  EXPECT_EQ(s, std::string{"Hello World"});
-}
-
-TEST(MPI, StringGather) {
-  // gather a string
-  mpi::communicator world;
-  std::string s{}, exp_s{};
-  for (int i = 0; i < world.size(); ++i) {
-    for (int j = 0; j < i + 1; ++j) exp_s += "a";
-    exp_s += std::to_string(i);
-  }
-  for (int i = 0; i < world.rank() + 1; ++i) s += "a";
-  s += std::to_string(world.rank());
-
-  // gather only on root
-  auto s_gathered = mpi::gather(s);
-  if (world.rank() == 0) EXPECT_EQ(s_gathered, exp_s);
-  else EXPECT_TRUE(s_gathered.empty());
-
-  // gather on all processes
-  auto s_gathered_all = mpi::all_gather(s);
-  EXPECT_EQ(s_gathered_all, exp_s);
-}
-
-MPI_TEST_MAIN;
diff --git a/test/c++/mpi_vector.cpp b/test/c++/mpi_vector.cpp
deleted file mode 100644
index 3630c68e..00000000
--- a/test/c++/mpi_vector.cpp
+++ /dev/null
@@ -1,182 +0,0 @@
-// Copyright (c) 2020-2024 Simons Foundation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0.txt
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-// Authors: Thomas Hahn, Nils Wentzell
-
-#include "./non_mpi_t.hpp"
-
-#include <gtest/gtest.h>
-#include <itertools/itertools.hpp>
-#include <mpi/mpi.hpp>
-
-#include <complex>
-#include <numeric>
-#include <string>
-#include <utility>
-#include <vector>
-
-TEST(MPI, VectorBroadcastMPIType) {
-  // broadcast a vector with an MPI type
-  mpi::communicator world;
-  std::vector<int> vec(5, 0);
-  if (world.rank() == 0) {
-    std::iota(vec.begin(), vec.end(), 0);
-  } else {
-    vec.clear();
-  }
-  mpi::broadcast(vec, world);
-  for (int i = 0; i < 5; ++i) EXPECT_EQ(vec[i], i);
-}
-
-TEST(MPI, VectorBroadcastTypeWithSpezializedMPIBroadcast) {
-  // broadcast a vector with a type that has a specialized mpi_broadcast
-  mpi::communicator world;
-  std::vector<non_mpi_t> vec(5);
-  if (world.rank() == 0) {
-    for (int i = 0; i < 5; ++i) vec[i].a = i;
-  }
-  mpi::broadcast(vec, world);
-  for (int i = 0; i < 5; ++i) EXPECT_EQ(vec[i].a, i);
-}
-
-TEST(MPI, VectorReduceInPlaceMPIType) {
-  // in-place reduce a vector with an MPI type
-  mpi::communicator world;
-  std::vector<int> vec{0, 1, 2, 3, 4};
-  mpi::reduce_in_place(vec, world);
-  if (world.rank() == 0)
-    for (int i = 0; i < 5; ++i) EXPECT_EQ(vec[i], i * world.size());
-  else
-    for (int i = 0; i < 5; ++i) EXPECT_EQ(vec[i], i);
-
-  // in-place allreduce a vector with an MPI type
-  std::iota(vec.begin(), vec.end(), 0);
-  mpi::all_reduce_in_place(vec, world);
-  for (int i = 0; i < 5; ++i) EXPECT_EQ(vec[i], i * world.size());
-}
-
-TEST(MPI, VectorReduceInPlaceTypeWithSpezializedMPIReduceInPlace) {
-  // in-place reduce a vector with a type that has a specialized mpi_reduce_in_place
-  mpi::communicator world;
-  std::vector<non_mpi_t> vec(5);
-  for (int i = 0; i < 5; ++i) vec[i].a = i;
-  mpi::reduce_in_place(vec, world);
-  if (world.rank() == 0)
-    for (int i = 0; i < 5; ++i) EXPECT_EQ(vec[i].a, i * world.size());
-  else
-    for (int i = 0; i < 5; ++i) EXPECT_EQ(vec[i].a, i);
-
-  // in-place allreduce a vector with a type that has a specialized mpi_reduce_in_place
-  for (int i = 0; i < 5; ++i) vec[i].a = i;
-  mpi::all_reduce_in_place(vec, world);
-  for (int i = 0; i < 5; ++i) EXPECT_EQ(vec[i].a, i * world.size());
-}
-
-TEST(MPI, VectorReduceMPIType) {
-  // reduce a vector with complex numbers
-  mpi::communicator world;
-  using vec_type = std::vector<std::complex<double>>;
-  const int size = 7;
-  vec_type vec(size);
-  for (int i = 0; i < size; ++i) vec[i] = std::complex<double>(i, -i);
-  auto vec_reduced = mpi::reduce(vec, world);
-  if (world.rank() == 0)
-    for (int i = 0; i < size; ++i) EXPECT_EQ(vec_reduced[i], std::complex<double>(i * world.size(), -i * world.size()));
-  else
-    EXPECT_TRUE(vec_reduced.empty());
-
-  // allreduce a vector with complex numbers
-  vec_reduced = mpi::all_reduce(vec, world);
-  for (int i = 0; i < size; ++i) EXPECT_EQ(vec_reduced[i], std::complex<double>(i * world.size(), -i * world.size()));
-}
-
-TEST(MPI, VectorReduceTypeWithSpezializedMPIReduce) {
-  // reduce a vector with a type that has a specialized mpi_reduce
-  mpi::communicator world;
-  std::vector<non_mpi_t> vec(5);
-  for (int i = 0; i < 5; ++i) vec[i].a = i;
-  auto vec_reduced = mpi::reduce(vec, world);
-  if (world.rank() == 0)
-    for (int i = 0; i < 5; ++i) EXPECT_EQ(vec_reduced[i].a, i * world.size());
-  else
-    EXPECT_TRUE(vec_reduced.empty());
-
-  // allreduce a vector with a type that has a specialized mpi_reduce
-  for (int i = 0; i < 5; ++i) vec[i].a = i;
-  auto vec_reduced_all = mpi::all_reduce(vec, world);
-  for (int i = 0; i < 5; ++i) EXPECT_EQ(vec_reduced_all[i].a, i * world.size());
-}
-
-TEST(MPI, EmptyVectorReduce) {
-  // reduce an empty vector
-  mpi::communicator world;
-  std::vector<double> v1{};
-  std::vector<double> v2 = mpi::reduce(v1, world);
-}
-
-TEST(MPI, VectorGatherScatter) {
-  // scatter and gather a vector of complex numbers
-  mpi::communicator world;
-  std::vector<std::complex<double>> vec(7), scattered_vec(7), gathered_vec(7, {0.0, 0.0});
-  for (auto [i, v_i] : itertools::enumerate(vec)) v_i = static_cast<double>(i) + 1.0;
-
-  scattered_vec = mpi::scatter(vec, world);
-  auto tmp      = mpi::scatter(vec, world);
-
-  for (auto &x : scattered_vec) x *= -1;
-  for (auto &x : vec) x *= -1;
-
-  gathered_vec = mpi::all_gather(scattered_vec, world);
-
-  EXPECT_EQ(vec, gathered_vec);
-}
-
-TEST(MPI, VectorGatherPair) {
-  // gather a vector of pairs
-  mpi::communicator world;
-  auto const rank          = world.rank();
-  auto const gathered_size = (world.size() + 1) * world.size() / 2;
-  std::vector<std::pair<int, std::string>> vec(world.rank() + 1);
-  for (int i = 0; i < vec.size(); ++i) {
-    vec[i].first  = i + rank * (rank + 1) / 2;
-    vec[i].second = std::to_string(vec[i].first);
-  }
-  auto vec_gathered = mpi::all_gather(vec, world);
-  for (int i = 0; i < gathered_size; ++i) EXPECT_EQ(vec_gathered[i], std::make_pair(i, std::to_string(i)));
-}
-
-TEST(MPI, VectorGatherOnlyOnRoot) {
-  // gather a vector only on root
-  mpi::communicator world;
-  std::vector<int> v = {1, 2, 3};
-  auto res           = mpi::gather(v, world);
-  if (world.rank() == 0) {
-    auto exp_res = v;
-    for (int i = 1; i < world.size(); ++i) exp_res.insert(exp_res.end(), v.begin(), v.end());
-    EXPECT_EQ(res, exp_res);
-  } else {
-    EXPECT_TRUE(res.empty());
-  }
-}
-
-TEST(MPI, VectorScatterSizeZero) {
-  // pass a vector of size 0 to scatter
-  mpi::communicator world;
-  std::vector<int> v = {1, 2, 3};
-  if (world.rank() == 0) v.clear();
-  auto res = mpi::scatter(v, world);
-  EXPECT_TRUE(res.empty());
-}
-
-MPI_TEST_MAIN;
diff --git a/test/c++/non_mpi_t.hpp b/test/c++/non_mpi_t.hpp
deleted file mode 100644
index 79821cfe..00000000
--- a/test/c++/non_mpi_t.hpp
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright (c) 2022-2024 Simons Foundation
-// Copyright (c) 2022 Hugo U.R. Strand
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0.txt
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-// Authors: Thomas Hahn, Hugo U.R. Strand
-
-#pragma once
-
-#include <gtest/gtest.h>
-#include <mpi/mpi.hpp>
-
-struct non_mpi_t {
-  int a{1};
-};
-
-// needs to be in the mpi namespace for ADL to work
-namespace mpi {
-
-  // specialize mpi_broadcast for foo
-  void mpi_broadcast(non_mpi_t &f, mpi::communicator c = {}, int root = 0) { broadcast(f.a, c, root); }
-
-  // specialize mpi_reduce_in_place for foo
-  void mpi_reduce_in_place(non_mpi_t &f, mpi::communicator c = {}, int root = 0, bool all = false, MPI_Op op = MPI_SUM) {
-    if (all) {
-      all_reduce_in_place(f.a, c, op);
-    } else {
-      reduce_in_place(f.a, c, root, false, op);
-    }
-  }
-
-  // specialize mpi_reduce for foo
-  non_mpi_t mpi_reduce(non_mpi_t const &f, mpi::communicator c = {}, int root = 0, bool all = false, MPI_Op op = MPI_SUM) {
-    non_mpi_t res{};
-    if (all) {
-      res.a = all_reduce(f.a, c, op);
-    } else {
-      res.a = reduce(f.a, c, root, false, op);
-    }
-    return (c.rank() == root || all ? res : non_mpi_t{});
-  }
-
-} // namespace mpi