flatironinstitute
diff --git a/‎CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎include/common/common.h‎
Lines changed: 5 additions & 0 deletions b/‎include/common/common.h‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎include/common/constants.h‎
Lines changed: 22 additions & 0 deletions b/‎include/common/constants.h‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎include/common/defines.h‎
Lines changed: 58 additions & 0 deletions b/‎include/common/defines.h‎
Lines changed: 58 additions & 0 deletions
diff --git a/‎include/common/utils.h‎
Lines changed: 149 additions & 0 deletions b/‎include/common/utils.h‎
Lines changed: 149 additions & 0 deletions
diff --git a/‎include/cufinufft/defs.h‎
Lines changed: 1 addition & 11 deletions b/‎include/cufinufft/defs.h‎
Lines changed: 1 addition & 11 deletions
diff --git a/‎include/cufinufft/impl.h‎
Lines changed: 1 addition & 0 deletions b/‎include/cufinufft/impl.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎include/cufinufft/utils.h‎
Lines changed: 18 additions & 36 deletions b/‎include/cufinufft/utils.h‎
Lines changed: 18 additions & 36 deletions
@@ -262,6 +262,7 @@ if(FINUFFT_USE_CPU)
         src/finufft_core.cpp
         src/c_interface.cpp
         src/finufft_utils.cpp
+        src/utils.cpp
     )
 
     if(FINUFFT_BUILD_FORTRAN)
 
@@ -0,0 +1,5 @@
+#pragma once
+
+#include <common/constants.h>
+#include <common/defines.h>
+#include <common/utils.h>
@@ -0,0 +1,22 @@
+#pragma once
+
+namespace finufft {
+namespace common {
+
+// constants needed within common
+// upper bound on w, ie nspread, even when padded (see evaluate_kernel_vector);
+// also for common
+inline constexpr int MIN_NSPREAD = 2;
+inline constexpr int MAX_NSPREAD = 16;
+// max number of positive quadr nodes
+inline constexpr int MAX_NQUAD = 100;
+// Fraction growth cut-off in utils:arraywidcen, sets when translate in type-3
+inline constexpr double ARRAYWIDCEN_GROWFRAC = 0.1;
+// How many waays there are to evaluate the kernel it should match the avbailable options
+// in finufft_opts
+inline constexpr int KEREVAL_METHODS = 2;
+inline constexpr double PI           = 3.141592653589793238462643383279502884;
+// 1 / (2 * PI)
+inline constexpr double INV_2PI = 0.159154943091895335768883763372514362;
+} // namespace common
+} // namespace finufft
@@ -0,0 +1,58 @@
+#pragma once
+
+/* IMPORTANT: for Windows compilers, you should add a line
+        #define FINUFFT_DLL
+   here if you are compiling/using FINUFFT as a DLL,
+   in order to do the proper importing/exporting, or
+   alternatively compile with -DFINUFFT_DLL or the equivalent
+   command-line flag.  This is not necessary under MinGW/Cygwin, where
+   libtool does the imports/exports automatically.
+   Alternatively use include(GenerateExportHeader) and
+   generate_export_header(finufft) to auto generate an header containing
+   these defines.The main reason is that if msvc changes the way it deals
+   with it in the future we just need to update cmake for it to work
+   instead of having a check on the msvc version. */
+#if defined(FINUFFT_DLL) && (defined(_WIN32) || defined(__WIN32__))
+#if defined(dll_EXPORTS)
+#define FINUFFT_EXPORT __declspec(dllexport)
+#else
+#define FINUFFT_EXPORT __declspec(dllimport)
+#endif
+#else
+#define FINUFFT_EXPORT
+#endif
+
+/* specify calling convention (Windows only)
+   The cdecl calling convention is actually not the default in all but a very
+   few C/C++ compilers.
+   If the user code changes the default compiler calling convention, may need
+   this when generating DLL. */
+#if defined(_WIN32) || defined(__WIN32__)
+#define FINUFFT_CDECL __cdecl
+#else
+#define FINUFFT_CDECL
+#endif
+
+// common function attributes
+#if defined(_MSC_VER)
+#define FINUFFT_ALWAYS_INLINE __forceinline
+#define FINUFFT_NEVER_INLINE  __declspec(noinline)
+#define FINUFFT_RESTRICT      __restrict
+#define FINUFFT_UNREACHABLE   __assume(0)
+#define FINUFFT_UNLIKELY(x)   (x)
+#define FINUFFT_LIKELY(x)     (x)
+#elif defined(__GNUC__) || defined(__clang__)
+#define FINUFFT_ALWAYS_INLINE __attribute__((always_inline)) inline
+#define FINUFFT_NEVER_INLINE  __attribute__((noinline))
+#define FINUFFT_RESTRICT      __restrict__
+#define FINUFFT_UNREACHABLE   __builtin_unreachable()
+#define FINUFFT_UNLIKELY(x)   __builtin_expect(!!(x), 0)
+#define FINUFFT_LIKELY(x)     __builtin_expect(!!(x), 1)
+#else
+#define FINUFFT_ALWAYS_INLINE inline
+#define FINUFFT_NEVER_INLINE
+#define FINUFFT_RESTRICT
+#define FINUFFT_UNREACHABLE
+#define FINUFFT_UNLIKELY(x) (x)
+#define FINUFFT_LIKELY(x)   (x)
+#endif
@@ -0,0 +1,149 @@
+#pragma once
+
+#include <array>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+
+#include "defines.h"
+
+namespace finufft {
+namespace common {
+
+FINUFFT_EXPORT void FINUFFT_CDECL gaussquad(int n, double *xgl, double *wgl);
+std::tuple<double, double> leg_eval(int n, double x);
+
+// helper to generate the integer sequence in range [Start, End]
+template<int Offset, typename Seq> struct offset_seq;
+
+template<int Offset, int... I>
+struct offset_seq<Offset, std::integer_sequence<int, I...>> {
+  using type = std::integer_sequence<int, (Offset + I)...>;
+};
+
+template<int Start, int End>
+using make_range =
+    typename offset_seq<Start, std::make_integer_sequence<int, End - Start + 1>>::type;
+
+template<typename Seq> struct DispatchParam {
+  int runtime_val;
+  using seq_type = Seq;
+};
+
+// Cartesian product over integer sequences.
+// Invokes f.template operator()<...>() for each combination of values.
+// The functor F must provide a templated call operator.
+// Adapted upon suggestion from Nils Wentzell: godbolt.org/z/GM94xb1j4
+//
+namespace detail {
+
+template<typename F, typename... Seq> struct Product;
+
+// Recursive case: at least two sequences remaining
+template<typename F, int... I1, typename Seq2, typename... Rest>
+struct Product<F, std::integer_sequence<int, I1...>, Seq2, Rest...> {
+  template<int... Prefix> static void apply(F &f) {
+    (Product<F, Seq2, Rest...>::template apply<Prefix..., I1>(f), ...);
+  }
+};
+
+// Base case: single sequence left
+template<typename F, int... I1> struct Product<F, std::integer_sequence<int, I1...>> {
+  template<int... Prefix> static void apply(F &f) {
+    (f.template operator()<Prefix..., I1>(), ...);
+  }
+};
+
+template<typename F, typename... Seq> void product(F &f, Seq...) {
+  Product<F, Seq...>::template apply<>(f);
+}
+
+// Helper functor invoked for each combination to check runtime values
+template<typename Func, std::size_t N, typename ArgTuple, typename ResultType>
+struct DispatcherCaller {
+  Func &func;
+  const std::array<int, N> &vals;
+  ArgTuple &args;
+  std::conditional_t<std::is_void_v<ResultType>, char, ResultType> result{};
+  template<int... Params> void operator()() {
+    static constexpr std::array<int, sizeof...(Params)> p{Params...};
+    if (p == vals) {
+      if constexpr (std::is_void_v<ResultType>) {
+        std::apply(
+            [&](auto &&...a) {
+              func.template operator()<Params...>(std::forward<decltype(a)>(a)...);
+            },
+            args);
+      } else {
+        result = std::apply(
+            [&](auto &&...a) {
+              return func.template operator()<Params...>(std::forward<decltype(a)>(a)...);
+            },
+            args);
+      }
+    }
+  }
+};
+
+template<typename Seq> struct seq_first;
+template<int I0, int... I>
+struct seq_first<std::integer_sequence<int, I0, I...>> : std::integral_constant<int, I0> {
+};
+
+template<typename Tuple, std::size_t... I>
+auto extract_vals_impl(const Tuple &t, std::index_sequence<I...>) {
+  return std::array<int, sizeof...(I)>{std::get<I>(t).runtime_val...};
+}
+template<typename Tuple> auto extract_vals(const Tuple &t) {
+  using T = std::remove_reference_t<Tuple>;
+  return extract_vals_impl(t, std::make_index_sequence<std::tuple_size_v<T>>{});
+}
+
+template<typename Tuple, std::size_t... I>
+auto extract_seqs_impl(const Tuple &t, std::index_sequence<I...>) {
+  using T = std::remove_reference_t<Tuple>;
+  return std::make_tuple(typename std::tuple_element_t<I, T>::seq_type{}...);
+}
+template<typename Tuple> auto extract_seqs(const Tuple &t) {
+  using T = std::remove_reference_t<Tuple>;
+  return extract_seqs_impl(t, std::make_index_sequence<std::tuple_size_v<T>>{});
+}
+
+template<typename Func, typename ArgTuple, typename... Seq>
+struct dispatch_result_helper {
+  template<std::size_t... I>
+  static auto test(std::index_sequence<I...>)
+      -> decltype(std::declval<Func>().template operator()<seq_first<Seq>::value...>(
+          std::get<I>(std::declval<ArgTuple>())...));
+  using type = decltype(test(std::make_index_sequence<std::tuple_size_v<ArgTuple>>{}));
+};
+template<typename Func, typename ArgTuple, typename SeqTuple> struct dispatch_result;
+template<typename Func, typename ArgTuple, typename... Seq>
+struct dispatch_result<Func, ArgTuple, std::tuple<Seq...>> {
+  using type = typename dispatch_result_helper<Func, ArgTuple, Seq...>::type;
+};
+template<typename Func, typename ArgTuple, typename SeqTuple>
+using dispatch_result_t = typename dispatch_result<Func, ArgTuple, SeqTuple>::type;
+
+} // namespace detail
+
+// Generic dispatcher mapping runtime ints to template parameters.
+// params is a tuple of DispatchParam holding runtime values and sequences.
+// When a match is found, the functor is invoked with those template parameters
+// and its result returned. Otherwise, the default-constructed result is returned.
+template<typename Func, typename ParamTuple, typename... Args>
+decltype(auto) dispatch(Func &&func, ParamTuple &&params, Args &&...args) {
+  using tuple_t           = std::remove_reference_t<ParamTuple>;
+  constexpr std::size_t N = std::tuple_size_v<tuple_t>;
+  auto vals               = detail::extract_vals(params);
+  auto seqs               = detail::extract_seqs(params);
+  auto arg_tuple          = std::forward_as_tuple(std::forward<Args>(args)...);
+  using result_t = detail::dispatch_result_t<Func, decltype(arg_tuple), decltype(seqs)>;
+  detail::DispatcherCaller<Func, N, decltype(arg_tuple), result_t> caller{func, vals,
+                                                                          arg_tuple};
+  std::apply([&](auto &&...s) { detail::product(caller, s...); }, seqs);
+  if constexpr (!std::is_void_v<result_t>) return caller.result;
+}
+
+} // namespace common
+} // namespace finufft
@@ -1,18 +1,8 @@
 #ifndef CUFINUFFT_DEFS_H
 #define CUFINUFFT_DEFS_H
 
+#include <common/common.h>
 #include <limits>
-// constants needed within common
-// upper bound on w, ie nspread, even when padded (see evaluate_kernel_vector); also for
-// common
-#define MAX_NSPREAD          16
-#define MIN_NSPREAD          2
-
-// max number of positive quadr nodes
-#define MAX_NQUAD            100
-
-// Fraction growth cut-off in utils:arraywidcen, sets when translate in type-3
-#define ARRAYWIDCEN_GROWFRAC 0.1
 
 // FIXME: If cufft ever takes N > INT_MAX...
 constexpr int32_t MAX_NF = std::numeric_limits<int32_t>::max();
 
@@ -72,6 +72,7 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
       Marco Barbone 07/26/24. Using SM when shared memory available is enough.
   */
   using namespace cufinufft::common;
+  using namespace finufft::common;
   int ier;
   if (type < 1 || type > 3) {
     fprintf(stderr, "[%s] Invalid type (%d): should be 1, 2, or 3.\n", __func__, type);
 
@@ -4,25 +4,21 @@
 // octave (mkoctfile) needs this otherwise it doesn't know what int64_t is!
 #include <complex>
 
-#include <cuComplex.h>
 #include <cufinufft/types.h>
 
 #include <cuda_runtime.h>
 #include <thrust/extrema.h>
+#include <tuple>
 #include <type_traits>
 #include <utility> // for std::forward
 
-#include <finufft_errors.h>
+#include <common/common.h>
 
 #ifndef _USE_MATH_DEFINES
 #define _USE_MATH_DEFINES
 #endif
 #include <cmath>
 
-#ifndef M_PI
-#define M_PI 3.14159265358979323846
-#endif
-
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 || defined(__clang__)
 #else
 __inline__ __device__ double atomicAdd(double *address, double val) {
@@ -72,6 +68,8 @@ template<typename T> __forceinline__ __device__ auto interval(const int ns, cons
 namespace cufinufft {
 namespace utils {
 
+using namespace finufft::common;
+
 class WithCudaDevice {
 public:
   explicit WithCudaDevice(const int device) : orig_device_{get_orig_device()} {
@@ -90,10 +88,8 @@ class WithCudaDevice {
   }
 };
 
-// math helpers whose source is in src/cuda/utils.cpp
-CUFINUFFT_BIGINT next235beven(CUFINUFFT_BIGINT n, CUFINUFFT_BIGINT b);
-void gaussquad(int n, double *xgl, double *wgl);
-std::tuple<double, double> leg_eval(int n, double x);
+// math helpers whose source is in src/utils.cpp
+long next235beven(long n, long b);
 
 template<typename T> T infnorm(int n, std::complex<T> *a) {
   T nrm = 0.0;
@@ -124,8 +120,8 @@ static __forceinline__ __device__ void atomicAddComplexShared(
  * on shared memory are supported so we leverage them
  */
 template<typename T>
-static __forceinline__ __device__ void atomicAddComplexGlobal(
-    cuda_complex<T> *address, cuda_complex<T> res) {
+static __forceinline__ __device__ void atomicAddComplexGlobal(cuda_complex<T> *address,
+                                                              cuda_complex<T> res) {
   if constexpr (
       std::is_same_v<cuda_complex<T>, float2> && COMPUTE_CAPABILITY_90_OR_HIGHER) {
     atomicAdd(address, res);
@@ -150,7 +146,7 @@ template<typename T> auto arrayrange(int n, T *a, cudaStream_t stream) {
 
 // Writes out w = half-width and c = center of an interval enclosing all a[n]'s
 // Only chooses a nonzero center if this increases w by less than fraction
-// ARRAYWIDCEN_GROWFRAC defined in defs.h.
+// ARRAYWIDCEN_GROWFRAC defined in common/constants.h.
 // This prevents rephasings which don't grow nf by much. 6/8/17
 // If n==0, w and c are not finite.
 template<typename T> auto arraywidcen(int n, T *a, cudaStream_t stream) {
@@ -180,41 +176,27 @@ auto set_nhg_type3(T S, T X, const cufinufft_opts &opts,
   else
     Ssafe = std::max(Ssafe, T(1) / X);
   // use the safe X and S...
-  T nfd = 2.0 * opts.upsampfac * Ssafe * Xsafe / M_PI + nss;
+  T nfd = 2.0 * opts.upsampfac * Ssafe * Xsafe / PI + nss;
   if (!std::isfinite(nfd)) nfd = 0.0; // use FLT to catch inf
   auto nf = (int)nfd;
   // printf("initial nf=%lld, ns=%d\n",*nf,spopts.nspread);
   //  catch too small nf, and nan or +-inf, otherwise spread fails...
   if (nf < 2 * spopts.nspread) nf = 2 * spopts.nspread;
-  if (nf < MAX_NF)                   // otherwise will fail anyway
-    nf = utils::next235beven(nf, 1); // expensive at huge nf
+  if (nf < MAX_NF)            // otherwise will fail anyway
+    nf = next235beven(nf, 1); // expensive at huge nf
   // Note: b is 1 because type 3 uses a type 2 plan, so it should not need the extra
   // condition that seems to be used by Block Gather as type 2 are only GM-sort
-  auto h   = 2 * T(M_PI) / nf;                       // upsampled grid spacing
+  auto h   = 2 * T(PI) / nf;                         // upsampled grid spacing
   auto gam = T(nf) / (2.0 * opts.upsampfac * Ssafe); // x scale fac to x'
   return std::make_tuple(nf, h, gam);
 }
 
-// Generalized dispatcher for any function requiring ns-based dispatch
-template<typename Func, typename T, int ns, typename... Args>
-int dispatch_ns(Func &&func, int target_ns, Args &&...args) {
-  if constexpr (ns > MAX_NSPREAD) {
-    return FINUFFT_ERR_METHOD_NOTVALID; // Stop recursion
-  } else {
-    if (target_ns == ns) {
-      return std::forward<Func>(func).template operator()<ns>(
-          std::forward<Args>(args)...);
-    }
-    return dispatch_ns<Func, T, ns + 1>(std::forward<Func>(func), target_ns,
-                                        std::forward<Args>(args)...);
-  }
-}
-
-// Wrapper function that starts the dispatch recursion
+// Wrapper around the generic dispatcher for nspread-based dispatch
 template<typename Func, typename T, typename... Args>
-int launch_dispatch_ns(Func &&func, int target_ns, Args &&...args) {
-  return dispatch_ns<Func, T, MIN_NSPREAD>(std::forward<Func>(func), target_ns,
-                                           std::forward<Args>(args)...);
+auto launch_dispatch_ns(Func &&func, int target_ns, Args &&...args) {
+  using NsSeq = make_range<MIN_NSPREAD, MAX_NSPREAD>;
+  auto params = std::make_tuple(DispatchParam<NsSeq>{target_ns});
+  return dispatch(std::forward<Func>(func), params, std::forward<Args>(args)...);
 }
 
 /**
Original file line number	Diff line number	Diff line change
`@@ -262,6 +262,7 @@ if(FINUFFT_USE_CPU)`
`262`	`262`	`src/finufft_core.cpp`
`263`	`263`	`src/c_interface.cpp`
`264`	`264`	`src/finufft_utils.cpp`
	`265`	`+ src/utils.cpp`
`265`	`266`	`)`
`266`	`267`
`267`	`268`	`if(FINUFFT_BUILD_FORTRAN)`