speed up CUDA enabled TSD builds by splitting computeScalarRange into TUs

jeffamstutz · jeffamstutz · commit d073178b0a82 · 2025-03-04T21:13:09.000-06:00
diff --git a/tsd/src/tsd/CMakeLists.txt b/tsd/src/tsd/CMakeLists.txt
@@ -6,6 +6,12 @@ project_add_library(STATIC)
 project_sources(
 PRIVATE
   algorithms/computeScalarRange.cpp
+  algorithms/detail/computeScalarRange_fixed8.cpp
+  algorithms/detail/computeScalarRange_fixed16.cpp
+  algorithms/detail/computeScalarRange_ufixed8.cpp
+  algorithms/detail/computeScalarRange_ufixed16.cpp
+  algorithms/detail/computeScalarRange_float32.cpp
+  algorithms/detail/computeScalarRange_float64.cpp
   authoring/importers/detail/HDRImage.cpp
   authoring/importers/detail/importer_common.cpp
   authoring/importers/import_ASSIMP.cpp
@@ -80,7 +86,12 @@ if (TSD_USE_CUDA)
   project_compile_definitions(PUBLIC -DTSD_USE_CUDA=1)
   project_link_libraries(PUBLIC CUDA::cudart)
   set_source_files_properties(
-    algorithms/computeScalarRange.cpp
+    algorithms/detail/computeScalarRange_fixed8.cpp
+    algorithms/detail/computeScalarRange_fixed16.cpp
+    algorithms/detail/computeScalarRange_ufixed8.cpp
+    algorithms/detail/computeScalarRange_ufixed16.cpp
+    algorithms/detail/computeScalarRange_float32.cpp
+    algorithms/detail/computeScalarRange_float64.cpp
     objects/SpatialField.cpp
     PROPERTIES
       COMPILE_FLAGS "--extended-lambda --expt-relaxed-constexpr"
diff --git a/tsd/src/tsd/algorithms/computeScalarRange.cpp b/tsd/src/tsd/algorithms/computeScalarRange.cpp
@@ -1,86 +1,13 @@
 // Copyright 2024-2025 NVIDIA Corporation
 // SPDX-License-Identifier: Apache-2.0
 
-#ifndef TSD_USE_CUDA
-#define TSD_USE_CUDA 1
-#endif
-
 #include "tsd/algorithms/computeScalarRange.hpp"
 #include "tsd/core/Context.hpp"
-// std
-#include <algorithm>
-#include <limits>
-#if TSD_USE_CUDA
-// thrust
-#include <cuda_runtime.h>
-#include <thrust/device_ptr.h>
-#include <thrust/extrema.h>
-#endif
-
-namespace tsd::algorithm {
-
-namespace detail {
-
-// NOTE(jda): This is a reduced version of anari::anariTypeInvoke() to lower
-//            Thrust/CUDA compile times
-template <typename R, template <int> class F, typename... Args>
-inline R scalarTypeInvoke(ANARIDataType type, Args &&...args)
-{
-  // clang-format off
-  switch (type) {
-  case ANARI_UFIXED8: return F<ANARI_UFIXED8>()(std::forward<Args>(args)...);
-  case ANARI_UFIXED16: return F<ANARI_UFIXED16>()(std::forward<Args>(args)...);
-  case ANARI_FIXED8: return F<ANARI_FIXED8>()(std::forward<Args>(args)...);
-  case ANARI_FIXED16: return F<ANARI_FIXED16>()(std::forward<Args>(args)...);
-  case ANARI_FLOAT32: return F<ANARI_FLOAT32>()(std::forward<Args>(args)...);
-  case ANARI_FLOAT64: return F<ANARI_FLOAT64>()(std::forward<Args>(args)...);
-  default:
-    return F<ANARI_UNKNOWN>()(std::forward<Args>(args)...);
-  }
-  // clang-format off
-}
-
-template <int ANARI_ENUM_T>
-struct ComputeScalarRange
-{
-  using properties_t = anari::ANARITypeProperties<ANARI_ENUM_T>;
-  using base_t = typename properties_t::base_type;
+#include "tsd/core/Logging.hpp"
 
-  tsd::float2 operator()(const Array &a)
-  {
-    tsd::float4 min_out{0.f, 0.f, 0.f, 0.f};
-    tsd::float4 max_out{0.f, 0.f, 0.f, 0.f};
+#include "tsd/algorithms/detail/computeScalarRangeImpl.hpp"
 
-    const auto *begin = a.dataAs<base_t>();
-    const auto *end = begin + a.size();
-#if TSD_USE_CUDA
-    if (a.kind() == Array::MemoryKind::CUDA) {
-      const auto minmax = thrust::minmax_element(
-          thrust::device_pointer_cast(begin), thrust::device_pointer_cast(end));
-      const base_t min_v = *minmax.first;
-      const base_t max_v = *minmax.second;
-      properties_t::toFloat4(&min_out.x, &min_v);
-      properties_t::toFloat4(&max_out.x, &max_v);
-    } else {
-#endif
-      const auto minmax = std::minmax_element(begin, end);
-      const auto min_v = *minmax.first;
-      const auto max_v = *minmax.second;
-      properties_t::toFloat4(&min_out.x, &min_v);
-      properties_t::toFloat4(&max_out.x, &max_v);
-#if TSD_USE_CUDA
-    }
-#endif
-
-    return {min_out.x, max_out.x};
-  }
-};
-
-} // namespace detail
-
-///////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////
+namespace tsd::algorithm {
 
 tsd::float2 computeScalarRange(const Array &a)
 {
@@ -103,8 +30,32 @@ tsd::float2 computeScalarRange(const Array &a)
       retval.y = std::max(retval.y, subRange.y);
     });
   } else if (elementsAreScalars) {
-    retval = detail::scalarTypeInvoke<tsd::float2, detail::ComputeScalarRange>(
-        type, a);
+    switch (type) {
+    case ANARI_UFIXED8:
+      retval = detail::computeScalarRange_ufixed8(a);
+      break;
+    case ANARI_UFIXED16:
+      retval = detail::computeScalarRange_ufixed16(a);
+      break;
+    case ANARI_FIXED8:
+      retval = detail::computeScalarRange_fixed8(a);
+      break;
+    case ANARI_FIXED16:
+      retval = detail::computeScalarRange_fixed16(a);
+      break;
+    case ANARI_FLOAT32:
+      retval = detail::computeScalarRange_float32(a);
+      break;
+    case ANARI_FLOAT64:
+      retval = detail::computeScalarRange_float64(a);
+      break;
+    default:
+      logWarning(
+          "computeScalarRange() called on an "
+          "array with incompatible element type '%s'",
+          anari::toString(type));
+      break;
+    }
   }
 
   return retval;
diff --git a/tsd/src/tsd/algorithms/detail/computeScalarRangeImpl.hpp b/tsd/src/tsd/algorithms/detail/computeScalarRangeImpl.hpp
@@ -0,0 +1,65 @@
+// Copyright 2025 NVIDIA Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#ifndef TSD_USE_CUDA
+#define TSD_USE_CUDA 1
+#endif
+
+#include "tsd/core/TSDMath.hpp"
+#include "tsd/objects/Array.hpp"
+// std
+#include <algorithm>
+#include <limits>
+#if TSD_USE_CUDA
+// thrust
+#include <cuda_runtime.h>
+#include <thrust/device_ptr.h>
+#include <thrust/extrema.h>
+#endif
+
+namespace tsd::algorithm::detail {
+
+template <int ANARI_ENUM_T>
+inline tsd::float2 computeScalarRangeImpl(const Array &a)
+{
+  using properties_t = anari::ANARITypeProperties<ANARI_ENUM_T>;
+  using base_t = typename properties_t::base_type;
+  tsd::float4 min_out{0.f, 0.f, 0.f, 0.f};
+  tsd::float4 max_out{0.f, 0.f, 0.f, 0.f};
+
+  const auto *begin = a.dataAs<base_t>();
+  const auto *end = begin + a.size();
+#if TSD_USE_CUDA
+  if (a.kind() == Array::MemoryKind::CUDA) {
+    const auto minmax = thrust::minmax_element(
+        thrust::device_pointer_cast(begin), thrust::device_pointer_cast(end));
+    const base_t min_v = *minmax.first;
+    const base_t max_v = *minmax.second;
+    properties_t::toFloat4(&min_out.x, &min_v);
+    properties_t::toFloat4(&max_out.x, &max_v);
+  } else {
+#endif
+    const auto minmax = std::minmax_element(begin, end);
+    const auto min_v = *minmax.first;
+    const auto max_v = *minmax.second;
+    properties_t::toFloat4(&min_out.x, &min_v);
+    properties_t::toFloat4(&max_out.x, &max_v);
+#if TSD_USE_CUDA
+  }
+#endif
+
+  return {min_out.x, max_out.x};
+}
+
+// NOTE(jda) - Expand the template in separate TUs due to long Thrust compile
+//             times.
+tsd::float2 computeScalarRange_ufixed8(const Array &a);
+tsd::float2 computeScalarRange_ufixed16(const Array &a);
+tsd::float2 computeScalarRange_fixed8(const Array &a);
+tsd::float2 computeScalarRange_fixed16(const Array &a);
+tsd::float2 computeScalarRange_float32(const Array &a);
+tsd::float2 computeScalarRange_float64(const Array &a);
+
+} // namespace tsd::algorithm::detail
diff --git a/tsd/src/tsd/algorithms/detail/computeScalarRange_fixed16.cpp b/tsd/src/tsd/algorithms/detail/computeScalarRange_fixed16.cpp
@@ -0,0 +1,13 @@
+// Copyright 2025 NVIDIA Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "computeScalarRangeImpl.hpp"
+
+namespace tsd::algorithm::detail {
+
+tsd::float2 computeScalarRange_fixed16(const Array &a)
+{
+  return computeScalarRangeImpl<ANARI_FIXED16>(a);
+}
+
+} // namespace tsd::algorithm::detail
diff --git a/tsd/src/tsd/algorithms/detail/computeScalarRange_fixed8.cpp b/tsd/src/tsd/algorithms/detail/computeScalarRange_fixed8.cpp
@@ -0,0 +1,13 @@
+// Copyright 2025 NVIDIA Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "computeScalarRangeImpl.hpp"
+
+namespace tsd::algorithm::detail {
+
+tsd::float2 computeScalarRange_fixed8(const Array &a)
+{
+  return computeScalarRangeImpl<ANARI_FIXED8>(a);
+}
+
+} // namespace tsd::algorithm::detail
diff --git a/tsd/src/tsd/algorithms/detail/computeScalarRange_float32.cpp b/tsd/src/tsd/algorithms/detail/computeScalarRange_float32.cpp
@@ -0,0 +1,13 @@
+// Copyright 2025 NVIDIA Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "computeScalarRangeImpl.hpp"
+
+namespace tsd::algorithm::detail {
+
+tsd::float2 computeScalarRange_float32(const Array &a)
+{
+  return computeScalarRangeImpl<ANARI_FLOAT32>(a);
+}
+
+} // namespace tsd::algorithm::detail
diff --git a/tsd/src/tsd/algorithms/detail/computeScalarRange_float64.cpp b/tsd/src/tsd/algorithms/detail/computeScalarRange_float64.cpp
@@ -0,0 +1,13 @@
+// Copyright 2025 NVIDIA Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "computeScalarRangeImpl.hpp"
+
+namespace tsd::algorithm::detail {
+
+tsd::float2 computeScalarRange_float64(const Array &a)
+{
+  return computeScalarRangeImpl<ANARI_FLOAT64>(a);
+}
+
+} // namespace tsd::algorithm::detail
diff --git a/tsd/src/tsd/algorithms/detail/computeScalarRange_ufixed16.cpp b/tsd/src/tsd/algorithms/detail/computeScalarRange_ufixed16.cpp
@@ -0,0 +1,13 @@
+// Copyright 2025 NVIDIA Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "computeScalarRangeImpl.hpp"
+
+namespace tsd::algorithm::detail {
+
+tsd::float2 computeScalarRange_ufixed16(const Array &a)
+{
+  return computeScalarRangeImpl<ANARI_UFIXED16>(a);
+}
+
+} // namespace tsd::algorithm::detail
diff --git a/tsd/src/tsd/algorithms/detail/computeScalarRange_ufixed8.cpp b/tsd/src/tsd/algorithms/detail/computeScalarRange_ufixed8.cpp
@@ -0,0 +1,13 @@
+// Copyright 2025 NVIDIA Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "computeScalarRangeImpl.hpp"
+
+namespace tsd::algorithm::detail {
+
+tsd::float2 computeScalarRange_ufixed8(const Array &a)
+{
+  return computeScalarRangeImpl<ANARI_UFIXED8>(a);
+}
+
+} // namespace tsd::algorithm::detail