From 6c21df3da87ca4d87683c69c28a9b31102b1c921 Mon Sep 17 00:00:00 2001
From: netwarm007 <chenwenli@chenwenli.com>
Date: Wed, 4 Jan 2023 15:20:39 +0800
Subject: [PATCH] fix bug

---
 Asset/Shaders/CUDA/OptixTest.shader.cu |  2 +-
 Framework/GeomMath/random.hpp          | 20 ++++++++++----------
 Test/CudaTest.cu                       |  8 ++++----
 Test/OptixTest.cu                      |  6 +++---
 Test/OptixTest.hpp                     |  2 +-
 5 files changed, 19 insertions(+), 19 deletions(-)
diff --git a/Asset/Shaders/CUDA/OptixTest.shader.cu b/Asset/Shaders/CUDA/OptixTest.shader.cu
index 15371330..fd9de4da 100644
--- a/Asset/Shaders/CUDA/OptixTest.shader.cu
+++ b/Asset/Shaders/CUDA/OptixTest.shader.cu
@@ -75,7 +75,7 @@ __global__ void __raygen__rg() {
     unsigned int j = launch_index.y;
     unsigned int pixel_index = j * params.image->Width + i;
 
-    curandState* local_rand_state = &params.rand_state[pixel_index];
+    curandStateMRG32k3a* local_rand_state = &params.rand_state[pixel_index];
 
     int num_of_samples = rtData->num_of_samples;
     vec3 col = {0.f, 0.f, 0.f};
diff --git a/Framework/GeomMath/random.hpp b/Framework/GeomMath/random.hpp
index b72082d2..b7ae1ee3 100644
--- a/Framework/GeomMath/random.hpp
+++ b/Framework/GeomMath/random.hpp
@@ -12,23 +12,23 @@ namespace My {
 #ifdef __CUDACC__
 
 template <class T>
-__device__ T random_f(curandStateMRG32k3a_t *local_rand_state) {
+__device__ __inline__ T random_f(curandStateMRG32k3a_t *local_rand_state) {
     return curand_uniform(local_rand_state);
 }
 
 template <class T>
-__device__ T random_f(T min, T max, curandStateMRG32k3a_t *local_rand_state) {
+__device__ __inline__ T random_f(T min, T max, curandStateMRG32k3a_t *local_rand_state) {
     T scale = max - min;
     return min + scale * curand_uniform(local_rand_state);
 }
 
 template <class T> requires std::integral<T>
-__device__ T random_int(T min, T max, curandStateMRG32k3a_t *local_rand_state) {
+__device__ __inline__ T random_int(T min, T max, curandStateMRG32k3a_t *local_rand_state) {
     return static_cast<T>(random_f<T>(static_cast<T>(min), static_cast<T>(max), local_rand_state));
 }
 
 template <class T, Dimension auto N>
-__device__ Vector<T, N> random_v(curandStateMRG32k3a_t *local_rand_state) {
+__device__ __inline__ Vector<T, N> random_v(curandStateMRG32k3a_t *local_rand_state) {
     auto vec = Vector<T, N>();
     for (int i = 0; i < N; i++) {
         vec[i] = random_f<T>(local_rand_state);
@@ -38,7 +38,7 @@ __device__ Vector<T, N> random_v(curandStateMRG32k3a_t *local_rand_state) {
 }
 
 template <class T, Dimension auto N>
-__device__ Vector<T, N> random_v(T min, T max, curandStateMRG32k3a_t *local_rand_state) {
+__device__ __inline__ Vector<T, N> random_v(T min, T max, curandStateMRG32k3a_t *local_rand_state) {
     auto vec = Vector<T, N>();
     for (int i = 0; i < N; i++) {
         vec[i] = random_f<T>(min, max, local_rand_state);
@@ -48,7 +48,7 @@ __device__ Vector<T, N> random_v(T min, T max, curandStateMRG32k3a_t *local_rand
 }
 
 template <class T, Dimension auto N>
-__device__ Vector<T, N> random_in_unit_sphere(curandStateMRG32k3a_t *local_rand_state) {
+__device__ __inline__ Vector<T, N> random_in_unit_sphere(curandStateMRG32k3a_t *local_rand_state) {
     while (true) {
         auto p = random_v<T, N>(T(-1), T(1), local_rand_state);
         if (LengthSquared(p) >= 1) continue;
@@ -57,14 +57,14 @@ __device__ Vector<T, N> random_in_unit_sphere(curandStateMRG32k3a_t *local_rand_
 }
 
 template <class T, Dimension auto N>
-__device__ Vector<T, N> random_unit_vector(curandStateMRG32k3a_t *local_rand_state) {
+__device__ __inline__ Vector<T, N> random_unit_vector(curandStateMRG32k3a_t *local_rand_state) {
     auto p = random_in_unit_sphere<T, N>(local_rand_state);
     Normalize(p);
     return p;
 }
 
 template <class T, Dimension auto N>
-__device__ Vector<T, N> random_in_hemisphere(const Vector<T, N>& normal, curandStateMRG32k3a_t *local_rand_state) {
+__device__ __inline__ Vector<T, N> random_in_hemisphere(const Vector<T, N>& normal, curandStateMRG32k3a_t *local_rand_state) {
     auto p = random_in_unit_sphere<T, N>(local_rand_state);
     T result;
     DotProduct<T, N>(result, p, normal);
@@ -76,7 +76,7 @@ __device__ Vector<T, N> random_in_hemisphere(const Vector<T, N>& normal, curandS
 }
 
 template <class T>
-__device__ Vector3<T> random_in_hemisphere_cosine_weighted(const Vector3<T>& normal, curandStateMRG32k3a_t *local_rand_state) {
+__device__ __inline__ Vector3<T> random_in_hemisphere_cosine_weighted(const Vector3<T>& normal, curandStateMRG32k3a_t *local_rand_state) {
     auto uv = random_v<T, 2>(local_rand_state);
     T phi = 2.0 * PI * uv[0];
 
@@ -90,7 +90,7 @@ __device__ Vector3<T> random_in_hemisphere_cosine_weighted(const Vector3<T>& nor
 }
 
 template <class T>
-__device__ Vector3<T> random_in_unit_disk(curandStateMRG32k3a_t *local_rand_state) {
+__device__ __inline__ Vector3<T> random_in_unit_disk(curandStateMRG32k3a_t *local_rand_state) {
     while (true) {
         auto p = Vector3<T>({random_f(T(-1.0), T(1.0), local_rand_state), random_f(T(-1.0), T(1.0), local_rand_state), 0});
         if (LengthSquared(p) >= (T)1.0) continue;
diff --git a/Test/CudaTest.cu b/Test/CudaTest.cu
index 4514185c..8bb5db2c 100644
--- a/Test/CudaTest.cu
+++ b/Test/CudaTest.cu
@@ -22,13 +22,13 @@ void check_cuda(cudaError_t result, char const *const func,
     }
 }
 
-__global__ void rand_init(curandState *rand_state) {
+__global__ void rand_init(curandStateMRG32k3a *rand_state) {
     if (threadIdx.x == 0 && blockIdx.x == 0) {
         curand_init(2023, 0, 0, rand_state);
     }
 }
 
-__global__ void test(curandState *local_rand_state) {
+__global__ void test(curandStateMRG32k3a *local_rand_state) {
     const int scene_obj_num = 1;
     My::Hitable<float>** pList = new My::Hitable<float>*[scene_obj_num];
     for (int i = 0; i < scene_obj_num; i++) {
@@ -42,9 +42,9 @@ __global__ void test(curandState *local_rand_state) {
 }
 
 int main() {
-    curandState *d_rand_state_1;
+    curandStateMRG32k3a *d_rand_state_1;
 
-    checkCudaErrors(cudaMalloc((void **)&d_rand_state_1, sizeof(curandState)));
+    checkCudaErrors(cudaMalloc((void **)&d_rand_state_1, sizeof(curandStateMRG32k3a)));
 
     rand_init<<<1, 1>>>(d_rand_state_1);
 
diff --git a/Test/OptixTest.cu b/Test/OptixTest.cu
index 07106015..57543321 100644
--- a/Test/OptixTest.cu
+++ b/Test/OptixTest.cu
@@ -73,7 +73,7 @@ static void context_log_cb( unsigned int level, const char* tag, const char* mes
     << message << "\n";
 }
 
-__global__ void rand_init(curandState *rand_state, const unsigned int max_x, const unsigned int max_y) {
+__global__ void rand_init(curandStateMRG32k3a *rand_state, const unsigned int max_x, const unsigned int max_y) {
     // Each thread in a block gets unique seed
     int i = threadIdx.x + blockIdx.x * blockDim.x;
     int j = threadIdx.y + blockIdx.y * blockDim.y;
@@ -360,7 +360,7 @@ int main() {
     My::Image img;
     My::Image* d_img;
     My::RayTracingCamera<float>* d_camera;
-    curandState* d_rand_state;
+    curandStateMRG32k3a* d_rand_state;
     {
         const float aspect_ratio = 16.0 / 9.0;
         const int image_width = 1920;
@@ -401,7 +401,7 @@ int main() {
         dim3 blocks((image_width + tile_width - 1) / tile_width, (image_height + tile_height - 1) / tile_height);
         dim3 threads(tile_width, tile_height);
 
-        checkCudaErrors(cudaMalloc((void **)&d_rand_state, num_pixels * sizeof(curandState)));
+        checkCudaErrors(cudaMalloc((void **)&d_rand_state, num_pixels * sizeof(curandStateMRG32k3a)));
 
         rand_init<<<blocks, threads>>>(d_rand_state, image_width, image_height);
         checkCudaErrors(cudaGetLastError());
diff --git a/Test/OptixTest.hpp b/Test/OptixTest.hpp
index 1260a2d0..e5a0f206 100644
--- a/Test/OptixTest.hpp
+++ b/Test/OptixTest.hpp
@@ -7,7 +7,7 @@
 struct Params{
     My::Image*                      image;
     My::RayTracingCamera<float>*    cam;
-    curandState*                    rand_state;
+    curandStateMRG32k3a*                    rand_state;
     OptixTraversableHandle          handle;
 };