diff --git a/CMakeLists.txt b/CMakeLists.txt index 608957ca..7747d13c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -64,7 +64,14 @@ set(LIBRARIES set(OPENCL_SOURCES src/kernels/cl/aplusb.cl src/kernels/cl/ray_tracing_render_brute_force.cl - src/kernels/cl/ray_tracing_render_using_lbvh.cl) + src/kernels/cl/ray_tracing_render_using_lbvh.cl + src/kernels/cl/merge_sort.cl + src/kernels/cl/morton_code_generation.cl + src/kernels/cl/lbvh_construction.cl + src/kernels/cl/lbvh_aabb_generation.cl + src/kernels/cl/zeros.cl + src/kernels/cl/bigbox_calc.cl + ) set(OPENCL_INCLUDES src/kernels/defines.h src/kernels/cl/helpers/rassert.cl src/kernels/cl/camera_helpers.cl src/kernels/cl/geometry_helpers.cl src/kernels/cl/random_helpers.cl src/kernels/shared_structs/aabb_gpu_shared.h src/kernels/shared_structs/bvh_node_gpu_shared.h src/kernels/shared_structs/camera_gpu_shared.h src/kernels/shared_structs/morton_code_gpu_shared.h src/kernels/shared_structs/struct_helpers.h) set(OPENCL_DEFINES) diff --git a/src/debug/debug_bvh.h b/src/debug/debug_bvh.h index 9d2fd067..83e08863 100644 --- a/src/debug/debug_bvh.h +++ b/src/debug/debug_bvh.h @@ -8,7 +8,7 @@ #include #include -#include "scene_reader.h" +// #include "scene_reader.h" #include "../kernels/shared_structs/bvh_node_gpu_shared.h" namespace debug { diff --git a/src/kernels/cl/bigbox_calc.cl b/src/kernels/cl/bigbox_calc.cl new file mode 100644 index 00000000..f3caeb2c --- /dev/null +++ b/src/kernels/cl/bigbox_calc.cl @@ -0,0 +1,56 @@ +#ifdef __CLION_IDE__ +#include +#endif + +#include "helpers/rassert.cl" +#include "../defines.h" + +#include "../shared_structs/camera_gpu_shared.h" +#include "../shared_structs/bvh_node_gpu_shared.h" +#include "../shared_structs/aabb_gpu_shared.h" +#include "../shared_structs/morton_code_gpu_shared.h" + +#include "camera_helpers.cl" +#include "geometry_helpers.cl" +#include "random_helpers.cl" + +// BVH traversal: closest hit along ray +__kernel void bigbox_calc( + uint nfaces, + __global const float* vertices, + __global const uint* faces, + __global AABBGPU* bigbox) +{ + uint i = get_global_id(0); + + if (i == 0) { + bigbox[0].min_x = +INFINITY; + bigbox[0].min_y = +INFINITY; + bigbox[0].min_z = +INFINITY; + bigbox[0].max_x = -INFINITY; + bigbox[0].max_y = -INFINITY; + bigbox[0].max_z = -INFINITY; + + for (int j = 0; j < nfaces; j++) { + uint3 f = loadFace(faces, j); + float3 v0 = loadVertex(vertices, f.x); + float3 v1 = loadVertex(vertices, f.y); + float3 v2 = loadVertex(vertices, f.z); + + bigbox[0].min_x = min(bigbox[0].min_x, min(v0.x, min(v1.x, v2.x))); + bigbox[0].min_y = min(bigbox[0].min_y, min(v0.y, min(v1.y, v2.y))); + bigbox[0].min_z = min(bigbox[0].min_z, min(v0.z, min(v1.z, v2.z))); + bigbox[0].max_x = max(bigbox[0].max_x, min(v0.x, min(v1.x, v2.x))); + bigbox[0].max_y = max(bigbox[0].max_y, min(v0.y, min(v1.y, v2.y))); + bigbox[0].max_z = max(bigbox[0].max_z, min(v0.z, min(v1.z, v2.z))); + } + + + printf("Biggest box : (%f %f %f) -- (%f %f %f)\n", bigbox[0].min_x, + bigbox[0].min_y, + bigbox[0].min_z, + bigbox[0].max_x, + bigbox[0].max_y, + bigbox[0].max_z); + } +} diff --git a/src/kernels/cl/helpers/rassert.cl b/src/kernels/cl/helpers/rassert.cl index 93d050f9..4e7bb992 100644 --- a/src/kernels/cl/helpers/rassert.cl +++ b/src/kernels/cl/helpers/rassert.cl @@ -1,6 +1,6 @@ #include "../../defines.h" -#if RASSERT_ENABLED +#if 1 #define rassert(condition, error_code) \ do { \ if (!(condition)) { \ diff --git a/src/kernels/cl/lbvh_aabb_generation.cl b/src/kernels/cl/lbvh_aabb_generation.cl new file mode 100644 index 00000000..678b0d21 --- /dev/null +++ b/src/kernels/cl/lbvh_aabb_generation.cl @@ -0,0 +1,84 @@ +#ifdef __CLION_IDE__ +#include +#endif + +#include "helpers/rassert.cl" +#include "../defines.h" + +#include "../shared_structs/camera_gpu_shared.h" +#include "../shared_structs/bvh_node_gpu_shared.h" +#include "../shared_structs/aabb_gpu_shared.h" + +#include "camera_helpers.cl" +#include "geometry_helpers.cl" +#include "random_helpers.cl" + +#define INVALID 0xffffffff + +void process_tri( + __global const uint* faces, + __global const float* vertices, + uint tri_id, + __global BVHNodeGPU* node +) { + uint3 f = loadFace(faces, tri_id); + float3 v0 = loadVertex(vertices, f.x); + float3 v1 = loadVertex(vertices, f.y); + float3 v2 = loadVertex(vertices, f.z); + + node->aabb.min_x = min(v0.x, min(v1.x, v2.x)); + node->aabb.min_y = min(v0.y, min(v1.y, v2.y)); + node->aabb.min_z = min(v0.z, min(v1.z, v2.z)); + + node->aabb.max_x = max(v0.x, max(v1.x, v2.x)); + node->aabb.max_y = max(v0.y, max(v1.y, v2.y)); + node->aabb.max_z = max(v0.z, max(v1.z, v2.z)); + + node->leftChildIndex = INVALID; + node->rightChildIndex = INVALID; +} + +__kernel void lbvh_aabb_generation( + __global const uint* morton_codes, + __global uint* face_indexes, + __global const uint* faces, + __global const float* vertices, + __global BVHNodeGPU* nodes, + __global int* parent, + __global int* counter, + uint nfaces, + int calc_leafs + ) +{ + int i = get_global_id(0); + const int leafStart = (int)nfaces - 1; + + if (calc_leafs) { + if (i >= leafStart && i < nfaces + leafStart) { + int face_id = face_indexes[i - leafStart]; + process_tri(faces, vertices, face_id - leafStart, &nodes[i]); + // printf("%d <-- %d\n", i, parent[i]); + atomic_add(&counter[parent[i]], 1); + face_indexes[i - leafStart] -= leafStart; + } + } + else { + if (i < leafStart && counter[i] >= 2) { + BVHNodeGPU l = nodes[nodes[i].leftChildIndex]; + BVHNodeGPU r = nodes[nodes[i].rightChildIndex]; + + nodes[i].aabb.min_x = min(l.aabb.min_x, r.aabb.min_x); + nodes[i].aabb.min_y = min(l.aabb.min_y, r.aabb.min_y); + nodes[i].aabb.min_z = min(l.aabb.min_z, r.aabb.min_z); + + nodes[i].aabb.max_x = max(l.aabb.max_x, r.aabb.max_x); + nodes[i].aabb.max_y = max(l.aabb.max_y, r.aabb.max_y); + nodes[i].aabb.max_z = max(l.aabb.max_z, r.aabb.max_z); + + if (parent[i] != -1) + atomic_add(&counter[parent[i]], 1); + } + } +} + + diff --git a/src/kernels/cl/lbvh_construction.cl b/src/kernels/cl/lbvh_construction.cl new file mode 100644 index 00000000..8a14bdd6 --- /dev/null +++ b/src/kernels/cl/lbvh_construction.cl @@ -0,0 +1,128 @@ +#ifdef __CLION_IDE__ +#include +#endif + +#include "helpers/rassert.cl" +#include "../defines.h" + +#include "../shared_structs/camera_gpu_shared.h" +#include "../shared_structs/bvh_node_gpu_shared.h" +#include "../shared_structs/aabb_gpu_shared.h" + +#include "camera_helpers.cl" +#include "geometry_helpers.cl" +#include "random_helpers.cl" + +inline int common_pref_len( + __global const uint* morton_codes, + int l, + int r, + int nfaces +) +{ + if (l > r) { + int tmp = r; + r = l; + l = tmp; + } + + rassert(r == l + 1, 4358904); + + if (l < 0) + return -1; + + if (r >= nfaces) + return -1; + + int bit = 31; + while (bit >= 0 && (((morton_codes[l] + l) >> bit) & 1) == (((morton_codes[r] + r) >> bit) & 1)) + bit--; + + return 32 - bit - 1; +} + +__kernel void lbvh_construction( + __global const uint* morton_codes, + __global uint* face_indexes, + __global const uint* faces, + __global const float* vertices, + __global BVHNodeGPU* nodes, + __global int* parent, + uint nfaces + ) +{ + int i = get_global_id(0); + const int leafStart = (int)nfaces - 1; + + int d, pref_len, j, min_pref, min_pref_ind; + if (i < leafStart) { + d = 0, pref_len = -1; + + if (i == 0) + d = +1; + else + d = (common_pref_len(morton_codes, i, i + 1, nfaces) > common_pref_len(morton_codes, i - 1, i, nfaces) ? +1 : -1); + + pref_len = common_pref_len(morton_codes, i, i - d, nfaces); + + j = i; + min_pref = 100, min_pref_ind = -1; + while (j + d < nfaces && + j + d >= 0 && + pref_len < common_pref_len(morton_codes, j, j + d, nfaces)) { + if (min_pref > common_pref_len(morton_codes, j, j + d, nfaces)) { + min_pref = common_pref_len(morton_codes, j, j + d, nfaces); + min_pref_ind = j; + } + j += d; + } + // min_pref_ind, min_pref_ind + d - max on segment + + if (i == 0) { + parent[0] = -1; + } + if (j == i + d) { + nodes[i].leftChildIndex = face_indexes[min(i, j)]; + nodes[i].rightChildIndex = face_indexes[max(i, j)]; + } + else if (i == min_pref_ind) { + if (d == +1) { + nodes[i].leftChildIndex = face_indexes[i]; + nodes[i].rightChildIndex = i + d; + } + if (d == -1) { + nodes[i].leftChildIndex = i + d; + nodes[i].rightChildIndex = face_indexes[i]; + } + } + else if (min_pref_ind + d == j) { + if (d == +1) { + nodes[i].leftChildIndex = j - d; + nodes[i].rightChildIndex = face_indexes[j]; + } + if (d == -1) { + nodes[i].leftChildIndex = face_indexes[j]; + nodes[i].rightChildIndex = j - d; + } + } + else { + nodes[i].leftChildIndex = min(min_pref_ind, min_pref_ind + d); + nodes[i].rightChildIndex = max(min_pref_ind, min_pref_ind + d); + } + } + + barrier(CLK_GLOBAL_MEM_FENCE); + + if (i < leafStart) { + parent[nodes[i].leftChildIndex] = i; + parent[nodes[i].rightChildIndex] = i; + + // atomic_add(&counter[nodes[i].leftChildIndex], 1); + // atomic_add(&counter[nodes[i].rightChildIndex], 1); + } + + barrier(CLK_GLOBAL_MEM_FENCE); + + if (i == 0) + printf("tree build done\n"); +} diff --git a/src/kernels/cl/merge_sort.cl b/src/kernels/cl/merge_sort.cl new file mode 100644 index 00000000..36b002c9 --- /dev/null +++ b/src/kernels/cl/merge_sort.cl @@ -0,0 +1,281 @@ +#define MAX_LOCAL_SIZE 256 //set via compile options + +////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// needed helper methods +inline void swap(uint *a, uint *b) { + uint tmp; + tmp = *b; + *b = *a; + *a = tmp; +} + +// dir == 1 means ascending +inline void sort(uint *a, uint *b, char dir) { + if ((*a > *b) == dir) swap(a, b); +} + +inline void swapLocal(__local uint *a, __local uint *b) { + uint tmp; + tmp = *b; + *b = *a; + *a = tmp; +} + +// dir == 1 means ascending +inline void sortLocal(__local uint *a, __local uint *b, char dir) { + if ((*a > *b) == dir) swapLocal(a, b); +} + +////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +// basic kernel for mergesort start +__kernel void merge_sort(const __global uint* inArray, __global uint* outArray) +{ + __local uint local_buffer[2][MAX_LOCAL_SIZE * 2]; + const uint lid = get_local_id(0); + const uint index = get_group_id(0) * (MAX_LOCAL_SIZE * 2) + lid; + char pong = 0; + char ping = 1; + + // load into local buffer + local_buffer[0][lid] = inArray[index]; + local_buffer[0][lid + MAX_LOCAL_SIZE] = inArray[index + MAX_LOCAL_SIZE]; + + // merge sort + for (unsigned int stride = 2; stride <= MAX_LOCAL_SIZE * 2; stride <<= 1) { + ping = pong; + pong = 1 - ping; + uint leftBoundary = lid * stride; + uint rightBoundary = leftBoundary + stride; + + uint middle = leftBoundary + (stride >> 1); + uint left = leftBoundary, right = middle; + barrier(CLK_LOCAL_MEM_FENCE); + if (rightBoundary > MAX_LOCAL_SIZE * 2) continue; +#pragma unroll + for (uint i = 0; i < stride; i++) { + uint leftVal = local_buffer[ping][left]; + uint rightVal = local_buffer[ping][right]; + bool selectLeft = left < middle && (right >= rightBoundary || leftVal <= rightVal); + + local_buffer[pong][leftBoundary + i] = (selectLeft) ? leftVal : rightVal; + + left += selectLeft; + right += 1 - selectLeft; + } + } + + //write back + barrier(CLK_LOCAL_MEM_FENCE); + outArray[index] = local_buffer[pong][lid]; + outArray[index + MAX_LOCAL_SIZE] = local_buffer[pong][lid + MAX_LOCAL_SIZE]; +} + +// For smaller strides so we can use local_buffer without getting into memory problems +__kernel void Sort_MergesortGlobalSmall(const __global uint* inArray, __global uint* outArray, const uint stride, const uint size) +{ + __local uint local_buffer[MAX_LOCAL_SIZE * 2]; + + // within one stride merge the different parts + const uint baseIndex = get_global_id(0) * stride; + const uint baseLocalIndex = get_local_id(0) * 2; + + uint middle = baseIndex + (stride >> 1); + uint left = baseIndex; + uint right = middle; + bool selectLeft = false; + + if ((baseIndex + stride) > size) return; + + local_buffer[baseLocalIndex + 1] = inArray[left]; + +#pragma unroll + for (uint i = baseIndex; i < (baseIndex + stride); i++) { + // check which value should be written out + local_buffer[baseLocalIndex + (int)selectLeft] = (selectLeft) ? inArray[left] : inArray[right]; + selectLeft = left < middle && (right == (baseIndex + stride) || local_buffer[baseLocalIndex + 1] <= local_buffer[baseLocalIndex]); + + // write out + outArray[i] = (selectLeft) ? local_buffer[baseLocalIndex + 1] : local_buffer[baseLocalIndex]; //PROBLEMATIC PART! WE RUN OUT OF MEMORY + + //increase counter accordingly + left += selectLeft; + right += 1 - selectLeft; + } +} + +__kernel void Sort_MergesortGlobalBig(const __global uint* inArray, __global uint* outArray, const uint stride, const uint size) +{ + //Problems: Breaks at large arrays. this version was stripped down (so little less performance but supports little bigger arrays) + + // within one stride merge the different parts + const uint baseIndex = get_global_id(0) * stride; + const char dir = 1; + + uint middle = baseIndex + (stride >> 1); + uint left = baseIndex; + uint right = middle; + bool selectLeft; + + if ((baseIndex + stride) > size) return; + +#pragma unroll + for (uint i = baseIndex; i < (baseIndex + stride); i++) { + // check which value should be written out + selectLeft = (left < middle && (right == (baseIndex + stride) || inArray[left] <= inArray[right])) == dir; + + // write out + outArray[i] = (selectLeft) ? inArray[left] : inArray[right]; + + //increase counter accordingly + left += selectLeft; + right += 1 - selectLeft; + } +} + +////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// OLD and BASIC global Kernel +__kernel void Sort_SimpleSortingNetwork(const __global uint* inArray, __global uint* outArray, const uint offset, const uint size) +{ + // TO DO: pimp! Problem: Kernel gets called pretty often -> overhead! + const uint index = (get_global_id(0) << 1) + offset; + if (index + 1 >= size) return; + + uint left = inArray[index]; + uint right = inArray[index + 1]; + sort(&left, &right, 1); + outArray[index] = left; + outArray[index + 1] = right; + + // not needed if we use input=output array + //bool additionalWork = (get_global_id(0) == 0) && offset; + //if (additionalWork) { //alternative: if (offset) {..} only + // outArray[0] = inArray[0]; + // outArray[size - 1] = inArray[size - 1]; + //} +} + +/* + * data - the data input and output array + * size - size of the array + * offset - whether or not there will be an offset + */ +__kernel void Sort_SimpleSortingNetworkLocal(__global uint* data, const uint size, const uint offset) +{ + __local uint local_buffer[MAX_LOCAL_SIZE * 2]; + const uint limit = MAX_LOCAL_SIZE * 2; + const uint lid = get_local_id(0); + const uint locIdx = lid << 1; + const uint index = get_group_id(0) * (MAX_LOCAL_SIZE * 2) + (offset * MAX_LOCAL_SIZE) + lid; + if (index + MAX_LOCAL_SIZE >= size) return; + + //load into local buffer + local_buffer[lid] = data[index]; + local_buffer[lid + MAX_LOCAL_SIZE] = data[index + MAX_LOCAL_SIZE]; + + //sort +#pragma unroll + for (int i = 0; i < limit; i++) { + barrier(CLK_LOCAL_MEM_FENCE); + uint index = locIdx + (i & 1); + if (index + 1 >= limit) continue; + sortLocal(&local_buffer[index], &local_buffer[index + 1], 1); + } + + // sync and write back + barrier(CLK_LOCAL_MEM_FENCE); + data[index] = local_buffer[lid]; + data[index + MAX_LOCAL_SIZE] = local_buffer[lid + MAX_LOCAL_SIZE]; +} + +////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +__kernel void Sort_BitonicMergesortStart(const __global uint* inArray, __global uint* outArray) +{ + __local uint local_buffer[MAX_LOCAL_SIZE * 2]; + const uint gid = get_global_id(0); + const uint lid = get_local_id(0); + + uint index = get_group_id(0) * (MAX_LOCAL_SIZE * 2) + lid; + //load into local mem + local_buffer[lid] = inArray[index]; + local_buffer[lid + MAX_LOCAL_SIZE] = inArray[index + MAX_LOCAL_SIZE]; + + uint clampedGID = gid & (MAX_LOCAL_SIZE - 1); + + // bitonic merge + for (uint blocksize = 2; blocksize < MAX_LOCAL_SIZE * 2; blocksize <<= 1) { + char dir = (clampedGID & (blocksize / 2)) == 0; // sort every other block in the other direction (faster % calc) +#pragma unroll + for (uint stride = blocksize >> 1; stride > 0; stride >>= 1){ + barrier(CLK_LOCAL_MEM_FENCE); + uint idx = 2 * lid - (lid & (stride - 1)); //take every other input BUT starting neighbouring within one block + sortLocal(&local_buffer[idx], &local_buffer[idx + stride], dir); + } + } + + // bitonic merge for biggest group is special (unrolling this so we dont need ifs in the part above) + char dir = (clampedGID & 0); //even or odd? sort accordingly +#pragma unroll + for (uint stride = MAX_LOCAL_SIZE; stride > 0; stride >>= 1){ + barrier(CLK_LOCAL_MEM_FENCE); + uint idx = 2 * lid - (lid & (stride - 1)); + sortLocal(&local_buffer[idx], &local_buffer[idx + stride], dir); + } + + // sync and write back + barrier(CLK_LOCAL_MEM_FENCE); + outArray[index] = local_buffer[lid]; + outArray[index + MAX_LOCAL_SIZE] = local_buffer[lid + MAX_LOCAL_SIZE]; +} + +__kernel void Sort_BitonicMergesortLocal(__global uint* data, const uint size, const uint blocksize, uint stride) +{ + // This Kernel is basically the same as Sort_BitonicMergesortStart except of the "unrolled" part and the provided parameters + __local uint local_buffer[2 * MAX_LOCAL_SIZE]; + uint gid = get_global_id(0); + uint groupId = get_group_id(0); + uint lid = get_local_id(0); + uint clampedGID = gid & (size / 2 - 1); + + uint index = groupId * (MAX_LOCAL_SIZE * 2) + lid; + //load into local mem + local_buffer[lid] = data[index]; + local_buffer[lid + MAX_LOCAL_SIZE] = data[index + MAX_LOCAL_SIZE]; + + // bitonic merge + char dir = (clampedGID & (blocksize / 2)) == 0; //same as above, % calc +#pragma unroll + for (; stride > 0; stride >>= 1) { + barrier(CLK_LOCAL_MEM_FENCE); + uint idx = 2 * lid - (lid & (stride - 1)); + sortLocal(&local_buffer[idx], &local_buffer[idx + stride], dir); + } + + // sync and write back + barrier(CLK_LOCAL_MEM_FENCE); + data[index] = local_buffer[lid]; + data[index + MAX_LOCAL_SIZE] = local_buffer[lid + MAX_LOCAL_SIZE]; +} + +__kernel void Sort_BitonicMergesortGlobal(__global uint* data, const uint size, const uint blocksize, const uint stride) +{ + // TO DO: Kernel implementation + uint gid = get_global_id(0); + uint clampedGID = gid & (size / 2 - 1); + + //calculate index and dir like above + uint index = 2 * clampedGID - (clampedGID & (stride - 1)); + char dir = (clampedGID & (blocksize / 2)) == 0; //same as above, % calc + + //bitonic merge + uint left = data[index]; + uint right = data[index + stride]; + + sort(&left, &right, dir); + + // writeback + data[index] = left; + data[index + stride] = right; +} + +////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// \ No newline at end of file diff --git a/src/kernels/cl/morton_code_generation.cl b/src/kernels/cl/morton_code_generation.cl new file mode 100644 index 00000000..d8e2df8e --- /dev/null +++ b/src/kernels/cl/morton_code_generation.cl @@ -0,0 +1,87 @@ +#ifdef __CLION_IDE__ +#include +#endif + +#include "helpers/rassert.cl" +#include "../defines.h" + +#include "../shared_structs/camera_gpu_shared.h" +#include "../shared_structs/bvh_node_gpu_shared.h" +#include "../shared_structs/aabb_gpu_shared.h" +#include "../shared_structs/morton_code_gpu_shared.h" + +#include "camera_helpers.cl" +#include "geometry_helpers.cl" +#include "random_helpers.cl" + +// Helper: expand 10 bits into 30 bits by inserting 2 zeros between each bit +unsigned int expandBits(unsigned int v) +{ + // Ensure we have only lowest 10 bits + // rassert(v == (v & 0x3FFu), 76389413321, v); + + // Magic bit expansion steps + v = (v * 0x00010001u) & 0xFF0000FFu; + v = (v * 0x00000101u) & 0x0F00F00Fu; + v = (v * 0x00000011u) & 0xC30C30C3u; + v = (v * 0x00000005u) & 0x49249249u; + + return v; +} + +// Convert 3D point in [0,1]^3 to 30-bit Morton code (10 bits per axis) +// Values outside [0,1] are clamped. +MortonCode morton3D(float x, float y, float z) +{ + // Map and clamp to integer grid [0, 1023] + unsigned int ix = min(max((int) (x * 1024.0f), 0), 1023); + unsigned int iy = min(max((int) (y * 1024.0f), 0), 1023); + unsigned int iz = min(max((int) (z * 1024.0f), 0), 1023); + + unsigned int xx = expandBits(ix); + unsigned int yy = expandBits(iy); + unsigned int zz = expandBits(iz); + + // Interleave: x in bits [2,5,8,...], y in [1,4,7,...], z in [0,3,6,...] + return (xx << 2) | (yy << 1) | zz; +} + + +// BVH traversal: closest hit along ray +__kernel void morton_code_generation( + uint nfaces, + __global const float* vertices, + __global const uint* faces, + __global uint* morton_encoding, + __global AABBGPU* bigbox) +{ + uint i = get_global_id(0); + + + + if (i < nfaces) { + uint3 f = loadFace(faces, i); + float3 v0 = loadVertex(vertices, f.x); + float3 v1 = loadVertex(vertices, f.y); + float3 v2 = loadVertex(vertices, f.z); + + float x = (v0.x + v1.x + v2.x) / 3; + float y = (v0.y + v1.y + v2.y) / 3; + float z = (v0.z + v1.z + v2.z) / 3; + + float dx = bigbox[0].max_x - bigbox[0].min_x; + float dy = bigbox[0].max_y - bigbox[0].min_y; + float dz = bigbox[0].max_z - bigbox[0].min_z; + + MortonCode baricenter = morton3D((x - bigbox[0].min_x) / dx, + (y - bigbox[0].min_y) / dy, + (z - bigbox[0].min_z) / dz); + + rassert((x - bigbox[0].min_x) > 0, 95694078589); + rassert((y - bigbox[0].min_y) > 0, 95694078589); + rassert((z - bigbox[0].min_z) > 0, 95694078589); + + + morton_encoding[i] = baricenter; + } +} diff --git a/src/kernels/cl/ray_tracing_render_brute_force.cl b/src/kernels/cl/ray_tracing_render_brute_force.cl index 113ad7fe..5717971d 100644 --- a/src/kernels/cl/ray_tracing_render_brute_force.cl +++ b/src/kernels/cl/ray_tracing_render_brute_force.cl @@ -71,7 +71,7 @@ __kernel void ray_tracing_render_brute_force( const uint i = get_global_id(0); const uint j = get_global_id(1); - rassert(camera.magic_bits_guard == CAMERA_VIEW_GPU_MAGIC_BITS_GUARD, 646435342); + // rassert(camera.magic_bits_guard == CAMERA_VIEW_GPU_MAGIC_BITS_GUARD, 646435342); if (i >= camera->K.width || j >= camera->K.height) return; diff --git a/src/kernels/cl/ray_tracing_render_using_lbvh.cl b/src/kernels/cl/ray_tracing_render_using_lbvh.cl index 35b5ca49..998a7eae 100644 --- a/src/kernels/cl/ray_tracing_render_using_lbvh.cl +++ b/src/kernels/cl/ray_tracing_render_using_lbvh.cl @@ -31,7 +31,64 @@ static inline bool bvh_closest_hit( const int rootIndex = 0; const int leafStart = (int)nfaces - 1; - // TODO implement BVH travering (with stack, don't use recursion) + int stack[100]; + int stackSize = 0; + stack[stackSize++] = rootIndex; + + float t_anw = FLT_MAX; + int face_anw = -1; + float u_anw = 0; + float v_anw = 0; + + float tNear; + float tFar; + + while (stackSize > 0) { + int nodeIndex = stack[--stackSize]; + rassert(nodeIndex < nfaces + nfaces - 1, 3905813); + BVHNodeGPU node = nodes[nodeIndex]; + + if (!intersect_ray_aabb(orig, dir, node.aabb, tMin, t_anw, &tNear, &tFar)) + continue; + + if (nodeIndex >= leafStart) { + uint leafIdx = (uint)(nodeIndex - leafStart); + rassert(leafIdx >= 0, 92048908); + rassert(leafIdx < nfaces, 942048908); + + uint triIndex = leafTriIndices[leafIdx]; + rassert(triIndex >= 0, 92048908); + rassert(triIndex < nfaces, 942048908); + + uint3 f = loadFace(faces, triIndex); + float3 v0 = loadVertex(vertices, f.x); + float3 v1 = loadVertex(vertices, f.y); + float3 v2 = loadVertex(vertices, f.z); + + float t, u, v; + if (intersect_ray_triangle(orig, dir, v0, v1, v2, tMin, t_anw, false, &t, &u, &v)) + { + if (t < t_anw) { + t_anw = t; + face_anw = (int)triIndex; + u_anw = u; + v_anw = v; + } + } + } else { + rassert(stackSize < 100, 1); + stack[stackSize++] = (int)node.leftChildIndex; + stack[stackSize++] = (int)node.rightChildIndex; + } + } + + if (face_anw != -1) { + *outT = t_anw; + *outFaceId = face_anw; + *outU = u_anw; + *outV = v_anw; + return true; + } return false; } @@ -50,7 +107,44 @@ static inline bool any_hit_from( const int rootIndex = 0; const int leafStart = (int)nfaces - 1; - // TODO implement BVH travering (with stack, don't use recursion) + int stack[100]; + int stackSize = 0; + stack[stackSize++] = rootIndex; + + float tNear; + float tFar; + + while (stackSize > 0) { + int nodeIndex = stack[--stackSize]; + BVHNodeGPU node = nodes[nodeIndex]; + + if (!intersect_ray_aabb_any(orig, dir, node.aabb, &tNear, &tFar)) + continue; + + if (nodeIndex >= leafStart) { + uint leafIdx = (uint)(nodeIndex - leafStart); + uint triIndex = leafTriIndices[leafIdx]; + if ((int)triIndex == ignore_face) + continue; + + rassert(triIndex >= 0, 92048908); + rassert(triIndex < nfaces, 942048908); + + uint3 f = loadFace(faces, triIndex); + float3 v0 = loadVertex(vertices, f.x); + float3 v1 = loadVertex(vertices, f.y); + float3 v2 = loadVertex(vertices, f.z); + + float t, u, v; + if (intersect_ray_triangle_any(orig, dir, v0, v1, v2, false, &t, &u, &v)) + return true; + + } else { + rassert(stackSize < 100, 1); + stack[stackSize++] = (int)node.leftChildIndex; + stack[stackSize++] = (int)node.rightChildIndex; + } + } return false; } @@ -82,7 +176,7 @@ __kernel void ray_tracing_render_using_lbvh( const uint i = get_global_id(0); const uint j = get_global_id(1); - rassert(camera.magic_bits_guard == CAMERA_VIEW_GPU_MAGIC_BITS_GUARD, 786435342); + // rassert(camera.magic_bits_guard == CAMERA_VIEW_GPU_MAGIC_BITS_GUARD, 786435342); if (i >= camera->K.width || j >= camera->K.height) return; diff --git a/src/kernels/cl/zeros.cl b/src/kernels/cl/zeros.cl new file mode 100644 index 00000000..7a94d5de --- /dev/null +++ b/src/kernels/cl/zeros.cl @@ -0,0 +1,24 @@ +#ifdef __CLION_IDE__ +#include +#endif + +#include "helpers/rassert.cl" +#include "../defines.h" + +#include "../shared_structs/camera_gpu_shared.h" +#include "../shared_structs/bvh_node_gpu_shared.h" +#include "../shared_structs/aabb_gpu_shared.h" + +#include "camera_helpers.cl" +#include "geometry_helpers.cl" +#include "random_helpers.cl" + +__kernel void zeros( + __global uint* a, + uint n +) +{ + uint i = get_global_id(0); + if (i < n) + a[i] = 0; +} diff --git a/src/kernels/kernels.cpp b/src/kernels/kernels.cpp index 51a1c4b2..e8b885d5 100644 --- a/src/kernels/kernels.cpp +++ b/src/kernels/kernels.cpp @@ -3,6 +3,15 @@ #include "cl/generated_kernels/aplusb.h" #include "cl/generated_kernels/ray_tracing_render_brute_force.h" #include "cl/generated_kernels/ray_tracing_render_using_lbvh.h" +#include "cl/generated_kernels/merge_sort.h" +#include "cl/generated_kernels/morton_code_generation.h" +#include "cl/generated_kernels/lbvh_construction.h" +#include "cl/generated_kernels/lbvh_aabb_generation.h" +#include "cl/generated_kernels/bigbox_calc.h" + + +#include "cl/generated_kernels/zeros.h" + #include "vk/generated_kernels/aplusb_comp.h" #include "vk/generated_kernels/ray_tracing_render_brute_force_comp.h" @@ -55,6 +64,37 @@ const ProgramBinaries& getRTWithLBVH() { return opencl_binaries_ray_tracing_render_using_lbvh; } + +const ProgramBinaries& getMergeSort() +{ + return opencl_binaries_merge_sort; +} + +const ProgramBinaries& getMortonCode() +{ + return opencl_binaries_morton_code_generation; +} + +const ProgramBinaries& getLBVHConstruction() +{ + return opencl_binaries_lbvh_construction; +} + +const ProgramBinaries& getAABBGen() +{ + return opencl_binaries_lbvh_aabb_generation; +} + +const ProgramBinaries& getZeros() +{ + return opencl_binaries_zeros; +} + +const ProgramBinaries& getBigBoxCalc() +{ + return opencl_binaries_bigbox_calc; +} + } // namespace ocl namespace avk2 { diff --git a/src/kernels/kernels.h b/src/kernels/kernels.h index b3c989e6..5ecf7f34 100644 --- a/src/kernels/kernels.h +++ b/src/kernels/kernels.h @@ -28,6 +28,12 @@ const ProgramBinaries& getAplusB(); const ProgramBinaries& getRTBruteForce(); const ProgramBinaries& getRTWithLBVH(); +const ProgramBinaries& getMergeSort(); +const ProgramBinaries& getMortonCode(); +const ProgramBinaries& getLBVHConstruction(); +const ProgramBinaries& getAABBGen(); +const ProgramBinaries& getZeros(); +const ProgramBinaries& getBigBoxCalc(); } namespace avk2 { diff --git a/src/main_linear_bvh.cpp b/src/main_linear_bvh.cpp index 35676eea..0ce604b1 100644 --- a/src/main_linear_bvh.cpp +++ b/src/main_linear_bvh.cpp @@ -14,6 +14,7 @@ #include "io/scene_reader.h" #include "cpu_helpers/build_bvh_cpu.h" +#include "debug/debug_bvh.h" #include #include @@ -52,6 +53,26 @@ size_t countDiffs(const TypedImage &a, const TypedImage &b, T threshold) { return count; } +void dfs_epta(int u, std::vector &nodes, int nfaces) { + if (u >= nfaces - 1) { + return; + } + + BVHNodeGPU l = nodes[nodes[u].leftChildIndex]; + BVHNodeGPU r = nodes[nodes[u].rightChildIndex]; + + dfs_epta(nodes[u].leftChildIndex, nodes, nfaces); + dfs_epta(nodes[u].rightChildIndex, nodes, nfaces); + + nodes[u].aabb.min_x = std::min(l.aabb.min_x, r.aabb.min_x); + nodes[u].aabb.min_y = std::min(l.aabb.min_y, r.aabb.min_y); + nodes[u].aabb.min_z = std::min(l.aabb.min_z, r.aabb.min_z); + + nodes[u].aabb.max_x = std::max(l.aabb.max_x, r.aabb.max_x); + nodes[u].aabb.max_y = std::max(l.aabb.max_y, r.aabb.max_y); + nodes[u].aabb.max_z = std::max(l.aabb.max_z, r.aabb.max_z); +} + void run(int argc, char** argv) { // chooseGPUVkDevices: @@ -77,6 +98,14 @@ void run(int argc, char** argv) ocl::KernelSource ocl_rt_brute_force(ocl::getRTBruteForce()); ocl::KernelSource ocl_rt_with_lbvh(ocl::getRTWithLBVH()); + ocl::KernelSource ocl_merge_sort(ocl::getMergeSort()); + ocl::KernelSource ocl_morton_code(ocl::getMortonCode()); + ocl::KernelSource ocl_lbvh_construction(ocl::getLBVHConstruction()); + ocl::KernelSource ocl_lbvh_aabb_generation(ocl::getAABBGen()); + ocl::KernelSource ocl_bigbox_calc(ocl::getBigBoxCalc()); + + ocl::KernelSource ocl_zeros(ocl::getZeros()); + avk2::KernelSource vk_rt_brute_force(avk2::getRTBruteForce()); avk2::KernelSource vk_rt_with_lbvh(avk2::getRTWithLBVH()); @@ -88,7 +117,7 @@ void run(int argc, char** argv) "data/san-miguel/san-miguel.obj", }; - const int niters = 10; // при отладке удобно запускать одну итерацию + const int niters = 1; // при отладке удобно запускать одну итерацию std::vector gpu_rt_perf_mrays_per_sec; std::vector gpu_lbvh_perfs_mtris_per_sec; @@ -109,7 +138,7 @@ void run(int argc, char** argv) SceneGeometry scene = loadScene(scene_path); // если на каком-то датасете падает - удобно взять подможество треугольников - например просто вызовите scene.faces.resize(10000); const unsigned int nvertices = scene.vertices.size(); - const unsigned int nfaces = scene.faces.size(); + unsigned int nfaces = scene.faces.size(); rassert(nvertices > 0, 546345423523143); rassert(nfaces > 0, 54362452342); std::string scene_name = std::filesystem::path(scene_path).parent_path().filename().string(); @@ -205,6 +234,7 @@ void run(int argc, char** argv) double cpu_lbvh_time = 0.0; double rt_times_with_cpu_lbvh_sum = 0.0; + if (true) { std::vector lbvh_nodes_cpu; std::vector leaf_faces_indices_cpu; @@ -230,7 +260,7 @@ void run(int argc, char** argv) timer t; // TODO оттрасируйте лучи на GPU используя построенный на CPU LBVH - throw std::runtime_error(CODE_IS_NOT_IMPLEMENTED); + // throw std::runtime_error(CODE_IS_NOT_IMPLEMENTED); if (context.type() == gpu::Context::TypeOpenCL) { ocl_rt_with_lbvh.exec( @@ -290,15 +320,61 @@ void run(int argc, char** argv) // TODO постройте LBVH на GPU // TODO оттрасируйте лучи на GPU используя построенный на GPU LBVH - bool gpu_lbvg_gpu_rt_done = false; + bool gpu_lbvg_gpu_rt_done = true; + + gpu::gpu_mem_32u morton_codes(nfaces); + gpu::gpu_mem_32u face_indexes(nfaces); + gpu::gpu_mem_32u buffer1(nfaces + nfaces - 1); + gpu::gpu_mem_32u buffer2(nfaces + nfaces - 1); + gpu::gpu_mem_32u buffer3(nfaces + nfaces - 1); + + gpu::shared_device_buffer_typed lbvh_nodes_gpu(nfaces + nfaces - 1); + + + ocl_zeros.exec(gpu::WorkSize(GROUP_SIZE, nfaces + nfaces - 1), buffer2, nfaces + nfaces - 1); + ocl_zeros.exec(gpu::WorkSize(GROUP_SIZE, nfaces + nfaces - 1), buffer3, nfaces + nfaces - 1); + + gpu::shared_device_buffer_typed bigbox(1); if (gpu_lbvg_gpu_rt_done) { std::vector gpu_lbvh_times; for (int iter = 0; iter < niters; ++iter) { timer t; - // TODO постройте LBVH на GPU + ocl_bigbox_calc.exec(gpu::WorkSize(GROUP_SIZE, 1), nfaces, vertices_gpu, faces_gpu, bigbox.clmem()); + ocl_morton_code.exec(gpu::WorkSize(GROUP_SIZE, nfaces), nfaces, vertices_gpu, faces_gpu, morton_codes, bigbox.clmem()); + + std::vector codes(nfaces, 0); + std::vector indexes(nfaces, 0); + morton_codes.readN(codes.data(), nfaces); + + std::vector> zip(nfaces); + for (int i = 0; i < nfaces; ++i) { + zip[i].first = codes[i]; + zip[i].second = i; + } + + // EMY BOOGWE POХUЙ + std::sort(zip.begin(), zip.end()); + for (int i = 0; i < nfaces; ++i) { + codes[i] = zip[i].first; + indexes[i] = zip[i].second + nfaces - 1; + } + + morton_codes.writeN(codes.data(), nfaces); + face_indexes.writeN(indexes.data(), nfaces); + + ocl_lbvh_construction.exec(gpu::WorkSize(GROUP_SIZE, nfaces), morton_codes, face_indexes, faces_gpu, vertices_gpu, lbvh_nodes_gpu.clmem(), buffer1, nfaces); + + ocl_lbvh_aabb_generation.exec(gpu::WorkSize(GROUP_SIZE, 2*nfaces - 1), morton_codes, face_indexes, faces_gpu, vertices_gpu, lbvh_nodes_gpu.clmem(), buffer1, buffer2, nfaces, 1); + for (int layer = 0; layer < 32; layer++) { + ocl_lbvh_aabb_generation.exec(gpu::WorkSize(GROUP_SIZE, 2*nfaces - 1), morton_codes, face_indexes, faces_gpu, vertices_gpu, lbvh_nodes_gpu.clmem(), buffer1, buffer2, nfaces, 0); + } + // std::vector nodes(nfaces + nfaces - 1); + // lbvh_nodes_gpu.readN(nodes.data(), nfaces + nfaces - 1); + + printf("build bvh ok!\n"); gpu_lbvh_times.push_back(t.elapsed()); } gpu_lbvh_time_sum = stats::sum(gpu_lbvh_times); @@ -317,6 +393,12 @@ void run(int argc, char** argv) timer t; // TODO оттрасируйте лучи на GPU используя построенный на GPU LBVH + ocl_rt_with_lbvh.exec( + gpu::WorkSize(16, 16, width, height), + vertices_gpu, faces_gpu, + lbvh_nodes_gpu.clmem(), face_indexes.clmem(), + framebuffer_face_id_gpu, framebuffer_ambient_occlusion_gpu, + camera_gpu.clmem(), nfaces); gpu_lbvh_rt_times.push_back(t.elapsed()); } diff --git a/test.txr b/test.txr new file mode 100644 index 00000000..e69de29b