From 254cb069a3e5ad73f5cea2c97780c8935dc1ec90 Mon Sep 17 00:00:00 2001 From: Anton Ljungdahl Date: Fri, 2 May 2025 12:38:43 +0200 Subject: [PATCH] working bvh on CPU --- .gitignore | 2 + build.bat | 4 +- src/base_core.cu | 33 +++ src/base_core.cuh | 60 +++++ src/base_core.h | 35 --- src/{base_math.c => base_math.cu} | 96 +++++++- src/{base_math.h => base_math.cuh} | 20 +- src/main.cu | 342 ++++++++++++++++++++++++----- src/rayt_bvh.cu | 136 ++++++++++++ src/rayt_bvh.cuh | 27 +++ src/{rayt_core.c => rayt_core.cu} | 105 ++++++++- src/{rayt_core.h => rayt_core.cuh} | 21 +- timeBuild.ctm | Bin 2980 -> 4724 bytes 13 files changed, 777 insertions(+), 104 deletions(-) create mode 100644 src/base_core.cu create mode 100644 src/base_core.cuh delete mode 100644 src/base_core.h rename src/{base_math.c => base_math.cu} (50%) rename src/{base_math.h => base_math.cuh} (73%) create mode 100644 src/rayt_bvh.cu create mode 100644 src/rayt_bvh.cuh rename src/{rayt_core.c => rayt_core.cu} (58%) rename src/{rayt_core.h => rayt_core.cuh} (59%) diff --git a/.gitignore b/.gitignore index 47e5e73..d26530c 100644 --- a/.gitignore +++ b/.gitignore @@ -115,3 +115,5 @@ build/ *.exr *.bmp *.png +*.sublime* +*.ctm) \ No newline at end of file diff --git a/build.bat b/build.bat index b169f39..92ece1a 100644 --- a/build.bat +++ b/build.bat @@ -15,7 +15,9 @@ set CudaRemoveWarnings=-diag-suppress 177 IF NOT EXIST .\build mkdir .\build pushd .\build -nvcc %CudaSources% %CudaRemoveWarnings% -o program.exe + +@rem nvcc %CudaRemoveWarnings% -G -g -lineinfo -o program.exe %CudaSources% +nvcc %CudaRemoveWarnings% -o program.exe %CudaSources% set LastError=%ERRORLEVEL% popd diff --git a/src/base_core.cu b/src/base_core.cu new file mode 100644 index 0000000..2fb6a77 --- /dev/null +++ b/src/base_core.cu @@ -0,0 +1,33 @@ + +__host__ function F64 +get_cpu_frequency() +{ + LARGE_INTEGER freq; + QueryPerformanceFrequency(&freq); + U64 start_tsc = __rdtsc(); + Sleep(100); + U64 end_tsc = __rdtsc(); + F64 cyclers_per_ms = (F64)(end_tsc - start_tsc) / 100.0; + return cyclers_per_ms * 1000.0; // Cycles per second +} + +__host__ function void +timer() +{ + g_cpu_timer.second_to_last_cycles = g_cpu_timer.last_cycles; + g_cpu_timer.last_cycles = __rdtsc(); +} + +__host__ function F64 +timer_elapsed() +{ + U64 cycles = g_cpu_timer.last_cycles - g_cpu_timer.second_to_last_cycles; + F64 elapsed = (F64)cycles / (g_cpu_timer.cpu_freq/1000.0); + return elapsed; // ms +} + + +function F64 test_function() +{ + return 34.20; +} diff --git a/src/base_core.cuh b/src/base_core.cuh new file mode 100644 index 0000000..e2a890c --- /dev/null +++ b/src/base_core.cuh @@ -0,0 +1,60 @@ +#pragma once + +#define WIN32_LEAN_AND_MEAN +#include +#include + +#include +#include +#include +#include + +#include +#include + +//------------------------------------------------------------------------------------------ +//~ base defines + +#define host_global static +#define function static + +//~ typedefs +typedef int32_t S32; +typedef uint32_t U32; +typedef uint64_t U64; +typedef double F64; +typedef float F32; + +//~ utility defines + +#define CUDA_CHECK(err) do { \ + if (err != cudaSuccess) { \ + fprintf(stderr, "CUDA ERROR: %s at %s:%d\n", \ + cudaGetErrorString(err), __FILE__, __LINE__); \ + exit(EXIT_FAILURE); \ + } \ +} while (0) + +#define LOG printf + +#define F32_MAX FLT_MAX +#define F32_MIN -FLT_MAX + + +typedef struct CPUTimer CPUTimer; +struct CPUTimer +{ + U64 last_cycles; + U64 second_to_last_cycles; + F64 elapsed; + F64 cpu_freq; +}; + + +//------------------------------------------------------------------------------------------ +//~ timing +__host__ function F64 get_cpu_frequency(); +__host__ function void timer(); +__host__ function F64 timer_elapsed(); +function F64 test_function(); + diff --git a/src/base_core.h b/src/base_core.h deleted file mode 100644 index f4ec8b7..0000000 --- a/src/base_core.h +++ /dev/null @@ -1,35 +0,0 @@ -#pragma once - -#include -#include -#include -#include - -#include - -//------------------------------------------------------------------------------------------ -//~ base defines - -#define host_global static -#define function static - -//~ typedefs -typedef int32_t S32; -typedef uint32_t U32; -typedef uint64_t U64; -typedef float F32; - -//~ utility defines - -#define CUDA_CHECK(err) do { \ - if (err != cudaSuccess) { \ - fprintf(stderr, "CUDA ERROR: %s at %s:%d\n", \ - cudaGetErrorString(err), __FILE__, __LINE__); \ - exit(EXIT_FAILURE); \ - } \ -} while (0) - -#define LOG printf - -#define F32_MAX FLT_MAX -#define F32_MIN FLT_MIN diff --git a/src/base_math.c b/src/base_math.cu similarity index 50% rename from src/base_math.c rename to src/base_math.cu index f87f443..4cf4205 100644 --- a/src/base_math.c +++ b/src/base_math.cu @@ -1,4 +1,24 @@ +__host__ inline function Vec3F32 +h_max_V3F32(Vec3F32 a, Vec3F32 b) +{ + Vec3F32 out = {0}; + out.x = a.x > b.x ? a.x : b.x; + out.y = a.y > b.y ? a.y : b.y; + out.z = a.z > b.z ? a.z : b.z; + return out; +} + +__host__ inline function Vec3F32 +h_min_V3F32(Vec3F32 a, Vec3F32 b) +{ + Vec3F32 out = {0}; + out.x = a.x < b.x ? a.x : b.x; + out.y = a.y < b.y ? a.y : b.y; + out.z = a.z < b.z ? a.z : b.z; + return out; +} + __host__ __device__ inline function Vec3F32 vec3F32(F32 x, F32 y, F32 z) { @@ -40,6 +60,16 @@ scale_V3F32(F32 s, Vec3F32 v) return out; } +__host__ __device__ inline function Vec3F32 +cross_V3F32(Vec3F32 a, Vec3F32 b) +{ + Vec3F32 out = {0}; + out.x = a.y*b.z-a.z*b.y; + out.y = a.z*b.x-a.x*b.z; + out.z = a.x*b.y-a.y*b.x; + return out; +} + __host__ __device__ inline function F32 dot_V3F32(Vec3F32 a, Vec3F32 b) { @@ -48,9 +78,9 @@ dot_V3F32(Vec3F32 a, Vec3F32 b) __host__ __device__ inline function Vec3F32 -ray_point_F32(F32 t, RayF32 ray) +ray_point_F32(F32 t, RayF32 *ray) { - Vec3F32 out = add_V3F32(ray.origin, scale_V3F32(t, ray.direction)); + Vec3F32 out = add_V3F32(ray->origin, scale_V3F32(t, ray->direction)); return out; } @@ -136,4 +166,66 @@ clamp_V3F32(RngF32 rng, Vec3F32 v) return out; } +__host__ function F32 +rand_uniform_host_F32() +{ + F32 rand_max = (F32)RAND_MAX; + U32 r = rand(); + F32 rf = (F32)r; + F32 out = rf/rand_max; + return out; +} +__host__ function Vec3F32 +vec3_rand_host_F32() +{ + Vec3F32 out = {0}; + out.x = rand_uniform_host_F32(); + out.y = rand_uniform_host_F32(); + out.z = rand_uniform_host_F32(); + return out; +} + +__host__ function U32 +h_intersect_aabb(RayF32 *ray, Vec3F32 bmin, Vec3F32 bmax, F32 closest_so_far) +{ + F32 tx1 = (bmin.x - ray->origin.x) / ray->direction.x; + F32 tx2 = (bmax.x - ray->origin.x) / ray->direction.x; + F32 tmin = MIN(tx1, tx2); + F32 tmax = MAX(tx1, tx2); + + F32 ty1 = (bmin.y - ray->origin.y) / ray->direction.y; + F32 ty2 = (bmax.y - ray->origin.y) / ray->direction.y; + tmin = MAX(tmin, MIN(ty1, ty2)); + tmax = MIN(tmax, MAX(ty1, ty2)); + + F32 tz1 = (bmin.z - ray->origin.z) / ray->direction.z; + F32 tz2 = (bmax.z - ray->origin.z) / ray->direction.z; + tmin = MAX(tmin, MIN(tz1, tz2)); + tmax = MIN(tmax, MAX(tz1, tz2)); + + U32 out = tmax >= tmin && tmin < closest_so_far && tmax > 0.0f; + return out; +} + +__device__ function U32 +intersect_aabb(RayF32 *ray, Vec3F32 bmin, Vec3F32 bmax, F32 closest_so_far) +{ + F32 tx1 = (bmin.x - ray->origin.x) / ray->direction.x; + F32 tx2 = (bmax.x - ray->origin.x) / ray->direction.x; + F32 tmin = fminf(tx1, tx2); + F32 tmax = fmaxf(tx1, tx2); + + F32 ty1 = (bmin.y - ray->origin.y) / ray->direction.y; + F32 ty2 = (bmax.y - ray->origin.y) / ray->direction.y; + tmin = fminf(tmin, fminf(ty1, ty2)); + tmax = fmaxf(tmax, fmaxf(ty1, ty2)); + + F32 tz1 = (bmin.z - ray->origin.z) / ray->direction.z; + F32 tz2 = (bmax.z - ray->origin.z) / ray->direction.z; + tmin = fminf(tmin, fminf(tz1, tz2)); + tmax = fmaxf(tmax, fmaxf(tz1, tz2)); + + U32 out = tmax >= tmin && tmin < closest_so_far && tmax > 0.0f; + return out; +} diff --git a/src/base_math.h b/src/base_math.cuh similarity index 73% rename from src/base_math.h rename to src/base_math.cuh index 960f7c3..ac2d394 100644 --- a/src/base_math.h +++ b/src/base_math.cuh @@ -1,5 +1,9 @@ #pragma once +#define MAX(a, b) (a) > (b) ? (a) : (b) +#define MIN(a, b) (a) > (b) ? (b) : (a) + + //------------------------------------------------------------------------------------------ //~ structs @@ -40,16 +44,21 @@ struct RayF32 //~ forward declarations + +__host__ inline function Vec3F32 h_max_V3F32(Vec3F32 a, Vec3F32 b); +__host__ inline function Vec3F32 h_min_V3F32(Vec3F32 a, Vec3F32 b); + __host__ __device__ inline function Vec3F32 vec3F32(F32 x, F32 y, F32 z); __host__ __device__ inline function Vec3F32 add_V3F32(Vec3F32 a, Vec3F32 b); __host__ __device__ inline function Vec3F32 sub_V3F32(Vec3F32 a, Vec3F32 b); __host__ __device__ inline function Vec3F32 scale_V3F32(F32 s, Vec3F32 v); - +__host__ __device__ inline function Vec3F32 cross_V3F32(Vec3F32 a, Vec3F32 b); __host__ __device__ inline function Vec3F32 ray_point_F32(F32 t, RayF32 *ray); __host__ __device__ inline function F32 mag_V3F32(Vec3F32 a); __host__ __device__ inline function F32 dot_V3F32(Vec3F32 a, Vec3F32 b); __device__ inline function F32 norm_V3F32(Vec3F32 a); +__host__ inline function F32 h_norm_V3F32(Vec3F32 a); __host__ __device__ function Vec3F32 lerp_V3F32(F32 s, Vec3F32 a, Vec3F32 b); @@ -57,7 +66,14 @@ __device__ function Vec3F32 rand_uniform_V3F32(curandState *local_rand_state); __device__ function Vec3F32 rand_uniform_range_V3F32(RngF32 rng, curandState *local_rand_state); __host__ function F32 linear_to_gamma(F32 val); -__host__ inline function F32 h_norm_V3F32(Vec3F32 a); __device__ function F32 clamp_F32(RngF32 rng, F32 val); __device__ function Vec3F32 clamp_V3F32(RngF32 rng, Vec3F32 v); + +__host__ function F32 rand_uniform_host_F32(); +__host__ function Vec3F32 vec3_rand_host_F32(); + +__host__ function U32 +h_intersect_aabb(RayF32 *ray, Vec3F32 bmin, Vec3F32 bmax, F32 closest_so_far); +__device__ function U32 +intersect_aabb(RayF32 *ray, Vec3F32 bmin, Vec3F32 bmax, F32 closest_so_far); diff --git a/src/main.cu b/src/main.cu index 9698786..1a64c38 100644 --- a/src/main.cu +++ b/src/main.cu @@ -1,13 +1,20 @@ -#include "base_core.h" -#include "base_math.h" -#include "rayt_core.h" +#define RENDER_ON_CPU 1 +#define BVH_USE_CPU 1 +#define DEBUG_DRAW_BBOX 0 + +//------------------------------------------------------------------------------------------ +//~ header includes +#include "base_core.cuh" +#include "base_math.cuh" +#include "rayt_core.cuh" +#include "rayt_bvh.cuh" //------------------------------------------------------------------------------------------ //~ Program parameter defines #define NUM_BLOCKS 1 #define NUM_THREADS 32 -#define IMAGE_WIDTH 1920 +#define IMAGE_WIDTH 1024 #define ASPECT_RATIO 1.7778f // 16/9 #define CURAND_SEED 1984 @@ -17,15 +24,30 @@ #define SAMPLES_PER_PIXEL 64 #define MAX_DIFFUSE_DEPTH 8 -#include "base_math.c" -#include "rayt_core.c" - //------------------------------------------------------------------------------------------ //~ host globals +host_global CPUTimer g_cpu_timer; host_global CameraF32 h_camera; host_global ViewportF32 h_viewport; host_global ImageF32 h_image; +host_global Entity *h_entities = 0; +host_global U32 *h_tri_indices = 0; +host_global BVH h_bvh; +//~ device globals +__constant__ CameraF32 camera; +__constant__ ViewportF32 viewport; +__constant__ ImageF32 image; + +//------------------------------------------------------------------------------------------ +//~ implementation includes +#include "base_core.cu" +#include "base_math.cu" +#include "rayt_core.cu" +#include "rayt_bvh.cu" + +//------------------------------------------------------------------------------------------ +//~ routines __global__ void cuda_main(Entity *entities, Vec3F32 *pixelbuffer, curandState *rand_state) { @@ -52,15 +74,11 @@ cuda_main(Entity *entities, Vec3F32 *pixelbuffer, curandState *rand_state) } -//------------------------------------------------------------------------------------------ -//~ Main -int main() +__host__ function void +set_up_scene_globals() { - cudaError_t cuErr; - ////////////////////////////////////////////////////////////////////////////////////////// - // Define image, camera and viewport on the CPU - // and then copy to constant globals on device + // Define image, camera and viewport on the CPU // ------------- h_image = {0}; h_image.width = IMAGE_WIDTH; @@ -68,21 +86,17 @@ int main() U32 height = U32((F32)h_image.width/h_image.aspect_ratio) + 1; h_image.height = height < 1 ? 1 : height; h_image.total_num_pixels = h_image.width * h_image.height; - cuErr = cudaMemcpyToSymbol(image, &h_image, sizeof(ImageF32), 0, cudaMemcpyHostToDevice); - CUDA_CHECK(cuErr); + LOG("Image size %i x %i, aspect ratio: %.4f \n", h_image.width, h_image.height, h_image.aspect_ratio); // ------------- h_camera = {0}; - h_camera.focal_length = 1.0f; + h_camera.focal_length = 3.0f; + h_camera.center = vec3F32(0.0f, 0.0f, 18.0f); F32 samples_per_pixel = (F32)SAMPLES_PER_PIXEL; h_camera.pixel_sample_scale = 1.0f/samples_per_pixel; - - cuErr = cudaMemcpyToSymbol(camera, &h_camera, sizeof(CameraF32), 0, - cudaMemcpyHostToDevice); - CUDA_CHECK(cuErr); - + // ------------- h_viewport = {0}; h_viewport.height = 2.0f; @@ -108,43 +122,36 @@ int main() h_viewport.pixel_origin = add_V3F32(viewport_upper_left, scale_V3F32(0.5f, pixel_delta_sum)); - cuErr = cudaMemcpyToSymbol(viewport, &h_viewport, sizeof(ViewportF32), 0, - cudaMemcpyHostToDevice); - CUDA_CHECK(cuErr); LOG("Viewport size %.2f x %.2f, aspect ratio: %.4f \n", h_viewport.width, h_viewport.height, h_viewport.aspect_ratio); +} - ////////////////////////////////////////////////////////////////////////////////////////// - // Setup entities and copy to device - U64 entity_list_byte_size = sizeof(Entity)*MAX_NUM_ENTITIES; - Entity *h_entities = (Entity *)malloc(entity_list_byte_size); - memset(h_entities, 0, entity_list_byte_size); - for(U32 i = 0; i < MAX_NUM_ENTITIES; i += 1) - { - // Init all entities to nil - h_entities[i].kind = EntityKind_Nil; - } +__host__ function void +copy_to_device_and_launch_cuda_main() +{ + cudaError_t cuErr; - // Manual spheres - { - h_entities[0].kind = EntityKind_Sphere; - h_entities[0].center = vec3F32(0.0f, 0.0f, -1.0f); - h_entities[0].radius = 0.5f; + // Copy constants + cuErr = cudaMemcpyToSymbol(image, &h_image, sizeof(ImageF32), 0, cudaMemcpyHostToDevice); + CUDA_CHECK(cuErr); - h_entities[1].kind = EntityKind_Sphere; - h_entities[1].center = vec3F32(0.0f, -100.5f, -1.0f); - h_entities[1].radius = 100.0f; - } + cuErr = cudaMemcpyToSymbol(camera, &h_camera, sizeof(CameraF32), 0, + cudaMemcpyHostToDevice); + CUDA_CHECK(cuErr); - // Copy to device + cuErr = cudaMemcpyToSymbol(viewport, &h_viewport, sizeof(ViewportF32), 0, + cudaMemcpyHostToDevice); + CUDA_CHECK(cuErr); + + + // Create and copy buffers to device Entity *entities = 0; + U64 entity_list_byte_size = sizeof(Entity)*MAX_NUM_ENTITIES; cuErr = cudaMalloc(&entities, entity_list_byte_size); CUDA_CHECK(cuErr); cuErr = cudaMemcpy(entities, h_entities, entity_list_byte_size, cudaMemcpyHostToDevice); CUDA_CHECK(cuErr); - - ////////////////////////////////////////////////////////////////////////////////////////// // Define grid, blocks, threads and any buffers such as pixel data and random state // ------------ @@ -201,14 +208,241 @@ int main() cudaMemcpyDeviceToHost); CUDA_CHECK(cuErr); - write_buffer_to_ppm(h_pixel_buffer, h_image.width, h_image.height); + write_buffer_to_ppm(h_pixel_buffer, h_image.width, h_image.height, "gpu_output.ppm"); - cuErr = cudaFree(pixel_buffer); - CUDA_CHECK(cuErr); - cuErr = cudaFree(entities); - CUDA_CHECK(cuErr); - cuErr = cudaFree(rand_state); - CUDA_CHECK(cuErr); + cuda_free(pixel_buffer); + cuda_free(entities); + cuda_free(rand_state); + + free(h_pixel_buffer); +} + +__host__ function void +set_up_entities() +{ + ////////////////////////////////////////////////////////////////////////////////////////// + // Setup entities + U64 entity_list_byte_size = sizeof(Entity)*MAX_NUM_ENTITIES; + h_entities = (Entity *)malloc(entity_list_byte_size); + memset(h_entities, 0, entity_list_byte_size); + +#if 0 + for(U32 i = 0; i < MAX_NUM_ENTITIES; i += 1) + { + // Init all entities to nil + h_entities[i].kind = EntityKind_Nil; + } +#endif + + // Random triangles + { + h_tri_indices = (U32 *)malloc(sizeof(U32)*MAX_NUM_ENTITIES); + for(U32 i = 0; i < MAX_NUM_ENTITIES; i += 1) + { + Vec3F32 r0 = vec3_rand_host_F32(); + Vec3F32 r1 = vec3_rand_host_F32(); + Vec3F32 r2 = vec3_rand_host_F32(); + // Put the first vertex within a 10x10x10 cube centered on the origin. + Vec3F32 v0 = scale_V3F32(9.0f, r0); + v0 = sub_V3F32(v0, vec3F32(5.0f, 5.0f, 5.0f)); + h_entities[i].kind = EntityKind_Tri; + h_entities[i].vertex0 = v0; + // The other two vertices are relative to the first. + h_entities[i].vertex1 = add_V3F32(v0, r1); + h_entities[i].vertex2 = add_V3F32(v0, r2); + Vec3F32 center = add_V3F32(h_entities[i].vertex0, + add_V3F32(h_entities[i].vertex1, h_entities[i].vertex2)); + center = scale_V3F32(0.3333f, center); + h_entities[i].center = center; + h_tri_indices[i] = i; + #if 0 + LOG("tri index[%i] = %i before bvh construction \n", i, h_tri_indices[i]); + #else + #endif + } + } + +} + +__host__ function void +set_up_bvh() +{ + h_bvh = bvh_build(); + +#if 0 + { + U32 total_leaf_nodes = 0; + for(U32 i = 0; i < h_bvh.max_num_nodes; i+=1) + { + BVHNode *node = &h_bvh.nodes[i]; + if(node->tri_count) + { + LOG("\n----\n"); + LOG("Leaf node with idx %i with tri count %i \n", i, node->tri_count); + total_leaf_nodes += 1; + LOG("Index into triangle index list, node->left_first: %i \n", node->left_first); + LOG("leaf node aabb_min = (%.2f %.2f %.2f) \n", + node->aabb_min.x, node->aabb_min.y, node->aabb_min.z); + LOG("leaf node aabb_max = (%.2f %.2f %.2f) \n", + node->aabb_max.x, node->aabb_max.y, node->aabb_max.z); + Entity *tri = &h_entities[h_tri_indices[node->left_first]]; + LOG("Triangle v0: (%.2f, %.2f %.2f) \n", + tri->vertex0.x, tri->vertex0.y, tri->vertex0.z); + LOG("Triangle v1: (%.2f, %.2f %.2f) \n", + tri->vertex1.x, tri->vertex1.y, tri->vertex1.z); + LOG("Triangle v2: (%.2f, %.2f %.2f) \n", + tri->vertex2.x, tri->vertex2.y, tri->vertex2.z); + LOG("----\n\n"); + } + } + LOG("Total number of leaf nodes %i \n", total_leaf_nodes); + } +#endif +} + +//------------------------------------------------------------------------------------------ +//~ Main +int main() +{ + g_cpu_timer = {0}; + g_cpu_timer.cpu_freq = get_cpu_frequency(); + + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + set_up_scene_globals(); + set_up_entities(); + set_up_bvh(); + +#if RENDER_ON_CPU + LOG("Starting CPU rendering \n"); + //cudaEventRecord(start, 0); + timer(); + // Render "ground truth" on CPU for validation + { + U64 num_pixels = h_image.width*h_image.height; + U64 pixel_buffer_size = num_pixels*sizeof(Vec3F32); + Vec3F32 *host_pixel_buffer = (Vec3F32 *)malloc(pixel_buffer_size); + for(U32 y = 0; y < h_image.height; y += 1) + { + for(U32 x = 0; x < h_image.width; x += 1) + { + U32 idx = y * h_image.width + x; + + Vec3F32 px_u = scale_V3F32((F32)x, h_viewport.pixel_delta_u); + Vec3F32 px_v = scale_V3F32((F32)y, h_viewport.pixel_delta_v); + Vec3F32 pixel_center = add_V3F32(h_viewport.pixel_origin, add_V3F32(px_u, px_v)); + + Vec3F32 ray_direction = sub_V3F32(pixel_center, h_camera.center); + RayF32 ray = {0}; + ray.origin = h_camera.center; + ray.direction = ray_direction; + HitRecord hit_rec = {0}; + hit_rec.t = F32_MAX; +#if BVH_USE_CPU + { + bvh_host_intersect(&h_bvh, &ray, &hit_rec, h_bvh.root_index); + if(hit_rec.hit) + { + //LOG("BVH hit triangle! hit_rec->normal: (%.2f, %.2f, %.2f) \n", + // hit_rec.normal.x, hit_rec.normal.y, hit_rec.normal.z); + } + } +#else + { + HitRecord temp_hit_rec = {0}; + temp_hit_rec.t = hit_rec.t; + for (U32 i = 0; i < MAX_NUM_ENTITIES; i+=1) + { + Entity *tri = &h_entities[i]; + hit_triangle_host(&ray, &temp_hit_rec, tri); + if(temp_hit_rec.hit) + { + hit_rec = temp_hit_rec; + } + } + } +#endif + + Vec3F32 pixel_color = {0.0f, 0.0f, 0.0f}; + if(hit_rec.hit) + { + // Paint entity + pixel_color = add_V3F32(hit_rec.normal, vec3F32(1.0f, 1.0f, 1.0f)); + pixel_color = scale_V3F32(0.5f, pixel_color); + } + else + { + // Paint background gradient + F32 norm = h_norm_V3F32(ray.direction); + Vec3F32 unit_dir = scale_V3F32(1.0f/norm, ray.direction); + Vec3F32 white = vec3F32(1.0f, 1.0f, 1.0f); + Vec3F32 light_blue = vec3F32(0.5f, 0.7f, 1.0f); + + // Lerp between white and light blue depending on y position + F32 blend = 0.5f*(unit_dir.y + 1.0f); + + pixel_color = lerp_V3F32(blend, white, light_blue); + + #if DEBUG_DRAW_BBOX + { + U32 do_debug_pixel = 0; + Vec3F32 color_index = {0.0f, 0.0f, 0.0f}; + for(U32 i = 0; i < h_bvh.max_num_nodes; i += 1) + { + BVHNode *node = &h_bvh.nodes[i]; + if(node->tri_count > 0 && h_intersect_aabb(&ray, node->aabb_min, node->aabb_max, F32_MAX)) + { + do_debug_pixel = 1; + } + if(do_debug_pixel) + { + color_index.x = 1.0f; + color_index.y = 0.0f; + color_index.z = 0.0f; + } + } + + if(do_debug_pixel) + { + pixel_color = color_index; + } + } + #endif + + } + + // Debug draw bvh + + + host_pixel_buffer[idx] = pixel_color; + + } + } + //cudaEventRecord(stop, 0); + //cudaEventSynchronize(stop); + timer(); + + { + F32 elapsed = timer_elapsed(); + //cudaEventElapsedTime(&elapsed, start, stop); + LOG("Elapsed time for CPU rendering: %.2f ms \n", elapsed); + U32 bvh_used = 0; +#if BVH_USE_CPU + bvh_used = 1; +#endif + LOG("BVH = %i \n", bvh_used); + } + + cudaEventDestroy(stop); + cudaEventDestroy(start); + + + write_buffer_to_ppm(host_pixel_buffer, h_image.width, h_image.height, "cpu_output.ppm"); + free(host_pixel_buffer); + } +#endif return 0; } diff --git a/src/rayt_bvh.cu b/src/rayt_bvh.cu new file mode 100644 index 0000000..4128924 --- /dev/null +++ b/src/rayt_bvh.cu @@ -0,0 +1,136 @@ + +__host__ function BVH +bvh_build() +{ + U64 max_bvh_nodes = 2 * MAX_NUM_ENTITIES - 1; + BVH bvh = {0}; + bvh.nodes = (BVHNode *)_aligned_malloc(sizeof(BVHNode)*max_bvh_nodes, 64); + bvh.max_num_nodes = max_bvh_nodes; + bvh.used_nodes = 2; // Skip by two, TODO(anton): Comment this. + bvh.minimum_entities_in_leaf = 2; + U32 root_index = 0; + BVHNode *root = &bvh.nodes[root_index]; + root->left_first = 0; + root->tri_count = MAX_NUM_ENTITIES; + + bvh_update_bounds(&bvh, 0); + + bvh_subdivide(&bvh, 0); + + return bvh; +} + +__host__ function void +bvh_subdivide(BVH *bvh, U32 node_idx) +{ + BVHNode *node = &bvh->nodes[node_idx]; + if(node->tri_count <= bvh->minimum_entities_in_leaf) + { + return; + } + + // Split box + Vec3F32 extent = sub_V3F32(node->aabb_max, node->aabb_min); + U32 axis = 0; + if(extent.y > extent.x) axis = 1; + if(extent.z > extent.v[axis]) axis = 2; + F32 split_pos = node->aabb_min.v[axis] + extent.v[axis] * 0.5f; + + // Sorting into left and right partitions + U32 i = node->left_first; + U32 j = node->tri_count + i - 1; + while (i <= j) + { + U32 tri_idx = h_tri_indices[i]; + if(h_entities[tri_idx].center.v[axis] < split_pos) + { + i += 1; + } + else + { + h_tri_indices[i] = h_tri_indices[j]; + h_tri_indices[j] = tri_idx; + j -= 1; + } + } + + U32 left_count = i - node->left_first; + if(left_count == 0 || left_count == node->tri_count) + { + // One of the partitions are empty, don't subdivide further. + return; + } + + // Create child nodes and subdivide + U32 left_child_index = bvh->used_nodes++; + U32 right_child_index = bvh->used_nodes++; + + bvh->nodes[left_child_index].left_first = node->left_first; + bvh->nodes[left_child_index].tri_count = left_count; + bvh->nodes[right_child_index].left_first = i; + bvh->nodes[right_child_index].tri_count = node->tri_count - left_count; + node->left_first = left_child_index; + node->tri_count = 0; + bvh_update_bounds(bvh, left_child_index); + bvh_update_bounds(bvh, right_child_index); + bvh_subdivide(bvh, left_child_index); + bvh_subdivide(bvh, right_child_index); +} + + +__host__ function void +bvh_update_bounds(BVH *bvh, U32 node_idx) +{ + BVHNode *node = &bvh->nodes[node_idx]; + + node->aabb_min = vec3F32(F32_MAX, F32_MAX, F32_MAX); + node->aabb_max = vec3F32(F32_MIN, F32_MIN, F32_MIN); + + U32 first_tri_idx = node->left_first; + for(U32 i = 0; i < node->tri_count; i += 1) + { + U32 leaf_tri_idx = h_tri_indices[first_tri_idx + i]; + Entity *tri = &h_entities[leaf_tri_idx]; + node->aabb_min = h_min_V3F32(node->aabb_min, tri->vertex0); + node->aabb_min = h_min_V3F32(node->aabb_min, tri->vertex1); + node->aabb_min = h_min_V3F32(node->aabb_min, tri->vertex2); + node->aabb_max = h_max_V3F32(node->aabb_max, tri->vertex0); + node->aabb_max = h_max_V3F32(node->aabb_max, tri->vertex1); + node->aabb_max = h_max_V3F32(node->aabb_max, tri->vertex2); + } +} + +__host__ function void +bvh_host_intersect(BVH *bvh, RayF32 *ray, HitRecord *rec, U32 node_idx) +{ + BVHNode *node = &bvh->nodes[node_idx]; + U32 any_hit = 0; + if(h_intersect_aabb(ray, node->aabb_min, node->aabb_max, rec->t)) + { + if(node->tri_count > 0) + { + //LOG("Hit a leaf node %i with tri count %i \n", node_idx, node->tri_count); + for(U32 i = 0; i < node->tri_count; i+=1) + { + U32 tri_index = h_tri_indices[node->left_first + i]; + Entity *tri = &h_entities[tri_index]; + hit_triangle_host(ray, rec, tri); + if(rec->hit) + { + any_hit = 1; + //LOG("got hit in bvh_host_intersect loop \n"); + } + } + } + else + { + bvh_host_intersect(bvh, ray, rec, node->left_first); + bvh_host_intersect(bvh, ray, rec, node->left_first + 1); + } + } + + if(!rec->hit) + { + rec->hit = any_hit; + } +} diff --git a/src/rayt_bvh.cuh b/src/rayt_bvh.cuh new file mode 100644 index 0000000..9575dd0 --- /dev/null +++ b/src/rayt_bvh.cuh @@ -0,0 +1,27 @@ +#pragma once + +// 32 bytes node +typedef struct BVHNode BVHNode; +struct BVHNode +{ + Vec3F32 aabb_min; + Vec3F32 aabb_max; + U32 left_first; + U32 tri_count; +}; + +typedef struct BVH BVH; +struct BVH +{ + BVHNode *nodes; + U32 used_nodes; + U32 root_index; + U32 max_num_nodes; + U32 num_leaf_nodes; + U32 minimum_entities_in_leaf; +}; + +__host__ function BVH bvh_build(); +__host__ function void bvh_update_bounds(BVH *bvh, U32 node_idx); +__host__ function void bvh_subdivide(BVH *bvh, U32 node_idx); +__host__ function void bvh_host_intersect(BVH *bvh, RayF32 *ray, HitRecord *rec, U32 node_idx); diff --git a/src/rayt_core.c b/src/rayt_core.cu similarity index 58% rename from src/rayt_core.c rename to src/rayt_core.cu index 0085b8d..7347466 100644 --- a/src/rayt_core.c +++ b/src/rayt_core.cu @@ -1,16 +1,11 @@ -//~ device globals -__constant__ CameraF32 camera; -__constant__ ViewportF32 viewport; -__constant__ ImageF32 image; - __host__ function void write_buffer_to_ppm(Vec3F32 *buffer, U32 image_width, - U32 image_height) + U32 image_height, + const char *filename) { - const char *filename = "output.ppm"; FILE *file = fopen(filename, "w"); if(!file) { @@ -133,6 +128,95 @@ get_sample_color(RayF32 ray, Entity *entities, curandState *local_rand_state) return out; } +// Common function for use on both host and device, +__host__ __device__ inline function void +triangle_intersection_common(RayF32 *ray, HitRecord *rec, Vec3F32 edge1, Vec3F32 edge2, + Entity* triangle) +{ + // Möller-Trumbore intersection algorithm + Vec3F32 h = cross_V3F32(ray->direction, edge2); + F32 closest_so_far = rec->t; + F32 a = dot_V3F32(edge1, h); + if(a <= -0.001f || a >= 0.001f) + { + F32 f = 1.0f/a; + Vec3F32 s = sub_V3F32(ray->origin, triangle->vertex0); + F32 u = f * dot_V3F32(s, h); + if(u >= 0.0f && u <= 1.0f) + { + Vec3F32 q = cross_V3F32(s, edge1); + F32 v = f * dot_V3F32(ray->direction, q); + if(v >= 0.0f && (u+v) <= 1.0f) + { + F32 t = f * dot_V3F32(edge2, q); + if(t > 0.0001f) + { + if(t <= closest_so_far) + { + rec->t = t; + rec->hit = 1; + } + } + } + } + } + +} + +__host__ function void +hit_triangle_host(RayF32 *ray, HitRecord *rec, Entity *triangle) +{ + + Vec3F32 edge1 = sub_V3F32(triangle->vertex1, triangle->vertex0); + Vec3F32 edge2 = sub_V3F32(triangle->vertex2, triangle->vertex0); + rec->hit = 0; + triangle_intersection_common(ray, rec, edge1, edge2, triangle); + + // Set the point of intersection and the normal of the, for now, + // vertex0 of the triangle. We have to get the actual surface normal at some point. + if(rec->hit) + { + Vec3F32 intersection_point = ray_point_F32(rec->t, ray); + Vec3F32 v0_normal = cross_V3F32(edge1, edge2); + F32 norm_inv = 1.0f/h_norm_V3F32(v0_normal); + v0_normal = scale_V3F32(norm_inv, v0_normal); + + F32 front_face = dot_V3F32(ray->direction, v0_normal) < 0.0f; + rec->normal = front_face ? v0_normal : scale_V3F32(-1.0f, v0_normal); + rec->front_face = front_face; + rec->point = intersection_point; + //LOG("Hit triangle in hit_triangle_host! \n"); + } +} +__device__ function HitRecord +hit_triangle(RayF32 *ray, Entity *triangle, F32 closest_so_far) +{ + + Vec3F32 edge1 = sub_V3F32(triangle->vertex1, triangle->vertex0); + Vec3F32 edge2 = sub_V3F32(triangle->vertex2, triangle->vertex0); + HitRecord out = {0}; + triangle_intersection_common(ray, &out, edge1, edge2, triangle); + + // Set the point of intersection and the normal of the, for now, + // vertex0 of the triangle. We have to get the actual surface normal at some point. + if(out.hit) + { + Vec3F32 intersection_point = ray_point_F32(out.t, ray); + Vec3F32 v0_normal = cross_V3F32(edge1, edge2); + F32 norm_inv = 1.0f/norm_V3F32(v0_normal); + v0_normal = scale_V3F32(norm_inv, v0_normal); + + F32 front_face = dot_V3F32(ray->direction, v0_normal) < 0.0f; + out.normal = front_face ? v0_normal : scale_V3F32(-1.0f, v0_normal); + out.front_face = front_face; + out.point = intersection_point; + + } + + return out; +} + + __global__ void cuda_init_state(curandState *rand_state) @@ -148,3 +232,10 @@ cuda_init_state(curandState *rand_state) } } + +__host__ void +cuda_free(void *device_ptr) +{ + cudaError_t cuErr = cudaFree(device_ptr); + CUDA_CHECK(cuErr); +} diff --git a/src/rayt_core.h b/src/rayt_core.cuh similarity index 59% rename from src/rayt_core.h rename to src/rayt_core.cuh index 2ad3b9d..69bf5fc 100644 --- a/src/rayt_core.h +++ b/src/rayt_core.cuh @@ -38,6 +38,7 @@ enum EntityKind { EntityKind_Nil, EntityKind_Sphere, + EntityKind_Tri, Num_EntityKinds }; @@ -56,12 +57,26 @@ struct Entity { EntityKind kind; Vec3F32 center; + Vec3F32 vertex0; + Vec3F32 vertex1; + Vec3F32 vertex2; F32 radius; }; -__host__ function void write_buffer_to_ppm(Vec3F32 *buffer, U32 image_width, U32 image_height); +__host__ function void write_buffer_to_ppm(Vec3F32 *buffer, U32 image_width, + U32 image_height, const char *filename); -__device__ function RayF32 ray_get_F32(F32 x, F32 y, Vec3F32 cam_center, curandState *local_rand_state); -__global__ void cuda_init_state(curandState *rand_state); \ No newline at end of file +__device__ function RayF32 ray_get_F32(F32 x, F32 y, Vec3F32 cam_center, + curandState *local_rand_state); + +__host__ __device__ inline function void +triangle_intersection_common(RayF32 *ray, HitRecord *rec, Vec3F32 edge1, Vec3F32 edge2, + Entity* triangle); +__device__ function HitRecord hit_triangle(RayF32 *ray, Entity *triangle, + F32 closest_so_far); +__host__ function void hit_triangle_host(RayF32 *ray, HitRecord *rec, Entity *triangle); + +__global__ void cuda_init_state(curandState *rand_state); +__host__ void cuda_free(void *device_ptr); diff --git a/timeBuild.ctm b/timeBuild.ctm index 57cf5e3932e071b8c26e531b36abfe2b2a2a47e3..8fbcbf7d860eab14d999e1e8b185cde2bd646644 100644 GIT binary patch delta 1765 zcmYk6e@xVM7{@3&-VTE^?5%(-p}*-e7|*J z(@_m|TIJr6dAnY1b@})RAr|@xk=dV7#;ltwM6~@z#0sg05_{eqiD7P@w01HHL@TK4U~4;m@g9_d-{dXIaU0^>Yhku znpni2f=@A3= zR%UGIU+u*?mt;ZIOW8GC#Egcg^uO<8{zof>*;qU*SrJIkPcEIo0SvTLgWLXR{^JBc z(t8_1BO0Wh?oU@Vs83+l(uQ<@ZCuV=LwCGt_wf|wr^JmL!_t_c?}_SL1=i5}6Gm#7 z^noR3_CN^S@h5LbmvVxO^b_Nm<*Z|({!!m8jnWVdxTkCKz__AG(hFZ-9AK5ag!1El z*|C`!8@o`EndfEd?`>#pb{ydVta$fKeHYmKNT@;m~OvU`k38&l+ES^*@cl*3EM~;(%zZ`7(6jFjD$E$bztDV>xr?}S@%_QDQ7-#z2FflCG7 DF4qbg delta 7 OcmeyOvP68t5^ewvegiWA