working bvh on CPU

This commit is contained in:
Anton Ljungdahl 2025-05-02 12:38:43 +02:00
parent be0688fa9f
commit 254cb069a3
13 changed files with 777 additions and 104 deletions

2
.gitignore vendored
View File

@ -115,3 +115,5 @@ build/
*.exr
*.bmp
*.png
*.sublime*
*.ctm)

View File

@ -15,7 +15,9 @@ set CudaRemoveWarnings=-diag-suppress 177
IF NOT EXIST .\build mkdir .\build
pushd .\build
nvcc %CudaSources% %CudaRemoveWarnings% -o program.exe
@rem nvcc %CudaRemoveWarnings% -G -g -lineinfo -o program.exe %CudaSources%
nvcc %CudaRemoveWarnings% -o program.exe %CudaSources%
set LastError=%ERRORLEVEL%
popd

33
src/base_core.cu Normal file
View File

@ -0,0 +1,33 @@
__host__ function F64
get_cpu_frequency()
{
LARGE_INTEGER freq;
QueryPerformanceFrequency(&freq);
U64 start_tsc = __rdtsc();
Sleep(100);
U64 end_tsc = __rdtsc();
F64 cyclers_per_ms = (F64)(end_tsc - start_tsc) / 100.0;
return cyclers_per_ms * 1000.0; // Cycles per second
}
__host__ function void
timer()
{
g_cpu_timer.second_to_last_cycles = g_cpu_timer.last_cycles;
g_cpu_timer.last_cycles = __rdtsc();
}
__host__ function F64
timer_elapsed()
{
U64 cycles = g_cpu_timer.last_cycles - g_cpu_timer.second_to_last_cycles;
F64 elapsed = (F64)cycles / (g_cpu_timer.cpu_freq/1000.0);
return elapsed; // ms
}
function F64 test_function()
{
return 34.20;
}

60
src/base_core.cuh Normal file
View File

@ -0,0 +1,60 @@
#pragma once
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#include <intrin.h>
#include <stdio.h>
#include <stdint.h>
#include <float.h>
#include <math.h>
#include <cuda_runtime.h>
#include <curand_kernel.h>
//------------------------------------------------------------------------------------------
//~ base defines
#define host_global static
#define function static
//~ typedefs
typedef int32_t S32;
typedef uint32_t U32;
typedef uint64_t U64;
typedef double F64;
typedef float F32;
//~ utility defines
#define CUDA_CHECK(err) do { \
if (err != cudaSuccess) { \
fprintf(stderr, "CUDA ERROR: %s at %s:%d\n", \
cudaGetErrorString(err), __FILE__, __LINE__); \
exit(EXIT_FAILURE); \
} \
} while (0)
#define LOG printf
#define F32_MAX FLT_MAX
#define F32_MIN -FLT_MAX
typedef struct CPUTimer CPUTimer;
struct CPUTimer
{
U64 last_cycles;
U64 second_to_last_cycles;
F64 elapsed;
F64 cpu_freq;
};
//------------------------------------------------------------------------------------------
//~ timing
__host__ function F64 get_cpu_frequency();
__host__ function void timer();
__host__ function F64 timer_elapsed();
function F64 test_function();

View File

@ -1,35 +0,0 @@
#pragma once
#include <stdio.h>
#include <stdint.h>
#include <float.h>
#include <math.h>
#include <curand_kernel.h>
//------------------------------------------------------------------------------------------
//~ base defines
#define host_global static
#define function static
//~ typedefs
typedef int32_t S32;
typedef uint32_t U32;
typedef uint64_t U64;
typedef float F32;
//~ utility defines
#define CUDA_CHECK(err) do { \
if (err != cudaSuccess) { \
fprintf(stderr, "CUDA ERROR: %s at %s:%d\n", \
cudaGetErrorString(err), __FILE__, __LINE__); \
exit(EXIT_FAILURE); \
} \
} while (0)
#define LOG printf
#define F32_MAX FLT_MAX
#define F32_MIN FLT_MIN

View File

@ -1,4 +1,24 @@
__host__ inline function Vec3F32
h_max_V3F32(Vec3F32 a, Vec3F32 b)
{
Vec3F32 out = {0};
out.x = a.x > b.x ? a.x : b.x;
out.y = a.y > b.y ? a.y : b.y;
out.z = a.z > b.z ? a.z : b.z;
return out;
}
__host__ inline function Vec3F32
h_min_V3F32(Vec3F32 a, Vec3F32 b)
{
Vec3F32 out = {0};
out.x = a.x < b.x ? a.x : b.x;
out.y = a.y < b.y ? a.y : b.y;
out.z = a.z < b.z ? a.z : b.z;
return out;
}
__host__ __device__ inline function Vec3F32
vec3F32(F32 x, F32 y, F32 z)
{
@ -40,6 +60,16 @@ scale_V3F32(F32 s, Vec3F32 v)
return out;
}
__host__ __device__ inline function Vec3F32
cross_V3F32(Vec3F32 a, Vec3F32 b)
{
Vec3F32 out = {0};
out.x = a.y*b.z-a.z*b.y;
out.y = a.z*b.x-a.x*b.z;
out.z = a.x*b.y-a.y*b.x;
return out;
}
__host__ __device__ inline function F32
dot_V3F32(Vec3F32 a, Vec3F32 b)
{
@ -48,9 +78,9 @@ dot_V3F32(Vec3F32 a, Vec3F32 b)
__host__ __device__ inline function Vec3F32
ray_point_F32(F32 t, RayF32 ray)
ray_point_F32(F32 t, RayF32 *ray)
{
Vec3F32 out = add_V3F32(ray.origin, scale_V3F32(t, ray.direction));
Vec3F32 out = add_V3F32(ray->origin, scale_V3F32(t, ray->direction));
return out;
}
@ -136,4 +166,66 @@ clamp_V3F32(RngF32 rng, Vec3F32 v)
return out;
}
__host__ function F32
rand_uniform_host_F32()
{
F32 rand_max = (F32)RAND_MAX;
U32 r = rand();
F32 rf = (F32)r;
F32 out = rf/rand_max;
return out;
}
__host__ function Vec3F32
vec3_rand_host_F32()
{
Vec3F32 out = {0};
out.x = rand_uniform_host_F32();
out.y = rand_uniform_host_F32();
out.z = rand_uniform_host_F32();
return out;
}
__host__ function U32
h_intersect_aabb(RayF32 *ray, Vec3F32 bmin, Vec3F32 bmax, F32 closest_so_far)
{
F32 tx1 = (bmin.x - ray->origin.x) / ray->direction.x;
F32 tx2 = (bmax.x - ray->origin.x) / ray->direction.x;
F32 tmin = MIN(tx1, tx2);
F32 tmax = MAX(tx1, tx2);
F32 ty1 = (bmin.y - ray->origin.y) / ray->direction.y;
F32 ty2 = (bmax.y - ray->origin.y) / ray->direction.y;
tmin = MAX(tmin, MIN(ty1, ty2));
tmax = MIN(tmax, MAX(ty1, ty2));
F32 tz1 = (bmin.z - ray->origin.z) / ray->direction.z;
F32 tz2 = (bmax.z - ray->origin.z) / ray->direction.z;
tmin = MAX(tmin, MIN(tz1, tz2));
tmax = MIN(tmax, MAX(tz1, tz2));
U32 out = tmax >= tmin && tmin < closest_so_far && tmax > 0.0f;
return out;
}
__device__ function U32
intersect_aabb(RayF32 *ray, Vec3F32 bmin, Vec3F32 bmax, F32 closest_so_far)
{
F32 tx1 = (bmin.x - ray->origin.x) / ray->direction.x;
F32 tx2 = (bmax.x - ray->origin.x) / ray->direction.x;
F32 tmin = fminf(tx1, tx2);
F32 tmax = fmaxf(tx1, tx2);
F32 ty1 = (bmin.y - ray->origin.y) / ray->direction.y;
F32 ty2 = (bmax.y - ray->origin.y) / ray->direction.y;
tmin = fminf(tmin, fminf(ty1, ty2));
tmax = fmaxf(tmax, fmaxf(ty1, ty2));
F32 tz1 = (bmin.z - ray->origin.z) / ray->direction.z;
F32 tz2 = (bmax.z - ray->origin.z) / ray->direction.z;
tmin = fminf(tmin, fminf(tz1, tz2));
tmax = fmaxf(tmax, fmaxf(tz1, tz2));
U32 out = tmax >= tmin && tmin < closest_so_far && tmax > 0.0f;
return out;
}

View File

@ -1,5 +1,9 @@
#pragma once
#define MAX(a, b) (a) > (b) ? (a) : (b)
#define MIN(a, b) (a) > (b) ? (b) : (a)
//------------------------------------------------------------------------------------------
//~ structs
@ -40,16 +44,21 @@ struct RayF32
//~ forward declarations
__host__ inline function Vec3F32 h_max_V3F32(Vec3F32 a, Vec3F32 b);
__host__ inline function Vec3F32 h_min_V3F32(Vec3F32 a, Vec3F32 b);
__host__ __device__ inline function Vec3F32 vec3F32(F32 x, F32 y, F32 z);
__host__ __device__ inline function Vec3F32 add_V3F32(Vec3F32 a, Vec3F32 b);
__host__ __device__ inline function Vec3F32 sub_V3F32(Vec3F32 a, Vec3F32 b);
__host__ __device__ inline function Vec3F32 scale_V3F32(F32 s, Vec3F32 v);
__host__ __device__ inline function Vec3F32 cross_V3F32(Vec3F32 a, Vec3F32 b);
__host__ __device__ inline function Vec3F32 ray_point_F32(F32 t, RayF32 *ray);
__host__ __device__ inline function F32 mag_V3F32(Vec3F32 a);
__host__ __device__ inline function F32 dot_V3F32(Vec3F32 a, Vec3F32 b);
__device__ inline function F32 norm_V3F32(Vec3F32 a);
__host__ inline function F32 h_norm_V3F32(Vec3F32 a);
__host__ __device__ function Vec3F32 lerp_V3F32(F32 s, Vec3F32 a, Vec3F32 b);
@ -57,7 +66,14 @@ __device__ function Vec3F32 rand_uniform_V3F32(curandState *local_rand_state);
__device__ function Vec3F32 rand_uniform_range_V3F32(RngF32 rng, curandState *local_rand_state);
__host__ function F32 linear_to_gamma(F32 val);
__host__ inline function F32 h_norm_V3F32(Vec3F32 a);
__device__ function F32 clamp_F32(RngF32 rng, F32 val);
__device__ function Vec3F32 clamp_V3F32(RngF32 rng, Vec3F32 v);
__host__ function F32 rand_uniform_host_F32();
__host__ function Vec3F32 vec3_rand_host_F32();
__host__ function U32
h_intersect_aabb(RayF32 *ray, Vec3F32 bmin, Vec3F32 bmax, F32 closest_so_far);
__device__ function U32
intersect_aabb(RayF32 *ray, Vec3F32 bmin, Vec3F32 bmax, F32 closest_so_far);

View File

@ -1,13 +1,20 @@
#include "base_core.h"
#include "base_math.h"
#include "rayt_core.h"
#define RENDER_ON_CPU 1
#define BVH_USE_CPU 1
#define DEBUG_DRAW_BBOX 0
//------------------------------------------------------------------------------------------
//~ header includes
#include "base_core.cuh"
#include "base_math.cuh"
#include "rayt_core.cuh"
#include "rayt_bvh.cuh"
//------------------------------------------------------------------------------------------
//~ Program parameter defines
#define NUM_BLOCKS 1
#define NUM_THREADS 32
#define IMAGE_WIDTH 1920
#define IMAGE_WIDTH 1024
#define ASPECT_RATIO 1.7778f // 16/9
#define CURAND_SEED 1984
@ -17,15 +24,30 @@
#define SAMPLES_PER_PIXEL 64
#define MAX_DIFFUSE_DEPTH 8
#include "base_math.c"
#include "rayt_core.c"
//------------------------------------------------------------------------------------------
//~ host globals
host_global CPUTimer g_cpu_timer;
host_global CameraF32 h_camera;
host_global ViewportF32 h_viewport;
host_global ImageF32 h_image;
host_global Entity *h_entities = 0;
host_global U32 *h_tri_indices = 0;
host_global BVH h_bvh;
//~ device globals
__constant__ CameraF32 camera;
__constant__ ViewportF32 viewport;
__constant__ ImageF32 image;
//------------------------------------------------------------------------------------------
//~ implementation includes
#include "base_core.cu"
#include "base_math.cu"
#include "rayt_core.cu"
#include "rayt_bvh.cu"
//------------------------------------------------------------------------------------------
//~ routines
__global__ void
cuda_main(Entity *entities, Vec3F32 *pixelbuffer, curandState *rand_state)
{
@ -52,15 +74,11 @@ cuda_main(Entity *entities, Vec3F32 *pixelbuffer, curandState *rand_state)
}
//------------------------------------------------------------------------------------------
//~ Main
int main()
__host__ function void
set_up_scene_globals()
{
cudaError_t cuErr;
//////////////////////////////////////////////////////////////////////////////////////////
// Define image, camera and viewport on the CPU
// and then copy to constant globals on device
// Define image, camera and viewport on the CPU
// -------------
h_image = {0};
h_image.width = IMAGE_WIDTH;
@ -68,21 +86,17 @@ int main()
U32 height = U32((F32)h_image.width/h_image.aspect_ratio) + 1;
h_image.height = height < 1 ? 1 : height;
h_image.total_num_pixels = h_image.width * h_image.height;
cuErr = cudaMemcpyToSymbol(image, &h_image, sizeof(ImageF32), 0, cudaMemcpyHostToDevice);
CUDA_CHECK(cuErr);
LOG("Image size %i x %i, aspect ratio: %.4f \n",
h_image.width, h_image.height, h_image.aspect_ratio);
// -------------
h_camera = {0};
h_camera.focal_length = 1.0f;
h_camera.focal_length = 3.0f;
h_camera.center = vec3F32(0.0f, 0.0f, 18.0f);
F32 samples_per_pixel = (F32)SAMPLES_PER_PIXEL;
h_camera.pixel_sample_scale = 1.0f/samples_per_pixel;
cuErr = cudaMemcpyToSymbol(camera, &h_camera, sizeof(CameraF32), 0,
cudaMemcpyHostToDevice);
CUDA_CHECK(cuErr);
// -------------
h_viewport = {0};
h_viewport.height = 2.0f;
@ -108,43 +122,36 @@ int main()
h_viewport.pixel_origin = add_V3F32(viewport_upper_left,
scale_V3F32(0.5f, pixel_delta_sum));
cuErr = cudaMemcpyToSymbol(viewport, &h_viewport, sizeof(ViewportF32), 0,
cudaMemcpyHostToDevice);
CUDA_CHECK(cuErr);
LOG("Viewport size %.2f x %.2f, aspect ratio: %.4f \n",
h_viewport.width, h_viewport.height, h_viewport.aspect_ratio);
}
//////////////////////////////////////////////////////////////////////////////////////////
// Setup entities and copy to device
U64 entity_list_byte_size = sizeof(Entity)*MAX_NUM_ENTITIES;
Entity *h_entities = (Entity *)malloc(entity_list_byte_size);
memset(h_entities, 0, entity_list_byte_size);
for(U32 i = 0; i < MAX_NUM_ENTITIES; i += 1)
{
// Init all entities to nil
h_entities[i].kind = EntityKind_Nil;
}
__host__ function void
copy_to_device_and_launch_cuda_main()
{
cudaError_t cuErr;
// Manual spheres
{
h_entities[0].kind = EntityKind_Sphere;
h_entities[0].center = vec3F32(0.0f, 0.0f, -1.0f);
h_entities[0].radius = 0.5f;
// Copy constants
cuErr = cudaMemcpyToSymbol(image, &h_image, sizeof(ImageF32), 0, cudaMemcpyHostToDevice);
CUDA_CHECK(cuErr);
h_entities[1].kind = EntityKind_Sphere;
h_entities[1].center = vec3F32(0.0f, -100.5f, -1.0f);
h_entities[1].radius = 100.0f;
}
cuErr = cudaMemcpyToSymbol(camera, &h_camera, sizeof(CameraF32), 0,
cudaMemcpyHostToDevice);
CUDA_CHECK(cuErr);
// Copy to device
cuErr = cudaMemcpyToSymbol(viewport, &h_viewport, sizeof(ViewportF32), 0,
cudaMemcpyHostToDevice);
CUDA_CHECK(cuErr);
// Create and copy buffers to device
Entity *entities = 0;
U64 entity_list_byte_size = sizeof(Entity)*MAX_NUM_ENTITIES;
cuErr = cudaMalloc(&entities, entity_list_byte_size);
CUDA_CHECK(cuErr);
cuErr = cudaMemcpy(entities, h_entities, entity_list_byte_size, cudaMemcpyHostToDevice);
CUDA_CHECK(cuErr);
//////////////////////////////////////////////////////////////////////////////////////////
// Define grid, blocks, threads and any buffers such as pixel data and random state
// ------------
@ -201,14 +208,241 @@ int main()
cudaMemcpyDeviceToHost);
CUDA_CHECK(cuErr);
write_buffer_to_ppm(h_pixel_buffer, h_image.width, h_image.height);
write_buffer_to_ppm(h_pixel_buffer, h_image.width, h_image.height, "gpu_output.ppm");
cuErr = cudaFree(pixel_buffer);
CUDA_CHECK(cuErr);
cuErr = cudaFree(entities);
CUDA_CHECK(cuErr);
cuErr = cudaFree(rand_state);
CUDA_CHECK(cuErr);
cuda_free(pixel_buffer);
cuda_free(entities);
cuda_free(rand_state);
free(h_pixel_buffer);
}
__host__ function void
set_up_entities()
{
//////////////////////////////////////////////////////////////////////////////////////////
// Setup entities
U64 entity_list_byte_size = sizeof(Entity)*MAX_NUM_ENTITIES;
h_entities = (Entity *)malloc(entity_list_byte_size);
memset(h_entities, 0, entity_list_byte_size);
#if 0
for(U32 i = 0; i < MAX_NUM_ENTITIES; i += 1)
{
// Init all entities to nil
h_entities[i].kind = EntityKind_Nil;
}
#endif
// Random triangles
{
h_tri_indices = (U32 *)malloc(sizeof(U32)*MAX_NUM_ENTITIES);
for(U32 i = 0; i < MAX_NUM_ENTITIES; i += 1)
{
Vec3F32 r0 = vec3_rand_host_F32();
Vec3F32 r1 = vec3_rand_host_F32();
Vec3F32 r2 = vec3_rand_host_F32();
// Put the first vertex within a 10x10x10 cube centered on the origin.
Vec3F32 v0 = scale_V3F32(9.0f, r0);
v0 = sub_V3F32(v0, vec3F32(5.0f, 5.0f, 5.0f));
h_entities[i].kind = EntityKind_Tri;
h_entities[i].vertex0 = v0;
// The other two vertices are relative to the first.
h_entities[i].vertex1 = add_V3F32(v0, r1);
h_entities[i].vertex2 = add_V3F32(v0, r2);
Vec3F32 center = add_V3F32(h_entities[i].vertex0,
add_V3F32(h_entities[i].vertex1, h_entities[i].vertex2));
center = scale_V3F32(0.3333f, center);
h_entities[i].center = center;
h_tri_indices[i] = i;
#if 0
LOG("tri index[%i] = %i before bvh construction \n", i, h_tri_indices[i]);
#else
#endif
}
}
}
__host__ function void
set_up_bvh()
{
h_bvh = bvh_build();
#if 0
{
U32 total_leaf_nodes = 0;
for(U32 i = 0; i < h_bvh.max_num_nodes; i+=1)
{
BVHNode *node = &h_bvh.nodes[i];
if(node->tri_count)
{
LOG("\n----\n");
LOG("Leaf node with idx %i with tri count %i \n", i, node->tri_count);
total_leaf_nodes += 1;
LOG("Index into triangle index list, node->left_first: %i \n", node->left_first);
LOG("leaf node aabb_min = (%.2f %.2f %.2f) \n",
node->aabb_min.x, node->aabb_min.y, node->aabb_min.z);
LOG("leaf node aabb_max = (%.2f %.2f %.2f) \n",
node->aabb_max.x, node->aabb_max.y, node->aabb_max.z);
Entity *tri = &h_entities[h_tri_indices[node->left_first]];
LOG("Triangle v0: (%.2f, %.2f %.2f) \n",
tri->vertex0.x, tri->vertex0.y, tri->vertex0.z);
LOG("Triangle v1: (%.2f, %.2f %.2f) \n",
tri->vertex1.x, tri->vertex1.y, tri->vertex1.z);
LOG("Triangle v2: (%.2f, %.2f %.2f) \n",
tri->vertex2.x, tri->vertex2.y, tri->vertex2.z);
LOG("----\n\n");
}
}
LOG("Total number of leaf nodes %i \n", total_leaf_nodes);
}
#endif
}
//------------------------------------------------------------------------------------------
//~ Main
int main()
{
g_cpu_timer = {0};
g_cpu_timer.cpu_freq = get_cpu_frequency();
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
set_up_scene_globals();
set_up_entities();
set_up_bvh();
#if RENDER_ON_CPU
LOG("Starting CPU rendering \n");
//cudaEventRecord(start, 0);
timer();
// Render "ground truth" on CPU for validation
{
U64 num_pixels = h_image.width*h_image.height;
U64 pixel_buffer_size = num_pixels*sizeof(Vec3F32);
Vec3F32 *host_pixel_buffer = (Vec3F32 *)malloc(pixel_buffer_size);
for(U32 y = 0; y < h_image.height; y += 1)
{
for(U32 x = 0; x < h_image.width; x += 1)
{
U32 idx = y * h_image.width + x;
Vec3F32 px_u = scale_V3F32((F32)x, h_viewport.pixel_delta_u);
Vec3F32 px_v = scale_V3F32((F32)y, h_viewport.pixel_delta_v);
Vec3F32 pixel_center = add_V3F32(h_viewport.pixel_origin, add_V3F32(px_u, px_v));
Vec3F32 ray_direction = sub_V3F32(pixel_center, h_camera.center);
RayF32 ray = {0};
ray.origin = h_camera.center;
ray.direction = ray_direction;
HitRecord hit_rec = {0};
hit_rec.t = F32_MAX;
#if BVH_USE_CPU
{
bvh_host_intersect(&h_bvh, &ray, &hit_rec, h_bvh.root_index);
if(hit_rec.hit)
{
//LOG("BVH hit triangle! hit_rec->normal: (%.2f, %.2f, %.2f) \n",
// hit_rec.normal.x, hit_rec.normal.y, hit_rec.normal.z);
}
}
#else
{
HitRecord temp_hit_rec = {0};
temp_hit_rec.t = hit_rec.t;
for (U32 i = 0; i < MAX_NUM_ENTITIES; i+=1)
{
Entity *tri = &h_entities[i];
hit_triangle_host(&ray, &temp_hit_rec, tri);
if(temp_hit_rec.hit)
{
hit_rec = temp_hit_rec;
}
}
}
#endif
Vec3F32 pixel_color = {0.0f, 0.0f, 0.0f};
if(hit_rec.hit)
{
// Paint entity
pixel_color = add_V3F32(hit_rec.normal, vec3F32(1.0f, 1.0f, 1.0f));
pixel_color = scale_V3F32(0.5f, pixel_color);
}
else
{
// Paint background gradient
F32 norm = h_norm_V3F32(ray.direction);
Vec3F32 unit_dir = scale_V3F32(1.0f/norm, ray.direction);
Vec3F32 white = vec3F32(1.0f, 1.0f, 1.0f);
Vec3F32 light_blue = vec3F32(0.5f, 0.7f, 1.0f);
// Lerp between white and light blue depending on y position
F32 blend = 0.5f*(unit_dir.y + 1.0f);
pixel_color = lerp_V3F32(blend, white, light_blue);
#if DEBUG_DRAW_BBOX
{
U32 do_debug_pixel = 0;
Vec3F32 color_index = {0.0f, 0.0f, 0.0f};
for(U32 i = 0; i < h_bvh.max_num_nodes; i += 1)
{
BVHNode *node = &h_bvh.nodes[i];
if(node->tri_count > 0 && h_intersect_aabb(&ray, node->aabb_min, node->aabb_max, F32_MAX))
{
do_debug_pixel = 1;
}
if(do_debug_pixel)
{
color_index.x = 1.0f;
color_index.y = 0.0f;
color_index.z = 0.0f;
}
}
if(do_debug_pixel)
{
pixel_color = color_index;
}
}
#endif
}
// Debug draw bvh
host_pixel_buffer[idx] = pixel_color;
}
}
//cudaEventRecord(stop, 0);
//cudaEventSynchronize(stop);
timer();
{
F32 elapsed = timer_elapsed();
//cudaEventElapsedTime(&elapsed, start, stop);
LOG("Elapsed time for CPU rendering: %.2f ms \n", elapsed);
U32 bvh_used = 0;
#if BVH_USE_CPU
bvh_used = 1;
#endif
LOG("BVH = %i \n", bvh_used);
}
cudaEventDestroy(stop);
cudaEventDestroy(start);
write_buffer_to_ppm(host_pixel_buffer, h_image.width, h_image.height, "cpu_output.ppm");
free(host_pixel_buffer);
}
#endif
return 0;
}

136
src/rayt_bvh.cu Normal file
View File

@ -0,0 +1,136 @@
__host__ function BVH
bvh_build()
{
U64 max_bvh_nodes = 2 * MAX_NUM_ENTITIES - 1;
BVH bvh = {0};
bvh.nodes = (BVHNode *)_aligned_malloc(sizeof(BVHNode)*max_bvh_nodes, 64);
bvh.max_num_nodes = max_bvh_nodes;
bvh.used_nodes = 2; // Skip by two, TODO(anton): Comment this.
bvh.minimum_entities_in_leaf = 2;
U32 root_index = 0;
BVHNode *root = &bvh.nodes[root_index];
root->left_first = 0;
root->tri_count = MAX_NUM_ENTITIES;
bvh_update_bounds(&bvh, 0);
bvh_subdivide(&bvh, 0);
return bvh;
}
__host__ function void
bvh_subdivide(BVH *bvh, U32 node_idx)
{
BVHNode *node = &bvh->nodes[node_idx];
if(node->tri_count <= bvh->minimum_entities_in_leaf)
{
return;
}
// Split box
Vec3F32 extent = sub_V3F32(node->aabb_max, node->aabb_min);
U32 axis = 0;
if(extent.y > extent.x) axis = 1;
if(extent.z > extent.v[axis]) axis = 2;
F32 split_pos = node->aabb_min.v[axis] + extent.v[axis] * 0.5f;
// Sorting into left and right partitions
U32 i = node->left_first;
U32 j = node->tri_count + i - 1;
while (i <= j)
{
U32 tri_idx = h_tri_indices[i];
if(h_entities[tri_idx].center.v[axis] < split_pos)
{
i += 1;
}
else
{
h_tri_indices[i] = h_tri_indices[j];
h_tri_indices[j] = tri_idx;
j -= 1;
}
}
U32 left_count = i - node->left_first;
if(left_count == 0 || left_count == node->tri_count)
{
// One of the partitions are empty, don't subdivide further.
return;
}
// Create child nodes and subdivide
U32 left_child_index = bvh->used_nodes++;
U32 right_child_index = bvh->used_nodes++;
bvh->nodes[left_child_index].left_first = node->left_first;
bvh->nodes[left_child_index].tri_count = left_count;
bvh->nodes[right_child_index].left_first = i;
bvh->nodes[right_child_index].tri_count = node->tri_count - left_count;
node->left_first = left_child_index;
node->tri_count = 0;
bvh_update_bounds(bvh, left_child_index);
bvh_update_bounds(bvh, right_child_index);
bvh_subdivide(bvh, left_child_index);
bvh_subdivide(bvh, right_child_index);
}
__host__ function void
bvh_update_bounds(BVH *bvh, U32 node_idx)
{
BVHNode *node = &bvh->nodes[node_idx];
node->aabb_min = vec3F32(F32_MAX, F32_MAX, F32_MAX);
node->aabb_max = vec3F32(F32_MIN, F32_MIN, F32_MIN);
U32 first_tri_idx = node->left_first;
for(U32 i = 0; i < node->tri_count; i += 1)
{
U32 leaf_tri_idx = h_tri_indices[first_tri_idx + i];
Entity *tri = &h_entities[leaf_tri_idx];
node->aabb_min = h_min_V3F32(node->aabb_min, tri->vertex0);
node->aabb_min = h_min_V3F32(node->aabb_min, tri->vertex1);
node->aabb_min = h_min_V3F32(node->aabb_min, tri->vertex2);
node->aabb_max = h_max_V3F32(node->aabb_max, tri->vertex0);
node->aabb_max = h_max_V3F32(node->aabb_max, tri->vertex1);
node->aabb_max = h_max_V3F32(node->aabb_max, tri->vertex2);
}
}
__host__ function void
bvh_host_intersect(BVH *bvh, RayF32 *ray, HitRecord *rec, U32 node_idx)
{
BVHNode *node = &bvh->nodes[node_idx];
U32 any_hit = 0;
if(h_intersect_aabb(ray, node->aabb_min, node->aabb_max, rec->t))
{
if(node->tri_count > 0)
{
//LOG("Hit a leaf node %i with tri count %i \n", node_idx, node->tri_count);
for(U32 i = 0; i < node->tri_count; i+=1)
{
U32 tri_index = h_tri_indices[node->left_first + i];
Entity *tri = &h_entities[tri_index];
hit_triangle_host(ray, rec, tri);
if(rec->hit)
{
any_hit = 1;
//LOG("got hit in bvh_host_intersect loop \n");
}
}
}
else
{
bvh_host_intersect(bvh, ray, rec, node->left_first);
bvh_host_intersect(bvh, ray, rec, node->left_first + 1);
}
}
if(!rec->hit)
{
rec->hit = any_hit;
}
}

27
src/rayt_bvh.cuh Normal file
View File

@ -0,0 +1,27 @@
#pragma once
// 32 bytes node
typedef struct BVHNode BVHNode;
struct BVHNode
{
Vec3F32 aabb_min;
Vec3F32 aabb_max;
U32 left_first;
U32 tri_count;
};
typedef struct BVH BVH;
struct BVH
{
BVHNode *nodes;
U32 used_nodes;
U32 root_index;
U32 max_num_nodes;
U32 num_leaf_nodes;
U32 minimum_entities_in_leaf;
};
__host__ function BVH bvh_build();
__host__ function void bvh_update_bounds(BVH *bvh, U32 node_idx);
__host__ function void bvh_subdivide(BVH *bvh, U32 node_idx);
__host__ function void bvh_host_intersect(BVH *bvh, RayF32 *ray, HitRecord *rec, U32 node_idx);

View File

@ -1,16 +1,11 @@
//~ device globals
__constant__ CameraF32 camera;
__constant__ ViewportF32 viewport;
__constant__ ImageF32 image;
__host__ function void
write_buffer_to_ppm(Vec3F32 *buffer,
U32 image_width,
U32 image_height)
U32 image_height,
const char *filename)
{
const char *filename = "output.ppm";
FILE *file = fopen(filename, "w");
if(!file)
{
@ -133,6 +128,95 @@ get_sample_color(RayF32 ray, Entity *entities, curandState *local_rand_state)
return out;
}
// Common function for use on both host and device,
__host__ __device__ inline function void
triangle_intersection_common(RayF32 *ray, HitRecord *rec, Vec3F32 edge1, Vec3F32 edge2,
Entity* triangle)
{
// Möller-Trumbore intersection algorithm
Vec3F32 h = cross_V3F32(ray->direction, edge2);
F32 closest_so_far = rec->t;
F32 a = dot_V3F32(edge1, h);
if(a <= -0.001f || a >= 0.001f)
{
F32 f = 1.0f/a;
Vec3F32 s = sub_V3F32(ray->origin, triangle->vertex0);
F32 u = f * dot_V3F32(s, h);
if(u >= 0.0f && u <= 1.0f)
{
Vec3F32 q = cross_V3F32(s, edge1);
F32 v = f * dot_V3F32(ray->direction, q);
if(v >= 0.0f && (u+v) <= 1.0f)
{
F32 t = f * dot_V3F32(edge2, q);
if(t > 0.0001f)
{
if(t <= closest_so_far)
{
rec->t = t;
rec->hit = 1;
}
}
}
}
}
}
__host__ function void
hit_triangle_host(RayF32 *ray, HitRecord *rec, Entity *triangle)
{
Vec3F32 edge1 = sub_V3F32(triangle->vertex1, triangle->vertex0);
Vec3F32 edge2 = sub_V3F32(triangle->vertex2, triangle->vertex0);
rec->hit = 0;
triangle_intersection_common(ray, rec, edge1, edge2, triangle);
// Set the point of intersection and the normal of the, for now,
// vertex0 of the triangle. We have to get the actual surface normal at some point.
if(rec->hit)
{
Vec3F32 intersection_point = ray_point_F32(rec->t, ray);
Vec3F32 v0_normal = cross_V3F32(edge1, edge2);
F32 norm_inv = 1.0f/h_norm_V3F32(v0_normal);
v0_normal = scale_V3F32(norm_inv, v0_normal);
F32 front_face = dot_V3F32(ray->direction, v0_normal) < 0.0f;
rec->normal = front_face ? v0_normal : scale_V3F32(-1.0f, v0_normal);
rec->front_face = front_face;
rec->point = intersection_point;
//LOG("Hit triangle in hit_triangle_host! \n");
}
}
__device__ function HitRecord
hit_triangle(RayF32 *ray, Entity *triangle, F32 closest_so_far)
{
Vec3F32 edge1 = sub_V3F32(triangle->vertex1, triangle->vertex0);
Vec3F32 edge2 = sub_V3F32(triangle->vertex2, triangle->vertex0);
HitRecord out = {0};
triangle_intersection_common(ray, &out, edge1, edge2, triangle);
// Set the point of intersection and the normal of the, for now,
// vertex0 of the triangle. We have to get the actual surface normal at some point.
if(out.hit)
{
Vec3F32 intersection_point = ray_point_F32(out.t, ray);
Vec3F32 v0_normal = cross_V3F32(edge1, edge2);
F32 norm_inv = 1.0f/norm_V3F32(v0_normal);
v0_normal = scale_V3F32(norm_inv, v0_normal);
F32 front_face = dot_V3F32(ray->direction, v0_normal) < 0.0f;
out.normal = front_face ? v0_normal : scale_V3F32(-1.0f, v0_normal);
out.front_face = front_face;
out.point = intersection_point;
}
return out;
}
__global__ void
cuda_init_state(curandState *rand_state)
@ -148,3 +232,10 @@ cuda_init_state(curandState *rand_state)
}
}
__host__ void
cuda_free(void *device_ptr)
{
cudaError_t cuErr = cudaFree(device_ptr);
CUDA_CHECK(cuErr);
}

View File

@ -38,6 +38,7 @@ enum EntityKind
{
EntityKind_Nil,
EntityKind_Sphere,
EntityKind_Tri,
Num_EntityKinds
};
@ -56,12 +57,26 @@ struct Entity
{
EntityKind kind;
Vec3F32 center;
Vec3F32 vertex0;
Vec3F32 vertex1;
Vec3F32 vertex2;
F32 radius;
};
__host__ function void write_buffer_to_ppm(Vec3F32 *buffer, U32 image_width, U32 image_height);
__host__ function void write_buffer_to_ppm(Vec3F32 *buffer, U32 image_width,
U32 image_height, const char *filename);
__device__ function RayF32 ray_get_F32(F32 x, F32 y, Vec3F32 cam_center, curandState *local_rand_state);
__global__ void cuda_init_state(curandState *rand_state);
__device__ function RayF32 ray_get_F32(F32 x, F32 y, Vec3F32 cam_center,
curandState *local_rand_state);
__host__ __device__ inline function void
triangle_intersection_common(RayF32 *ray, HitRecord *rec, Vec3F32 edge1, Vec3F32 edge2,
Entity* triangle);
__device__ function HitRecord hit_triangle(RayF32 *ray, Entity *triangle,
F32 closest_so_far);
__host__ function void hit_triangle_host(RayF32 *ray, HitRecord *rec, Entity *triangle);
__global__ void cuda_init_state(curandState *rand_state);
__host__ void cuda_free(void *device_ptr);

Binary file not shown.