working bvh on CPU
This commit is contained in:
parent
be0688fa9f
commit
254cb069a3
2
.gitignore
vendored
2
.gitignore
vendored
@ -115,3 +115,5 @@ build/
|
||||
*.exr
|
||||
*.bmp
|
||||
*.png
|
||||
*.sublime*
|
||||
*.ctm)
|
||||
@ -15,7 +15,9 @@ set CudaRemoveWarnings=-diag-suppress 177
|
||||
|
||||
IF NOT EXIST .\build mkdir .\build
|
||||
pushd .\build
|
||||
nvcc %CudaSources% %CudaRemoveWarnings% -o program.exe
|
||||
|
||||
@rem nvcc %CudaRemoveWarnings% -G -g -lineinfo -o program.exe %CudaSources%
|
||||
nvcc %CudaRemoveWarnings% -o program.exe %CudaSources%
|
||||
|
||||
set LastError=%ERRORLEVEL%
|
||||
popd
|
||||
|
||||
33
src/base_core.cu
Normal file
33
src/base_core.cu
Normal file
@ -0,0 +1,33 @@
|
||||
|
||||
__host__ function F64
|
||||
get_cpu_frequency()
|
||||
{
|
||||
LARGE_INTEGER freq;
|
||||
QueryPerformanceFrequency(&freq);
|
||||
U64 start_tsc = __rdtsc();
|
||||
Sleep(100);
|
||||
U64 end_tsc = __rdtsc();
|
||||
F64 cyclers_per_ms = (F64)(end_tsc - start_tsc) / 100.0;
|
||||
return cyclers_per_ms * 1000.0; // Cycles per second
|
||||
}
|
||||
|
||||
__host__ function void
|
||||
timer()
|
||||
{
|
||||
g_cpu_timer.second_to_last_cycles = g_cpu_timer.last_cycles;
|
||||
g_cpu_timer.last_cycles = __rdtsc();
|
||||
}
|
||||
|
||||
__host__ function F64
|
||||
timer_elapsed()
|
||||
{
|
||||
U64 cycles = g_cpu_timer.last_cycles - g_cpu_timer.second_to_last_cycles;
|
||||
F64 elapsed = (F64)cycles / (g_cpu_timer.cpu_freq/1000.0);
|
||||
return elapsed; // ms
|
||||
}
|
||||
|
||||
|
||||
function F64 test_function()
|
||||
{
|
||||
return 34.20;
|
||||
}
|
||||
60
src/base_core.cuh
Normal file
60
src/base_core.cuh
Normal file
@ -0,0 +1,60 @@
|
||||
#pragma once
|
||||
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#include <windows.h>
|
||||
#include <intrin.h>
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdint.h>
|
||||
#include <float.h>
|
||||
#include <math.h>
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <curand_kernel.h>
|
||||
|
||||
//------------------------------------------------------------------------------------------
|
||||
//~ base defines
|
||||
|
||||
#define host_global static
|
||||
#define function static
|
||||
|
||||
//~ typedefs
|
||||
typedef int32_t S32;
|
||||
typedef uint32_t U32;
|
||||
typedef uint64_t U64;
|
||||
typedef double F64;
|
||||
typedef float F32;
|
||||
|
||||
//~ utility defines
|
||||
|
||||
#define CUDA_CHECK(err) do { \
|
||||
if (err != cudaSuccess) { \
|
||||
fprintf(stderr, "CUDA ERROR: %s at %s:%d\n", \
|
||||
cudaGetErrorString(err), __FILE__, __LINE__); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define LOG printf
|
||||
|
||||
#define F32_MAX FLT_MAX
|
||||
#define F32_MIN -FLT_MAX
|
||||
|
||||
|
||||
typedef struct CPUTimer CPUTimer;
|
||||
struct CPUTimer
|
||||
{
|
||||
U64 last_cycles;
|
||||
U64 second_to_last_cycles;
|
||||
F64 elapsed;
|
||||
F64 cpu_freq;
|
||||
};
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------------------
|
||||
//~ timing
|
||||
__host__ function F64 get_cpu_frequency();
|
||||
__host__ function void timer();
|
||||
__host__ function F64 timer_elapsed();
|
||||
function F64 test_function();
|
||||
|
||||
@ -1,35 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdint.h>
|
||||
#include <float.h>
|
||||
#include <math.h>
|
||||
|
||||
#include <curand_kernel.h>
|
||||
|
||||
//------------------------------------------------------------------------------------------
|
||||
//~ base defines
|
||||
|
||||
#define host_global static
|
||||
#define function static
|
||||
|
||||
//~ typedefs
|
||||
typedef int32_t S32;
|
||||
typedef uint32_t U32;
|
||||
typedef uint64_t U64;
|
||||
typedef float F32;
|
||||
|
||||
//~ utility defines
|
||||
|
||||
#define CUDA_CHECK(err) do { \
|
||||
if (err != cudaSuccess) { \
|
||||
fprintf(stderr, "CUDA ERROR: %s at %s:%d\n", \
|
||||
cudaGetErrorString(err), __FILE__, __LINE__); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define LOG printf
|
||||
|
||||
#define F32_MAX FLT_MAX
|
||||
#define F32_MIN FLT_MIN
|
||||
@ -1,4 +1,24 @@
|
||||
|
||||
__host__ inline function Vec3F32
|
||||
h_max_V3F32(Vec3F32 a, Vec3F32 b)
|
||||
{
|
||||
Vec3F32 out = {0};
|
||||
out.x = a.x > b.x ? a.x : b.x;
|
||||
out.y = a.y > b.y ? a.y : b.y;
|
||||
out.z = a.z > b.z ? a.z : b.z;
|
||||
return out;
|
||||
}
|
||||
|
||||
__host__ inline function Vec3F32
|
||||
h_min_V3F32(Vec3F32 a, Vec3F32 b)
|
||||
{
|
||||
Vec3F32 out = {0};
|
||||
out.x = a.x < b.x ? a.x : b.x;
|
||||
out.y = a.y < b.y ? a.y : b.y;
|
||||
out.z = a.z < b.z ? a.z : b.z;
|
||||
return out;
|
||||
}
|
||||
|
||||
__host__ __device__ inline function Vec3F32
|
||||
vec3F32(F32 x, F32 y, F32 z)
|
||||
{
|
||||
@ -40,6 +60,16 @@ scale_V3F32(F32 s, Vec3F32 v)
|
||||
return out;
|
||||
}
|
||||
|
||||
__host__ __device__ inline function Vec3F32
|
||||
cross_V3F32(Vec3F32 a, Vec3F32 b)
|
||||
{
|
||||
Vec3F32 out = {0};
|
||||
out.x = a.y*b.z-a.z*b.y;
|
||||
out.y = a.z*b.x-a.x*b.z;
|
||||
out.z = a.x*b.y-a.y*b.x;
|
||||
return out;
|
||||
}
|
||||
|
||||
__host__ __device__ inline function F32
|
||||
dot_V3F32(Vec3F32 a, Vec3F32 b)
|
||||
{
|
||||
@ -48,9 +78,9 @@ dot_V3F32(Vec3F32 a, Vec3F32 b)
|
||||
|
||||
|
||||
__host__ __device__ inline function Vec3F32
|
||||
ray_point_F32(F32 t, RayF32 ray)
|
||||
ray_point_F32(F32 t, RayF32 *ray)
|
||||
{
|
||||
Vec3F32 out = add_V3F32(ray.origin, scale_V3F32(t, ray.direction));
|
||||
Vec3F32 out = add_V3F32(ray->origin, scale_V3F32(t, ray->direction));
|
||||
return out;
|
||||
}
|
||||
|
||||
@ -136,4 +166,66 @@ clamp_V3F32(RngF32 rng, Vec3F32 v)
|
||||
return out;
|
||||
}
|
||||
|
||||
__host__ function F32
|
||||
rand_uniform_host_F32()
|
||||
{
|
||||
F32 rand_max = (F32)RAND_MAX;
|
||||
U32 r = rand();
|
||||
F32 rf = (F32)r;
|
||||
F32 out = rf/rand_max;
|
||||
return out;
|
||||
}
|
||||
|
||||
__host__ function Vec3F32
|
||||
vec3_rand_host_F32()
|
||||
{
|
||||
Vec3F32 out = {0};
|
||||
out.x = rand_uniform_host_F32();
|
||||
out.y = rand_uniform_host_F32();
|
||||
out.z = rand_uniform_host_F32();
|
||||
return out;
|
||||
}
|
||||
|
||||
__host__ function U32
|
||||
h_intersect_aabb(RayF32 *ray, Vec3F32 bmin, Vec3F32 bmax, F32 closest_so_far)
|
||||
{
|
||||
F32 tx1 = (bmin.x - ray->origin.x) / ray->direction.x;
|
||||
F32 tx2 = (bmax.x - ray->origin.x) / ray->direction.x;
|
||||
F32 tmin = MIN(tx1, tx2);
|
||||
F32 tmax = MAX(tx1, tx2);
|
||||
|
||||
F32 ty1 = (bmin.y - ray->origin.y) / ray->direction.y;
|
||||
F32 ty2 = (bmax.y - ray->origin.y) / ray->direction.y;
|
||||
tmin = MAX(tmin, MIN(ty1, ty2));
|
||||
tmax = MIN(tmax, MAX(ty1, ty2));
|
||||
|
||||
F32 tz1 = (bmin.z - ray->origin.z) / ray->direction.z;
|
||||
F32 tz2 = (bmax.z - ray->origin.z) / ray->direction.z;
|
||||
tmin = MAX(tmin, MIN(tz1, tz2));
|
||||
tmax = MIN(tmax, MAX(tz1, tz2));
|
||||
|
||||
U32 out = tmax >= tmin && tmin < closest_so_far && tmax > 0.0f;
|
||||
return out;
|
||||
}
|
||||
|
||||
__device__ function U32
|
||||
intersect_aabb(RayF32 *ray, Vec3F32 bmin, Vec3F32 bmax, F32 closest_so_far)
|
||||
{
|
||||
F32 tx1 = (bmin.x - ray->origin.x) / ray->direction.x;
|
||||
F32 tx2 = (bmax.x - ray->origin.x) / ray->direction.x;
|
||||
F32 tmin = fminf(tx1, tx2);
|
||||
F32 tmax = fmaxf(tx1, tx2);
|
||||
|
||||
F32 ty1 = (bmin.y - ray->origin.y) / ray->direction.y;
|
||||
F32 ty2 = (bmax.y - ray->origin.y) / ray->direction.y;
|
||||
tmin = fminf(tmin, fminf(ty1, ty2));
|
||||
tmax = fmaxf(tmax, fmaxf(ty1, ty2));
|
||||
|
||||
F32 tz1 = (bmin.z - ray->origin.z) / ray->direction.z;
|
||||
F32 tz2 = (bmax.z - ray->origin.z) / ray->direction.z;
|
||||
tmin = fminf(tmin, fminf(tz1, tz2));
|
||||
tmax = fmaxf(tmax, fmaxf(tz1, tz2));
|
||||
|
||||
U32 out = tmax >= tmin && tmin < closest_so_far && tmax > 0.0f;
|
||||
return out;
|
||||
}
|
||||
@ -1,5 +1,9 @@
|
||||
#pragma once
|
||||
|
||||
#define MAX(a, b) (a) > (b) ? (a) : (b)
|
||||
#define MIN(a, b) (a) > (b) ? (b) : (a)
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------------------
|
||||
//~ structs
|
||||
|
||||
@ -40,16 +44,21 @@ struct RayF32
|
||||
//~ forward declarations
|
||||
|
||||
|
||||
|
||||
__host__ inline function Vec3F32 h_max_V3F32(Vec3F32 a, Vec3F32 b);
|
||||
__host__ inline function Vec3F32 h_min_V3F32(Vec3F32 a, Vec3F32 b);
|
||||
|
||||
__host__ __device__ inline function Vec3F32 vec3F32(F32 x, F32 y, F32 z);
|
||||
__host__ __device__ inline function Vec3F32 add_V3F32(Vec3F32 a, Vec3F32 b);
|
||||
__host__ __device__ inline function Vec3F32 sub_V3F32(Vec3F32 a, Vec3F32 b);
|
||||
__host__ __device__ inline function Vec3F32 scale_V3F32(F32 s, Vec3F32 v);
|
||||
|
||||
__host__ __device__ inline function Vec3F32 cross_V3F32(Vec3F32 a, Vec3F32 b);
|
||||
|
||||
__host__ __device__ inline function Vec3F32 ray_point_F32(F32 t, RayF32 *ray);
|
||||
__host__ __device__ inline function F32 mag_V3F32(Vec3F32 a);
|
||||
__host__ __device__ inline function F32 dot_V3F32(Vec3F32 a, Vec3F32 b);
|
||||
__device__ inline function F32 norm_V3F32(Vec3F32 a);
|
||||
__host__ inline function F32 h_norm_V3F32(Vec3F32 a);
|
||||
|
||||
__host__ __device__ function Vec3F32 lerp_V3F32(F32 s, Vec3F32 a, Vec3F32 b);
|
||||
|
||||
@ -57,7 +66,14 @@ __device__ function Vec3F32 rand_uniform_V3F32(curandState *local_rand_state);
|
||||
__device__ function Vec3F32 rand_uniform_range_V3F32(RngF32 rng, curandState *local_rand_state);
|
||||
|
||||
__host__ function F32 linear_to_gamma(F32 val);
|
||||
__host__ inline function F32 h_norm_V3F32(Vec3F32 a);
|
||||
|
||||
__device__ function F32 clamp_F32(RngF32 rng, F32 val);
|
||||
__device__ function Vec3F32 clamp_V3F32(RngF32 rng, Vec3F32 v);
|
||||
|
||||
__host__ function F32 rand_uniform_host_F32();
|
||||
__host__ function Vec3F32 vec3_rand_host_F32();
|
||||
|
||||
__host__ function U32
|
||||
h_intersect_aabb(RayF32 *ray, Vec3F32 bmin, Vec3F32 bmax, F32 closest_so_far);
|
||||
__device__ function U32
|
||||
intersect_aabb(RayF32 *ray, Vec3F32 bmin, Vec3F32 bmax, F32 closest_so_far);
|
||||
336
src/main.cu
336
src/main.cu
@ -1,13 +1,20 @@
|
||||
#include "base_core.h"
|
||||
#include "base_math.h"
|
||||
#include "rayt_core.h"
|
||||
#define RENDER_ON_CPU 1
|
||||
#define BVH_USE_CPU 1
|
||||
#define DEBUG_DRAW_BBOX 0
|
||||
|
||||
//------------------------------------------------------------------------------------------
|
||||
//~ header includes
|
||||
#include "base_core.cuh"
|
||||
#include "base_math.cuh"
|
||||
#include "rayt_core.cuh"
|
||||
#include "rayt_bvh.cuh"
|
||||
|
||||
//------------------------------------------------------------------------------------------
|
||||
//~ Program parameter defines
|
||||
#define NUM_BLOCKS 1
|
||||
#define NUM_THREADS 32
|
||||
|
||||
#define IMAGE_WIDTH 1920
|
||||
#define IMAGE_WIDTH 1024
|
||||
#define ASPECT_RATIO 1.7778f // 16/9
|
||||
|
||||
#define CURAND_SEED 1984
|
||||
@ -17,15 +24,30 @@
|
||||
#define SAMPLES_PER_PIXEL 64
|
||||
#define MAX_DIFFUSE_DEPTH 8
|
||||
|
||||
#include "base_math.c"
|
||||
#include "rayt_core.c"
|
||||
|
||||
//------------------------------------------------------------------------------------------
|
||||
//~ host globals
|
||||
host_global CPUTimer g_cpu_timer;
|
||||
host_global CameraF32 h_camera;
|
||||
host_global ViewportF32 h_viewport;
|
||||
host_global ImageF32 h_image;
|
||||
host_global Entity *h_entities = 0;
|
||||
host_global U32 *h_tri_indices = 0;
|
||||
host_global BVH h_bvh;
|
||||
|
||||
//~ device globals
|
||||
__constant__ CameraF32 camera;
|
||||
__constant__ ViewportF32 viewport;
|
||||
__constant__ ImageF32 image;
|
||||
|
||||
//------------------------------------------------------------------------------------------
|
||||
//~ implementation includes
|
||||
#include "base_core.cu"
|
||||
#include "base_math.cu"
|
||||
#include "rayt_core.cu"
|
||||
#include "rayt_bvh.cu"
|
||||
|
||||
//------------------------------------------------------------------------------------------
|
||||
//~ routines
|
||||
__global__ void
|
||||
cuda_main(Entity *entities, Vec3F32 *pixelbuffer, curandState *rand_state)
|
||||
{
|
||||
@ -52,15 +74,11 @@ cuda_main(Entity *entities, Vec3F32 *pixelbuffer, curandState *rand_state)
|
||||
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------------------
|
||||
//~ Main
|
||||
int main()
|
||||
__host__ function void
|
||||
set_up_scene_globals()
|
||||
{
|
||||
cudaError_t cuErr;
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Define image, camera and viewport on the CPU
|
||||
// and then copy to constant globals on device
|
||||
// -------------
|
||||
h_image = {0};
|
||||
h_image.width = IMAGE_WIDTH;
|
||||
@ -68,21 +86,17 @@ int main()
|
||||
U32 height = U32((F32)h_image.width/h_image.aspect_ratio) + 1;
|
||||
h_image.height = height < 1 ? 1 : height;
|
||||
h_image.total_num_pixels = h_image.width * h_image.height;
|
||||
cuErr = cudaMemcpyToSymbol(image, &h_image, sizeof(ImageF32), 0, cudaMemcpyHostToDevice);
|
||||
CUDA_CHECK(cuErr);
|
||||
|
||||
LOG("Image size %i x %i, aspect ratio: %.4f \n",
|
||||
h_image.width, h_image.height, h_image.aspect_ratio);
|
||||
|
||||
// -------------
|
||||
h_camera = {0};
|
||||
h_camera.focal_length = 1.0f;
|
||||
h_camera.focal_length = 3.0f;
|
||||
h_camera.center = vec3F32(0.0f, 0.0f, 18.0f);
|
||||
F32 samples_per_pixel = (F32)SAMPLES_PER_PIXEL;
|
||||
h_camera.pixel_sample_scale = 1.0f/samples_per_pixel;
|
||||
|
||||
cuErr = cudaMemcpyToSymbol(camera, &h_camera, sizeof(CameraF32), 0,
|
||||
cudaMemcpyHostToDevice);
|
||||
CUDA_CHECK(cuErr);
|
||||
|
||||
// -------------
|
||||
h_viewport = {0};
|
||||
h_viewport.height = 2.0f;
|
||||
@ -108,43 +122,36 @@ int main()
|
||||
h_viewport.pixel_origin = add_V3F32(viewport_upper_left,
|
||||
scale_V3F32(0.5f, pixel_delta_sum));
|
||||
|
||||
cuErr = cudaMemcpyToSymbol(viewport, &h_viewport, sizeof(ViewportF32), 0,
|
||||
cudaMemcpyHostToDevice);
|
||||
CUDA_CHECK(cuErr);
|
||||
LOG("Viewport size %.2f x %.2f, aspect ratio: %.4f \n",
|
||||
h_viewport.width, h_viewport.height, h_viewport.aspect_ratio);
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Setup entities and copy to device
|
||||
U64 entity_list_byte_size = sizeof(Entity)*MAX_NUM_ENTITIES;
|
||||
Entity *h_entities = (Entity *)malloc(entity_list_byte_size);
|
||||
memset(h_entities, 0, entity_list_byte_size);
|
||||
for(U32 i = 0; i < MAX_NUM_ENTITIES; i += 1)
|
||||
{
|
||||
// Init all entities to nil
|
||||
h_entities[i].kind = EntityKind_Nil;
|
||||
}
|
||||
|
||||
// Manual spheres
|
||||
__host__ function void
|
||||
copy_to_device_and_launch_cuda_main()
|
||||
{
|
||||
h_entities[0].kind = EntityKind_Sphere;
|
||||
h_entities[0].center = vec3F32(0.0f, 0.0f, -1.0f);
|
||||
h_entities[0].radius = 0.5f;
|
||||
cudaError_t cuErr;
|
||||
|
||||
h_entities[1].kind = EntityKind_Sphere;
|
||||
h_entities[1].center = vec3F32(0.0f, -100.5f, -1.0f);
|
||||
h_entities[1].radius = 100.0f;
|
||||
}
|
||||
// Copy constants
|
||||
cuErr = cudaMemcpyToSymbol(image, &h_image, sizeof(ImageF32), 0, cudaMemcpyHostToDevice);
|
||||
CUDA_CHECK(cuErr);
|
||||
|
||||
// Copy to device
|
||||
cuErr = cudaMemcpyToSymbol(camera, &h_camera, sizeof(CameraF32), 0,
|
||||
cudaMemcpyHostToDevice);
|
||||
CUDA_CHECK(cuErr);
|
||||
|
||||
cuErr = cudaMemcpyToSymbol(viewport, &h_viewport, sizeof(ViewportF32), 0,
|
||||
cudaMemcpyHostToDevice);
|
||||
CUDA_CHECK(cuErr);
|
||||
|
||||
|
||||
// Create and copy buffers to device
|
||||
Entity *entities = 0;
|
||||
U64 entity_list_byte_size = sizeof(Entity)*MAX_NUM_ENTITIES;
|
||||
cuErr = cudaMalloc(&entities, entity_list_byte_size);
|
||||
CUDA_CHECK(cuErr);
|
||||
cuErr = cudaMemcpy(entities, h_entities, entity_list_byte_size, cudaMemcpyHostToDevice);
|
||||
CUDA_CHECK(cuErr);
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Define grid, blocks, threads and any buffers such as pixel data and random state
|
||||
// ------------
|
||||
@ -201,14 +208,241 @@ int main()
|
||||
cudaMemcpyDeviceToHost);
|
||||
CUDA_CHECK(cuErr);
|
||||
|
||||
write_buffer_to_ppm(h_pixel_buffer, h_image.width, h_image.height);
|
||||
write_buffer_to_ppm(h_pixel_buffer, h_image.width, h_image.height, "gpu_output.ppm");
|
||||
|
||||
cuErr = cudaFree(pixel_buffer);
|
||||
CUDA_CHECK(cuErr);
|
||||
cuErr = cudaFree(entities);
|
||||
CUDA_CHECK(cuErr);
|
||||
cuErr = cudaFree(rand_state);
|
||||
CUDA_CHECK(cuErr);
|
||||
cuda_free(pixel_buffer);
|
||||
cuda_free(entities);
|
||||
cuda_free(rand_state);
|
||||
|
||||
free(h_pixel_buffer);
|
||||
}
|
||||
|
||||
__host__ function void
|
||||
set_up_entities()
|
||||
{
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Setup entities
|
||||
U64 entity_list_byte_size = sizeof(Entity)*MAX_NUM_ENTITIES;
|
||||
h_entities = (Entity *)malloc(entity_list_byte_size);
|
||||
memset(h_entities, 0, entity_list_byte_size);
|
||||
|
||||
#if 0
|
||||
for(U32 i = 0; i < MAX_NUM_ENTITIES; i += 1)
|
||||
{
|
||||
// Init all entities to nil
|
||||
h_entities[i].kind = EntityKind_Nil;
|
||||
}
|
||||
#endif
|
||||
|
||||
// Random triangles
|
||||
{
|
||||
h_tri_indices = (U32 *)malloc(sizeof(U32)*MAX_NUM_ENTITIES);
|
||||
for(U32 i = 0; i < MAX_NUM_ENTITIES; i += 1)
|
||||
{
|
||||
Vec3F32 r0 = vec3_rand_host_F32();
|
||||
Vec3F32 r1 = vec3_rand_host_F32();
|
||||
Vec3F32 r2 = vec3_rand_host_F32();
|
||||
// Put the first vertex within a 10x10x10 cube centered on the origin.
|
||||
Vec3F32 v0 = scale_V3F32(9.0f, r0);
|
||||
v0 = sub_V3F32(v0, vec3F32(5.0f, 5.0f, 5.0f));
|
||||
h_entities[i].kind = EntityKind_Tri;
|
||||
h_entities[i].vertex0 = v0;
|
||||
// The other two vertices are relative to the first.
|
||||
h_entities[i].vertex1 = add_V3F32(v0, r1);
|
||||
h_entities[i].vertex2 = add_V3F32(v0, r2);
|
||||
Vec3F32 center = add_V3F32(h_entities[i].vertex0,
|
||||
add_V3F32(h_entities[i].vertex1, h_entities[i].vertex2));
|
||||
center = scale_V3F32(0.3333f, center);
|
||||
h_entities[i].center = center;
|
||||
h_tri_indices[i] = i;
|
||||
#if 0
|
||||
LOG("tri index[%i] = %i before bvh construction \n", i, h_tri_indices[i]);
|
||||
#else
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
__host__ function void
|
||||
set_up_bvh()
|
||||
{
|
||||
h_bvh = bvh_build();
|
||||
|
||||
#if 0
|
||||
{
|
||||
U32 total_leaf_nodes = 0;
|
||||
for(U32 i = 0; i < h_bvh.max_num_nodes; i+=1)
|
||||
{
|
||||
BVHNode *node = &h_bvh.nodes[i];
|
||||
if(node->tri_count)
|
||||
{
|
||||
LOG("\n----\n");
|
||||
LOG("Leaf node with idx %i with tri count %i \n", i, node->tri_count);
|
||||
total_leaf_nodes += 1;
|
||||
LOG("Index into triangle index list, node->left_first: %i \n", node->left_first);
|
||||
LOG("leaf node aabb_min = (%.2f %.2f %.2f) \n",
|
||||
node->aabb_min.x, node->aabb_min.y, node->aabb_min.z);
|
||||
LOG("leaf node aabb_max = (%.2f %.2f %.2f) \n",
|
||||
node->aabb_max.x, node->aabb_max.y, node->aabb_max.z);
|
||||
Entity *tri = &h_entities[h_tri_indices[node->left_first]];
|
||||
LOG("Triangle v0: (%.2f, %.2f %.2f) \n",
|
||||
tri->vertex0.x, tri->vertex0.y, tri->vertex0.z);
|
||||
LOG("Triangle v1: (%.2f, %.2f %.2f) \n",
|
||||
tri->vertex1.x, tri->vertex1.y, tri->vertex1.z);
|
||||
LOG("Triangle v2: (%.2f, %.2f %.2f) \n",
|
||||
tri->vertex2.x, tri->vertex2.y, tri->vertex2.z);
|
||||
LOG("----\n\n");
|
||||
}
|
||||
}
|
||||
LOG("Total number of leaf nodes %i \n", total_leaf_nodes);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------------------
|
||||
//~ Main
|
||||
int main()
|
||||
{
|
||||
g_cpu_timer = {0};
|
||||
g_cpu_timer.cpu_freq = get_cpu_frequency();
|
||||
|
||||
cudaEvent_t start, stop;
|
||||
cudaEventCreate(&start);
|
||||
cudaEventCreate(&stop);
|
||||
|
||||
set_up_scene_globals();
|
||||
set_up_entities();
|
||||
set_up_bvh();
|
||||
|
||||
#if RENDER_ON_CPU
|
||||
LOG("Starting CPU rendering \n");
|
||||
//cudaEventRecord(start, 0);
|
||||
timer();
|
||||
// Render "ground truth" on CPU for validation
|
||||
{
|
||||
U64 num_pixels = h_image.width*h_image.height;
|
||||
U64 pixel_buffer_size = num_pixels*sizeof(Vec3F32);
|
||||
Vec3F32 *host_pixel_buffer = (Vec3F32 *)malloc(pixel_buffer_size);
|
||||
for(U32 y = 0; y < h_image.height; y += 1)
|
||||
{
|
||||
for(U32 x = 0; x < h_image.width; x += 1)
|
||||
{
|
||||
U32 idx = y * h_image.width + x;
|
||||
|
||||
Vec3F32 px_u = scale_V3F32((F32)x, h_viewport.pixel_delta_u);
|
||||
Vec3F32 px_v = scale_V3F32((F32)y, h_viewport.pixel_delta_v);
|
||||
Vec3F32 pixel_center = add_V3F32(h_viewport.pixel_origin, add_V3F32(px_u, px_v));
|
||||
|
||||
Vec3F32 ray_direction = sub_V3F32(pixel_center, h_camera.center);
|
||||
RayF32 ray = {0};
|
||||
ray.origin = h_camera.center;
|
||||
ray.direction = ray_direction;
|
||||
HitRecord hit_rec = {0};
|
||||
hit_rec.t = F32_MAX;
|
||||
#if BVH_USE_CPU
|
||||
{
|
||||
bvh_host_intersect(&h_bvh, &ray, &hit_rec, h_bvh.root_index);
|
||||
if(hit_rec.hit)
|
||||
{
|
||||
//LOG("BVH hit triangle! hit_rec->normal: (%.2f, %.2f, %.2f) \n",
|
||||
// hit_rec.normal.x, hit_rec.normal.y, hit_rec.normal.z);
|
||||
}
|
||||
}
|
||||
#else
|
||||
{
|
||||
HitRecord temp_hit_rec = {0};
|
||||
temp_hit_rec.t = hit_rec.t;
|
||||
for (U32 i = 0; i < MAX_NUM_ENTITIES; i+=1)
|
||||
{
|
||||
Entity *tri = &h_entities[i];
|
||||
hit_triangle_host(&ray, &temp_hit_rec, tri);
|
||||
if(temp_hit_rec.hit)
|
||||
{
|
||||
hit_rec = temp_hit_rec;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
Vec3F32 pixel_color = {0.0f, 0.0f, 0.0f};
|
||||
if(hit_rec.hit)
|
||||
{
|
||||
// Paint entity
|
||||
pixel_color = add_V3F32(hit_rec.normal, vec3F32(1.0f, 1.0f, 1.0f));
|
||||
pixel_color = scale_V3F32(0.5f, pixel_color);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Paint background gradient
|
||||
F32 norm = h_norm_V3F32(ray.direction);
|
||||
Vec3F32 unit_dir = scale_V3F32(1.0f/norm, ray.direction);
|
||||
Vec3F32 white = vec3F32(1.0f, 1.0f, 1.0f);
|
||||
Vec3F32 light_blue = vec3F32(0.5f, 0.7f, 1.0f);
|
||||
|
||||
// Lerp between white and light blue depending on y position
|
||||
F32 blend = 0.5f*(unit_dir.y + 1.0f);
|
||||
|
||||
pixel_color = lerp_V3F32(blend, white, light_blue);
|
||||
|
||||
#if DEBUG_DRAW_BBOX
|
||||
{
|
||||
U32 do_debug_pixel = 0;
|
||||
Vec3F32 color_index = {0.0f, 0.0f, 0.0f};
|
||||
for(U32 i = 0; i < h_bvh.max_num_nodes; i += 1)
|
||||
{
|
||||
BVHNode *node = &h_bvh.nodes[i];
|
||||
if(node->tri_count > 0 && h_intersect_aabb(&ray, node->aabb_min, node->aabb_max, F32_MAX))
|
||||
{
|
||||
do_debug_pixel = 1;
|
||||
}
|
||||
if(do_debug_pixel)
|
||||
{
|
||||
color_index.x = 1.0f;
|
||||
color_index.y = 0.0f;
|
||||
color_index.z = 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
if(do_debug_pixel)
|
||||
{
|
||||
pixel_color = color_index;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
// Debug draw bvh
|
||||
|
||||
|
||||
host_pixel_buffer[idx] = pixel_color;
|
||||
|
||||
}
|
||||
}
|
||||
//cudaEventRecord(stop, 0);
|
||||
//cudaEventSynchronize(stop);
|
||||
timer();
|
||||
|
||||
{
|
||||
F32 elapsed = timer_elapsed();
|
||||
//cudaEventElapsedTime(&elapsed, start, stop);
|
||||
LOG("Elapsed time for CPU rendering: %.2f ms \n", elapsed);
|
||||
U32 bvh_used = 0;
|
||||
#if BVH_USE_CPU
|
||||
bvh_used = 1;
|
||||
#endif
|
||||
LOG("BVH = %i \n", bvh_used);
|
||||
}
|
||||
|
||||
cudaEventDestroy(stop);
|
||||
cudaEventDestroy(start);
|
||||
|
||||
|
||||
write_buffer_to_ppm(host_pixel_buffer, h_image.width, h_image.height, "cpu_output.ppm");
|
||||
free(host_pixel_buffer);
|
||||
}
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
136
src/rayt_bvh.cu
Normal file
136
src/rayt_bvh.cu
Normal file
@ -0,0 +1,136 @@
|
||||
|
||||
__host__ function BVH
|
||||
bvh_build()
|
||||
{
|
||||
U64 max_bvh_nodes = 2 * MAX_NUM_ENTITIES - 1;
|
||||
BVH bvh = {0};
|
||||
bvh.nodes = (BVHNode *)_aligned_malloc(sizeof(BVHNode)*max_bvh_nodes, 64);
|
||||
bvh.max_num_nodes = max_bvh_nodes;
|
||||
bvh.used_nodes = 2; // Skip by two, TODO(anton): Comment this.
|
||||
bvh.minimum_entities_in_leaf = 2;
|
||||
U32 root_index = 0;
|
||||
BVHNode *root = &bvh.nodes[root_index];
|
||||
root->left_first = 0;
|
||||
root->tri_count = MAX_NUM_ENTITIES;
|
||||
|
||||
bvh_update_bounds(&bvh, 0);
|
||||
|
||||
bvh_subdivide(&bvh, 0);
|
||||
|
||||
return bvh;
|
||||
}
|
||||
|
||||
__host__ function void
|
||||
bvh_subdivide(BVH *bvh, U32 node_idx)
|
||||
{
|
||||
BVHNode *node = &bvh->nodes[node_idx];
|
||||
if(node->tri_count <= bvh->minimum_entities_in_leaf)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// Split box
|
||||
Vec3F32 extent = sub_V3F32(node->aabb_max, node->aabb_min);
|
||||
U32 axis = 0;
|
||||
if(extent.y > extent.x) axis = 1;
|
||||
if(extent.z > extent.v[axis]) axis = 2;
|
||||
F32 split_pos = node->aabb_min.v[axis] + extent.v[axis] * 0.5f;
|
||||
|
||||
// Sorting into left and right partitions
|
||||
U32 i = node->left_first;
|
||||
U32 j = node->tri_count + i - 1;
|
||||
while (i <= j)
|
||||
{
|
||||
U32 tri_idx = h_tri_indices[i];
|
||||
if(h_entities[tri_idx].center.v[axis] < split_pos)
|
||||
{
|
||||
i += 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
h_tri_indices[i] = h_tri_indices[j];
|
||||
h_tri_indices[j] = tri_idx;
|
||||
j -= 1;
|
||||
}
|
||||
}
|
||||
|
||||
U32 left_count = i - node->left_first;
|
||||
if(left_count == 0 || left_count == node->tri_count)
|
||||
{
|
||||
// One of the partitions are empty, don't subdivide further.
|
||||
return;
|
||||
}
|
||||
|
||||
// Create child nodes and subdivide
|
||||
U32 left_child_index = bvh->used_nodes++;
|
||||
U32 right_child_index = bvh->used_nodes++;
|
||||
|
||||
bvh->nodes[left_child_index].left_first = node->left_first;
|
||||
bvh->nodes[left_child_index].tri_count = left_count;
|
||||
bvh->nodes[right_child_index].left_first = i;
|
||||
bvh->nodes[right_child_index].tri_count = node->tri_count - left_count;
|
||||
node->left_first = left_child_index;
|
||||
node->tri_count = 0;
|
||||
bvh_update_bounds(bvh, left_child_index);
|
||||
bvh_update_bounds(bvh, right_child_index);
|
||||
bvh_subdivide(bvh, left_child_index);
|
||||
bvh_subdivide(bvh, right_child_index);
|
||||
}
|
||||
|
||||
|
||||
__host__ function void
|
||||
bvh_update_bounds(BVH *bvh, U32 node_idx)
|
||||
{
|
||||
BVHNode *node = &bvh->nodes[node_idx];
|
||||
|
||||
node->aabb_min = vec3F32(F32_MAX, F32_MAX, F32_MAX);
|
||||
node->aabb_max = vec3F32(F32_MIN, F32_MIN, F32_MIN);
|
||||
|
||||
U32 first_tri_idx = node->left_first;
|
||||
for(U32 i = 0; i < node->tri_count; i += 1)
|
||||
{
|
||||
U32 leaf_tri_idx = h_tri_indices[first_tri_idx + i];
|
||||
Entity *tri = &h_entities[leaf_tri_idx];
|
||||
node->aabb_min = h_min_V3F32(node->aabb_min, tri->vertex0);
|
||||
node->aabb_min = h_min_V3F32(node->aabb_min, tri->vertex1);
|
||||
node->aabb_min = h_min_V3F32(node->aabb_min, tri->vertex2);
|
||||
node->aabb_max = h_max_V3F32(node->aabb_max, tri->vertex0);
|
||||
node->aabb_max = h_max_V3F32(node->aabb_max, tri->vertex1);
|
||||
node->aabb_max = h_max_V3F32(node->aabb_max, tri->vertex2);
|
||||
}
|
||||
}
|
||||
|
||||
__host__ function void
|
||||
bvh_host_intersect(BVH *bvh, RayF32 *ray, HitRecord *rec, U32 node_idx)
|
||||
{
|
||||
BVHNode *node = &bvh->nodes[node_idx];
|
||||
U32 any_hit = 0;
|
||||
if(h_intersect_aabb(ray, node->aabb_min, node->aabb_max, rec->t))
|
||||
{
|
||||
if(node->tri_count > 0)
|
||||
{
|
||||
//LOG("Hit a leaf node %i with tri count %i \n", node_idx, node->tri_count);
|
||||
for(U32 i = 0; i < node->tri_count; i+=1)
|
||||
{
|
||||
U32 tri_index = h_tri_indices[node->left_first + i];
|
||||
Entity *tri = &h_entities[tri_index];
|
||||
hit_triangle_host(ray, rec, tri);
|
||||
if(rec->hit)
|
||||
{
|
||||
any_hit = 1;
|
||||
//LOG("got hit in bvh_host_intersect loop \n");
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
bvh_host_intersect(bvh, ray, rec, node->left_first);
|
||||
bvh_host_intersect(bvh, ray, rec, node->left_first + 1);
|
||||
}
|
||||
}
|
||||
|
||||
if(!rec->hit)
|
||||
{
|
||||
rec->hit = any_hit;
|
||||
}
|
||||
}
|
||||
27
src/rayt_bvh.cuh
Normal file
27
src/rayt_bvh.cuh
Normal file
@ -0,0 +1,27 @@
|
||||
#pragma once
|
||||
|
||||
// 32 bytes node
|
||||
typedef struct BVHNode BVHNode;
|
||||
struct BVHNode
|
||||
{
|
||||
Vec3F32 aabb_min;
|
||||
Vec3F32 aabb_max;
|
||||
U32 left_first;
|
||||
U32 tri_count;
|
||||
};
|
||||
|
||||
typedef struct BVH BVH;
|
||||
struct BVH
|
||||
{
|
||||
BVHNode *nodes;
|
||||
U32 used_nodes;
|
||||
U32 root_index;
|
||||
U32 max_num_nodes;
|
||||
U32 num_leaf_nodes;
|
||||
U32 minimum_entities_in_leaf;
|
||||
};
|
||||
|
||||
__host__ function BVH bvh_build();
|
||||
__host__ function void bvh_update_bounds(BVH *bvh, U32 node_idx);
|
||||
__host__ function void bvh_subdivide(BVH *bvh, U32 node_idx);
|
||||
__host__ function void bvh_host_intersect(BVH *bvh, RayF32 *ray, HitRecord *rec, U32 node_idx);
|
||||
@ -1,16 +1,11 @@
|
||||
//~ device globals
|
||||
__constant__ CameraF32 camera;
|
||||
__constant__ ViewportF32 viewport;
|
||||
__constant__ ImageF32 image;
|
||||
|
||||
|
||||
__host__ function void
|
||||
write_buffer_to_ppm(Vec3F32 *buffer,
|
||||
U32 image_width,
|
||||
U32 image_height)
|
||||
U32 image_height,
|
||||
const char *filename)
|
||||
{
|
||||
|
||||
const char *filename = "output.ppm";
|
||||
FILE *file = fopen(filename, "w");
|
||||
if(!file)
|
||||
{
|
||||
@ -133,6 +128,95 @@ get_sample_color(RayF32 ray, Entity *entities, curandState *local_rand_state)
|
||||
return out;
|
||||
}
|
||||
|
||||
// Common function for use on both host and device,
|
||||
__host__ __device__ inline function void
|
||||
triangle_intersection_common(RayF32 *ray, HitRecord *rec, Vec3F32 edge1, Vec3F32 edge2,
|
||||
Entity* triangle)
|
||||
{
|
||||
// Möller-Trumbore intersection algorithm
|
||||
Vec3F32 h = cross_V3F32(ray->direction, edge2);
|
||||
F32 closest_so_far = rec->t;
|
||||
F32 a = dot_V3F32(edge1, h);
|
||||
if(a <= -0.001f || a >= 0.001f)
|
||||
{
|
||||
F32 f = 1.0f/a;
|
||||
Vec3F32 s = sub_V3F32(ray->origin, triangle->vertex0);
|
||||
F32 u = f * dot_V3F32(s, h);
|
||||
if(u >= 0.0f && u <= 1.0f)
|
||||
{
|
||||
Vec3F32 q = cross_V3F32(s, edge1);
|
||||
F32 v = f * dot_V3F32(ray->direction, q);
|
||||
if(v >= 0.0f && (u+v) <= 1.0f)
|
||||
{
|
||||
F32 t = f * dot_V3F32(edge2, q);
|
||||
if(t > 0.0001f)
|
||||
{
|
||||
if(t <= closest_so_far)
|
||||
{
|
||||
rec->t = t;
|
||||
rec->hit = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
__host__ function void
|
||||
hit_triangle_host(RayF32 *ray, HitRecord *rec, Entity *triangle)
|
||||
{
|
||||
|
||||
Vec3F32 edge1 = sub_V3F32(triangle->vertex1, triangle->vertex0);
|
||||
Vec3F32 edge2 = sub_V3F32(triangle->vertex2, triangle->vertex0);
|
||||
rec->hit = 0;
|
||||
triangle_intersection_common(ray, rec, edge1, edge2, triangle);
|
||||
|
||||
// Set the point of intersection and the normal of the, for now,
|
||||
// vertex0 of the triangle. We have to get the actual surface normal at some point.
|
||||
if(rec->hit)
|
||||
{
|
||||
Vec3F32 intersection_point = ray_point_F32(rec->t, ray);
|
||||
Vec3F32 v0_normal = cross_V3F32(edge1, edge2);
|
||||
F32 norm_inv = 1.0f/h_norm_V3F32(v0_normal);
|
||||
v0_normal = scale_V3F32(norm_inv, v0_normal);
|
||||
|
||||
F32 front_face = dot_V3F32(ray->direction, v0_normal) < 0.0f;
|
||||
rec->normal = front_face ? v0_normal : scale_V3F32(-1.0f, v0_normal);
|
||||
rec->front_face = front_face;
|
||||
rec->point = intersection_point;
|
||||
//LOG("Hit triangle in hit_triangle_host! \n");
|
||||
}
|
||||
}
|
||||
__device__ function HitRecord
|
||||
hit_triangle(RayF32 *ray, Entity *triangle, F32 closest_so_far)
|
||||
{
|
||||
|
||||
Vec3F32 edge1 = sub_V3F32(triangle->vertex1, triangle->vertex0);
|
||||
Vec3F32 edge2 = sub_V3F32(triangle->vertex2, triangle->vertex0);
|
||||
HitRecord out = {0};
|
||||
triangle_intersection_common(ray, &out, edge1, edge2, triangle);
|
||||
|
||||
// Set the point of intersection and the normal of the, for now,
|
||||
// vertex0 of the triangle. We have to get the actual surface normal at some point.
|
||||
if(out.hit)
|
||||
{
|
||||
Vec3F32 intersection_point = ray_point_F32(out.t, ray);
|
||||
Vec3F32 v0_normal = cross_V3F32(edge1, edge2);
|
||||
F32 norm_inv = 1.0f/norm_V3F32(v0_normal);
|
||||
v0_normal = scale_V3F32(norm_inv, v0_normal);
|
||||
|
||||
F32 front_face = dot_V3F32(ray->direction, v0_normal) < 0.0f;
|
||||
out.normal = front_face ? v0_normal : scale_V3F32(-1.0f, v0_normal);
|
||||
out.front_face = front_face;
|
||||
out.point = intersection_point;
|
||||
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
|
||||
|
||||
__global__ void
|
||||
cuda_init_state(curandState *rand_state)
|
||||
@ -148,3 +232,10 @@ cuda_init_state(curandState *rand_state)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
__host__ void
|
||||
cuda_free(void *device_ptr)
|
||||
{
|
||||
cudaError_t cuErr = cudaFree(device_ptr);
|
||||
CUDA_CHECK(cuErr);
|
||||
}
|
||||
@ -38,6 +38,7 @@ enum EntityKind
|
||||
{
|
||||
EntityKind_Nil,
|
||||
EntityKind_Sphere,
|
||||
EntityKind_Tri,
|
||||
Num_EntityKinds
|
||||
};
|
||||
|
||||
@ -56,12 +57,26 @@ struct Entity
|
||||
{
|
||||
EntityKind kind;
|
||||
Vec3F32 center;
|
||||
Vec3F32 vertex0;
|
||||
Vec3F32 vertex1;
|
||||
Vec3F32 vertex2;
|
||||
F32 radius;
|
||||
};
|
||||
|
||||
|
||||
|
||||
__host__ function void write_buffer_to_ppm(Vec3F32 *buffer, U32 image_width, U32 image_height);
|
||||
__host__ function void write_buffer_to_ppm(Vec3F32 *buffer, U32 image_width,
|
||||
U32 image_height, const char *filename);
|
||||
|
||||
__device__ function RayF32 ray_get_F32(F32 x, F32 y, Vec3F32 cam_center,
|
||||
curandState *local_rand_state);
|
||||
|
||||
__host__ __device__ inline function void
|
||||
triangle_intersection_common(RayF32 *ray, HitRecord *rec, Vec3F32 edge1, Vec3F32 edge2,
|
||||
Entity* triangle);
|
||||
__device__ function HitRecord hit_triangle(RayF32 *ray, Entity *triangle,
|
||||
F32 closest_so_far);
|
||||
__host__ function void hit_triangle_host(RayF32 *ray, HitRecord *rec, Entity *triangle);
|
||||
|
||||
__device__ function RayF32 ray_get_F32(F32 x, F32 y, Vec3F32 cam_center, curandState *local_rand_state);
|
||||
__global__ void cuda_init_state(curandState *rand_state);
|
||||
__host__ void cuda_free(void *device_ptr);
|
||||
BIN
timeBuild.ctm
BIN
timeBuild.ctm
Binary file not shown.
Loading…
Reference in New Issue
Block a user