working bvh on CPU

2025-05-02 12:38:43 +02:00 · 2025-05-02 12:38:43 +02:00 · 254cb069a3
commit 254cb069a3
parent be0688fa9f
13 changed files with 777 additions and 104 deletions
--- a/.gitignore
+++ b/.gitignore
@ -115,3 +115,5 @@ build/
 *.exr
 *.bmp
 *.png
+*.sublime*
+*.ctm)
--- a/build.bat
+++ b/build.bat
@ -15,7 +15,9 @@ set CudaRemoveWarnings=-diag-suppress 177

 IF NOT EXIST .\build mkdir .\build
 pushd .\build
-nvcc %CudaSources% %CudaRemoveWarnings% -o program.exe
+
+@rem nvcc %CudaRemoveWarnings% -G -g -lineinfo -o program.exe %CudaSources%
+nvcc %CudaRemoveWarnings%  -o program.exe %CudaSources%

 set LastError=%ERRORLEVEL%
 popd
--- a/src/base_core.cu
+++ b/src/base_core.cu
@ -0,0 +1,33 @@
+
+__host__ function F64
+get_cpu_frequency() 
+{
+  LARGE_INTEGER freq;
+  QueryPerformanceFrequency(&freq);
+  U64 start_tsc = __rdtsc();
+  Sleep(100);
+  U64 end_tsc = __rdtsc();
+  F64 cyclers_per_ms = (F64)(end_tsc - start_tsc) / 100.0;
+  return cyclers_per_ms * 1000.0; // Cycles per second
+}
+
+__host__ function void 
+timer()
+{
+  g_cpu_timer.second_to_last_cycles = g_cpu_timer.last_cycles;
+  g_cpu_timer.last_cycles = __rdtsc();
+}
+
+__host__ function F64
+timer_elapsed()
+{
+  U64 cycles = g_cpu_timer.last_cycles - g_cpu_timer.second_to_last_cycles;
+  F64 elapsed = (F64)cycles / (g_cpu_timer.cpu_freq/1000.0);
+  return elapsed; // ms
+}
+
+
+function F64 test_function()
+{
+  return 34.20;
+}
--- a/src/base_core.cuh
+++ b/src/base_core.cuh
@ -0,0 +1,60 @@
+#pragma once 
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <intrin.h>
+
+#include <stdio.h>
+#include <stdint.h>
+#include <float.h>
+#include <math.h>
+
+#include <cuda_runtime.h>
+#include <curand_kernel.h>
+
+//------------------------------------------------------------------------------------------
+//~ base defines
+
+#define host_global static
+#define function static
+
+//~ typedefs 
+typedef int32_t   S32;
+typedef uint32_t  U32;
+typedef uint64_t  U64;
+typedef double    F64;
+typedef float     F32;
+
+//~ utility defines
+
+#define CUDA_CHECK(err) do { \
+  if (err != cudaSuccess) { \
+    fprintf(stderr, "CUDA ERROR: %s at %s:%d\n", \
+        cudaGetErrorString(err), __FILE__, __LINE__); \
+    exit(EXIT_FAILURE); \
+  } \
+} while (0)
+
+#define LOG printf
+
+#define F32_MAX FLT_MAX
+#define F32_MIN -FLT_MAX
+
+
+typedef struct CPUTimer CPUTimer;
+struct CPUTimer 
+{
+  U64 last_cycles;
+  U64 second_to_last_cycles;
+  F64 elapsed;
+  F64 cpu_freq;
+};
+
+
+//------------------------------------------------------------------------------------------
+//~ timing
+__host__ function F64 get_cpu_frequency();
+__host__ function void timer();
+__host__ function F64  timer_elapsed();
+function F64 test_function();
+
--- a/src/base_core.h
+++ b/src/base_core.h
@ -1,35 +0,0 @@
-#pragma once 
-
-#include <stdio.h>
-#include <stdint.h>
-#include <float.h>
-#include <math.h>
-
-#include <curand_kernel.h>
-
-//------------------------------------------------------------------------------------------
-//~ base defines
-
-#define host_global static
-#define function static
-
-//~ typedefs 
-typedef int32_t S32;
-typedef uint32_t U32;
-typedef uint64_t U64;
-typedef float F32;
-
-//~ utility defines
-
-#define CUDA_CHECK(err) do { \
-  if (err != cudaSuccess) { \
-    fprintf(stderr, "CUDA ERROR: %s at %s:%d\n", \
-        cudaGetErrorString(err), __FILE__, __LINE__); \
-    exit(EXIT_FAILURE); \
-  } \
-} while (0)
-
-#define LOG printf
-
-#define F32_MAX FLT_MAX
-#define F32_MIN FLT_MIN
--- a/src/base_math.cu
+++ b/src/base_math.cu
@ -1,4 +1,24 @@

+__host__ inline function Vec3F32 
+h_max_V3F32(Vec3F32 a, Vec3F32 b)
+{
+  Vec3F32 out = {0};
+  out.x = a.x > b.x ? a.x : b.x;
+  out.y = a.y > b.y ? a.y : b.y;
+  out.z = a.z > b.z ? a.z : b.z;
+  return out;
+}
+
+__host__ inline function Vec3F32 
+h_min_V3F32(Vec3F32 a, Vec3F32 b)
+{
+  Vec3F32 out = {0};
+  out.x = a.x < b.x ? a.x : b.x;
+  out.y = a.y < b.y ? a.y : b.y;
+  out.z = a.z < b.z ? a.z : b.z;
+  return out;
+}
+
 __host__ __device__ inline function Vec3F32 
 vec3F32(F32 x, F32 y, F32 z)
 {
@ -40,6 +60,16 @@ scale_V3F32(F32 s, Vec3F32 v)
  return out;
 }

+__host__ __device__ inline function Vec3F32 
+cross_V3F32(Vec3F32 a, Vec3F32 b)
+{
+  Vec3F32 out = {0};
+  out.x = a.y*b.z-a.z*b.y;
+  out.y = a.z*b.x-a.x*b.z;
+  out.z = a.x*b.y-a.y*b.x;
+  return out;
+}
+
 __host__ __device__ inline function F32 
 dot_V3F32(Vec3F32 a, Vec3F32 b)
 {
@ -48,9 +78,9 @@ dot_V3F32(Vec3F32 a, Vec3F32 b)


 __host__ __device__ inline function Vec3F32 
-ray_point_F32(F32 t, RayF32 ray)
+ray_point_F32(F32 t, RayF32 *ray)
 {
-  Vec3F32 out = add_V3F32(ray.origin, scale_V3F32(t, ray.direction));
+  Vec3F32 out = add_V3F32(ray->origin, scale_V3F32(t, ray->direction));
  return out;
 }

@ -136,4 +166,66 @@ clamp_V3F32(RngF32 rng, Vec3F32 v)
  return out;
 }

+__host__ function F32 
+rand_uniform_host_F32() 
+{
+  F32 rand_max = (F32)RAND_MAX;
+  U32 r = rand();
+  F32 rf = (F32)r;
+  F32 out = rf/rand_max;
+  return out;
+}

+__host__ function Vec3F32 
+vec3_rand_host_F32() 
+{
+  Vec3F32 out = {0};
+  out.x = rand_uniform_host_F32();
+  out.y = rand_uniform_host_F32();
+  out.z = rand_uniform_host_F32();
+  return out;
+}
+
+__host__ function U32 
+h_intersect_aabb(RayF32 *ray, Vec3F32 bmin, Vec3F32 bmax, F32 closest_so_far)
+{
+  F32 tx1 = (bmin.x - ray->origin.x) / ray->direction.x;
+  F32 tx2 = (bmax.x - ray->origin.x) / ray->direction.x;
+  F32 tmin = MIN(tx1, tx2);
+  F32 tmax = MAX(tx1, tx2);
+  
+  F32 ty1 = (bmin.y - ray->origin.y) / ray->direction.y;
+  F32 ty2 = (bmax.y - ray->origin.y) / ray->direction.y;
+  tmin = MAX(tmin, MIN(ty1, ty2));
+  tmax = MIN(tmax, MAX(ty1, ty2));
+  
+  F32 tz1 = (bmin.z - ray->origin.z) / ray->direction.z;
+  F32 tz2 = (bmax.z - ray->origin.z) / ray->direction.z;
+  tmin = MAX(tmin, MIN(tz1, tz2));
+  tmax = MIN(tmax, MAX(tz1, tz2));
+
+  U32 out = tmax >= tmin && tmin < closest_so_far && tmax > 0.0f;
+  return out;
+}
+
+__device__ function U32
+intersect_aabb(RayF32 *ray, Vec3F32 bmin, Vec3F32 bmax, F32 closest_so_far)
+{
+  F32 tx1 = (bmin.x - ray->origin.x) / ray->direction.x;
+  F32 tx2 = (bmax.x - ray->origin.x) / ray->direction.x;
+  F32 tmin = fminf(tx1, tx2);
+  F32 tmax = fmaxf(tx1, tx2);
+  
+  F32 ty1 = (bmin.y - ray->origin.y) / ray->direction.y;
+  F32 ty2 = (bmax.y - ray->origin.y) / ray->direction.y;
+  tmin = fminf(tmin, fminf(ty1, ty2));
+  tmax = fmaxf(tmax, fmaxf(ty1, ty2));
+  
+  F32 tz1 = (bmin.z - ray->origin.z) / ray->direction.z;
+  F32 tz2 = (bmax.z - ray->origin.z) / ray->direction.z;
+  tmin = fminf(tmin, fminf(tz1, tz2));
+  tmax = fmaxf(tmax, fmaxf(tz1, tz2));
+
+  U32 out = tmax >= tmin && tmin < closest_so_far && tmax > 0.0f;
+  return out;
+}
--- a/src/base_math.cuh
+++ b/src/base_math.cuh
@ -1,5 +1,9 @@
 #pragma once

+#define MAX(a, b) (a) > (b) ? (a) : (b)
+#define MIN(a, b) (a) > (b) ? (b) : (a)
+
+
 //------------------------------------------------------------------------------------------
 //~ structs

@ -40,16 +44,21 @@ struct RayF32
 //~ forward declarations


+
+__host__ inline function Vec3F32 h_max_V3F32(Vec3F32 a, Vec3F32 b);
+__host__ inline function Vec3F32 h_min_V3F32(Vec3F32 a, Vec3F32 b);
+  
 __host__ __device__ inline function Vec3F32 vec3F32(F32 x, F32 y, F32 z);
 __host__ __device__ inline function Vec3F32 add_V3F32(Vec3F32 a, Vec3F32 b);
 __host__ __device__ inline function Vec3F32 sub_V3F32(Vec3F32 a, Vec3F32 b);
 __host__ __device__ inline function Vec3F32 scale_V3F32(F32 s, Vec3F32 v);
-
+__host__ __device__ inline function Vec3F32 cross_V3F32(Vec3F32 a, Vec3F32 b);

 __host__ __device__ inline function Vec3F32 ray_point_F32(F32 t, RayF32 *ray);
 __host__ __device__ inline function F32 mag_V3F32(Vec3F32 a);
 __host__ __device__ inline function F32 dot_V3F32(Vec3F32 a, Vec3F32 b);
 __device__  inline function F32 norm_V3F32(Vec3F32 a);
+__host__ inline function F32 h_norm_V3F32(Vec3F32 a);

 __host__ __device__ function Vec3F32 lerp_V3F32(F32 s, Vec3F32 a, Vec3F32 b);

@ -57,7 +66,14 @@ __device__ function Vec3F32 rand_uniform_V3F32(curandState *local_rand_state);
 __device__ function Vec3F32 rand_uniform_range_V3F32(RngF32 rng, curandState *local_rand_state);

 __host__ function F32 linear_to_gamma(F32 val);
-__host__ inline function F32 h_norm_V3F32(Vec3F32 a);

 __device__ function F32 clamp_F32(RngF32 rng, F32 val);
 __device__ function Vec3F32 clamp_V3F32(RngF32 rng, Vec3F32 v);
+
+__host__ function F32 rand_uniform_host_F32();
+__host__ function Vec3F32 vec3_rand_host_F32();
+
+__host__ function U32 
+h_intersect_aabb(RayF32 *ray, Vec3F32 bmin, Vec3F32 bmax, F32 closest_so_far);
+__device__ function U32
+intersect_aabb(RayF32 *ray, Vec3F32 bmin, Vec3F32 bmax, F32 closest_so_far);
--- a/src/main.cu
+++ b/src/main.cu
@ -1,13 +1,20 @@
-#include "base_core.h"
-#include "base_math.h"
-#include "rayt_core.h"
+#define RENDER_ON_CPU 1
+#define BVH_USE_CPU 1
+#define DEBUG_DRAW_BBOX 0
+
+//------------------------------------------------------------------------------------------
+//~ header includes
+#include "base_core.cuh"
+#include "base_math.cuh"
+#include "rayt_core.cuh"
+#include "rayt_bvh.cuh"

 //------------------------------------------------------------------------------------------
 //~ Program parameter defines
 #define NUM_BLOCKS   1
 #define NUM_THREADS  32

-#define IMAGE_WIDTH 1920
+#define IMAGE_WIDTH 1024
 #define ASPECT_RATIO 1.7778f // 16/9

 #define CURAND_SEED 1984
@ -17,15 +24,30 @@
 #define SAMPLES_PER_PIXEL 64
 #define MAX_DIFFUSE_DEPTH 8

-#include "base_math.c"
-#include "rayt_core.c"
-
 //------------------------------------------------------------------------------------------
 //~ host globals
+host_global CPUTimer g_cpu_timer;
 host_global CameraF32 h_camera;
 host_global ViewportF32 h_viewport;
 host_global ImageF32 h_image;
+host_global Entity *h_entities = 0;
+host_global U32 *h_tri_indices = 0;
+host_global BVH h_bvh;

+//~ device globals
+__constant__ CameraF32 camera;
+__constant__ ViewportF32 viewport;
+__constant__ ImageF32 image;
+
+//------------------------------------------------------------------------------------------
+//~ implementation includes
+#include "base_core.cu"
+#include "base_math.cu"
+#include "rayt_core.cu"
+#include "rayt_bvh.cu" 
+
+//------------------------------------------------------------------------------------------
+//~ routines
 __global__ void 
 cuda_main(Entity *entities, Vec3F32 *pixelbuffer, curandState *rand_state) 
 {
@ -52,15 +74,11 @@ cuda_main(Entity *entities, Vec3F32 *pixelbuffer, curandState *rand_state)

 }

-//------------------------------------------------------------------------------------------
-//~ Main
-int main() 
+__host__ function void 
+set_up_scene_globals() 
 {
-  cudaError_t cuErr;
-
  //////////////////////////////////////////////////////////////////////////////////////////
  // Define image, camera and viewport on the CPU
-  // and then copy to constant globals on device
  // -------------
  h_image = {0};
  h_image.width = IMAGE_WIDTH;
@ -68,21 +86,17 @@ int main()
  U32 height = U32((F32)h_image.width/h_image.aspect_ratio) + 1;
  h_image.height = height < 1 ? 1 : height;
  h_image.total_num_pixels = h_image.width * h_image.height;
-  cuErr = cudaMemcpyToSymbol(image, &h_image, sizeof(ImageF32), 0, cudaMemcpyHostToDevice);
-  CUDA_CHECK(cuErr);
+  
  LOG("Image size %i x %i, aspect ratio: %.4f \n", 
      h_image.width, h_image.height, h_image.aspect_ratio);

  // -------------
  h_camera = {0};
-  h_camera.focal_length = 1.0f;
+  h_camera.focal_length = 3.0f;
+  h_camera.center = vec3F32(0.0f, 0.0f, 18.0f);
  F32 samples_per_pixel = (F32)SAMPLES_PER_PIXEL;
  h_camera.pixel_sample_scale = 1.0f/samples_per_pixel;
  
-  cuErr = cudaMemcpyToSymbol(camera, &h_camera, sizeof(CameraF32), 0, 
-      cudaMemcpyHostToDevice);
-  CUDA_CHECK(cuErr);
-
  // -------------
  h_viewport = {0};
  h_viewport.height = 2.0f;
@ -108,43 +122,36 @@ int main()
  h_viewport.pixel_origin = add_V3F32(viewport_upper_left, 
      scale_V3F32(0.5f, pixel_delta_sum));

-  cuErr = cudaMemcpyToSymbol(viewport, &h_viewport, sizeof(ViewportF32), 0, 
-      cudaMemcpyHostToDevice);
-  CUDA_CHECK(cuErr);
  LOG("Viewport size %.2f x %.2f, aspect ratio: %.4f \n", 
      h_viewport.width, h_viewport.height, h_viewport.aspect_ratio);

-
-  //////////////////////////////////////////////////////////////////////////////////////////
-  // Setup entities and copy to device
-  U64 entity_list_byte_size = sizeof(Entity)*MAX_NUM_ENTITIES;
-  Entity *h_entities = (Entity *)malloc(entity_list_byte_size);
-  memset(h_entities, 0, entity_list_byte_size);
-  for(U32 i = 0; i < MAX_NUM_ENTITIES; i += 1)
-  {
-    // Init all entities to nil
-    h_entities[i].kind = EntityKind_Nil;
 }

-  // Manual spheres
+__host__ function void
+copy_to_device_and_launch_cuda_main()
 {
-    h_entities[0].kind = EntityKind_Sphere;
-    h_entities[0].center = vec3F32(0.0f, 0.0f, -1.0f);
-    h_entities[0].radius = 0.5f;
+  cudaError_t cuErr;

-    h_entities[1].kind = EntityKind_Sphere;
-    h_entities[1].center = vec3F32(0.0f, -100.5f, -1.0f);
-    h_entities[1].radius = 100.0f;
-  }
+  // Copy constants
+  cuErr = cudaMemcpyToSymbol(image, &h_image, sizeof(ImageF32), 0, cudaMemcpyHostToDevice);
+  CUDA_CHECK(cuErr);

-  // Copy to device
+  cuErr = cudaMemcpyToSymbol(camera, &h_camera, sizeof(CameraF32), 0, 
+      cudaMemcpyHostToDevice);
+  CUDA_CHECK(cuErr);
+
+  cuErr = cudaMemcpyToSymbol(viewport, &h_viewport, sizeof(ViewportF32), 0, 
+      cudaMemcpyHostToDevice);
+  CUDA_CHECK(cuErr);
+
+
+  // Create and copy buffers to device
  Entity *entities = 0;
+  U64 entity_list_byte_size = sizeof(Entity)*MAX_NUM_ENTITIES;
  cuErr = cudaMalloc(&entities, entity_list_byte_size);
  CUDA_CHECK(cuErr);
  cuErr = cudaMemcpy(entities, h_entities, entity_list_byte_size, cudaMemcpyHostToDevice);
  CUDA_CHECK(cuErr);
-
-
  //////////////////////////////////////////////////////////////////////////////////////////
  // Define grid, blocks, threads and any buffers such as pixel data and random state
  // ------------
@ -201,14 +208,241 @@ int main()
      cudaMemcpyDeviceToHost);
  CUDA_CHECK(cuErr);

-  write_buffer_to_ppm(h_pixel_buffer, h_image.width, h_image.height);
+  write_buffer_to_ppm(h_pixel_buffer, h_image.width, h_image.height, "gpu_output.ppm");

-  cuErr = cudaFree(pixel_buffer);
-  CUDA_CHECK(cuErr);
-  cuErr = cudaFree(entities);
-  CUDA_CHECK(cuErr);
-  cuErr = cudaFree(rand_state);
-  CUDA_CHECK(cuErr);
+  cuda_free(pixel_buffer);
+  cuda_free(entities);
+  cuda_free(rand_state);
+
+  free(h_pixel_buffer);
+}
+
+__host__ function void
+set_up_entities() 
+{
+  //////////////////////////////////////////////////////////////////////////////////////////
+  // Setup entities
+  U64 entity_list_byte_size = sizeof(Entity)*MAX_NUM_ENTITIES;
+  h_entities = (Entity *)malloc(entity_list_byte_size);
+  memset(h_entities, 0, entity_list_byte_size);
+
+#if 0  
+  for(U32 i = 0; i < MAX_NUM_ENTITIES; i += 1)
+  {
+    // Init all entities to nil
+    h_entities[i].kind = EntityKind_Nil;
+  }
+#endif
+
+  // Random triangles
+  {
+    h_tri_indices = (U32 *)malloc(sizeof(U32)*MAX_NUM_ENTITIES);
+    for(U32 i = 0; i < MAX_NUM_ENTITIES; i += 1)
+    {
+      Vec3F32 r0 = vec3_rand_host_F32();
+      Vec3F32 r1 = vec3_rand_host_F32();
+      Vec3F32 r2 = vec3_rand_host_F32();
+      // Put the first vertex within a 10x10x10 cube centered on the origin.
+      Vec3F32 v0 = scale_V3F32(9.0f, r0);
+      v0 = sub_V3F32(v0, vec3F32(5.0f, 5.0f, 5.0f));
+      h_entities[i].kind = EntityKind_Tri;
+      h_entities[i].vertex0 = v0;
+      // The other two vertices are relative to the first.
+      h_entities[i].vertex1 = add_V3F32(v0, r1);
+      h_entities[i].vertex2 = add_V3F32(v0, r2);
+      Vec3F32 center = add_V3F32(h_entities[i].vertex0, 
+          add_V3F32(h_entities[i].vertex1, h_entities[i].vertex2));
+      center = scale_V3F32(0.3333f, center); 
+      h_entities[i].center = center;
+      h_tri_indices[i] = i;
+      #if 0
+        LOG("tri index[%i] = %i before bvh construction \n", i, h_tri_indices[i]);
+      #else
+      #endif
+    }
+  }
+
+}
+
+__host__ function void
+set_up_bvh()
+{
+  h_bvh = bvh_build();
+
+#if 0
+  {
+    U32 total_leaf_nodes = 0;
+    for(U32 i = 0; i < h_bvh.max_num_nodes; i+=1)
+    {
+      BVHNode *node = &h_bvh.nodes[i];
+      if(node->tri_count)
+      {
+        LOG("\n----\n");
+        LOG("Leaf node with idx %i with tri count %i \n", i, node->tri_count);
+        total_leaf_nodes += 1;
+        LOG("Index into triangle index list, node->left_first: %i \n", node->left_first);
+        LOG("leaf node aabb_min = (%.2f %.2f %.2f) \n", 
+          node->aabb_min.x, node->aabb_min.y, node->aabb_min.z);
+        LOG("leaf node aabb_max = (%.2f %.2f %.2f) \n", 
+          node->aabb_max.x, node->aabb_max.y, node->aabb_max.z);
+        Entity *tri = &h_entities[h_tri_indices[node->left_first]];
+        LOG("Triangle v0: (%.2f, %.2f %.2f) \n", 
+          tri->vertex0.x, tri->vertex0.y, tri->vertex0.z);
+        LOG("Triangle v1: (%.2f, %.2f %.2f) \n", 
+          tri->vertex1.x, tri->vertex1.y, tri->vertex1.z);
+        LOG("Triangle v2: (%.2f, %.2f %.2f) \n", 
+          tri->vertex2.x, tri->vertex2.y, tri->vertex2.z);
+        LOG("----\n\n");
+      }
+    }
+    LOG("Total number of leaf nodes %i \n", total_leaf_nodes);
+  }
+#endif
+}
+
+//------------------------------------------------------------------------------------------
+//~ Main
+int main() 
+{
+  g_cpu_timer = {0};
+  g_cpu_timer.cpu_freq = get_cpu_frequency();
+
+  cudaEvent_t start, stop;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+
+  set_up_scene_globals();
+  set_up_entities();
+  set_up_bvh();
+
+#if RENDER_ON_CPU
+  LOG("Starting CPU rendering \n");
+  //cudaEventRecord(start, 0);
+  timer();
+  // Render "ground truth" on CPU for validation
+  {
+    U64 num_pixels = h_image.width*h_image.height;
+    U64 pixel_buffer_size = num_pixels*sizeof(Vec3F32);
+    Vec3F32 *host_pixel_buffer = (Vec3F32 *)malloc(pixel_buffer_size);
+    for(U32 y = 0; y < h_image.height; y += 1)
+    {
+      for(U32 x = 0; x < h_image.width; x += 1)
+      {
+        U32 idx = y * h_image.width + x;
+
+        Vec3F32 px_u = scale_V3F32((F32)x, h_viewport.pixel_delta_u);
+        Vec3F32 px_v = scale_V3F32((F32)y, h_viewport.pixel_delta_v);
+        Vec3F32 pixel_center = add_V3F32(h_viewport.pixel_origin, add_V3F32(px_u, px_v));
+
+        Vec3F32 ray_direction = sub_V3F32(pixel_center, h_camera.center);
+        RayF32 ray = {0};
+        ray.origin = h_camera.center;
+        ray.direction = ray_direction;
+        HitRecord hit_rec = {0};
+        hit_rec.t = F32_MAX;
+#if BVH_USE_CPU
+        {
+          bvh_host_intersect(&h_bvh, &ray, &hit_rec, h_bvh.root_index);
+          if(hit_rec.hit)
+          {
+            //LOG("BVH hit triangle! hit_rec->normal: (%.2f, %.2f, %.2f) \n",
+            //    hit_rec.normal.x, hit_rec.normal.y, hit_rec.normal.z);
+          }
+        }
+#else
+        {
+          HitRecord temp_hit_rec = {0};
+          temp_hit_rec.t = hit_rec.t;
+          for (U32 i = 0; i < MAX_NUM_ENTITIES; i+=1)
+          {
+            Entity *tri = &h_entities[i];
+            hit_triangle_host(&ray, &temp_hit_rec, tri);
+            if(temp_hit_rec.hit)
+            {
+              hit_rec = temp_hit_rec;
+            }
+          }
+        }
+#endif
+
+        Vec3F32 pixel_color = {0.0f, 0.0f, 0.0f};
+        if(hit_rec.hit)
+        {
+          // Paint entity
+          pixel_color = add_V3F32(hit_rec.normal, vec3F32(1.0f, 1.0f, 1.0f));
+          pixel_color = scale_V3F32(0.5f, pixel_color);
+        }
+        else 
+        {
+          // Paint background gradient
+          F32 norm = h_norm_V3F32(ray.direction);
+          Vec3F32 unit_dir = scale_V3F32(1.0f/norm, ray.direction);
+          Vec3F32 white = vec3F32(1.0f, 1.0f, 1.0f);
+          Vec3F32 light_blue = vec3F32(0.5f, 0.7f, 1.0f);
+
+          // Lerp between white and light blue depending on y position
+          F32 blend = 0.5f*(unit_dir.y + 1.0f);
+
+          pixel_color = lerp_V3F32(blend, white, light_blue);
+
+          #if DEBUG_DRAW_BBOX
+          {
+            U32 do_debug_pixel = 0;
+            Vec3F32 color_index = {0.0f, 0.0f, 0.0f};
+            for(U32 i = 0; i < h_bvh.max_num_nodes; i += 1)
+            {
+              BVHNode *node = &h_bvh.nodes[i];
+              if(node->tri_count > 0 && h_intersect_aabb(&ray, node->aabb_min, node->aabb_max, F32_MAX))
+              {
+                do_debug_pixel = 1;
+              }
+              if(do_debug_pixel) 
+              {
+                color_index.x = 1.0f;
+                color_index.y = 0.0f;
+                color_index.z = 0.0f;
+              }
+            }
+
+            if(do_debug_pixel)
+            {
+              pixel_color = color_index;
+            }
+          }
+          #endif
+
+        }
+        
+        // Debug draw bvh
+
+
+        host_pixel_buffer[idx] = pixel_color;
+
+      }
+    }
+    //cudaEventRecord(stop, 0);
+    //cudaEventSynchronize(stop);
+    timer();
+    
+    {
+      F32 elapsed = timer_elapsed();
+      //cudaEventElapsedTime(&elapsed, start, stop);
+      LOG("Elapsed time for CPU rendering: %.2f ms \n", elapsed);
+      U32 bvh_used = 0;
+#if BVH_USE_CPU
+      bvh_used = 1;
+#endif
+      LOG("BVH = %i \n", bvh_used);
+    }
+
+    cudaEventDestroy(stop);
+    cudaEventDestroy(start);
+
+    
+    write_buffer_to_ppm(host_pixel_buffer, h_image.width, h_image.height, "cpu_output.ppm");
+    free(host_pixel_buffer);
+  }
+#endif

  return 0;
 }
--- a/src/rayt_bvh.cu
+++ b/src/rayt_bvh.cu
@ -0,0 +1,136 @@
+
+__host__ function BVH
+bvh_build()
+{
+  U64 max_bvh_nodes = 2 * MAX_NUM_ENTITIES - 1;
+  BVH bvh = {0};
+  bvh.nodes = (BVHNode *)_aligned_malloc(sizeof(BVHNode)*max_bvh_nodes, 64);
+  bvh.max_num_nodes = max_bvh_nodes;
+  bvh.used_nodes = 2; // Skip by two, TODO(anton): Comment this.
+  bvh.minimum_entities_in_leaf = 2;
+  U32 root_index = 0;
+  BVHNode *root = &bvh.nodes[root_index];
+  root->left_first = 0; 
+  root->tri_count = MAX_NUM_ENTITIES;
+
+  bvh_update_bounds(&bvh, 0);
+  
+  bvh_subdivide(&bvh, 0);
+
+  return bvh;
+}
+
+__host__ function void
+bvh_subdivide(BVH *bvh, U32 node_idx)
+{
+  BVHNode *node = &bvh->nodes[node_idx];
+  if(node->tri_count <= bvh->minimum_entities_in_leaf) 
+  {
+    return;
+  }
+
+  // Split box
+  Vec3F32 extent = sub_V3F32(node->aabb_max, node->aabb_min);
+  U32 axis = 0;
+  if(extent.y > extent.x) axis = 1;
+  if(extent.z > extent.v[axis]) axis = 2;
+  F32 split_pos = node->aabb_min.v[axis] + extent.v[axis] * 0.5f;
+
+  // Sorting into left and right partitions
+  U32 i = node->left_first;
+  U32 j = node->tri_count + i - 1;
+  while (i <= j)
+  {
+    U32 tri_idx = h_tri_indices[i];
+    if(h_entities[tri_idx].center.v[axis] < split_pos)
+    {
+      i += 1;
+    }
+    else
+    {
+      h_tri_indices[i] = h_tri_indices[j];
+      h_tri_indices[j] = tri_idx;
+      j -= 1;
+    }
+  }
+
+  U32 left_count = i - node->left_first;
+  if(left_count == 0 || left_count == node->tri_count)
+  {
+    // One of the partitions are empty, don't subdivide further.
+    return;
+  }
+
+  // Create child nodes and subdivide
+  U32 left_child_index  = bvh->used_nodes++;
+  U32 right_child_index = bvh->used_nodes++;
+
+  bvh->nodes[left_child_index].left_first = node->left_first;
+  bvh->nodes[left_child_index].tri_count = left_count;
+  bvh->nodes[right_child_index].left_first = i;
+  bvh->nodes[right_child_index].tri_count = node->tri_count - left_count;
+  node->left_first = left_child_index;
+  node->tri_count = 0;
+  bvh_update_bounds(bvh, left_child_index);
+  bvh_update_bounds(bvh, right_child_index);
+  bvh_subdivide(bvh, left_child_index);
+  bvh_subdivide(bvh, right_child_index);
+}
+
+
+__host__ function void 
+bvh_update_bounds(BVH *bvh, U32 node_idx)
+{
+  BVHNode *node = &bvh->nodes[node_idx];
+
+  node->aabb_min = vec3F32(F32_MAX, F32_MAX, F32_MAX);
+  node->aabb_max = vec3F32(F32_MIN, F32_MIN, F32_MIN);
+  
+  U32 first_tri_idx = node->left_first;
+  for(U32 i = 0; i < node->tri_count; i += 1)
+  {
+    U32 leaf_tri_idx = h_tri_indices[first_tri_idx + i];
+    Entity *tri = &h_entities[leaf_tri_idx];
+    node->aabb_min = h_min_V3F32(node->aabb_min, tri->vertex0);
+    node->aabb_min = h_min_V3F32(node->aabb_min, tri->vertex1);
+    node->aabb_min = h_min_V3F32(node->aabb_min, tri->vertex2);
+    node->aabb_max = h_max_V3F32(node->aabb_max, tri->vertex0);
+    node->aabb_max = h_max_V3F32(node->aabb_max, tri->vertex1);
+    node->aabb_max = h_max_V3F32(node->aabb_max, tri->vertex2);
+  } 
+}
+
+__host__ function void
+bvh_host_intersect(BVH *bvh, RayF32 *ray, HitRecord *rec, U32 node_idx)
+{
+  BVHNode *node = &bvh->nodes[node_idx];
+  U32 any_hit = 0;
+  if(h_intersect_aabb(ray, node->aabb_min, node->aabb_max, rec->t))
+  {
+    if(node->tri_count > 0)
+    {
+      //LOG("Hit a leaf node %i with tri count %i \n", node_idx, node->tri_count);
+      for(U32 i = 0; i < node->tri_count; i+=1)
+      {
+        U32 tri_index = h_tri_indices[node->left_first + i];
+        Entity *tri = &h_entities[tri_index];
+        hit_triangle_host(ray, rec, tri);
+        if(rec->hit)
+        { 
+          any_hit = 1;
+          //LOG("got hit in bvh_host_intersect loop \n");
+        }
+      }
+    }
+    else
+    {
+      bvh_host_intersect(bvh, ray, rec, node->left_first);
+      bvh_host_intersect(bvh, ray, rec, node->left_first + 1);
+    }
+  }
+
+  if(!rec->hit) 
+  {
+     rec->hit = any_hit;
+  }
+}
--- a/src/rayt_bvh.cuh
+++ b/src/rayt_bvh.cuh
@ -0,0 +1,27 @@
+#pragma once
+
+// 32 bytes node
+typedef struct BVHNode BVHNode;
+struct BVHNode
+{
+  Vec3F32 aabb_min;
+  Vec3F32 aabb_max;
+  U32 left_first;
+  U32 tri_count;
+};
+
+typedef struct BVH BVH;
+struct BVH
+{
+  BVHNode *nodes;
+  U32 used_nodes;
+  U32 root_index;
+  U32 max_num_nodes;
+  U32 num_leaf_nodes;
+  U32 minimum_entities_in_leaf;                    
+};
+
+__host__ function BVH bvh_build();
+__host__ function void bvh_update_bounds(BVH *bvh, U32 node_idx);
+__host__ function void bvh_subdivide(BVH *bvh, U32 node_idx);
+__host__ function void bvh_host_intersect(BVH *bvh, RayF32 *ray, HitRecord *rec, U32 node_idx);
--- a/src/rayt_core.cu
+++ b/src/rayt_core.cu
@ -1,16 +1,11 @@
-//~ device globals
-__constant__ CameraF32 camera;
-__constant__ ViewportF32 viewport;
-__constant__ ImageF32 image;
-

 __host__ function void 
 write_buffer_to_ppm(Vec3F32 *buffer, 
                    U32 image_width, 
-                    U32 image_height)
+                    U32 image_height,
+                    const char *filename)
 {

-  const char *filename = "output.ppm";
  FILE *file = fopen(filename, "w");
  if(!file) 
  {
@ -133,6 +128,95 @@ get_sample_color(RayF32 ray, Entity *entities, curandState *local_rand_state)
  return out;
 }

+// Common function for use on both host and device, 
+__host__ __device__ inline function void
+triangle_intersection_common(RayF32 *ray, HitRecord *rec, Vec3F32 edge1, Vec3F32 edge2, 
+                             Entity* triangle)
+{
+  // Möller-Trumbore intersection algorithm
+  Vec3F32 h = cross_V3F32(ray->direction, edge2);
+  F32 closest_so_far = rec->t;
+  F32 a = dot_V3F32(edge1, h);
+  if(a <= -0.001f || a >= 0.001f)
+  {
+    F32 f = 1.0f/a;
+    Vec3F32 s = sub_V3F32(ray->origin, triangle->vertex0);
+    F32 u = f * dot_V3F32(s, h);
+    if(u >= 0.0f && u <= 1.0f)
+    {
+      Vec3F32 q = cross_V3F32(s, edge1);
+      F32 v = f * dot_V3F32(ray->direction, q);
+      if(v >= 0.0f && (u+v) <= 1.0f)
+      {
+        F32 t = f * dot_V3F32(edge2, q);
+        if(t > 0.0001f) 
+        {
+          if(t <= closest_so_far)
+          {
+            rec->t = t;
+            rec->hit = 1;
+          }
+        }
+      }
+    }
+  }
+
+}
+
+__host__ function void
+hit_triangle_host(RayF32 *ray, HitRecord *rec, Entity *triangle)
+{
+
+  Vec3F32 edge1 = sub_V3F32(triangle->vertex1, triangle->vertex0);
+  Vec3F32 edge2 = sub_V3F32(triangle->vertex2, triangle->vertex0);
+  rec->hit = 0;
+  triangle_intersection_common(ray, rec, edge1, edge2, triangle);
+
+  // Set the point of intersection and the normal of the, for now,
+  // vertex0 of the triangle. We have to get the actual surface normal at some point.
+  if(rec->hit)
+  {
+    Vec3F32 intersection_point = ray_point_F32(rec->t, ray);
+    Vec3F32 v0_normal = cross_V3F32(edge1, edge2);
+    F32 norm_inv = 1.0f/h_norm_V3F32(v0_normal);
+    v0_normal = scale_V3F32(norm_inv, v0_normal);
+
+    F32 front_face = dot_V3F32(ray->direction, v0_normal) < 0.0f;
+    rec->normal = front_face ? v0_normal : scale_V3F32(-1.0f, v0_normal);
+    rec->front_face = front_face;
+    rec->point = intersection_point;
+    //LOG("Hit triangle in hit_triangle_host! \n");
+  }
+}
+__device__ function HitRecord 
+hit_triangle(RayF32 *ray, Entity *triangle, F32 closest_so_far)
+{
+
+  Vec3F32 edge1 = sub_V3F32(triangle->vertex1, triangle->vertex0);
+  Vec3F32 edge2 = sub_V3F32(triangle->vertex2, triangle->vertex0);
+  HitRecord out = {0};
+  triangle_intersection_common(ray, &out, edge1, edge2, triangle);
+
+  // Set the point of intersection and the normal of the, for now,
+  // vertex0 of the triangle. We have to get the actual surface normal at some point.
+  if(out.hit)
+  {
+    Vec3F32 intersection_point = ray_point_F32(out.t, ray);
+    Vec3F32 v0_normal = cross_V3F32(edge1, edge2);
+    F32 norm_inv = 1.0f/norm_V3F32(v0_normal);
+    v0_normal = scale_V3F32(norm_inv, v0_normal);
+
+    F32 front_face = dot_V3F32(ray->direction, v0_normal) < 0.0f;
+    out.normal = front_face ? v0_normal : scale_V3F32(-1.0f, v0_normal);
+    out.front_face = front_face;
+    out.point = intersection_point;
+
+  }
+
+  return out;
+}
+
+

 __global__ void 
 cuda_init_state(curandState *rand_state) 
@ -148,3 +232,10 @@ cuda_init_state(curandState *rand_state)
  }

 }
+
+__host__ void
+cuda_free(void *device_ptr)
+{
+  cudaError_t cuErr = cudaFree(device_ptr);
+  CUDA_CHECK(cuErr);
+}
--- a/src/rayt_core.cuh
+++ b/src/rayt_core.cuh
@ -38,6 +38,7 @@ enum EntityKind
 {
  EntityKind_Nil,
  EntityKind_Sphere,
+  EntityKind_Tri,
  Num_EntityKinds
 };

@ -56,12 +57,26 @@ struct Entity
 {
  EntityKind kind;
  Vec3F32 center;
+  Vec3F32 vertex0;
+  Vec3F32 vertex1;
+  Vec3F32 vertex2;
  F32 radius;
 };



-__host__ function void write_buffer_to_ppm(Vec3F32 *buffer, U32 image_width, U32 image_height);
+__host__ function void write_buffer_to_ppm(Vec3F32 *buffer, U32 image_width, 
+                                           U32 image_height, const char *filename);
+
+__device__ function RayF32 ray_get_F32(F32 x, F32 y, Vec3F32 cam_center, 
+                                       curandState *local_rand_state);
+
+__host__ __device__ inline function void
+triangle_intersection_common(RayF32 *ray, HitRecord *rec, Vec3F32 edge1, Vec3F32 edge2, 
+                             Entity* triangle);
+__device__ function HitRecord hit_triangle(RayF32 *ray, Entity *triangle, 
+                                           F32 closest_so_far);
+__host__ function void hit_triangle_host(RayF32 *ray, HitRecord *rec, Entity *triangle);

-__device__ function RayF32 ray_get_F32(F32 x, F32 y, Vec3F32 cam_center, curandState *local_rand_state);
 __global__ void cuda_init_state(curandState *rand_state);
+__host__ void cuda_free(void *device_ptr);
--- a/timeBuild.ctm
+++ b/timeBuild.ctm