refactor before doing bvh triangles

2025-04-29 19:57:13 +02:00 · 2025-04-29 19:57:13 +02:00 · 8025e73db4
commit 8025e73db4
parent 3e41274f45
10 changed files with 2728 additions and 589 deletions
--- a/build.bat
+++ b/build.bat
@ -11,9 +11,11 @@ set cuda_root=D:/lib/cudatoolkit/lib/x64

 set CudaSources=../src/main.cu

+set CudaRemoveWarnings=-diag-suppress 177
+
 IF NOT EXIST .\build mkdir .\build
 pushd .\build
-nvcc %CudaSources% -o program.exe
+nvcc %CudaSources% %CudaRemoveWarnings% -o program.exe

 set LastError=%ERRORLEVEL%
 popd
--- a/ray_ws.sublime-workspace
+++ b/ray_ws.sublime-workspace
--- a/run.bat
+++ b/run.bat
@ -0,0 +1,3 @@
+cd build
+program.exe
+cd ..
--- a/src/base_core.h
+++ b/src/base_core.h
@ -0,0 +1,35 @@
+#pragma once 
+
+#include <stdio.h>
+#include <stdint.h>
+#include <float.h>
+#include <math.h>
+
+#include <curand_kernel.h>
+
+//------------------------------------------------------------------------------------------
+//~ base defines
+
+#define host_global static
+#define function static
+
+//~ typedefs 
+typedef int32_t S32;
+typedef uint32_t U32;
+typedef uint64_t U64;
+typedef float F32;
+
+//~ utility defines
+
+#define CUDA_CHECK(err) do { \
+  if (err != cudaSuccess) { \
+    fprintf(stderr, "CUDA ERROR: %s at %s:%d\n", \
+        cudaGetErrorString(err), __FILE__, __LINE__); \
+    exit(EXIT_FAILURE); \
+  } \
+} while (0)
+
+#define LOG printf
+
+#define F32_MAX FLT_MAX
+#define F32_MIN FLT_MIN
--- a/src/base_math.c
+++ b/src/base_math.c
@ -0,0 +1,139 @@
+
+__host__ __device__ inline function Vec3F32 
+vec3F32(F32 x, F32 y, F32 z)
+{
+  Vec3F32 out = {0};
+  out.x = x;
+  out.y = y;
+  out.z = z;
+  return out;
+}
+
+
+__host__ __device__ inline function Vec3F32 
+add_V3F32(Vec3F32 a, Vec3F32 b)
+{
+  Vec3F32 out = {0};
+  out.x = a.x + b.x;
+  out.y = a.y + b.y;
+  out.z = a.z + b.z;
+  return out;
+}
+
+__host__ __device__ inline function Vec3F32 
+sub_V3F32(Vec3F32 a, Vec3F32 b)
+{
+  Vec3F32 out = {0};
+  out.x = a.x-b.x;
+  out.y = a.y-b.y;
+  out.z = a.z-b.z;
+  return out;
+}
+
+__host__ __device__ inline function Vec3F32 
+scale_V3F32(F32 s, Vec3F32 v)
+{
+  Vec3F32 out = {0};
+  out.x = s*v.x;
+  out.y = s*v.y;
+  out.z = s*v.z;
+  return out;
+}
+
+__host__ __device__ inline function F32 
+dot_V3F32(Vec3F32 a, Vec3F32 b)
+{
+  return a.x*b.x + a.y*b.y + a.z*b.z;
+}
+
+
+__host__ __device__ inline function Vec3F32 
+ray_point_F32(F32 t, RayF32 ray)
+{
+  Vec3F32 out = add_V3F32(ray.origin, scale_V3F32(t, ray.direction));
+  return out;
+}
+
+__host__ __device__ inline function F32 
+mag_V3F32(Vec3F32 a)
+{
+  return dot_V3F32(a, a);
+}
+
+__host__ function F32
+h_norm_V3F32(Vec3F32 a)
+{
+  F32 mag = mag_V3F32(a);
+  return sqrtf(mag);
+}
+__device__ function F32 
+norm_V3F32(Vec3F32 a)
+{
+   F32 mag = mag_V3F32(a);
+   return __fsqrt_rn(mag);
+}
+
+__host__ __device__ function Vec3F32 
+lerp_V3F32(F32 s, Vec3F32 a, Vec3F32 b)
+{
+  Vec3F32 lerp_term1 = scale_V3F32(1.0f-s, a);
+  Vec3F32 lerp_term2 = scale_V3F32(s, b);
+  Vec3F32 lerp_result = add_V3F32(lerp_term1, lerp_term2);
+
+  return lerp_result;
+}
+
+__device__ function Vec3F32 
+rand_uniform_V3F32(curandState *local_rand_state)
+{
+  Vec3F32 out = {0};
+  out.x = curand_uniform(local_rand_state);
+  out.y = curand_uniform(local_rand_state);
+  out.z = curand_uniform(local_rand_state);
+  return out;
+}
+
+__device__ function Vec3F32 
+rand_uniform_range_V3F32(RngF32 rng, curandState *local_rand_state)
+{
+  Vec3F32 out = {0};
+  out.x = rng.min + (rng.max-rng.min) * curand_uniform(local_rand_state);
+  out.y = rng.min + (rng.max-rng.min) * curand_uniform(local_rand_state);
+  out.z = rng.min + (rng.max-rng.min) * curand_uniform(local_rand_state);
+  return out;
+}
+
+
+__host__ function F32 
+linear_to_gamma(F32 val)
+{
+  // We assume that the input value is in linear space, and 
+  // we transform it to approximate srgb space by taking the sqrt
+  F32 out = val;
+  if (val > 0.0f)
+  {
+    out = sqrtf(val);
+  }
+  return out;
+}
+
+
+__device__ function F32
+clamp_F32(RngF32 rng, F32 val)
+{
+  F32 out = fmaxf(rng.min, val);
+  out = fminf(val, rng.max);
+  return out;
+}
+
+__device__ function Vec3F32 
+clamp_V3F32(RngF32 rng, Vec3F32 v)
+{
+  Vec3F32 out = {0};
+  out.x = clamp_F32(rng, v.x);
+  out.y = clamp_F32(rng, v.y);
+  out.z = clamp_F32(rng, v.z);
+  return out;
+}
+
+
--- a/src/base_math.h
+++ b/src/base_math.h
@ -0,0 +1,63 @@
+#pragma once
+
+//------------------------------------------------------------------------------------------
+//~ structs
+
+typedef union Vec3F32 Vec3F32;
+union Vec3F32
+{
+  struct 
+  {
+    F32 x;
+    F32 y;
+    F32 z;
+  };
+  struct 
+  {
+    F32 r;
+    F32 g;
+    F32 b;
+  };
+  F32 v[3];
+};
+
+typedef struct RngF32 RngF32;
+struct RngF32 
+{
+  F32 min;
+  F32 max;
+};
+
+typedef struct RayF32 RayF32;
+struct RayF32
+{
+  Vec3F32 origin;
+  Vec3F32 direction;
+};
+
+
+//------------------------------------------------------------------------------------------
+//~ forward declarations
+
+
+__host__ __device__ inline function Vec3F32 vec3F32(F32 x, F32 y, F32 z);
+__host__ __device__ inline function Vec3F32 add_V3F32(Vec3F32 a, Vec3F32 b);
+__host__ __device__ inline function Vec3F32 sub_V3F32(Vec3F32 a, Vec3F32 b);
+__host__ __device__ inline function Vec3F32 scale_V3F32(F32 s, Vec3F32 v);
+
+
+__host__ __device__ inline function Vec3F32 ray_point_F32(F32 t, RayF32 *ray);
+__host__ __device__ inline function F32 mag_V3F32(Vec3F32 a);
+__host__ __device__ inline function F32 dot_V3F32(Vec3F32 a, Vec3F32 b);
+__device__  inline function F32 norm_V3F32(Vec3F32 a);
+
+__host__ __device__ function Vec3F32 lerp_V3F32(F32 s, Vec3F32 a, Vec3F32 b);
+
+__device__ function Vec3F32 rand_uniform_V3F32(curandState *local_rand_state);
+__device__ function Vec3F32 rand_uniform_range_V3F32(RngF32 rng, curandState *local_rand_state);
+
+__host__ function F32 linear_to_gamma(F32 val);
+__host__ inline function F32 h_norm_V3F32(Vec3F32 a);
+
+__device__ function F32 clamp_F32(RngF32 rng, F32 val);
+__device__ function Vec3F32 clamp_V3F32(RngF32 rng, Vec3F32 v);
--- a/src/main.cu
+++ b/src/main.cu
@ -1,36 +1,6 @@
-#include <stdio.h>
-#include <stdint.h>
-#include <float.h>
-
-#include <curand_kernel.h>
-
-//------------------------------------------------------------------------------------------
-//~ base defines
-
-#define host_global static
-#define function static
-
-//~ typedefs 
-typedef int32_t S32;
-typedef uint32_t U32;
-typedef uint64_t U64;
-typedef float F32;
-
-//~ utility defines
-
-#define CUDA_CHECK(err) do { \
-  if (err != cudaSuccess) { \
-    fprintf(stderr, "CUDA ERROR: %s at %s:%d\n", \
-        cudaGetErrorString(err), __FILE__, __LINE__); \
-    exit(EXIT_FAILURE); \
-  } \
-} while (0)
-
-#define LOG printf
-
-#define F32_MAX FLT_MAX
-#define F32_MIN FLT_MIN
-
+#include "base_core.h"
+#include "base_math.h"
+#include "rayt_core.h"

 //------------------------------------------------------------------------------------------
 //~ Program parameter defines
@ -47,520 +17,14 @@ typedef float F32;
 #define SAMPLES_PER_PIXEL 64
 #define MAX_DIFFUSE_DEPTH 8

-//------------------------------------------------------------------------------------------
-//~ structs
-
-typedef union Vec3F32 Vec3F32;
-union Vec3F32
-{
-  struct 
-  {
-    F32 x;
-    F32 y;
-    F32 z;
-  };
-  struct 
-  {
-    F32 r;
-    F32 g;
-    F32 b;
-  };
-  F32 v[3];
-};
-
-typedef struct RngF32 RngF32;
-struct RngF32 
-{
-  F32 min;
-  F32 max;
-};
-
-typedef struct RayF32 RayF32;
-struct RayF32
-{
-  Vec3F32 origin;
-  Vec3F32 direction;
-};
-
-
-typedef struct ViewportF32 ViewportF32;
-
-struct ViewportF32 
-{
-  F32 width;
-  F32 height;
-  F32 aspect_ratio;
-  Vec3F32 u; // along horizontal edge, right from top left corner
-  Vec3F32 v; // along vertical edge, down from top left corner
-  Vec3F32 upper_left;
-  Vec3F32 pixel_origin;
-  Vec3F32 pixel_delta_u;
-  Vec3F32 pixel_delta_v;
-};
-
-typedef struct CameraF32 CameraF32;
-struct CameraF32 
-{
-  Vec3F32 center;
-  Vec3F32 up;
-  F32 focal_length;
-  F32 pixel_sample_scale;
-};
-
-typedef struct ImageF32 ImageF32;
-struct ImageF32 
-{
-  U32 width;
-  U32 height;
-  F32 aspect_ratio;
-  U32 total_num_pixels;
-};
-
-enum EntityKind 
-{
-  EntityKind_Nil,
-  EntityKind_Sphere,
-  Num_EntityKinds
-};
-
-typedef struct HitRecord HitRecord;
-struct HitRecord 
-{
-  Vec3F32 point;
-  Vec3F32 normal;
-  F32 t; // Root parameter for hit sphere
-  F32 hit; // Hit true or false
-  F32 front_face;
-};
-
-typedef struct Entity Entity;
-struct Entity
-{
-  EntityKind kind;
-  Vec3F32 center;
-  F32 radius;
-};
+#include "base_math.c"
+#include "rayt_core.c"

 //------------------------------------------------------------------------------------------
 //~ host globals
-host_global Entity nil_entity = {EntityKind_Nil, {0.0f, 0.0f, 0.0f}, 0.0f};
-//~ device globals
-
-__constant__ CameraF32 camera;
-__constant__ ViewportF32 viewport;
-__constant__ ImageF32 image;
-
-//------------------------------------------------------------------------------------------
-//~ routines
-
-
-__host__ __device__ function Vec3F32 vec3F32(F32 x, F32 y, F32 z)
-{
-  Vec3F32 out = {0};
-  out.x = x;
-  out.y = y;
-  out.z = z;
-  return out;
-}
-
-
-__host__ __device__ function Vec3F32 add_V3F32(Vec3F32 a, Vec3F32 b)
-{
-  Vec3F32 out = {0};
-  out.x = a.x + b.x;
-  out.y = a.y + b.y;
-  out.z = a.z + b.z;
-  return out;
-}
-
-__host__ __device__ function Vec3F32 sub_V3F32(Vec3F32 a, Vec3F32 b)
-{
-  Vec3F32 out = {0};
-  out.x = a.x-b.x;
-  out.y = a.y-b.y;
-  out.z = a.z-b.z;
-  return out;
-}
-
-
-
-__host__ __device__ function Vec3F32 scale_V3F32(F32 s, Vec3F32 v)
-{
-  Vec3F32 out = {0};
-  out.x = s*v.x;
-  out.y = s*v.y;
-  out.z = s*v.z;
-  return out;
-}
-
-__device__ function F32 dot_V3F32(Vec3F32 a, Vec3F32 b)
-{
-  return a.x*b.x + a.y*b.y + a.z*b.z;
-}
-
-
-__device__ function Vec3F32 ray_point_F32(F32 t, RayF32 ray)
-{
-  Vec3F32 out = add_V3F32(ray.origin, scale_V3F32(t, ray.direction));
-  return out;
-}
-
-__device__ function F32 mag_V3F32(Vec3F32 a)
-{
-  return dot_V3F32(a, a);
-}
-
-__device__ function F32 norm_V3F32(Vec3F32 a)
-{
-   F32 mag = mag_V3F32(a);
-   return __fsqrt_rn(mag);
-}
-
-__device__ function Vec3F32 lerp_V3F32(F32 s, Vec3F32 a, Vec3F32 b)
-{
-  Vec3F32 lerp_term1 = scale_V3F32(1.0f-s, a);
-  Vec3F32 lerp_term2 = scale_V3F32(s, b);
-  Vec3F32 lerp_result = add_V3F32(lerp_term1, lerp_term2);
-
-  return lerp_result;
-}
-
-__device__ function F32 surrounds_RngF32(RngF32 rng, F32 val)
-{
-  F32 out = (rng.min < val) && (val < rng.max); 
-  return out;
-}
-
-//
-//__device__ function F32 contains_RngF32(RngF32 rng, F32 val)
-//{
-//  F32 out = (rng.min <= val) && (val <= rng.max);
-//  return out;
-//}
-//
-//__device__ function F32 size_RngF32(RngF32 rng)
-//{
-//  return rng.max-rng.min;
-//}
-//
-
-__device__ function Vec3F32 
-rand_uniform_V3F32(curandState *local_rand_state)
-{
-  Vec3F32 out = {0};
-  out.x = curand_uniform(local_rand_state);
-  out.y = curand_uniform(local_rand_state);
-  out.z = curand_uniform(local_rand_state);
-  return out;
-}
-
-__device__ function Vec3F32 
-rand_uniform_rng_V3F32(RngF32 rng, curandState *local_rand_state)
-{
-  Vec3F32 out = {0};
-  out.x = rng.min + (rng.max-rng.min) * curand_uniform(local_rand_state);
-  out.y = rng.min + (rng.max-rng.min) * curand_uniform(local_rand_state);
-  out.z = rng.min + (rng.max-rng.min) * curand_uniform(local_rand_state);
-  return out;
-}
-
-__device__ function Vec3F32
-rand_unit_vector_on_sphere_F32(curandState *local_rand_state)
-{
-  Vec3F32 out = {0};
-  RngF32 range = {-1.0f, 1.0f}; // Cube bounding the unit sphere
-  F32 inner_bound = 1e-8f; // Don't want too small vectors
-  for(U32 i = 0; i < MAX_RANDOM_UNIT_VECTOR_ITERATIONS; i += 1)
-  {
-     out = rand_uniform_rng_V3F32(range, local_rand_state);
-     F32 normsqrd = dot_V3F32(out, out);
-     if(inner_bound < normsqrd && normsqrd <= 1.0f)
-     {
-       F32 norm = __fsqrt_rn(normsqrd);
-       out = scale_V3F32(1.0f/norm, out);
-       break;
-     }
-  }
-
-  return out;
-}
-
-__device__ function Vec3F32
-rand_unit_vector_on_hemisphere_F32(curandState *local_rand_state, Vec3F32 normal)
-{
-  
-  Vec3F32 out = {0};
-  Vec3F32 vec_on_unit_sphere = rand_unit_vector_on_sphere_F32(local_rand_state);
-  if(dot_V3F32(vec_on_unit_sphere, normal) > 0.0f)
-  {
-    // same hemisphere
-    out = vec_on_unit_sphere;
-  }
-  else 
-  {
-    out = scale_V3F32(-1.0f, vec_on_unit_sphere);
-  }
-
-  return out;
-}
-
-__host__ function void write_buffer_to_ppm(Vec3F32 *buffer, 
-                                           U32 image_width, 
-                                           U32 image_height)
-{
-
-  const char *filename = "output.ppm";
-  FILE *file = fopen(filename, "w");
-  if(!file) 
-  {
-    LOG("Error opening file %s \n", filename);
-  }
-
-  // Write PPM header. First it has "P3" by itself to indicate ASCII colors,
-  fprintf(file, "P3\n");
-  // The row below will say the dimensions of the image:
-  // (width, height) <-> (num columns, num rows)
-  fprintf(file, "%i %i\n", image_width, image_height);
-  // Then we have a value for the maximum pixel color
-  fprintf(file, "255\n");
-  // Then we have all the lines with pixel data, 
-  // it will be three values for each column j on a row i,
-  // corresponding to a pixel with index (i,j).
-  for(U32 i = 0; i < image_height; i += 1)
-  {
-    for(U32 j = 0; j < image_width; j +=1)
-    {
-      // We represent RGB values by floats internally and scale to integer values
-      U32 idx = i * image_width + j;
-
-      F32 r = buffer[idx].r;
-      F32 g = buffer[idx].g;
-      F32 b = buffer[idx].b;
-
-      U32 ir = int(255.999f * r);
-      U32 ig = int(255.999f * g);
-      U32 ib = int(255.999f * b);
-
-      fprintf(file, "%i %i %i ", ir, ig, ib);
-    }
-    fprintf(file, "\n");
-  }
-
-  fclose(file);
-}
-
-__device__ function F32
-clamp_F32(RngF32 rng, F32 val)
-{
-  F32 out = fmaxf(rng.min, val);
-  out = fminf(val, rng.max);
-  return out;
-}
-
-__device__ function Vec3F32 
-clamp_V3F32(RngF32 rng, Vec3F32 v)
-{
-  Vec3F32 out = {0};
-  out.x = clamp_F32(rng, v.x);
-  out.y = clamp_F32(rng, v.y);
-  out.z = clamp_F32(rng, v.z);
-  return out;
-}
-
-__device__ function HitRecord 
-hit_sphere(Vec3F32 center, F32 radius, RayF32 ray, RngF32 range)
-{
-  HitRecord out = {0};
-  // We take the quadratic formula -b/2a +- sqrt(b*b-4ac) / 2a,
-  // and we calculate only the sqrt part. If there is a hit with the sphere we either
-  // have two solutions (positive sqrt), one solution (zero sqrt) 
-  // or no solution (negative sqrt).
-  // If we have no solution we have no hit on
-  // the sphere centered at center, with the given radius.
-
-  // Note that we can simplify this, since we always get b = -2(D . (C-Q)), and if
-  // we say b = -2h in the quadradic formula, we get
-  // -(-2h)/2a +- sqrt((-2h)**2 - 4ac) / 2a which expands to
-  // 2h/2a +- 2sqrt(h*h - ac)/2a, simplifying to (h +- sqrt(h*h - ac))/a.
-  // So we use this simplification to optimise away some operations
-
-  // Compare lines with RTIOW
-  // (C-Q)
-  Vec3F32 oc = sub_V3F32(center, ray.origin);
-  // a = D.D
-  F32 a = dot_V3F32(ray.direction, ray.direction);
-  // h = D . (C-Q)
-  F32 h = dot_V3F32(ray.direction, oc);
-  // c = (C-Q) . (C-Q) - r*r
-  F32 c = dot_V3F32(oc, oc) - radius*radius;
-
-  F32 discriminant = h*h - a*c;
-
-  // We are actually solving for the parameter t in the expression of a point P(t) that 
-  // intersects the sphere. This is the quadratic problem we get by solving for t in 
-  // (C - P(t)) . (C - P(t)) = r*r, r being the radius and P(t) = tD+Q,
-  // where D is the direction of the ray and Q the origin of the ray.
-  F32 hit_true = 0.0f;
-
-  // Branching version
-  // TODO(anton): Maybe try to make a branchless version
-  F32 root = 0.0f;
-  if(discriminant < 0.0f)
-  {
-    hit_true = 0.0f;
-  } 
-  else
-  {
-    // t = (h += sqrt(h*h-ac))/a, and here we take the smallest solution to get the point 
-    // on the sphere closest to the ray origin.
-    F32 sqrtd = __fsqrt_rn(discriminant);
-    root = (h - sqrtd)/a;
-    if(!surrounds_RngF32(range, root))
-    {
-      root = (h + sqrtd)/a; 
-      if(!surrounds_RngF32(range, root))
-      {
-        hit_true = 0.0f;
-      } 
-      else 
-      {
-        hit_true = 1.0f;
-      }
-    } 
-    else
-    {
-      hit_true = 1.0f;
-    }
-  }
-
-  out.hit = hit_true;
-  out.t = root;
-
-  // t is the parameter of the (closest) sphere-ray intersection point P(t) = tD+Q,
-  // where Q is the ray origin and D the ray direction.
-  out.point = ray_point_F32(out.t, ray); // intersection point
-  Vec3F32 N = sub_V3F32(out.point, center);
-  N = scale_V3F32(1.0f/radius, N);
-
-  F32 front_face = dot_V3F32(ray.direction, N) < 0.0f;
-  out.normal = front_face ? N : scale_V3F32(-1.0f, N);
-  out.front_face = front_face;
-
-  return out;
-}
-
-__device__ function RayF32 
-ray_get_F32(F32 x, F32 y, Vec3F32 cam_center, curandState *local_rand_state)
-{
-  
-    RayF32 out = {0};
-    
-    // We have unit vectors delta_u and delta_v in the horizontal and vertical viewport directions.
-    Vec3F32 px_u = scale_V3F32(x, viewport.pixel_delta_u);
-    Vec3F32 px_v = scale_V3F32(y, viewport.pixel_delta_v);
-    Vec3F32 pixel_center = add_V3F32(viewport.pixel_origin, add_V3F32(px_u, px_v));
-
-    // To get anti-aliasing we make a random offset from the pixel center
-    F32 rand_u = curand_uniform(local_rand_state) - 0.5f;
-    F32 rand_v = curand_uniform(local_rand_state) - 0.5f;
-    // the rand u and rand v are offsets from a pixel in the [-0.5, 0.5] square.
-    // We need to put that into the world space of our viewport
-    Vec3F32 offset_u = scale_V3F32(rand_u, viewport.pixel_delta_u);
-    Vec3F32 offset_v = scale_V3F32(rand_v, viewport.pixel_delta_v);
-
-    // Then we shift the pixel center with the offsets in both directions
-    Vec3F32 pixel_sample = add_V3F32(pixel_center, add_V3F32(offset_u, offset_v));
-    // With a randomised point around the pixel center we can define the ray direction
-    // as the vector from the camera center to the point on the viewport.
-    Vec3F32 ray_direction = sub_V3F32(pixel_sample, camera.center);
-
-    out.origin = camera.center;
-    out.direction = ray_direction;
-    return out;
-}
-
-// Trace a ray and get a pixel color sample
-__device__ function Vec3F32 
-get_sample_color(RayF32 ray, Entity *entities, curandState *local_rand_state)
-{
-
-  RayF32 current_ray = ray;
-  Vec3F32 out = {0};
-
-  F32 current_attenuation = 1.0f;
-  F32 attenuation_factor = 0.5f;
-  Vec3F32 sample_pixel_color = vec3F32(0.0f, 0.0f, 0.0f);
-  for(U32 bounce_idx = 0; 
-      bounce_idx < MAX_DIFFUSE_DEPTH; 
-      bounce_idx += 1)
-  {
-
-    RngF32 hit_range = {0.001f, F32_MAX};
-    HitRecord hit_rec = {0};
-    for(U32 entity_idx = 0; entity_idx < MAX_NUM_ENTITIES; entity_idx += 1)
-    {
-      Entity *entity = &entities[entity_idx];
-      switch(entity->kind)
-      {
-        case EntityKind_Nil:
-          {
-            // no op
-          } break;
-
-        case EntityKind_Sphere:
-          {
-            HitRecord temp_hit_rec = hit_sphere(entity->center, entity->radius, 
-                current_ray, hit_range);
-            if(temp_hit_rec.hit) 
-            {
-              hit_rec = temp_hit_rec;
-              hit_range.max = hit_rec.t;
-            }
-
-          } break;
-      } // end switch entity kind
-
-    } 
-
-    if(hit_rec.hit)
-    {
-      // "Paint entity"
-      // For a diffuse color we actually just update the attenuation here and 
-      // bounce rays around... Then when we are not hitting anything anymore we will sample
-      // the background gradient and use the computed attenuation. Since the rays are
-      // bouncing diffusely this will shade nicely.
-      Vec3F32 rand_dir = rand_unit_vector_on_hemisphere_F32(local_rand_state, hit_rec.normal);
-      current_attenuation = current_attenuation * attenuation_factor;
-
-      current_ray.origin = hit_rec.point;
-      current_ray.direction = rand_dir;
-      //sample_pixel_color = add_V3F32(hit_rec.normal, vec3F32(1.0f, 1.0f, 1.0f));
-      //sample_pixel_color = scale_V3F32(0.5f, sample_pixel_color);
-      // debug
-      //sample_pixel_color = vec3F32(1.0f, 0.0f, 0.0f);
-    }
-    else 
-    {
-      // Paint background gradient
-      F32 norm = norm_V3F32(ray.direction);
-      Vec3F32 unit_dir = scale_V3F32(1.0f/norm, ray.direction);
-      Vec3F32 white = vec3F32(1.0f, 1.0f, 1.0f);
-      Vec3F32 light_blue = vec3F32(0.5f, 0.7f, 1.0f);
-
-      // Lerp between white and light blue depending on y position
-      F32 blend = 0.5f*(unit_dir.y + 1.0f);
-
-      sample_pixel_color = lerp_V3F32(blend, white, light_blue);
-      // Scale by the current attenuation for diffuse shading using background color
-      sample_pixel_color = scale_V3F32(current_attenuation, sample_pixel_color);
-      break;
-    }
-
-  }
-
-  out = sample_pixel_color;
-  return out;
-}
+host_global CameraF32 h_camera;
+host_global ViewportF32 h_viewport;
+host_global ImageF32 h_image;

 __global__ void 
 cuda_main(Entity *entities, Vec3F32 *pixelbuffer, curandState *rand_state) 
@ -582,44 +46,12 @@ cuda_main(Entity *entities, Vec3F32 *pixelbuffer, curandState *rand_state)
    // we initialise the color for this pixel to black.
    // Loop over all pixel samples
    Vec3F32 pixel_color = vec3F32(0.0f, 0.0f, 0.0f);
-    for(U32 sample_idx = 0; sample_idx < SAMPLES_PER_PIXEL; sample_idx += 1)
-    {
   
-      // TODO(anton): Maybe we can randomise things directly here as the
-      // nvidia accelerated version, where we just put the x, y indices with a 
-      // randomised shift and normalise to viewport space by dividing by max x, max y
-      RayF32 ray = ray_get_F32((F32)x, (F32)y, camera.center, &local_rand_state);
-
-      Vec3F32 sample_pixel_color = get_sample_color(ray, entities, &local_rand_state);
-
-      F32 debug_sample = curand_uniform(&rand_state[idx]);
-      Vec3F32 debug = vec3F32(debug_sample, debug_sample, debug_sample);
-      //pixel_color = add_V3F32(pixel_color, debug);
-      pixel_color = add_V3F32(pixel_color, sample_pixel_color);
-    }
-
-    pixel_color = scale_V3F32(1.0f/(F32)SAMPLES_PER_PIXEL, pixel_color);
-    RngF32 clamp_range = {0.0f, 1.0f};
-    //pixel_color = clamp_V3F32(clamp_range, pixel_color);
    pixelbuffer[idx] = pixel_color;
  }

 }

-__global__ void cuda_init_state(curandState *rand_state) 
-{
-
-  U32 x = threadIdx.x + blockIdx.x * blockDim.x;
-  U32 y = threadIdx.y + blockIdx.y * blockDim.y;
-
-  if(x < image.width && y < image.height)
-  {
-    U32 idx = y * image.width + x;
-    curand_init(CURAND_SEED, idx, 0, &rand_state[idx]);
-  }
-
-}
-
 //------------------------------------------------------------------------------------------
 //~ Main
 int main() 
@ -630,7 +62,7 @@ int main()
  // Define image, camera and viewport on the CPU 
  // and then copy to constant globals on device
  // -------------
-  ImageF32 h_image = {0};
+  h_image = {0};
  h_image.width = IMAGE_WIDTH;
  h_image.aspect_ratio = ASPECT_RATIO;
  U32 height = U32((F32)h_image.width/h_image.aspect_ratio) + 1;
@ -642,7 +74,7 @@ int main()
      h_image.width, h_image.height, h_image.aspect_ratio);

  // -------------
-  CameraF32 h_camera = {0};
+  h_camera = {0};
  h_camera.focal_length = 1.0f;
  F32 samples_per_pixel = (F32)SAMPLES_PER_PIXEL;
  h_camera.pixel_sample_scale = 1.0f/samples_per_pixel;
@ -652,7 +84,7 @@ int main()
  CUDA_CHECK(cuErr);

  // -------------
-  ViewportF32 h_viewport = {0};
+  h_viewport = {0};
  h_viewport.height = 2.0f;
  h_viewport.width = h_viewport.height * ((F32)h_image.width/(F32)h_image.height);
  h_viewport.aspect_ratio = h_viewport.width/h_viewport.height;
@ -685,14 +117,13 @@ int main()

  //////////////////////////////////////////////////////////////////////////////////////////
  // Setup entities and copy to device
-  U64 entity_list_size = sizeof(Entity)*MAX_NUM_ENTITIES;
-  Entity *h_entities = (Entity *)malloc(entity_list_size);
+  U64 entity_list_byte_size = sizeof(Entity)*MAX_NUM_ENTITIES;
+  Entity *h_entities = (Entity *)malloc(entity_list_byte_size);
+  memset(h_entities, 0, entity_list_byte_size);
  for(U32 i = 0; i < MAX_NUM_ENTITIES; i += 1)
  {
    // Init all entities to nil
-    //h_entities[i] = {0};
-    //h_entities[i].kind = EntityKind_Nil;
-    h_entities[i] = nil_entity;
+    h_entities[i].kind = EntityKind_Nil;
  }

  // Manual spheres
@ -708,9 +139,9 @@ int main()

  // Copy to device
  Entity *entities = 0;
-  cuErr = cudaMalloc(&entities, entity_list_size);
+  cuErr = cudaMalloc(&entities, entity_list_byte_size);
  CUDA_CHECK(cuErr);
-  cuErr = cudaMemcpy(entities, h_entities, entity_list_size, cudaMemcpyHostToDevice);
+  cuErr = cudaMemcpy(entities, h_entities, entity_list_byte_size, cudaMemcpyHostToDevice);
  CUDA_CHECK(cuErr);


--- a/src/rayt_core.c
+++ b/src/rayt_core.c
@ -0,0 +1,150 @@
+//~ device globals
+__constant__ CameraF32 camera;
+__constant__ ViewportF32 viewport;
+__constant__ ImageF32 image;
+
+
+__host__ function void 
+write_buffer_to_ppm(Vec3F32 *buffer, 
+                    U32 image_width, 
+                    U32 image_height)
+{
+
+  const char *filename = "output.ppm";
+  FILE *file = fopen(filename, "w");
+  if(!file) 
+  {
+    LOG("Error opening file %s \n", filename);
+  }
+
+  // Write PPM header. First it has "P3" by itself to indicate ASCII colors,
+  fprintf(file, "P3\n");
+  // The row below will say the dimensions of the image:
+  // (width, height) <-> (num columns, num rows)
+  fprintf(file, "%i %i\n", image_width, image_height);
+  // Then we have a value for the maximum pixel color
+  fprintf(file, "255\n");
+  // Then we have all the lines with pixel data, 
+  // it will be three values for each column j on a row i,
+  // corresponding to a pixel with index (i,j).
+  for(U32 i = 0; i < image_height; i += 1)
+  {
+    for(U32 j = 0; j < image_width; j +=1)
+    {
+      // We represent RGB values by floats internally and scale to integer values
+      U32 idx = i * image_width + j;
+
+      F32 r = buffer[idx].r;
+      F32 g = buffer[idx].g;
+      F32 b = buffer[idx].b;
+      
+      r = linear_to_gamma(r);
+      g = linear_to_gamma(g);
+      b = linear_to_gamma(b);
+
+      U32 ir = int(255.999f * r);
+      U32 ig = int(255.999f * g);
+      U32 ib = int(255.999f * b);
+
+      fprintf(file, "%i %i %i ", ir, ig, ib);
+    }
+    fprintf(file, "\n");
+
+    
+  }
+
+  fclose(file);
+}
+
+__device__ function RayF32 
+ray_get_F32(F32 x, F32 y, Vec3F32 cam_center, curandState *local_rand_state)
+{
+  
+    RayF32 out = {0};
+    
+    // We have unit vectors delta_u and delta_v in the horizontal and vertical viewport directions.
+    Vec3F32 px_u = scale_V3F32(x, viewport.pixel_delta_u);
+    Vec3F32 px_v = scale_V3F32(y, viewport.pixel_delta_v);
+    Vec3F32 pixel_center = add_V3F32(viewport.pixel_origin, add_V3F32(px_u, px_v));
+
+    // To get anti-aliasing we make a random offset from the pixel center
+    F32 rand_u = curand_uniform(local_rand_state) - 0.5f;
+    F32 rand_v = curand_uniform(local_rand_state) - 0.5f;
+    // the rand u and rand v are offsets from a pixel in the [-0.5, 0.5] square.
+    // We need to put that into the world space of our viewport
+    Vec3F32 offset_u = scale_V3F32(rand_u, viewport.pixel_delta_u);
+    Vec3F32 offset_v = scale_V3F32(rand_v, viewport.pixel_delta_v);
+
+    // Then we shift the pixel center with the offsets in both directions
+    Vec3F32 pixel_sample = add_V3F32(pixel_center, add_V3F32(offset_u, offset_v));
+    // With a randomised point around the pixel center we can define the ray direction
+    // as the vector from the camera center to the point on the viewport.
+    Vec3F32 ray_direction = sub_V3F32(pixel_sample, camera.center);
+
+    out.origin = camera.center;
+    out.direction = ray_direction;
+    return out;
+}
+
+// Trace a ray and get a pixel color sample
+__device__ function Vec3F32 
+get_sample_color(RayF32 ray, Entity *entities, curandState *local_rand_state)
+{
+
+  RayF32 current_ray = ray;
+  Vec3F32 out = {0};
+
+  F32 current_attenuation = 1.0f;
+  F32 attenuation_factor = 0.5f;
+  Vec3F32 sample_pixel_color = vec3F32(0.0f, 0.0f, 0.0f);
+  for(U32 bounce_idx = 0; 
+      bounce_idx < MAX_DIFFUSE_DEPTH; 
+      bounce_idx += 1)
+  {
+
+    RngF32 hit_range = {0.001f, F32_MAX};
+    HitRecord hit_rec = {0};
+     
+
+    if(hit_rec.hit)
+    {
+      
+    }
+    else 
+    {
+      // Paint background gradient
+      F32 norm = norm_V3F32(ray.direction);
+      Vec3F32 unit_dir = scale_V3F32(1.0f/norm, ray.direction);
+      Vec3F32 white = vec3F32(1.0f, 1.0f, 1.0f);
+      Vec3F32 light_blue = vec3F32(0.5f, 0.7f, 1.0f);
+
+      // Lerp between white and light blue depending on y position
+      F32 blend = 0.5f*(unit_dir.y + 1.0f);
+
+      sample_pixel_color = lerp_V3F32(blend, white, light_blue);
+      // Scale by the current attenuation for diffuse shading using background color
+      sample_pixel_color = scale_V3F32(current_attenuation, sample_pixel_color);
+      break;
+    }
+
+  }
+
+  out = sample_pixel_color;
+  return out;
+}
+
+
+__global__ void 
+cuda_init_state(curandState *rand_state) 
+{
+
+  U32 x = threadIdx.x + blockIdx.x * blockDim.x;
+  U32 y = threadIdx.y + blockIdx.y * blockDim.y;
+
+  if(x < image.width && y < image.height)
+  {
+    U32 idx = y * image.width + x;
+    curand_init(CURAND_SEED, idx, 0, &rand_state[idx]);
+  }
+
+}
--- a/src/rayt_core.h
+++ b/src/rayt_core.h
@ -0,0 +1,67 @@
+#pragma once
+
+
+
+typedef struct ViewportF32 ViewportF32;
+struct ViewportF32 
+{
+  F32 width;
+  F32 height;
+  F32 aspect_ratio;
+  Vec3F32 u; // along horizontal edge, right from top left corner
+  Vec3F32 v; // along vertical edge, down from top left corner
+  Vec3F32 upper_left;
+  Vec3F32 pixel_origin;
+  Vec3F32 pixel_delta_u;
+  Vec3F32 pixel_delta_v;
+};
+
+typedef struct CameraF32 CameraF32;
+struct CameraF32 
+{
+  Vec3F32 center;
+  Vec3F32 up;
+  F32 focal_length;
+  F32 pixel_sample_scale;
+};
+
+typedef struct ImageF32 ImageF32;
+struct ImageF32 
+{
+  U32 width;
+  U32 height;
+  F32 aspect_ratio;
+  U32 total_num_pixels;
+};
+
+enum EntityKind 
+{
+  EntityKind_Nil,
+  EntityKind_Sphere,
+  Num_EntityKinds
+};
+
+typedef struct HitRecord HitRecord;
+struct HitRecord 
+{
+  Vec3F32 point;
+  Vec3F32 normal;
+  F32 t; // Root parameter for hit sphere
+  F32 hit; // Hit true or false
+  F32 front_face;
+};
+
+typedef struct Entity Entity;
+struct Entity
+{
+  EntityKind kind;
+  Vec3F32 center;
+  F32 radius;
+};
+
+
+
+__host__ function void write_buffer_to_ppm(Vec3F32 *buffer, U32 image_width, U32 image_height);
+
+__device__ function RayF32 ray_get_F32(F32 x, F32 y, Vec3F32 cam_center, curandState *local_rand_state);
+__global__ void cuda_init_state(curandState *rand_state);
--- a/timeBuild.ctm
+++ b/timeBuild.ctm