diff --git a/src/main.cu b/src/main.cu
index 3de8a4a..88c8c0b 100644
--- a/src/main.cu
+++ b/src/main.cu
@@ -1,12 +1,13 @@
 #include <stdio.h>
 #include <stdint.h>
+#include <float.h>
 
 #include <curand_kernel.h>
 
 //------------------------------------------------------------------------------------------
 //~ base defines
 
-#define global static
+#define host_global static
 #define function static
 
 //~ typedefs 
@@ -27,6 +28,8 @@ typedef float F32;
 
 #define LOG printf
 
+#define F32_MAX FLT_MAX
+#define F32_MIN FLT_MIN
 
 //~ test defines
 #define NUM_BLOCKS   1
@@ -38,6 +41,9 @@ typedef float F32;
 
 #define CURAND_SEED 1984
 
+
+#define MAX_NUM_ENTITIES 64
+
 //------------------------------------------------------------------------------------------
 //~ structs
 
@@ -59,6 +65,12 @@ union Vec3F32
   F32 v[3];
 };
 
+typedef struct RngF32 RngF32;
+struct RngF32 
+{
+  F32 min;
+  F32 max;
+};
 
 typedef struct RayF32 RayF32;
 struct RayF32
@@ -100,9 +112,34 @@ struct ImageF32
   U32 total_num_pixels;
 };
 
+enum EntityKind 
+{
+  EntityKind_Nil,
+  EntityKind_Sphere,
+  Num_EntityKinds
+};
+
+typedef struct HitRecord HitRecord;
+struct HitRecord 
+{
+  Vec3F32 point;
+  Vec3F32 normal;
+  F32 t; // Root parameter for hit sphere
+  F32 hit; // Hit true or false
+  F32 front_face;
+};
+
+typedef struct Entity Entity;
+struct Entity
+{
+  EntityKind kind;
+  Vec3F32 center;
+  F32 radius;
+};
+
 //------------------------------------------------------------------------------------------
 //~ host globals
-
+host_global Entity nil_entity = {EntityKind_Nil, {0.0f, 0.0f, 0.0f}, 0.0f};
 //~ device globals
 
 __constant__ CameraF32 camera;
@@ -184,6 +221,22 @@ __device__ function Vec3F32 lerp_V3F32(F32 s, Vec3F32 a, Vec3F32 b)
   return lerp_result;
 }
 
+__device__ function F32 surrounds_RngF32(RngF32 rng, F32 val)
+{
+  F32 out = (rng.min < val) && (val < rng.max); 
+  return out;
+}
+
+__device__ function F32 contains_RngF32(RngF32 rng, F32 val)
+{
+  F32 out = (rng.min <= val) && (val <= rng.max);
+  return out;
+}
+
+__device__ function F32 size_RngF32(RngF32 rng)
+{
+  return rng.max-rng.min;
+}
 
 __host__ function void write_buffer_to_ppm(Vec3F32 *buffer, 
                                            U32 image_width, 
@@ -233,8 +286,11 @@ __host__ function void write_buffer_to_ppm(Vec3F32 *buffer,
   fclose(file);
 }
 
-__device__ function F32 hit_sphere(Vec3F32 center, F32 radius, RayF32 r)
+
+__device__ function HitRecord hit_sphere(Vec3F32 center, F32 radius, 
+    RayF32 ray, RngF32 range)
 {
+  HitRecord out = {0};
   // We take the quadratic formula -b/2a +- sqrt(b*b-4ac) / 2a,
   // and we calculate only the sqrt part. If there is a hit with the sphere we either
   // have two solutions (positive sqrt), one solution (zero sqrt) 
@@ -250,11 +306,11 @@ __device__ function F32 hit_sphere(Vec3F32 center, F32 radius, RayF32 r)
 
   // Compare lines with RTIOW
   // (C-Q)
-  Vec3F32 oc = sub_V3F32(center, r.origin);
+  Vec3F32 oc = sub_V3F32(center, ray.origin);
   // a = D.D
-  F32 a = dot_V3F32(r.direction, r.direction);
+  F32 a = dot_V3F32(ray.direction, ray.direction);
   // h = D . (C-Q)
-  F32 h = dot_V3F32(r.direction, oc);
+  F32 h = dot_V3F32(ray.direction, oc);
   // c = (C-Q) . (C-Q) - r*r
   F32 c = dot_V3F32(oc, oc) - radius*radius;
 
@@ -264,24 +320,58 @@ __device__ function F32 hit_sphere(Vec3F32 center, F32 radius, RayF32 r)
   // intersects the sphere. This is the quadratic problem we get by solving for t in 
   // (C - P(t)) . (C - P(t)) = r*r, r being the radius and P(t) = tD+Q,
   // where D is the direction of the ray and Q the origin of the ray.
-  F32 out = 0.0f;
+  F32 hit_true = 0.0f;
+
+  // Branching version
+  // TODO(anton): Maybe try to make a branchless version
+  F32 root = 0.0f;
   if(discriminant < 0.0f)
   {
-    out = -1.0f;
-  }
+    hit_true = 0.0f;
+  } 
   else
   {
     // t = (h += sqrt(h*h-ac))/a, and here we take the smallest solution to get the point 
     // on the sphere closest to the ray origin.
-    out = (h - __fsqrt_rn(discriminant))/a; 
+    F32 sqrtd = __fsqrt_rn(discriminant);
+    root = (h - sqrtd)/a;
+    if(!surrounds_RngF32(range, root))
+    {
+      root = (h + sqrtd)/a; 
+      if(!surrounds_RngF32(range, root))
+      {
+        hit_true = 0.0f;
+      } 
+      else 
+      {
+        hit_true = 1.0f;
+      }
+    } 
+    else
+    {
+      hit_true = 1.0f;
+    }
   }
 
+  out.hit = hit_true;
+  out.t = root;
+
+  // t is the parameter of the (closest) sphere-ray intersection point P(t) = tD+Q,
+  // where Q is the ray origin and D the ray direction.
+  out.point = ray_point_F32(out.t, ray); // intersection point
+  Vec3F32 N = sub_V3F32(out.point, center);
+  N = scale_V3F32(1.0f/radius, N);
+
+  F32 front_face = dot_V3F32(ray.direction, N) < 0.0f;
+  out.normal = front_face ? N : scale_V3F32(-1.0f, N);
+  out.front_face = front_face;
+
   return out;
 }
 
 
 
-__global__ function void cuda_main(Vec3F32 *pixelbuffer, U32 *idxbuffer) 
+__global__ void cuda_main(Entity *entities, Vec3F32 *pixelbuffer, U32 *idxbuffer) 
 {
 
   U32 x = blockIdx.x * blockDim.x + threadIdx.x;
@@ -293,37 +383,59 @@ __global__ function void cuda_main(Vec3F32 *pixelbuffer, U32 *idxbuffer)
     Vec3F32 px_u = scale_V3F32((F32)x, viewport.pixel_delta_u);
     Vec3F32 px_v = scale_V3F32((F32)y, viewport.pixel_delta_v);
     Vec3F32 pixel_center = add_V3F32(viewport.pixel_origin, add_V3F32(px_u, px_v));
-    
+
     // TODO(anton): Maybe we dont need some ray structure here..
     Vec3F32 ray_direction = sub_V3F32(pixel_center, camera.center);
-    RayF32 r = {0};
-    r.origin = camera.center;
-    r.direction = ray_direction;
+    RayF32 ray = {0};
+    ray.origin = camera.center;
+    ray.direction = ray_direction;
 
-    F32 norm = norm_V3F32(r.direction);
-    Vec3F32 unit_dir = scale_V3F32(1.0f/norm, r.direction);
-    Vec3F32 white = vec3F32(1.0f, 1.0f, 1.0f);
-    Vec3F32 light_blue = vec3F32(0.5f, 0.7f, 1.0f);
 
-    // Lerp between white and light blue depending on y position
-    F32 blend = 0.5f*(unit_dir.y + 1.0f);
-    Vec3F32 pixel_color = {0};
-
-    Vec3F32 sphere_center = vec3F32(0.0f, 0.0f, -1.0f);
-    F32 sphere_radius = 0.5f;
-
-    // t is the parameter of the (closest) sphere-ray intersection point P(t) = tD+Q,
-    // where Q is the ray origin and D the ray direction.
-    F32 t = hit_sphere(sphere_center, sphere_radius, r);
-    if(t > 0.0f)
+    RngF32 hit_range = {F32_MIN, F32_MAX};
+    HitRecord hit_rec = {0};
+    for(U32 entity_idx = 0; entity_idx < MAX_NUM_ENTITIES; entity_idx += 1)
     {
-      Vec3F32 intersection_point = ray_point_F32(t, r);
-      Vec3F32 N = sub_V3F32(intersection_point, sphere_center);
-      N = scale_V3F32(1.0f/sphere_radius, N);
-      pixel_color = scale_V3F32(0.5f, add_V3F32(N, vec3F32(1.0f, 1.0f, 1.0f)));
+      Entity *entity = &entities[entity_idx];
+      switch(entity->kind)
+      {
+        case EntityKind_Nil:
+        {
+          // no op
+        } break;
+
+        case EntityKind_Sphere:
+        {
+          HitRecord temp_hit_rec = hit_sphere(entity->center, entity->radius, 
+                                              ray, hit_range);
+          if(temp_hit_rec.hit) 
+          {
+            hit_rec = temp_hit_rec;
+            hit_range.max = hit_rec.t;
+          }
+
+        } break;
+      } // end switch entity kind
+
     } 
-    else
+
+    Vec3F32 pixel_color = {0.0f, 0.0f, 0.0f};
+    if(hit_rec.hit)
     {
+      // Paint entity
+      pixel_color = add_V3F32(hit_rec.normal, vec3F32(1.0f, 1.0f, 1.0f));
+      pixel_color = scale_V3F32(0.5f, pixel_color);
+    }
+    else 
+    {
+      // Paint background gradient
+      F32 norm = norm_V3F32(ray.direction);
+      Vec3F32 unit_dir = scale_V3F32(1.0f/norm, ray.direction);
+      Vec3F32 white = vec3F32(1.0f, 1.0f, 1.0f);
+      Vec3F32 light_blue = vec3F32(0.5f, 0.7f, 1.0f);
+
+      // Lerp between white and light blue depending on y position
+      F32 blend = 0.5f*(unit_dir.y + 1.0f);
+
       pixel_color = lerp_V3F32(blend, white, light_blue);
     }
 
@@ -338,7 +450,7 @@ __global__ function void cuda_main(Vec3F32 *pixelbuffer, U32 *idxbuffer)
 
 }
 
-__global__ function void cuda_init_state(curandState *rand_state) 
+__global__ void cuda_init_state(curandState *rand_state) 
 {
 
   U32 x = threadIdx.x + blockIdx.x * blockDim.x;
@@ -349,7 +461,7 @@ __global__ function void cuda_init_state(curandState *rand_state)
     U32 idx = y * image.width + x;
     curand_init(CURAND_SEED, idx, 0, &rand_state[idx]);
   }
-  
+
 }
 
 //------------------------------------------------------------------------------------------
@@ -377,9 +489,9 @@ int main()
   CameraF32 h_camera = {0};
   h_camera.focal_length = 1.0f;
   cuErr = cudaMemcpyToSymbol(camera, &h_camera, sizeof(CameraF32), 0, 
-                             cudaMemcpyHostToDevice);
+      cudaMemcpyHostToDevice);
   CUDA_CHECK(cuErr);
-  
+
   // -------------
   ViewportF32 h_viewport = {0};
   h_viewport.height = 2.0f;
@@ -387,30 +499,62 @@ int main()
   h_viewport.aspect_ratio = h_viewport.width/h_viewport.height;
   h_viewport.u = vec3F32(h_viewport.width, 0.0f, 0.0f);
   h_viewport.v = vec3F32(0.0f, -h_viewport.height, 0.0f);
-  
+
   F32 width_inverse = 1.0f/(F32)h_image.width;
   F32 height_inverse = 1.0f/(F32)h_image.height;
   h_viewport.pixel_delta_u = scale_V3F32(width_inverse, h_viewport.u);
   h_viewport.pixel_delta_v = scale_V3F32(height_inverse, h_viewport.v);
-  
+
   // upper_left = camera - vec3(0,0,focal_length) - viewport_u/2 - viewport_v/2
   Vec3F32 viewport_upper_left = sub_V3F32(h_camera.center, 
-                                          vec3F32(0.0f, 0.0f, h_camera.focal_length));
+      vec3F32(0.0f, 0.0f, h_camera.focal_length));
   viewport_upper_left = sub_V3F32(viewport_upper_left, scale_V3F32(0.5f, h_viewport.u));
   viewport_upper_left = sub_V3F32(viewport_upper_left, scale_V3F32(0.5f, h_viewport.v));
   h_viewport.upper_left = viewport_upper_left;
-  
+
   // pixel_origin = upper_left + 0.5 * (delta u + delta v)
   Vec3F32 pixel_delta_sum = add_V3F32(h_viewport.pixel_delta_u, h_viewport.pixel_delta_v);
   h_viewport.pixel_origin = add_V3F32(viewport_upper_left, 
-                                      scale_V3F32(0.5f, pixel_delta_sum));
+      scale_V3F32(0.5f, pixel_delta_sum));
 
   cuErr = cudaMemcpyToSymbol(viewport, &h_viewport, sizeof(ViewportF32), 0, 
-                             cudaMemcpyHostToDevice);
+      cudaMemcpyHostToDevice);
   CUDA_CHECK(cuErr);
   LOG("Viewport size %.2f x %.2f, aspect ratio: %.4f \n", 
       h_viewport.width, h_viewport.height, h_viewport.aspect_ratio);
 
+
+  //////////////////////////////////////////////////////////////////////////////////////////
+  // Setup entities and copy to device
+  U64 entity_list_size = sizeof(Entity)*MAX_NUM_ENTITIES;
+  Entity *h_entities = (Entity *)malloc(entity_list_size);
+  for(U32 i = 0; i < MAX_NUM_ENTITIES; i += 1)
+  {
+    // Init all entities to nil
+    //h_entities[i] = {0};
+    //h_entities[i].kind = EntityKind_Nil;
+    h_entities[i] = nil_entity;
+  }
+
+  // Manual spheres
+  {
+    h_entities[0].kind = EntityKind_Sphere;
+    h_entities[0].center = vec3F32(0.0f, 0.0f, -1.0f);
+    h_entities[0].radius = 0.5f;
+
+    h_entities[1].kind = EntityKind_Sphere;
+    h_entities[1].center = vec3F32(0.0f, -100.5f, -1.0f);
+    h_entities[1].radius = 100.0f;
+  }
+
+  // Copy to device
+  Entity *entities = 0;
+  cuErr = cudaMalloc(&entities, entity_list_size);
+  CUDA_CHECK(cuErr);
+  cuErr = cudaMemcpy(entities, h_entities, entity_list_size, cudaMemcpyHostToDevice);
+  CUDA_CHECK(cuErr);
+
+
   //////////////////////////////////////////////////////////////////////////////////////////
   // Define grid, blocks, threads and any buffers such as pixel data and random state
   // ------------
@@ -436,6 +580,8 @@ int main()
   cuErr = cudaMalloc(&d_rand_state, num_pixels*sizeof(curandState));
   CUDA_CHECK(cuErr);
 
+
+
   //////////////////////////////////////////////////////////////////////////////////////////
   // Initialise CUDA state such as random number states per thread.
   // This is separate for performance measurements
@@ -456,7 +602,7 @@ int main()
   LOG("threads per block: (%i, %i %i) \n", 
       threads_per_block.x, threads_per_block.y, threads_per_block.z);
 
-  cuda_main<<<blocks_per_grid, threads_per_block>>>(pixel_buffer, idxbuffer);
+  cuda_main<<<blocks_per_grid, threads_per_block>>>(entities, pixel_buffer, idxbuffer);
   cuErr = cudaGetLastError();
   CUDA_CHECK(cuErr);
   cuErr = cudaDeviceSynchronize();
@@ -468,19 +614,19 @@ int main()
   // ------------
   Vec3F32 *h_pixel_buffer = (Vec3F32 *)malloc(pixel_buffer_size);
   cuErr = cudaMemcpy(h_pixel_buffer, pixel_buffer, pixel_buffer_size,
-                     cudaMemcpyDeviceToHost);
+      cudaMemcpyDeviceToHost);
   CUDA_CHECK(cuErr);
 
   // TODO(anton): remove debug buffer
   U32 *h_idxbuffer = (U32 *)malloc(num_pixels*sizeof(U32));
   cuErr = cudaMemcpy(h_idxbuffer, idxbuffer, num_pixels*sizeof(U32), 
-                     cudaMemcpyDeviceToHost);
+      cudaMemcpyDeviceToHost);
 
   write_buffer_to_ppm(h_pixel_buffer, h_image.width, h_image.height, h_idxbuffer);
- 
+
   cuErr = cudaFree(pixel_buffer);
   CUDA_CHECK(cuErr);
-  
+
   return 0;
 }
 
diff --git a/timeBuild.ctm b/timeBuild.ctm
index 13b36a6..a58225b 100644
Binary files a/timeBuild.ctm and b/timeBuild.ctm differ