From fe726e9f493f09733db06d377484423dc54d9722 Mon Sep 17 00:00:00 2001
From: Anton Ljungdahl <anton.ljungdahl@fysik.su.se>
Date: Fri, 25 Apr 2025 22:15:17 +0200
Subject: [PATCH] antialising working!

---
 src/main.cu   | 257 +++++++++++++++++++++++++++++++-------------------
 timeBuild.ctm | Bin 1268 -> 2228 bytes
 2 files changed, 159 insertions(+), 98 deletions(-)

diff --git a/src/main.cu b/src/main.cu
index 88c8c0b..ce3bccc 100644
--- a/src/main.cu
+++ b/src/main.cu
@@ -31,18 +31,19 @@ typedef float F32;
 #define F32_MAX FLT_MAX
 #define F32_MIN FLT_MIN
 
-//~ test defines
+
+//------------------------------------------------------------------------------------------
+//~ Program parameter defines
 #define NUM_BLOCKS   1
 #define NUM_THREADS  32
 
-
 #define IMAGE_WIDTH 1920
 #define ASPECT_RATIO 1.7778f // 16/9
 
 #define CURAND_SEED 1984
 
-
 #define MAX_NUM_ENTITIES 64
+#define SAMPLES_PER_PIXEL 32
 
 //------------------------------------------------------------------------------------------
 //~ structs
@@ -101,6 +102,7 @@ struct CameraF32
   Vec3F32 center;
   Vec3F32 up;
   F32 focal_length;
+  F32 pixel_sample_scale;
 };
 
 typedef struct ImageF32 ImageF32;
@@ -227,21 +229,22 @@ __device__ function F32 surrounds_RngF32(RngF32 rng, F32 val)
   return out;
 }
 
-__device__ function F32 contains_RngF32(RngF32 rng, F32 val)
-{
-  F32 out = (rng.min <= val) && (val <= rng.max);
-  return out;
-}
-
-__device__ function F32 size_RngF32(RngF32 rng)
-{
-  return rng.max-rng.min;
-}
+//
+//__device__ function F32 contains_RngF32(RngF32 rng, F32 val)
+//{
+//  F32 out = (rng.min <= val) && (val <= rng.max);
+//  return out;
+//}
+//
+//__device__ function F32 size_RngF32(RngF32 rng)
+//{
+//  return rng.max-rng.min;
+//}
+//
 
 __host__ function void write_buffer_to_ppm(Vec3F32 *buffer, 
                                            U32 image_width, 
-                                           U32 image_height, 
-                                           U32 *idx_buffer) 
+                                           U32 image_height)
 {
 
   const char *filename = "output.ppm";
@@ -267,9 +270,7 @@ __host__ function void write_buffer_to_ppm(Vec3F32 *buffer,
     {
       // We represent RGB values by floats internally and scale to integer values
       U32 idx = i * image_width + j;
-      if(idx_buffer[idx] != 0) {
-        //LOG("idx %i, idxbuffer[idx] = %i \n", idx, idx_buffer[idx]);
-      }
+
       F32 r = buffer[idx].r;
       F32 g = buffer[idx].g;
       F32 b = buffer[idx].b;
@@ -286,9 +287,26 @@ __host__ function void write_buffer_to_ppm(Vec3F32 *buffer,
   fclose(file);
 }
 
+__device__ function F32
+clamp_F32(RngF32 rng, F32 val)
+{
+  F32 out = fmaxf(rng.min, val);
+  out = fminf(val, rng.max);
+  return out;
+}
 
-__device__ function HitRecord hit_sphere(Vec3F32 center, F32 radius, 
-    RayF32 ray, RngF32 range)
+__device__ function Vec3F32 
+clamp_V3F32(RngF32 rng, Vec3F32 v)
+{
+  Vec3F32 out = {0};
+  out.x = clamp_F32(rng, v.x);
+  out.y = clamp_F32(rng, v.y);
+  out.z = clamp_F32(rng, v.z);
+  return out;
+}
+
+__device__ function HitRecord 
+hit_sphere(Vec3F32 center, F32 radius, RayF32 ray, RngF32 range)
 {
   HitRecord out = {0};
   // We take the quadratic formula -b/2a +- sqrt(b*b-4ac) / 2a,
@@ -369,9 +387,98 @@ __device__ function HitRecord hit_sphere(Vec3F32 center, F32 radius,
   return out;
 }
 
+__device__ function RayF32 
+ray_get_F32(F32 x, F32 y, Vec3F32 cam_center, curandState local_rand_state)
+{
+  
+    RayF32 out = {0};
+    
+    // We have unit vectors delta_u and delta_v in the horizontal and vertical viewport directions.
+    Vec3F32 px_u = scale_V3F32(x, viewport.pixel_delta_u);
+    Vec3F32 px_v = scale_V3F32(y, viewport.pixel_delta_v);
+    Vec3F32 pixel_center = add_V3F32(viewport.pixel_origin, add_V3F32(px_u, px_v));
 
+    // To get anti-aliasing we make a random offset from the pixel center
+    F32 rand_u = curand_uniform(&local_rand_state) - 0.5f;
+    F32 rand_v = curand_uniform(&local_rand_state) - 0.5f;
+    // the rand u and rand v are offsets from a pixel in the [-0.5, 0.5] square.
+    // We need to put that into the world space of our viewport
+    Vec3F32 offset_u = scale_V3F32(rand_u, viewport.pixel_delta_u);
+    Vec3F32 offset_v = scale_V3F32(rand_v, viewport.pixel_delta_v);
 
-__global__ void cuda_main(Entity *entities, Vec3F32 *pixelbuffer, U32 *idxbuffer) 
+    // Then we shift the pixel center with the offsets in both directions
+    Vec3F32 pixel_sample = add_V3F32(pixel_center, add_V3F32(offset_u, offset_v));
+    // With a randomised point around the pixel center we can define the ray direction
+    // as the vector from the camera center to the point on the viewport.
+    Vec3F32 ray_direction = sub_V3F32(pixel_sample, camera.center);
+
+    out.origin = camera.center;
+    out.direction = ray_direction;
+    return out;
+}
+
+// Trace a ray and get a pixel color sample
+__device__ function Vec3F32 
+get_sample_color(RayF32 ray, Entity *entities)
+{
+  Vec3F32 out = {0};
+
+  RngF32 hit_range = {F32_MIN, F32_MAX};
+  HitRecord hit_rec = {0};
+  for(U32 entity_idx = 0; entity_idx < MAX_NUM_ENTITIES; entity_idx += 1)
+  {
+    Entity *entity = &entities[entity_idx];
+    switch(entity->kind)
+    {
+      case EntityKind_Nil:
+        {
+          // no op
+        } break;
+
+      case EntityKind_Sphere:
+        {
+          HitRecord temp_hit_rec = hit_sphere(entity->center, entity->radius, 
+              ray, hit_range);
+          if(temp_hit_rec.hit) 
+          {
+            hit_rec = temp_hit_rec;
+            hit_range.max = hit_rec.t;
+          }
+
+        } break;
+    } // end switch entity kind
+
+  } 
+
+  Vec3F32 sample_pixel_color = vec3F32(0.0f, 0.0f, 0.0f);
+  if(hit_rec.hit)
+  {
+    // Paint entity
+    sample_pixel_color = add_V3F32(hit_rec.normal, vec3F32(1.0f, 1.0f, 1.0f));
+    sample_pixel_color = scale_V3F32(0.5f, sample_pixel_color);
+    // debug
+    //sample_pixel_color = vec3F32(1.0f, 0.0f, 0.0f);
+  }
+  else 
+  {
+    // Paint background gradient
+    F32 norm = norm_V3F32(ray.direction);
+    Vec3F32 unit_dir = scale_V3F32(1.0f/norm, ray.direction);
+    Vec3F32 white = vec3F32(1.0f, 1.0f, 1.0f);
+    Vec3F32 light_blue = vec3F32(0.5f, 0.7f, 1.0f);
+
+    // Lerp between white and light blue depending on y position
+    F32 blend = 0.5f*(unit_dir.y + 1.0f);
+
+    sample_pixel_color = lerp_V3F32(blend, white, light_blue);
+  }
+
+  out = sample_pixel_color;
+  return out;
+}
+
+  __global__ void 
+cuda_main(Entity *entities, Vec3F32 *pixelbuffer, curandState *rand_state) 
 {
 
   U32 x = blockIdx.x * blockDim.x + threadIdx.x;
@@ -380,72 +487,30 @@ __global__ void cuda_main(Entity *entities, Vec3F32 *pixelbuffer, U32 *idxbuffer
 
   if(x < image.width && y < image.height)
   {
-    Vec3F32 px_u = scale_V3F32((F32)x, viewport.pixel_delta_u);
-    Vec3F32 px_v = scale_V3F32((F32)y, viewport.pixel_delta_v);
-    Vec3F32 pixel_center = add_V3F32(viewport.pixel_origin, add_V3F32(px_u, px_v));
 
-    // TODO(anton): Maybe we dont need some ray structure here..
-    Vec3F32 ray_direction = sub_V3F32(pixel_center, camera.center);
-    RayF32 ray = {0};
-    ray.origin = camera.center;
-    ray.direction = ray_direction;
-
-
-    RngF32 hit_range = {F32_MIN, F32_MAX};
-    HitRecord hit_rec = {0};
-    for(U32 entity_idx = 0; entity_idx < MAX_NUM_ENTITIES; entity_idx += 1)
+    // We are adding all samples and then dividing by num samples to get the mean, so 
+    // we initialise the color for this pixel to black.
+    // Loop over all pixel samples
+    Vec3F32 pixel_color = vec3F32(0.0f, 0.0f, 0.0f);
+    for(U32 sample_idx = 0; sample_idx < SAMPLES_PER_PIXEL; sample_idx += 1)
     {
-      Entity *entity = &entities[entity_idx];
-      switch(entity->kind)
-      {
-        case EntityKind_Nil:
-        {
-          // no op
-        } break;
 
-        case EntityKind_Sphere:
-        {
-          HitRecord temp_hit_rec = hit_sphere(entity->center, entity->radius, 
-                                              ray, hit_range);
-          if(temp_hit_rec.hit) 
-          {
-            hit_rec = temp_hit_rec;
-            hit_range.max = hit_rec.t;
-          }
+      // TODO(anton): Maybe we can randomise things directly here as the
+      // nvidia accelerated version, where we just put the x, y indices with a 
+      // randomised shift and normalise to viewport space by dividing by max x, max y
+      RayF32 ray = ray_get_F32((F32)x, (F32)y, camera.center, rand_state[idx]);
+      
+      Vec3F32 sample_pixel_color = get_sample_color(ray, entities);
 
-        } break;
-      } // end switch entity kind
-
-    } 
-
-    Vec3F32 pixel_color = {0.0f, 0.0f, 0.0f};
-    if(hit_rec.hit)
-    {
-      // Paint entity
-      pixel_color = add_V3F32(hit_rec.normal, vec3F32(1.0f, 1.0f, 1.0f));
-      pixel_color = scale_V3F32(0.5f, pixel_color);
-    }
-    else 
-    {
-      // Paint background gradient
-      F32 norm = norm_V3F32(ray.direction);
-      Vec3F32 unit_dir = scale_V3F32(1.0f/norm, ray.direction);
-      Vec3F32 white = vec3F32(1.0f, 1.0f, 1.0f);
-      Vec3F32 light_blue = vec3F32(0.5f, 0.7f, 1.0f);
-
-      // Lerp between white and light blue depending on y position
-      F32 blend = 0.5f*(unit_dir.y + 1.0f);
-
-      pixel_color = lerp_V3F32(blend, white, light_blue);
+      F32 debug_sample = curand_uniform(&rand_state[idx]);
+      Vec3F32 debug = vec3F32(debug_sample, debug_sample, debug_sample);
+      //pixel_color = add_V3F32(pixel_color, debug);
+      pixel_color = add_V3F32(pixel_color, sample_pixel_color);
     }
 
-    pixelbuffer[idx] = pixel_color;
-
-    //pixelbuffer[idx].x = (F32)x/(F32)image.width;
-    //pixelbuffer[idx].y = (F32)y/(F32)image.height;
-    //pixelbuffer[idx].z = 0.0f;
-
-    idxbuffer[idx] = idx;
+    pixel_color = scale_V3F32(1.0f/(F32)SAMPLES_PER_PIXEL, pixel_color);
+    RngF32 clamp_range = {0.0f, 1.0f};
+    pixelbuffer[idx] = clamp_V3F32(clamp_range, pixel_color);
   }
 
 }
@@ -488,6 +553,9 @@ int main()
   // -------------
   CameraF32 h_camera = {0};
   h_camera.focal_length = 1.0f;
+  F32 samples_per_pixel = (F32)SAMPLES_PER_PIXEL;
+  h_camera.pixel_sample_scale = 1.0f/samples_per_pixel;
+
   cuErr = cudaMemcpyToSymbol(camera, &h_camera, sizeof(CameraF32), 0, 
       cudaMemcpyHostToDevice);
   CUDA_CHECK(cuErr);
@@ -515,7 +583,7 @@ int main()
   // pixel_origin = upper_left + 0.5 * (delta u + delta v)
   Vec3F32 pixel_delta_sum = add_V3F32(h_viewport.pixel_delta_u, h_viewport.pixel_delta_v);
   h_viewport.pixel_origin = add_V3F32(viewport_upper_left, 
-      scale_V3F32(0.5f, pixel_delta_sum));
+                                      scale_V3F32(0.5f, pixel_delta_sum));
 
   cuErr = cudaMemcpyToSymbol(viewport, &h_viewport, sizeof(ViewportF32), 0, 
       cudaMemcpyHostToDevice);
@@ -571,22 +639,16 @@ int main()
   cuErr = cudaMalloc(&pixel_buffer, pixel_buffer_size);
   CUDA_CHECK(cuErr);
 
-  // This is just a debug buffer, TODO(anton): remove
-  U32 *idxbuffer = 0;
-  cuErr = cudaMalloc(&idxbuffer, sizeof(U32)*num_pixels);
+  curandState *rand_state = 0;
+  cuErr = cudaMalloc(&rand_state, num_pixels*sizeof(curandState));
   CUDA_CHECK(cuErr);
 
-  curandState *d_rand_state = 0;
-  cuErr = cudaMalloc(&d_rand_state, num_pixels*sizeof(curandState));
-  CUDA_CHECK(cuErr);
-
-
 
   //////////////////////////////////////////////////////////////////////////////////////////
   // Initialise CUDA state such as random number states per thread.
   // This is separate for performance measurements
   // ------------
-  cuda_init_state<<<blocks_per_grid, threads_per_block>>>(d_rand_state);
+  cuda_init_state<<<blocks_per_grid, threads_per_block>>>(rand_state);
   cuErr = cudaGetLastError();
   CUDA_CHECK(cuErr);
   cuErr = cudaDeviceSynchronize();
@@ -602,7 +664,7 @@ int main()
   LOG("threads per block: (%i, %i %i) \n", 
       threads_per_block.x, threads_per_block.y, threads_per_block.z);
 
-  cuda_main<<<blocks_per_grid, threads_per_block>>>(entities, pixel_buffer, idxbuffer);
+  cuda_main<<<blocks_per_grid, threads_per_block>>>(entities, pixel_buffer, rand_state);
   cuErr = cudaGetLastError();
   CUDA_CHECK(cuErr);
   cuErr = cudaDeviceSynchronize();
@@ -617,15 +679,14 @@ int main()
       cudaMemcpyDeviceToHost);
   CUDA_CHECK(cuErr);
 
-  // TODO(anton): remove debug buffer
-  U32 *h_idxbuffer = (U32 *)malloc(num_pixels*sizeof(U32));
-  cuErr = cudaMemcpy(h_idxbuffer, idxbuffer, num_pixels*sizeof(U32), 
-      cudaMemcpyDeviceToHost);
-
-  write_buffer_to_ppm(h_pixel_buffer, h_image.width, h_image.height, h_idxbuffer);
+  write_buffer_to_ppm(h_pixel_buffer, h_image.width, h_image.height);
 
   cuErr = cudaFree(pixel_buffer);
   CUDA_CHECK(cuErr);
+  cuErr = cudaFree(entities);
+  CUDA_CHECK(cuErr);
+  cuErr = cudaFree(rand_state);
+  CUDA_CHECK(cuErr);
 
   return 0;
 }
diff --git a/timeBuild.ctm b/timeBuild.ctm
index a58225bb5bbf00a5a1f80e30d98c73d82bd01449..521cfba5c6f63d073b537242589e73b04e1dc9ae 100644
GIT binary patch
delta 975
zcmYk)Ur1AN6bJBQGxJ`})K>W?(}|ct)@T+H!jf=%DN7h=wAvK+5GZOQH5lHaC<*Ek
z+ruDm21R23_8>|SPKlsPU|_4jy_6v%VG?t#6+=5`{SNu~Ip5#;{m$?HZbF(U3MR4@
zYjUzjnv7&+0e~+D;PR6{r)6Q|W&rQJ)K1o69+Lrdhb=*NEBc%F+*@9M9M|`H{0p4-
zVeWiYQ<f+Co#@5K=Z)+p85D3w>|dxr0~m0FxNXzf0?`o22Z+|r46^5NgSGW=jQt#~
zjSY`%6YEv<s>P}=6rM(#9)*vwX`pUbZyQ^c6@c|%<>LPWPP}D&ph#>$2iT;VDEkFE
z{q8`h7;_rffdkiHa=&5EX9K(pc{KI}df0sGJ^MBKny);<u2OJ=#<`&=73dSw4N98_
zTDJ?+#F3v{`#1Wl>9<EO1~`NLV+}2L$kb2ke>LoT$i9khcS@r>M6RGK`){4t$)>`|
zap@u#sG-qiOOfe@^be@HN7I~FV}3UluihncGv>)YLmin0Sd{?kx_q5%d}An>x*lND
zKX5R0dq!BvOgv(^_lXzK0Qv`pB{qGL{P@iuY<iNG6;qr|KVgqWO_cDB(9yk}ZKc8;
zXqP<XW77aR{;*b-$?^T^0DHx=5B~}Pbi>q<(Ef6<0S$D2I;64#=-Ka%r))YPz0jnw
z=>UKC=#AZ?-<=0=VfjjuO}TxgxUhn9t9bL}k88D=0-6r=mehaVXoq&lyhm(EPi7pn
IG_!T+zaIbuJOBUy

delta 7
OcmdlY_=R)B7Zv~xJp*e1