From fbf9421843afc69750ae4b2534026bc962e0d7d1 Mon Sep 17 00:00:00 2001
From: Anton Ljungdahl <anton.ljungdahl@fysik.su.se>
Date: Wed, 23 Apr 2025 22:57:31 +0200
Subject: [PATCH] random numbers

---
 src/main.cu   | 236 +++++++++++++++++++-------------------------------
 timeBuild.ctm | Bin 1124 -> 1252 bytes
 2 files changed, 89 insertions(+), 147 deletions(-)

diff --git a/src/main.cu b/src/main.cu
index 0e9c1bc..3de8a4a 100644
--- a/src/main.cu
+++ b/src/main.cu
@@ -1,6 +1,9 @@
 #include <stdio.h>
 #include <stdint.h>
 
+#include <curand_kernel.h>
+
+//------------------------------------------------------------------------------------------
 //~ base defines
 
 #define global static
@@ -12,9 +15,6 @@ typedef uint32_t U32;
 typedef uint64_t U64;
 typedef float F32;
 
-
-
-
 //~ utility defines
 
 #define CUDA_CHECK(err) do { \
@@ -36,7 +36,9 @@ typedef float F32;
 #define IMAGE_WIDTH 1920
 #define ASPECT_RATIO 1.7778f // 16/9
 
-//------------------------------------------------------------------------------
+#define CURAND_SEED 1984
+
+//------------------------------------------------------------------------------------------
 //~ structs
 
 typedef union Vec3F32 Vec3F32;
@@ -98,7 +100,7 @@ struct ImageF32
   U32 total_num_pixels;
 };
 
-//------------------------------------------------------------------------------
+//------------------------------------------------------------------------------------------
 //~ host globals
 
 //~ device globals
@@ -107,7 +109,7 @@ __constant__ CameraF32 camera;
 __constant__ ViewportF32 viewport;
 __constant__ ImageF32 image;
 
-//------------------------------------------------------------------------------
+//------------------------------------------------------------------------------------------
 //~ routines
 
 
@@ -183,7 +185,10 @@ __device__ function Vec3F32 lerp_V3F32(F32 s, Vec3F32 a, Vec3F32 b)
 }
 
 
-__host__ function void write_buffer_to_ppm(Vec3F32 *buffer, U32 image_width, U32 image_height, U32 *idx_buffer) 
+__host__ function void write_buffer_to_ppm(Vec3F32 *buffer, 
+                                           U32 image_width, 
+                                           U32 image_height, 
+                                           U32 *idx_buffer) 
 {
 
   const char *filename = "output.ppm";
@@ -195,12 +200,14 @@ __host__ function void write_buffer_to_ppm(Vec3F32 *buffer, U32 image_width, U32
 
   // Write PPM header. First it has "P3" by itself to indicate ASCII colors,
   fprintf(file, "P3\n");
-  // The row below will say the dimensions of the image: (width, height) <-> (num columns, num rows)
+  // The row below will say the dimensions of the image:
+  // (width, height) <-> (num columns, num rows)
   fprintf(file, "%i %i\n", image_width, image_height);
   // Then we have a value for the maximum pixel color
   fprintf(file, "255\n");
-  // Then we have all the lines with pixel data, it will be three values for each column j on a row i, corresponding
-  // to a pixel with index (i,j).
+  // Then we have all the lines with pixel data, 
+  // it will be three values for each column j on a row i,
+  // corresponding to a pixel with index (i,j).
   for(U32 i = 0; i < image_height; i += 1)
   {
     for(U32 j = 0; j < image_width; j +=1)
@@ -228,22 +235,30 @@ __host__ function void write_buffer_to_ppm(Vec3F32 *buffer, U32 image_width, U32
 
 __device__ function F32 hit_sphere(Vec3F32 center, F32 radius, RayF32 r)
 {
-  // We take the quadratic formula -b +- sqrt(b*b-4ac) / 2a,
+  // We take the quadratic formula -b/2a +- sqrt(b*b-4ac) / 2a,
   // and we calculate only the sqrt part. If there is a hit with the sphere we either
-  // have two solutions (positive sqrt), one solution (zero sqrt) or no solution (negative sqrt).
-  // If we have no solution we have no hit on the sphere centered at center, with the given radius.
+  // have two solutions (positive sqrt), one solution (zero sqrt) 
+  // or no solution (negative sqrt).
+  // If we have no solution we have no hit on
+  // the sphere centered at center, with the given radius.
+
+  // Note that we can simplify this, since we always get b = -2(D . (C-Q)), and if
+  // we say b = -2h in the quadradic formula, we get
+  // -(-2h)/2a +- sqrt((-2h)**2 - 4ac) / 2a which expands to
+  // 2h/2a +- 2sqrt(h*h - ac)/2a, simplifying to (h +- sqrt(h*h - ac))/a.
+  // So we use this simplification to optimise away some operations
 
   // Compare lines with RTIOW
   // (C-Q)
   Vec3F32 oc = sub_V3F32(center, r.origin);
   // a = D.D
   F32 a = dot_V3F32(r.direction, r.direction);
-  // b = -2D . (C-Q)
-  F32 b = dot_V3F32(scale_V3F32(-2.0f, r.direction), oc);
+  // h = D . (C-Q)
+  F32 h = dot_V3F32(r.direction, oc);
   // c = (C-Q) . (C-Q) - r*r
   F32 c = dot_V3F32(oc, oc) - radius*radius;
 
-  F32 discriminant = b*b - 4*a*c;
+  F32 discriminant = h*h - a*c;
 
   // We are actually solving for the parameter t in the expression of a point P(t) that 
   // intersects the sphere. This is the quadratic problem we get by solving for t in 
@@ -256,9 +271,9 @@ __device__ function F32 hit_sphere(Vec3F32 center, F32 radius, RayF32 r)
   }
   else
   {
-    // t = (-b += sqrt(b*b-4ac))/2a, and here we take the smallest solution to get the point 
+    // t = (h += sqrt(h*h-ac))/a, and here we take the smallest solution to get the point 
     // on the sphere closest to the ray origin.
-    out = (-b - __fsqrt_rn(discriminant))/(2*a); 
+    out = (h - __fsqrt_rn(discriminant))/a; 
   }
 
   return out;
@@ -273,8 +288,7 @@ __global__ function void cuda_main(Vec3F32 *pixelbuffer, U32 *idxbuffer)
   U32 y = blockIdx.y * blockDim.y + threadIdx.y;
   U32 idx = y * image.width + x;
 
-  if(x >= image.width || y >= image.height) return;
-
+  if(x < image.width && y < image.height)
   {
     Vec3F32 px_u = scale_V3F32((F32)x, viewport.pixel_delta_u);
     Vec3F32 px_v = scale_V3F32((F32)y, viewport.pixel_delta_v);
@@ -324,11 +338,27 @@ __global__ function void cuda_main(Vec3F32 *pixelbuffer, U32 *idxbuffer)
 
 }
 
+__global__ function void cuda_init_state(curandState *rand_state) 
+{
 
+  U32 x = threadIdx.x + blockIdx.x * blockDim.x;
+  U32 y = threadIdx.y + blockIdx.y * blockDim.y;
+
+  if(x < image.width && y < image.height)
+  {
+    U32 idx = y * image.width + x;
+    curand_init(CURAND_SEED, idx, 0, &rand_state[idx]);
+  }
+  
+}
+
+//------------------------------------------------------------------------------------------
+//~ Main
 int main() 
 {
   cudaError_t cuErr;
 
+  //////////////////////////////////////////////////////////////////////////////////////////
   // Define image, camera and viewport on the CPU 
   // and then copy to constant globals on device
   // -------------
@@ -381,7 +411,9 @@ int main()
   LOG("Viewport size %.2f x %.2f, aspect ratio: %.4f \n", 
       h_viewport.width, h_viewport.height, h_viewport.aspect_ratio);
 
-  // Define grid, blocks, threads and pixel buffers
+  //////////////////////////////////////////////////////////////////////////////////////////
+  // Define grid, blocks, threads and any buffers such as pixel data and random state
+  // ------------
   U32 num_pixels = h_image.total_num_pixels;
   U64 pixel_buffer_size = num_pixels*sizeof(Vec3F32);
 
@@ -395,26 +427,54 @@ int main()
   cuErr = cudaMalloc(&pixel_buffer, pixel_buffer_size);
   CUDA_CHECK(cuErr);
 
+  // This is just a debug buffer, TODO(anton): remove
   U32 *idxbuffer = 0;
   cuErr = cudaMalloc(&idxbuffer, sizeof(U32)*num_pixels);
+  CUDA_CHECK(cuErr);
+
+  curandState *d_rand_state = 0;
+  cuErr = cudaMalloc(&d_rand_state, num_pixels*sizeof(curandState));
+  CUDA_CHECK(cuErr);
+
+  //////////////////////////////////////////////////////////////////////////////////////////
+  // Initialise CUDA state such as random number states per thread.
+  // This is separate for performance measurements
+  // ------------
+  cuda_init_state<<<blocks_per_grid, threads_per_block>>>(d_rand_state);
+  cuErr = cudaGetLastError();
+  CUDA_CHECK(cuErr);
+  cuErr = cudaDeviceSynchronize();
+  CUDA_CHECK(cuErr);
+
+  //////////////////////////////////////////////////////////////////////////////////////////
+  // Launch the main CUDA kernel, each thread will color a pixel and store it 
+  // in the pixel buffer.
+  // ------------
 
-  
   LOG("Launching main kernel with \n blocks per grid: (%i, %i, %i) \n",
       blocks_per_grid.x, blocks_per_grid.y, blocks_per_grid.z);
   LOG("threads per block: (%i, %i %i) \n", 
       threads_per_block.x, threads_per_block.y, threads_per_block.z);
 
   cuda_main<<<blocks_per_grid, threads_per_block>>>(pixel_buffer, idxbuffer);
-  cudaDeviceSynchronize();
-
-
-
-  Vec3F32 *h_pixel_buffer = (Vec3F32 *)malloc(pixel_buffer_size);
-  cuErr = cudaMemcpy(h_pixel_buffer, pixel_buffer, pixel_buffer_size, cudaMemcpyDeviceToHost);
+  cuErr = cudaGetLastError();
+  CUDA_CHECK(cuErr);
+  cuErr = cudaDeviceSynchronize();
   CUDA_CHECK(cuErr);
 
+
+  //////////////////////////////////////////////////////////////////////////////////////////
+  // Copy the pixel buffer back from the device and write it to an image file.
+  // ------------
+  Vec3F32 *h_pixel_buffer = (Vec3F32 *)malloc(pixel_buffer_size);
+  cuErr = cudaMemcpy(h_pixel_buffer, pixel_buffer, pixel_buffer_size,
+                     cudaMemcpyDeviceToHost);
+  CUDA_CHECK(cuErr);
+
+  // TODO(anton): remove debug buffer
   U32 *h_idxbuffer = (U32 *)malloc(num_pixels*sizeof(U32));
-  cuErr = cudaMemcpy(h_idxbuffer, idxbuffer, num_pixels*sizeof(U32), cudaMemcpyDeviceToHost);
+  cuErr = cudaMemcpy(h_idxbuffer, idxbuffer, num_pixels*sizeof(U32), 
+                     cudaMemcpyDeviceToHost);
 
   write_buffer_to_ppm(h_pixel_buffer, h_image.width, h_image.height, h_idxbuffer);
  
@@ -424,121 +484,3 @@ int main()
   return 0;
 }
 
-/**
-
-
-
-function void write_test_ppm(U32 image_width, U32 image_height) 
-{
-
-  const char *filename = "test_output.ppm";
-  FILE *file = fopen(filename, "w");
-  if(!file) 
-  {
-    LOG("Error opening file %s \n", filename);
-  }
-
-  // Write PPM header. First it has "P3" by itself to indicate ASCII colors,
-  fprintf(file, "P3\n");
-  // The row below will say the dimensions of the image: (width, height) <-> (num columns, num rows)
-  fprintf(file, "%i %i\n", image_width, image_height);
-  // Then we have a value for the maximum pixel color
-  fprintf(file, "255\n");
-  // Then we have all the lines with pixel data, it will be three values for each column j on a row i, corresponding
-  // to a pixel with index (i,j).
-  for(U32 i = 0; i < image_height; i += 1)
-  {
-    for(U32 j = 0; j < image_width; j +=1)
-    {
-      // We represent RGB values by floats internally and scale to integer values
-      F32 r = float(j) / (float(image_width)-1.0f);
-      F32 g = float(i) / (float(image_height)-1.0f);
-      F32 b = 0.0f;// - (float(j)/(float(image_width)-1.0f);
-
-      U32 ir = int(255.999f * r);
-      U32 ig = int(255.999f * g);
-      U32 ib = int(255.999f * b);
-
-      fprintf(file, "%i %i %i ", ir, ig, ib);
-    }
-    fprintf(file, "\n");
-  }
-
-  fclose(file);
-}
-
-__device__ void modify_array(U32 *arr, U32 N) 
-{
-  U32 i = threadIdx.x;
-  if(i < N)
-  {
-    arr[i] = i;
-  }
-
-  return;
-}
-
-
-__global__ void hello_from_device(U32 *arr, U32 N) 
-{
-
-  modify_array(arr, N);
-
-
-  return;
-}
-
-
-
-int hello_cuda() 
-{
-
-  LOG("Hello from cpu\n");
-  
-  
-  U32 N = NUM_THREADS;
-  U32 arr_size = N * sizeof(U32);
-
-
-  U32 *arr = (U32 *)malloc(arr_size);
-  memset(arr, 0, arr_size);
-
-  cudaError_t cuErr;
-  U32 *d_arr = 0;
-
-
-
-  cuErr = cudaMalloc(&d_arr, arr_size);
-  CUDA_CHECK(cuErr);
-
-  cuErr = cudaMemcpy(d_arr, arr, arr_size, cudaMemcpyHostToDevice);
-  CUDA_CHECK(cuErr);
-
-  LOG("Array before CUDA \n");
-  for(U32 i = 0; i < N; i += 1) 
-  {
-    LOG("%i: %i \n", i, arr[i]);
-  }
-  LOG("\n");
-
-
-  hello_from_device<<<NUM_BLOCKS, NUM_THREADS>>>(d_arr, N);
-  
-  cuErr = cudaMemcpy(arr, d_arr, arr_size, cudaMemcpyDeviceToHost);
-  CUDA_CHECK(cuErr);
-
-  cuErr = cudaFree(d_arr);
-  CUDA_CHECK(cuErr);
-
-  LOG("Array after CUDA \n");
-  for(U32 i = 0; i < N; i += 1) 
-  {
-    LOG("%i \n", arr[i]);
-  }
-  LOG("\n");
-
-
-  return 0;
-}
-
-*/
diff --git a/timeBuild.ctm b/timeBuild.ctm
index 20697c4b1c19a07a1849d6bd59a92dc4b798b5ed..13b36a67cc0b1ad03bad977818f8f67166b21f2b 100644
GIT binary patch
delta 137
zcmaFD@q}|j3QIl1{j;~rw%lf9W?*2r&(6T`ZS%eZ)lmKgAm8LYvqT+`&j^&C!ot9?
w^`)yKjK3Mkcli)&4C903jpwTsBlFKT$<K!ITY&P<nL`hQ_#pcdxEL5209c?YJpcdz

delta 7
OcmaFD`GjLb3JU-Y=>qZq