working gradient on gpu

2025-04-23 15:23:20 +02:00 · 2025-04-23 15:23:20 +02:00 · a49709d3de
commit a49709d3de
parent 4500fcbb18
3 changed files with 1461 additions and 7 deletions
--- a/build/output.ppm
+++ b/build/output.ppm
--- a/src/main.cu
+++ b/src/main.cu
@ -1,11 +1,22 @@
 #include <stdio.h>
+#include <stdint.h>

+//~ base defines

-typedef int S32;
-typedef unsigned int U32;
+#define global static
+#define function static
+
+//~ typedefs 
+typedef int32_t S32;
+typedef uint32_t U32;
+typedef uint64_t U64;
 typedef float F32;


+
+
+//~ utility defines
+
 #define CUDA_CHECK(err) do { \
  if (err != cudaSuccess) { \
    fprintf(stderr, "CUDA ERROR: %s at %s:%d\n", \
@ -14,18 +25,378 @@ typedef float F32;
  } \
 } while (0)

-
-
 #define LOG printf


-
+//~ test defines
 #define NUM_BLOCKS   1
 #define NUM_THREADS  32


+#define IMAGE_WIDTH 1920
+#define ASPECT_RATIO 1.7778f // 16/9
+
+//------------------------------------------------------------------------------
+//~ structs
+
+typedef union Vec3F32 Vec3F32;
+union Vec3F32
+{
+  struct 
+  {
+    F32 x;
+    F32 y;
+    F32 z;
+  };
+  F32 v[3];
+};


+typedef struct RayF32 RayF32;
+struct RayF32
+{
+  Vec3F32 origin;
+  Vec3F32 direction;
+};
+
+
+typedef struct ViewportF32 ViewportF32;
+
+struct ViewportF32 
+{
+  F32 width;
+  F32 height;
+  F32 aspect_ratio;
+  Vec3F32 u; // along horizontal edge, right from top left corner
+  Vec3F32 v; // along vertical edge, down from top left corner
+  Vec3F32 upper_left;
+  Vec3F32 pixel_origin;
+  Vec3F32 pixel_delta_u;
+  Vec3F32 pixel_delta_v;
+};
+
+typedef struct CameraF32 CameraF32;
+struct CameraF32 
+{
+  Vec3F32 center;
+  Vec3F32 up;
+  F32 focal_length;
+};
+
+typedef struct ImageF32 ImageF32;
+struct ImageF32 
+{
+  U32 width;
+  U32 height;
+  F32 aspect_ratio;
+  U32 total_num_pixels;
+};
+
+//------------------------------------------------------------------------------
+//~ host globals
+
+//~ device globals
+
+__constant__ CameraF32 camera;
+__constant__ ViewportF32 viewport;
+__constant__ ImageF32 image;
+
+//------------------------------------------------------------------------------
+//~ routines
+
+
+__host__ __device__ function Vec3F32 vec3F32(F32 x, F32 y, F32 z)
+{
+  Vec3F32 out = {0};
+  out.x = x;
+  out.y = y;
+  out.z = z;
+  return out;
+}
+
+
+__host__ __device__ function Vec3F32 add_V3F32(Vec3F32 a, Vec3F32 b)
+{
+  Vec3F32 out = {0};
+  out.x = a.x + b.x;
+  out.y = a.y + b.y;
+  out.z = a.z + b.z;
+  return out;
+}
+
+__host__ __device__ function Vec3F32 sub_V3F32(Vec3F32 a, Vec3F32 b)
+{
+  Vec3F32 out = {0};
+  out.x = a.x-b.x;
+  out.y = a.y-b.y;
+  out.z = a.z-b.z;
+  return out;
+}
+
+
+
+__host__ __device__ function Vec3F32 scale_V3F32(F32 s, Vec3F32 v)
+{
+  Vec3F32 out = {0};
+  out.x = s*v.x;
+  out.y = s*v.y;
+  out.z = s*v.z;
+  return out;
+}
+
+__device__ function Vec3F32 ray_point_F32(F32 t, RayF32 *ray)
+{
+  Vec3F32 out = add_V3F32(ray->origin, scale_V3F32(t, ray->direction));
+  return out;
+}
+
+__device__ function F32 mag_V3F32(Vec3F32 a)
+{
+  return a.x*a.x + a.y*a.y + a.z*a.z;
+}
+
+__device__ function F32 norm_V3F32(Vec3F32 a)
+{
+   F32 mag = mag_V3F32(a);
+   return __fsqrt_rn(mag);
+}
+
+__device__ function Vec3F32 lerp_V3F32(F32 s, Vec3F32 a, Vec3F32 b)
+{
+  Vec3F32 lerp_term1 = scale_V3F32(1.0f-s, a);
+  Vec3F32 lerp_term2 = scale_V3F32(s, b);
+  Vec3F32 lerp_result = add_V3F32(lerp_term1, lerp_term2);
+
+  return lerp_result;
+}
+
+
+
+
+
+function void write_buffer_to_ppm(Vec3F32 *buffer, U32 image_width, U32 image_height, U32 *idx_buffer) 
+{
+
+  const char *filename = "output.ppm";
+  FILE *file = fopen(filename, "w");
+  if(!file) 
+  {
+    LOG("Error opening file %s \n", filename);
+  }
+
+  // Write PPM header. First it has "P3" by itself to indicate ASCII colors,
+  fprintf(file, "P3\n");
+  // The row below will say the dimensions of the image: (width, height) <-> (num columns, num rows)
+  fprintf(file, "%i %i\n", image_width, image_height);
+  // Then we have a value for the maximum pixel color
+  fprintf(file, "255\n");
+  // Then we have all the lines with pixel data, it will be three values for each column j on a row i, corresponding
+  // to a pixel with index (i,j).
+  for(U32 i = 0; i < image_height; i += 1)
+  {
+    for(U32 j = 0; j < image_width; j +=1)
+    {
+      // We represent RGB values by floats internally and scale to integer values
+      U32 idx = i * image_width + j;
+      if(idx_buffer[idx] != 0) {
+        //LOG("idx %i, idxbuffer[idx] = %i \n", idx, idx_buffer[idx]);
+      }
+      F32 r = buffer[idx].x;
+      F32 g = buffer[idx].y;
+      F32 b = buffer[idx].z;
+
+      U32 ir = int(255.999f * r);
+      U32 ig = int(255.999f * g);
+      U32 ib = int(255.999f * b);
+
+      fprintf(file, "%i %i %i ", ir, ig, ib);
+    }
+    fprintf(file, "\n");
+  }
+
+  fclose(file);
+}
+
+
+
+__global__ function void cuda_main(Vec3F32 *pixelbuffer, U32 *idxbuffer) 
+{
+
+  U32 x = blockIdx.x * blockDim.x + threadIdx.x;
+  U32 y = blockIdx.y * blockDim.y + threadIdx.y;
+  U32 idx = y * image.width + x;
+
+  if(x >= image.width || y >= image.height) return;
+
+  {
+    Vec3F32 px_u = scale_V3F32((F32)x, viewport.pixel_delta_u);
+    Vec3F32 px_v = scale_V3F32((F32)y, viewport.pixel_delta_v);
+    Vec3F32 pixel_center = add_V3F32(viewport.pixel_origin, add_V3F32(px_u, px_v));
+    Vec3F32 ray_direction = sub_V3F32(pixel_center, camera.center);
+    RayF32 r = {0};
+    r.origin = camera.center;
+    r.direction = ray_direction;
+
+    F32 norm = norm_V3F32(r.direction);
+    Vec3F32 unit_dir = scale_V3F32(1.0f/norm, r.direction);
+    Vec3F32 white = vec3F32(1.0f, 1.0f, 1.0f);
+    Vec3F32 light_blue = vec3F32(0.5f, 0.7f, 1.0f);
+
+    // Lerp between white and light blue depending on y position
+    F32 blend = 0.5f*(unit_dir.y + 1.0f);
+    pixelbuffer[idx] = lerp_V3F32(blend, white, light_blue);
+
+    //pixelbuffer[idx].x = (F32)x/(F32)image.width;
+    //pixelbuffer[idx].y = (F32)y/(F32)image.height;
+    //pixelbuffer[idx].z = 0.0f;
+
+    idxbuffer[idx] = idx;
+  }
+
+}
+
+
+int main() 
+{
+  cudaError_t cuErr;
+
+  // Define image, camera and viewport on the CPU 
+  // and then copy to constant globals on device
+  // -------------
+  ImageF32 h_image = {0};
+  h_image.width = IMAGE_WIDTH;
+  h_image.aspect_ratio = ASPECT_RATIO;
+  U32 height = U32((F32)h_image.width/h_image.aspect_ratio) + 1;
+  h_image.height = height < 1 ? 1 : height;
+  h_image.total_num_pixels = h_image.width * h_image.height;
+  cuErr = cudaMemcpyToSymbol(image, &h_image, sizeof(ImageF32), 0, cudaMemcpyHostToDevice);
+  CUDA_CHECK(cuErr);
+  LOG("Image size %i x %i, aspect ratio: %.4f \n", 
+      h_image.width, h_image.height, h_image.aspect_ratio);
+
+  // -------------
+  CameraF32 h_camera = {0};
+  h_camera.focal_length = 1.0f;
+  cuErr = cudaMemcpyToSymbol(camera, &h_camera, sizeof(CameraF32), 0, 
+                             cudaMemcpyHostToDevice);
+  CUDA_CHECK(cuErr);
+  
+  // -------------
+  ViewportF32 h_viewport = {0};
+  h_viewport.height = 2.0f;
+  h_viewport.width = h_viewport.height * ((F32)h_image.width/(F32)h_image.height);
+  h_viewport.aspect_ratio = h_viewport.width/h_viewport.height;
+  h_viewport.u = vec3F32(h_viewport.width, 0.0f, 0.0f);
+  h_viewport.v = vec3F32(0.0f, -h_viewport.height, 0.0f);
+  
+  F32 width_inverse = 1.0f/(F32)h_image.width;
+  F32 height_inverse = 1.0f/(F32)h_image.height;
+  h_viewport.pixel_delta_u = scale_V3F32(width_inverse, h_viewport.u);
+  h_viewport.pixel_delta_v = scale_V3F32(height_inverse, h_viewport.v);
+  
+  // upper_left = camera - vec3(0,0,focal_length) - viewport_u/2 - viewport_v/2
+  Vec3F32 viewport_upper_left = sub_V3F32(h_camera.center, 
+                                          vec3F32(0.0f, 0.0f, h_camera.focal_length));
+  viewport_upper_left = sub_V3F32(viewport_upper_left, scale_V3F32(0.5f, h_viewport.u));
+  viewport_upper_left = sub_V3F32(viewport_upper_left, scale_V3F32(0.5f, h_viewport.v));
+  h_viewport.upper_left = viewport_upper_left;
+  
+  // pixel_origin = upper_left + 0.5 * (delta u + delta v)
+  Vec3F32 pixel_delta_sum = add_V3F32(h_viewport.pixel_delta_u, h_viewport.pixel_delta_v);
+  h_viewport.pixel_origin = add_V3F32(viewport_upper_left, 
+                                      scale_V3F32(0.5f, pixel_delta_sum));
+
+  cuErr = cudaMemcpyToSymbol(viewport, &h_viewport, sizeof(ViewportF32), 0, 
+                             cudaMemcpyHostToDevice);
+  CUDA_CHECK(cuErr);
+  LOG("Viewport size %.2f x %.2f, aspect ratio: %.4f \n", 
+      h_viewport.width, h_viewport.height, h_viewport.aspect_ratio);
+
+  // Define grid, blocks, threads and pixel buffers
+  U32 num_pixels = h_image.total_num_pixels;
+  U64 pixel_buffer_size = num_pixels*sizeof(Vec3F32);
+
+  dim3 threads_per_block(16, 8);
+  dim3 blocks_per_grid(
+      (h_image.width + threads_per_block.x - 1) / threads_per_block.x,
+      (h_image.height + threads_per_block.y - 1) / threads_per_block.y
+      );
+
+  Vec3F32 *pixel_buffer = 0;
+  cuErr = cudaMalloc(&pixel_buffer, pixel_buffer_size);
+  CUDA_CHECK(cuErr);
+
+  U32 *idxbuffer = 0;
+  cuErr = cudaMalloc(&idxbuffer, sizeof(U32)*num_pixels);
+
+  
+  LOG("Launching main kernel with \n blocks per grid: (%i, %i, %i) \n",
+      blocks_per_grid.x, blocks_per_grid.y, blocks_per_grid.z);
+  LOG("threads per block: (%i, %i %i) \n", 
+      threads_per_block.x, threads_per_block.y, threads_per_block.z);
+
+  cuda_main<<<blocks_per_grid, threads_per_block>>>(pixel_buffer, idxbuffer);
+  cudaDeviceSynchronize();
+
+
+
+  Vec3F32 *h_pixel_buffer = (Vec3F32 *)malloc(pixel_buffer_size);
+  cuErr = cudaMemcpy(h_pixel_buffer, pixel_buffer, pixel_buffer_size, cudaMemcpyDeviceToHost);
+  CUDA_CHECK(cuErr);
+
+  U32 *h_idxbuffer = (U32 *)malloc(num_pixels*sizeof(U32));
+  cuErr = cudaMemcpy(h_idxbuffer, idxbuffer, num_pixels*sizeof(U32), cudaMemcpyDeviceToHost);
+
+  write_buffer_to_ppm(h_pixel_buffer, h_image.width, h_image.height, h_idxbuffer);
+ 
+  cuErr = cudaFree(pixel_buffer);
+  CUDA_CHECK(cuErr);
+  
+  return 0;
+}
+
+/**
+
+
+
+function void write_test_ppm(U32 image_width, U32 image_height) 
+{
+
+  const char *filename = "test_output.ppm";
+  FILE *file = fopen(filename, "w");
+  if(!file) 
+  {
+    LOG("Error opening file %s \n", filename);
+  }
+
+  // Write PPM header. First it has "P3" by itself to indicate ASCII colors,
+  fprintf(file, "P3\n");
+  // The row below will say the dimensions of the image: (width, height) <-> (num columns, num rows)
+  fprintf(file, "%i %i\n", image_width, image_height);
+  // Then we have a value for the maximum pixel color
+  fprintf(file, "255\n");
+  // Then we have all the lines with pixel data, it will be three values for each column j on a row i, corresponding
+  // to a pixel with index (i,j).
+  for(U32 i = 0; i < image_height; i += 1)
+  {
+    for(U32 j = 0; j < image_width; j +=1)
+    {
+      // We represent RGB values by floats internally and scale to integer values
+      F32 r = float(j) / (float(image_width)-1.0f);
+      F32 g = float(i) / (float(image_height)-1.0f);
+      F32 b = 0.0f;// - (float(j)/(float(image_width)-1.0f);
+
+      U32 ir = int(255.999f * r);
+      U32 ig = int(255.999f * g);
+      U32 ib = int(255.999f * b);
+
+      fprintf(file, "%i %i %i ", ir, ig, ib);
+    }
+    fprintf(file, "\n");
+  }
+
+  fclose(file);
+}

 __device__ void modify_array(U32 *arr, U32 N) 
 {
@ -50,8 +421,7 @@ __global__ void hello_from_device(U32 *arr, U32 N)



-
-int main() 
+int hello_cuda() 
 {

  LOG("Hello from cpu\n");
@ -102,3 +472,4 @@ int main()
  return 0;
 }

+*/
--- a/timeBuild.ctm
+++ b/timeBuild.ctm