From fbf9421843afc69750ae4b2534026bc962e0d7d1 Mon Sep 17 00:00:00 2001 From: Anton Ljungdahl Date: Wed, 23 Apr 2025 22:57:31 +0200 Subject: [PATCH] random numbers --- src/main.cu | 236 +++++++++++++++++++------------------------------- timeBuild.ctm | Bin 1124 -> 1252 bytes 2 files changed, 89 insertions(+), 147 deletions(-) diff --git a/src/main.cu b/src/main.cu index 0e9c1bc..3de8a4a 100644 --- a/src/main.cu +++ b/src/main.cu @@ -1,6 +1,9 @@ #include #include +#include + +//------------------------------------------------------------------------------------------ //~ base defines #define global static @@ -12,9 +15,6 @@ typedef uint32_t U32; typedef uint64_t U64; typedef float F32; - - - //~ utility defines #define CUDA_CHECK(err) do { \ @@ -36,7 +36,9 @@ typedef float F32; #define IMAGE_WIDTH 1920 #define ASPECT_RATIO 1.7778f // 16/9 -//------------------------------------------------------------------------------ +#define CURAND_SEED 1984 + +//------------------------------------------------------------------------------------------ //~ structs typedef union Vec3F32 Vec3F32; @@ -98,7 +100,7 @@ struct ImageF32 U32 total_num_pixels; }; -//------------------------------------------------------------------------------ +//------------------------------------------------------------------------------------------ //~ host globals //~ device globals @@ -107,7 +109,7 @@ __constant__ CameraF32 camera; __constant__ ViewportF32 viewport; __constant__ ImageF32 image; -//------------------------------------------------------------------------------ +//------------------------------------------------------------------------------------------ //~ routines @@ -183,7 +185,10 @@ __device__ function Vec3F32 lerp_V3F32(F32 s, Vec3F32 a, Vec3F32 b) } -__host__ function void write_buffer_to_ppm(Vec3F32 *buffer, U32 image_width, U32 image_height, U32 *idx_buffer) +__host__ function void write_buffer_to_ppm(Vec3F32 *buffer, + U32 image_width, + U32 image_height, + U32 *idx_buffer) { const char *filename = "output.ppm"; @@ -195,12 +200,14 @@ __host__ function void write_buffer_to_ppm(Vec3F32 *buffer, U32 image_width, U32 // Write PPM header. First it has "P3" by itself to indicate ASCII colors, fprintf(file, "P3\n"); - // The row below will say the dimensions of the image: (width, height) <-> (num columns, num rows) + // The row below will say the dimensions of the image: + // (width, height) <-> (num columns, num rows) fprintf(file, "%i %i\n", image_width, image_height); // Then we have a value for the maximum pixel color fprintf(file, "255\n"); - // Then we have all the lines with pixel data, it will be three values for each column j on a row i, corresponding - // to a pixel with index (i,j). + // Then we have all the lines with pixel data, + // it will be three values for each column j on a row i, + // corresponding to a pixel with index (i,j). for(U32 i = 0; i < image_height; i += 1) { for(U32 j = 0; j < image_width; j +=1) @@ -228,22 +235,30 @@ __host__ function void write_buffer_to_ppm(Vec3F32 *buffer, U32 image_width, U32 __device__ function F32 hit_sphere(Vec3F32 center, F32 radius, RayF32 r) { - // We take the quadratic formula -b +- sqrt(b*b-4ac) / 2a, + // We take the quadratic formula -b/2a +- sqrt(b*b-4ac) / 2a, // and we calculate only the sqrt part. If there is a hit with the sphere we either - // have two solutions (positive sqrt), one solution (zero sqrt) or no solution (negative sqrt). - // If we have no solution we have no hit on the sphere centered at center, with the given radius. + // have two solutions (positive sqrt), one solution (zero sqrt) + // or no solution (negative sqrt). + // If we have no solution we have no hit on + // the sphere centered at center, with the given radius. + + // Note that we can simplify this, since we always get b = -2(D . (C-Q)), and if + // we say b = -2h in the quadradic formula, we get + // -(-2h)/2a +- sqrt((-2h)**2 - 4ac) / 2a which expands to + // 2h/2a +- 2sqrt(h*h - ac)/2a, simplifying to (h +- sqrt(h*h - ac))/a. + // So we use this simplification to optimise away some operations // Compare lines with RTIOW // (C-Q) Vec3F32 oc = sub_V3F32(center, r.origin); // a = D.D F32 a = dot_V3F32(r.direction, r.direction); - // b = -2D . (C-Q) - F32 b = dot_V3F32(scale_V3F32(-2.0f, r.direction), oc); + // h = D . (C-Q) + F32 h = dot_V3F32(r.direction, oc); // c = (C-Q) . (C-Q) - r*r F32 c = dot_V3F32(oc, oc) - radius*radius; - F32 discriminant = b*b - 4*a*c; + F32 discriminant = h*h - a*c; // We are actually solving for the parameter t in the expression of a point P(t) that // intersects the sphere. This is the quadratic problem we get by solving for t in @@ -256,9 +271,9 @@ __device__ function F32 hit_sphere(Vec3F32 center, F32 radius, RayF32 r) } else { - // t = (-b += sqrt(b*b-4ac))/2a, and here we take the smallest solution to get the point + // t = (h += sqrt(h*h-ac))/a, and here we take the smallest solution to get the point // on the sphere closest to the ray origin. - out = (-b - __fsqrt_rn(discriminant))/(2*a); + out = (h - __fsqrt_rn(discriminant))/a; } return out; @@ -273,8 +288,7 @@ __global__ function void cuda_main(Vec3F32 *pixelbuffer, U32 *idxbuffer) U32 y = blockIdx.y * blockDim.y + threadIdx.y; U32 idx = y * image.width + x; - if(x >= image.width || y >= image.height) return; - + if(x < image.width && y < image.height) { Vec3F32 px_u = scale_V3F32((F32)x, viewport.pixel_delta_u); Vec3F32 px_v = scale_V3F32((F32)y, viewport.pixel_delta_v); @@ -324,11 +338,27 @@ __global__ function void cuda_main(Vec3F32 *pixelbuffer, U32 *idxbuffer) } +__global__ function void cuda_init_state(curandState *rand_state) +{ + U32 x = threadIdx.x + blockIdx.x * blockDim.x; + U32 y = threadIdx.y + blockIdx.y * blockDim.y; + + if(x < image.width && y < image.height) + { + U32 idx = y * image.width + x; + curand_init(CURAND_SEED, idx, 0, &rand_state[idx]); + } + +} + +//------------------------------------------------------------------------------------------ +//~ Main int main() { cudaError_t cuErr; + ////////////////////////////////////////////////////////////////////////////////////////// // Define image, camera and viewport on the CPU // and then copy to constant globals on device // ------------- @@ -381,7 +411,9 @@ int main() LOG("Viewport size %.2f x %.2f, aspect ratio: %.4f \n", h_viewport.width, h_viewport.height, h_viewport.aspect_ratio); - // Define grid, blocks, threads and pixel buffers + ////////////////////////////////////////////////////////////////////////////////////////// + // Define grid, blocks, threads and any buffers such as pixel data and random state + // ------------ U32 num_pixels = h_image.total_num_pixels; U64 pixel_buffer_size = num_pixels*sizeof(Vec3F32); @@ -395,26 +427,54 @@ int main() cuErr = cudaMalloc(&pixel_buffer, pixel_buffer_size); CUDA_CHECK(cuErr); + // This is just a debug buffer, TODO(anton): remove U32 *idxbuffer = 0; cuErr = cudaMalloc(&idxbuffer, sizeof(U32)*num_pixels); + CUDA_CHECK(cuErr); + + curandState *d_rand_state = 0; + cuErr = cudaMalloc(&d_rand_state, num_pixels*sizeof(curandState)); + CUDA_CHECK(cuErr); + + ////////////////////////////////////////////////////////////////////////////////////////// + // Initialise CUDA state such as random number states per thread. + // This is separate for performance measurements + // ------------ + cuda_init_state<<>>(d_rand_state); + cuErr = cudaGetLastError(); + CUDA_CHECK(cuErr); + cuErr = cudaDeviceSynchronize(); + CUDA_CHECK(cuErr); + + ////////////////////////////////////////////////////////////////////////////////////////// + // Launch the main CUDA kernel, each thread will color a pixel and store it + // in the pixel buffer. + // ------------ - LOG("Launching main kernel with \n blocks per grid: (%i, %i, %i) \n", blocks_per_grid.x, blocks_per_grid.y, blocks_per_grid.z); LOG("threads per block: (%i, %i %i) \n", threads_per_block.x, threads_per_block.y, threads_per_block.z); cuda_main<<>>(pixel_buffer, idxbuffer); - cudaDeviceSynchronize(); - - - - Vec3F32 *h_pixel_buffer = (Vec3F32 *)malloc(pixel_buffer_size); - cuErr = cudaMemcpy(h_pixel_buffer, pixel_buffer, pixel_buffer_size, cudaMemcpyDeviceToHost); + cuErr = cudaGetLastError(); + CUDA_CHECK(cuErr); + cuErr = cudaDeviceSynchronize(); CUDA_CHECK(cuErr); + + ////////////////////////////////////////////////////////////////////////////////////////// + // Copy the pixel buffer back from the device and write it to an image file. + // ------------ + Vec3F32 *h_pixel_buffer = (Vec3F32 *)malloc(pixel_buffer_size); + cuErr = cudaMemcpy(h_pixel_buffer, pixel_buffer, pixel_buffer_size, + cudaMemcpyDeviceToHost); + CUDA_CHECK(cuErr); + + // TODO(anton): remove debug buffer U32 *h_idxbuffer = (U32 *)malloc(num_pixels*sizeof(U32)); - cuErr = cudaMemcpy(h_idxbuffer, idxbuffer, num_pixels*sizeof(U32), cudaMemcpyDeviceToHost); + cuErr = cudaMemcpy(h_idxbuffer, idxbuffer, num_pixels*sizeof(U32), + cudaMemcpyDeviceToHost); write_buffer_to_ppm(h_pixel_buffer, h_image.width, h_image.height, h_idxbuffer); @@ -424,121 +484,3 @@ int main() return 0; } -/** - - - -function void write_test_ppm(U32 image_width, U32 image_height) -{ - - const char *filename = "test_output.ppm"; - FILE *file = fopen(filename, "w"); - if(!file) - { - LOG("Error opening file %s \n", filename); - } - - // Write PPM header. First it has "P3" by itself to indicate ASCII colors, - fprintf(file, "P3\n"); - // The row below will say the dimensions of the image: (width, height) <-> (num columns, num rows) - fprintf(file, "%i %i\n", image_width, image_height); - // Then we have a value for the maximum pixel color - fprintf(file, "255\n"); - // Then we have all the lines with pixel data, it will be three values for each column j on a row i, corresponding - // to a pixel with index (i,j). - for(U32 i = 0; i < image_height; i += 1) - { - for(U32 j = 0; j < image_width; j +=1) - { - // We represent RGB values by floats internally and scale to integer values - F32 r = float(j) / (float(image_width)-1.0f); - F32 g = float(i) / (float(image_height)-1.0f); - F32 b = 0.0f;// - (float(j)/(float(image_width)-1.0f); - - U32 ir = int(255.999f * r); - U32 ig = int(255.999f * g); - U32 ib = int(255.999f * b); - - fprintf(file, "%i %i %i ", ir, ig, ib); - } - fprintf(file, "\n"); - } - - fclose(file); -} - -__device__ void modify_array(U32 *arr, U32 N) -{ - U32 i = threadIdx.x; - if(i < N) - { - arr[i] = i; - } - - return; -} - - -__global__ void hello_from_device(U32 *arr, U32 N) -{ - - modify_array(arr, N); - - - return; -} - - - -int hello_cuda() -{ - - LOG("Hello from cpu\n"); - - - U32 N = NUM_THREADS; - U32 arr_size = N * sizeof(U32); - - - U32 *arr = (U32 *)malloc(arr_size); - memset(arr, 0, arr_size); - - cudaError_t cuErr; - U32 *d_arr = 0; - - - - cuErr = cudaMalloc(&d_arr, arr_size); - CUDA_CHECK(cuErr); - - cuErr = cudaMemcpy(d_arr, arr, arr_size, cudaMemcpyHostToDevice); - CUDA_CHECK(cuErr); - - LOG("Array before CUDA \n"); - for(U32 i = 0; i < N; i += 1) - { - LOG("%i: %i \n", i, arr[i]); - } - LOG("\n"); - - - hello_from_device<<>>(d_arr, N); - - cuErr = cudaMemcpy(arr, d_arr, arr_size, cudaMemcpyDeviceToHost); - CUDA_CHECK(cuErr); - - cuErr = cudaFree(d_arr); - CUDA_CHECK(cuErr); - - LOG("Array after CUDA \n"); - for(U32 i = 0; i < N; i += 1) - { - LOG("%i \n", arr[i]); - } - LOG("\n"); - - - return 0; -} - -*/ diff --git a/timeBuild.ctm b/timeBuild.ctm index 20697c4b1c19a07a1849d6bd59a92dc4b798b5ed..13b36a67cc0b1ad03bad977818f8f67166b21f2b 100644 GIT binary patch delta 137 zcmaFD@q}|j3QIl1{j;~rw%lf9W?*2r&(6T`ZS%eZ)lmKgAm8LYvqT+`&j^&C!ot9? w^`)yKjK3Mkcli)&4C903jpwTsBlFKT$qZq