From fe726e9f493f09733db06d377484423dc54d9722 Mon Sep 17 00:00:00 2001 From: Anton Ljungdahl Date: Fri, 25 Apr 2025 22:15:17 +0200 Subject: [PATCH] antialising working! --- src/main.cu | 257 +++++++++++++++++++++++++++++++------------------- timeBuild.ctm | Bin 1268 -> 2228 bytes 2 files changed, 159 insertions(+), 98 deletions(-) diff --git a/src/main.cu b/src/main.cu index 88c8c0b..ce3bccc 100644 --- a/src/main.cu +++ b/src/main.cu @@ -31,18 +31,19 @@ typedef float F32; #define F32_MAX FLT_MAX #define F32_MIN FLT_MIN -//~ test defines + +//------------------------------------------------------------------------------------------ +//~ Program parameter defines #define NUM_BLOCKS 1 #define NUM_THREADS 32 - #define IMAGE_WIDTH 1920 #define ASPECT_RATIO 1.7778f // 16/9 #define CURAND_SEED 1984 - #define MAX_NUM_ENTITIES 64 +#define SAMPLES_PER_PIXEL 32 //------------------------------------------------------------------------------------------ //~ structs @@ -101,6 +102,7 @@ struct CameraF32 Vec3F32 center; Vec3F32 up; F32 focal_length; + F32 pixel_sample_scale; }; typedef struct ImageF32 ImageF32; @@ -227,21 +229,22 @@ __device__ function F32 surrounds_RngF32(RngF32 rng, F32 val) return out; } -__device__ function F32 contains_RngF32(RngF32 rng, F32 val) -{ - F32 out = (rng.min <= val) && (val <= rng.max); - return out; -} - -__device__ function F32 size_RngF32(RngF32 rng) -{ - return rng.max-rng.min; -} +// +//__device__ function F32 contains_RngF32(RngF32 rng, F32 val) +//{ +// F32 out = (rng.min <= val) && (val <= rng.max); +// return out; +//} +// +//__device__ function F32 size_RngF32(RngF32 rng) +//{ +// return rng.max-rng.min; +//} +// __host__ function void write_buffer_to_ppm(Vec3F32 *buffer, U32 image_width, - U32 image_height, - U32 *idx_buffer) + U32 image_height) { const char *filename = "output.ppm"; @@ -267,9 +270,7 @@ __host__ function void write_buffer_to_ppm(Vec3F32 *buffer, { // We represent RGB values by floats internally and scale to integer values U32 idx = i * image_width + j; - if(idx_buffer[idx] != 0) { - //LOG("idx %i, idxbuffer[idx] = %i \n", idx, idx_buffer[idx]); - } + F32 r = buffer[idx].r; F32 g = buffer[idx].g; F32 b = buffer[idx].b; @@ -286,9 +287,26 @@ __host__ function void write_buffer_to_ppm(Vec3F32 *buffer, fclose(file); } +__device__ function F32 +clamp_F32(RngF32 rng, F32 val) +{ + F32 out = fmaxf(rng.min, val); + out = fminf(val, rng.max); + return out; +} -__device__ function HitRecord hit_sphere(Vec3F32 center, F32 radius, - RayF32 ray, RngF32 range) +__device__ function Vec3F32 +clamp_V3F32(RngF32 rng, Vec3F32 v) +{ + Vec3F32 out = {0}; + out.x = clamp_F32(rng, v.x); + out.y = clamp_F32(rng, v.y); + out.z = clamp_F32(rng, v.z); + return out; +} + +__device__ function HitRecord +hit_sphere(Vec3F32 center, F32 radius, RayF32 ray, RngF32 range) { HitRecord out = {0}; // We take the quadratic formula -b/2a +- sqrt(b*b-4ac) / 2a, @@ -369,9 +387,98 @@ __device__ function HitRecord hit_sphere(Vec3F32 center, F32 radius, return out; } +__device__ function RayF32 +ray_get_F32(F32 x, F32 y, Vec3F32 cam_center, curandState local_rand_state) +{ + + RayF32 out = {0}; + + // We have unit vectors delta_u and delta_v in the horizontal and vertical viewport directions. + Vec3F32 px_u = scale_V3F32(x, viewport.pixel_delta_u); + Vec3F32 px_v = scale_V3F32(y, viewport.pixel_delta_v); + Vec3F32 pixel_center = add_V3F32(viewport.pixel_origin, add_V3F32(px_u, px_v)); + // To get anti-aliasing we make a random offset from the pixel center + F32 rand_u = curand_uniform(&local_rand_state) - 0.5f; + F32 rand_v = curand_uniform(&local_rand_state) - 0.5f; + // the rand u and rand v are offsets from a pixel in the [-0.5, 0.5] square. + // We need to put that into the world space of our viewport + Vec3F32 offset_u = scale_V3F32(rand_u, viewport.pixel_delta_u); + Vec3F32 offset_v = scale_V3F32(rand_v, viewport.pixel_delta_v); -__global__ void cuda_main(Entity *entities, Vec3F32 *pixelbuffer, U32 *idxbuffer) + // Then we shift the pixel center with the offsets in both directions + Vec3F32 pixel_sample = add_V3F32(pixel_center, add_V3F32(offset_u, offset_v)); + // With a randomised point around the pixel center we can define the ray direction + // as the vector from the camera center to the point on the viewport. + Vec3F32 ray_direction = sub_V3F32(pixel_sample, camera.center); + + out.origin = camera.center; + out.direction = ray_direction; + return out; +} + +// Trace a ray and get a pixel color sample +__device__ function Vec3F32 +get_sample_color(RayF32 ray, Entity *entities) +{ + Vec3F32 out = {0}; + + RngF32 hit_range = {F32_MIN, F32_MAX}; + HitRecord hit_rec = {0}; + for(U32 entity_idx = 0; entity_idx < MAX_NUM_ENTITIES; entity_idx += 1) + { + Entity *entity = &entities[entity_idx]; + switch(entity->kind) + { + case EntityKind_Nil: + { + // no op + } break; + + case EntityKind_Sphere: + { + HitRecord temp_hit_rec = hit_sphere(entity->center, entity->radius, + ray, hit_range); + if(temp_hit_rec.hit) + { + hit_rec = temp_hit_rec; + hit_range.max = hit_rec.t; + } + + } break; + } // end switch entity kind + + } + + Vec3F32 sample_pixel_color = vec3F32(0.0f, 0.0f, 0.0f); + if(hit_rec.hit) + { + // Paint entity + sample_pixel_color = add_V3F32(hit_rec.normal, vec3F32(1.0f, 1.0f, 1.0f)); + sample_pixel_color = scale_V3F32(0.5f, sample_pixel_color); + // debug + //sample_pixel_color = vec3F32(1.0f, 0.0f, 0.0f); + } + else + { + // Paint background gradient + F32 norm = norm_V3F32(ray.direction); + Vec3F32 unit_dir = scale_V3F32(1.0f/norm, ray.direction); + Vec3F32 white = vec3F32(1.0f, 1.0f, 1.0f); + Vec3F32 light_blue = vec3F32(0.5f, 0.7f, 1.0f); + + // Lerp between white and light blue depending on y position + F32 blend = 0.5f*(unit_dir.y + 1.0f); + + sample_pixel_color = lerp_V3F32(blend, white, light_blue); + } + + out = sample_pixel_color; + return out; +} + + __global__ void +cuda_main(Entity *entities, Vec3F32 *pixelbuffer, curandState *rand_state) { U32 x = blockIdx.x * blockDim.x + threadIdx.x; @@ -380,72 +487,30 @@ __global__ void cuda_main(Entity *entities, Vec3F32 *pixelbuffer, U32 *idxbuffer if(x < image.width && y < image.height) { - Vec3F32 px_u = scale_V3F32((F32)x, viewport.pixel_delta_u); - Vec3F32 px_v = scale_V3F32((F32)y, viewport.pixel_delta_v); - Vec3F32 pixel_center = add_V3F32(viewport.pixel_origin, add_V3F32(px_u, px_v)); - // TODO(anton): Maybe we dont need some ray structure here.. - Vec3F32 ray_direction = sub_V3F32(pixel_center, camera.center); - RayF32 ray = {0}; - ray.origin = camera.center; - ray.direction = ray_direction; - - - RngF32 hit_range = {F32_MIN, F32_MAX}; - HitRecord hit_rec = {0}; - for(U32 entity_idx = 0; entity_idx < MAX_NUM_ENTITIES; entity_idx += 1) + // We are adding all samples and then dividing by num samples to get the mean, so + // we initialise the color for this pixel to black. + // Loop over all pixel samples + Vec3F32 pixel_color = vec3F32(0.0f, 0.0f, 0.0f); + for(U32 sample_idx = 0; sample_idx < SAMPLES_PER_PIXEL; sample_idx += 1) { - Entity *entity = &entities[entity_idx]; - switch(entity->kind) - { - case EntityKind_Nil: - { - // no op - } break; - case EntityKind_Sphere: - { - HitRecord temp_hit_rec = hit_sphere(entity->center, entity->radius, - ray, hit_range); - if(temp_hit_rec.hit) - { - hit_rec = temp_hit_rec; - hit_range.max = hit_rec.t; - } + // TODO(anton): Maybe we can randomise things directly here as the + // nvidia accelerated version, where we just put the x, y indices with a + // randomised shift and normalise to viewport space by dividing by max x, max y + RayF32 ray = ray_get_F32((F32)x, (F32)y, camera.center, rand_state[idx]); + + Vec3F32 sample_pixel_color = get_sample_color(ray, entities); - } break; - } // end switch entity kind - - } - - Vec3F32 pixel_color = {0.0f, 0.0f, 0.0f}; - if(hit_rec.hit) - { - // Paint entity - pixel_color = add_V3F32(hit_rec.normal, vec3F32(1.0f, 1.0f, 1.0f)); - pixel_color = scale_V3F32(0.5f, pixel_color); - } - else - { - // Paint background gradient - F32 norm = norm_V3F32(ray.direction); - Vec3F32 unit_dir = scale_V3F32(1.0f/norm, ray.direction); - Vec3F32 white = vec3F32(1.0f, 1.0f, 1.0f); - Vec3F32 light_blue = vec3F32(0.5f, 0.7f, 1.0f); - - // Lerp between white and light blue depending on y position - F32 blend = 0.5f*(unit_dir.y + 1.0f); - - pixel_color = lerp_V3F32(blend, white, light_blue); + F32 debug_sample = curand_uniform(&rand_state[idx]); + Vec3F32 debug = vec3F32(debug_sample, debug_sample, debug_sample); + //pixel_color = add_V3F32(pixel_color, debug); + pixel_color = add_V3F32(pixel_color, sample_pixel_color); } - pixelbuffer[idx] = pixel_color; - - //pixelbuffer[idx].x = (F32)x/(F32)image.width; - //pixelbuffer[idx].y = (F32)y/(F32)image.height; - //pixelbuffer[idx].z = 0.0f; - - idxbuffer[idx] = idx; + pixel_color = scale_V3F32(1.0f/(F32)SAMPLES_PER_PIXEL, pixel_color); + RngF32 clamp_range = {0.0f, 1.0f}; + pixelbuffer[idx] = clamp_V3F32(clamp_range, pixel_color); } } @@ -488,6 +553,9 @@ int main() // ------------- CameraF32 h_camera = {0}; h_camera.focal_length = 1.0f; + F32 samples_per_pixel = (F32)SAMPLES_PER_PIXEL; + h_camera.pixel_sample_scale = 1.0f/samples_per_pixel; + cuErr = cudaMemcpyToSymbol(camera, &h_camera, sizeof(CameraF32), 0, cudaMemcpyHostToDevice); CUDA_CHECK(cuErr); @@ -515,7 +583,7 @@ int main() // pixel_origin = upper_left + 0.5 * (delta u + delta v) Vec3F32 pixel_delta_sum = add_V3F32(h_viewport.pixel_delta_u, h_viewport.pixel_delta_v); h_viewport.pixel_origin = add_V3F32(viewport_upper_left, - scale_V3F32(0.5f, pixel_delta_sum)); + scale_V3F32(0.5f, pixel_delta_sum)); cuErr = cudaMemcpyToSymbol(viewport, &h_viewport, sizeof(ViewportF32), 0, cudaMemcpyHostToDevice); @@ -571,22 +639,16 @@ int main() cuErr = cudaMalloc(&pixel_buffer, pixel_buffer_size); CUDA_CHECK(cuErr); - // This is just a debug buffer, TODO(anton): remove - U32 *idxbuffer = 0; - cuErr = cudaMalloc(&idxbuffer, sizeof(U32)*num_pixels); + curandState *rand_state = 0; + cuErr = cudaMalloc(&rand_state, num_pixels*sizeof(curandState)); CUDA_CHECK(cuErr); - curandState *d_rand_state = 0; - cuErr = cudaMalloc(&d_rand_state, num_pixels*sizeof(curandState)); - CUDA_CHECK(cuErr); - - ////////////////////////////////////////////////////////////////////////////////////////// // Initialise CUDA state such as random number states per thread. // This is separate for performance measurements // ------------ - cuda_init_state<<>>(d_rand_state); + cuda_init_state<<>>(rand_state); cuErr = cudaGetLastError(); CUDA_CHECK(cuErr); cuErr = cudaDeviceSynchronize(); @@ -602,7 +664,7 @@ int main() LOG("threads per block: (%i, %i %i) \n", threads_per_block.x, threads_per_block.y, threads_per_block.z); - cuda_main<<>>(entities, pixel_buffer, idxbuffer); + cuda_main<<>>(entities, pixel_buffer, rand_state); cuErr = cudaGetLastError(); CUDA_CHECK(cuErr); cuErr = cudaDeviceSynchronize(); @@ -617,15 +679,14 @@ int main() cudaMemcpyDeviceToHost); CUDA_CHECK(cuErr); - // TODO(anton): remove debug buffer - U32 *h_idxbuffer = (U32 *)malloc(num_pixels*sizeof(U32)); - cuErr = cudaMemcpy(h_idxbuffer, idxbuffer, num_pixels*sizeof(U32), - cudaMemcpyDeviceToHost); - - write_buffer_to_ppm(h_pixel_buffer, h_image.width, h_image.height, h_idxbuffer); + write_buffer_to_ppm(h_pixel_buffer, h_image.width, h_image.height); cuErr = cudaFree(pixel_buffer); CUDA_CHECK(cuErr); + cuErr = cudaFree(entities); + CUDA_CHECK(cuErr); + cuErr = cudaFree(rand_state); + CUDA_CHECK(cuErr); return 0; } diff --git a/timeBuild.ctm b/timeBuild.ctm index a58225bb5bbf00a5a1f80e30d98c73d82bd01449..521cfba5c6f63d073b537242589e73b04e1dc9ae 100644 GIT binary patch delta 975 zcmYk)Ur1AN6bJBQGxJ`})K>W?(}|ct)@T+H!jf=%DN7h=wAvK+5GZOQH5lHaC<*Ek z+ruDm21R23_8>|SPKlsPU|_4jy_6v%VG?t#6+=5`{SNu~Ip5#;{m$?HZbF(U3MR4@ zYjUzjnv7&+0e~+D;PR6{r)6Q|W&rQJ)K1o69+Lrdhb=*NEBc%F+*@9M9M|`H{0p4- zVeWiYQ|dxr0~m0FxNXzf0?`o22Z+|r46^5NgSGW=jQt#~ zjSY`%6YEvP}=6rM(#9)*vwX`pUbZyQ^c6@c|%<>LPWPP}D&ph#>$2iT;VDEkFE z{q8`h7;_rffdkiHa=&5EX9K(pc{KI}df0sGJ^MBKny);9LoT$i9khcS@r>M6RGK`){4t$)>`| zap@u#sG-qiOOfe@^be@HN7I~FV}3UluihncGv>)YLmin0Sd{?kx_q5%d}An>x*lND zKX5R0dq!BvOgv(^_lXzK0Qv`pB{qGL{P@iuYq<(Ef6<0S$D2I;64#=-Ka%r))YPz0jnw z=>UKC=#AZ?-<=0=VfjjuO}TxgxUhn9t9bL}k88D=0-6r=mehaVXoq&lyhm(EPi7pn IG_!T+zaIbuJOBUy delta 7 OcmdlY_=R)B7Zv~xJp*e1