diff --git a/src/main.cu b/src/main.cu index 3de8a4a..88c8c0b 100644 --- a/src/main.cu +++ b/src/main.cu @@ -1,12 +1,13 @@ #include #include +#include #include //------------------------------------------------------------------------------------------ //~ base defines -#define global static +#define host_global static #define function static //~ typedefs @@ -27,6 +28,8 @@ typedef float F32; #define LOG printf +#define F32_MAX FLT_MAX +#define F32_MIN FLT_MIN //~ test defines #define NUM_BLOCKS 1 @@ -38,6 +41,9 @@ typedef float F32; #define CURAND_SEED 1984 + +#define MAX_NUM_ENTITIES 64 + //------------------------------------------------------------------------------------------ //~ structs @@ -59,6 +65,12 @@ union Vec3F32 F32 v[3]; }; +typedef struct RngF32 RngF32; +struct RngF32 +{ + F32 min; + F32 max; +}; typedef struct RayF32 RayF32; struct RayF32 @@ -100,9 +112,34 @@ struct ImageF32 U32 total_num_pixels; }; +enum EntityKind +{ + EntityKind_Nil, + EntityKind_Sphere, + Num_EntityKinds +}; + +typedef struct HitRecord HitRecord; +struct HitRecord +{ + Vec3F32 point; + Vec3F32 normal; + F32 t; // Root parameter for hit sphere + F32 hit; // Hit true or false + F32 front_face; +}; + +typedef struct Entity Entity; +struct Entity +{ + EntityKind kind; + Vec3F32 center; + F32 radius; +}; + //------------------------------------------------------------------------------------------ //~ host globals - +host_global Entity nil_entity = {EntityKind_Nil, {0.0f, 0.0f, 0.0f}, 0.0f}; //~ device globals __constant__ CameraF32 camera; @@ -184,6 +221,22 @@ __device__ function Vec3F32 lerp_V3F32(F32 s, Vec3F32 a, Vec3F32 b) return lerp_result; } +__device__ function F32 surrounds_RngF32(RngF32 rng, F32 val) +{ + F32 out = (rng.min < val) && (val < rng.max); + return out; +} + +__device__ function F32 contains_RngF32(RngF32 rng, F32 val) +{ + F32 out = (rng.min <= val) && (val <= rng.max); + return out; +} + +__device__ function F32 size_RngF32(RngF32 rng) +{ + return rng.max-rng.min; +} __host__ function void write_buffer_to_ppm(Vec3F32 *buffer, U32 image_width, @@ -233,8 +286,11 @@ __host__ function void write_buffer_to_ppm(Vec3F32 *buffer, fclose(file); } -__device__ function F32 hit_sphere(Vec3F32 center, F32 radius, RayF32 r) + +__device__ function HitRecord hit_sphere(Vec3F32 center, F32 radius, + RayF32 ray, RngF32 range) { + HitRecord out = {0}; // We take the quadratic formula -b/2a +- sqrt(b*b-4ac) / 2a, // and we calculate only the sqrt part. If there is a hit with the sphere we either // have two solutions (positive sqrt), one solution (zero sqrt) @@ -250,11 +306,11 @@ __device__ function F32 hit_sphere(Vec3F32 center, F32 radius, RayF32 r) // Compare lines with RTIOW // (C-Q) - Vec3F32 oc = sub_V3F32(center, r.origin); + Vec3F32 oc = sub_V3F32(center, ray.origin); // a = D.D - F32 a = dot_V3F32(r.direction, r.direction); + F32 a = dot_V3F32(ray.direction, ray.direction); // h = D . (C-Q) - F32 h = dot_V3F32(r.direction, oc); + F32 h = dot_V3F32(ray.direction, oc); // c = (C-Q) . (C-Q) - r*r F32 c = dot_V3F32(oc, oc) - radius*radius; @@ -264,24 +320,58 @@ __device__ function F32 hit_sphere(Vec3F32 center, F32 radius, RayF32 r) // intersects the sphere. This is the quadratic problem we get by solving for t in // (C - P(t)) . (C - P(t)) = r*r, r being the radius and P(t) = tD+Q, // where D is the direction of the ray and Q the origin of the ray. - F32 out = 0.0f; + F32 hit_true = 0.0f; + + // Branching version + // TODO(anton): Maybe try to make a branchless version + F32 root = 0.0f; if(discriminant < 0.0f) { - out = -1.0f; - } + hit_true = 0.0f; + } else { // t = (h += sqrt(h*h-ac))/a, and here we take the smallest solution to get the point // on the sphere closest to the ray origin. - out = (h - __fsqrt_rn(discriminant))/a; + F32 sqrtd = __fsqrt_rn(discriminant); + root = (h - sqrtd)/a; + if(!surrounds_RngF32(range, root)) + { + root = (h + sqrtd)/a; + if(!surrounds_RngF32(range, root)) + { + hit_true = 0.0f; + } + else + { + hit_true = 1.0f; + } + } + else + { + hit_true = 1.0f; + } } + out.hit = hit_true; + out.t = root; + + // t is the parameter of the (closest) sphere-ray intersection point P(t) = tD+Q, + // where Q is the ray origin and D the ray direction. + out.point = ray_point_F32(out.t, ray); // intersection point + Vec3F32 N = sub_V3F32(out.point, center); + N = scale_V3F32(1.0f/radius, N); + + F32 front_face = dot_V3F32(ray.direction, N) < 0.0f; + out.normal = front_face ? N : scale_V3F32(-1.0f, N); + out.front_face = front_face; + return out; } -__global__ function void cuda_main(Vec3F32 *pixelbuffer, U32 *idxbuffer) +__global__ void cuda_main(Entity *entities, Vec3F32 *pixelbuffer, U32 *idxbuffer) { U32 x = blockIdx.x * blockDim.x + threadIdx.x; @@ -293,37 +383,59 @@ __global__ function void cuda_main(Vec3F32 *pixelbuffer, U32 *idxbuffer) Vec3F32 px_u = scale_V3F32((F32)x, viewport.pixel_delta_u); Vec3F32 px_v = scale_V3F32((F32)y, viewport.pixel_delta_v); Vec3F32 pixel_center = add_V3F32(viewport.pixel_origin, add_V3F32(px_u, px_v)); - + // TODO(anton): Maybe we dont need some ray structure here.. Vec3F32 ray_direction = sub_V3F32(pixel_center, camera.center); - RayF32 r = {0}; - r.origin = camera.center; - r.direction = ray_direction; + RayF32 ray = {0}; + ray.origin = camera.center; + ray.direction = ray_direction; - F32 norm = norm_V3F32(r.direction); - Vec3F32 unit_dir = scale_V3F32(1.0f/norm, r.direction); - Vec3F32 white = vec3F32(1.0f, 1.0f, 1.0f); - Vec3F32 light_blue = vec3F32(0.5f, 0.7f, 1.0f); - // Lerp between white and light blue depending on y position - F32 blend = 0.5f*(unit_dir.y + 1.0f); - Vec3F32 pixel_color = {0}; - - Vec3F32 sphere_center = vec3F32(0.0f, 0.0f, -1.0f); - F32 sphere_radius = 0.5f; - - // t is the parameter of the (closest) sphere-ray intersection point P(t) = tD+Q, - // where Q is the ray origin and D the ray direction. - F32 t = hit_sphere(sphere_center, sphere_radius, r); - if(t > 0.0f) + RngF32 hit_range = {F32_MIN, F32_MAX}; + HitRecord hit_rec = {0}; + for(U32 entity_idx = 0; entity_idx < MAX_NUM_ENTITIES; entity_idx += 1) { - Vec3F32 intersection_point = ray_point_F32(t, r); - Vec3F32 N = sub_V3F32(intersection_point, sphere_center); - N = scale_V3F32(1.0f/sphere_radius, N); - pixel_color = scale_V3F32(0.5f, add_V3F32(N, vec3F32(1.0f, 1.0f, 1.0f))); + Entity *entity = &entities[entity_idx]; + switch(entity->kind) + { + case EntityKind_Nil: + { + // no op + } break; + + case EntityKind_Sphere: + { + HitRecord temp_hit_rec = hit_sphere(entity->center, entity->radius, + ray, hit_range); + if(temp_hit_rec.hit) + { + hit_rec = temp_hit_rec; + hit_range.max = hit_rec.t; + } + + } break; + } // end switch entity kind + } - else + + Vec3F32 pixel_color = {0.0f, 0.0f, 0.0f}; + if(hit_rec.hit) { + // Paint entity + pixel_color = add_V3F32(hit_rec.normal, vec3F32(1.0f, 1.0f, 1.0f)); + pixel_color = scale_V3F32(0.5f, pixel_color); + } + else + { + // Paint background gradient + F32 norm = norm_V3F32(ray.direction); + Vec3F32 unit_dir = scale_V3F32(1.0f/norm, ray.direction); + Vec3F32 white = vec3F32(1.0f, 1.0f, 1.0f); + Vec3F32 light_blue = vec3F32(0.5f, 0.7f, 1.0f); + + // Lerp between white and light blue depending on y position + F32 blend = 0.5f*(unit_dir.y + 1.0f); + pixel_color = lerp_V3F32(blend, white, light_blue); } @@ -338,7 +450,7 @@ __global__ function void cuda_main(Vec3F32 *pixelbuffer, U32 *idxbuffer) } -__global__ function void cuda_init_state(curandState *rand_state) +__global__ void cuda_init_state(curandState *rand_state) { U32 x = threadIdx.x + blockIdx.x * blockDim.x; @@ -349,7 +461,7 @@ __global__ function void cuda_init_state(curandState *rand_state) U32 idx = y * image.width + x; curand_init(CURAND_SEED, idx, 0, &rand_state[idx]); } - + } //------------------------------------------------------------------------------------------ @@ -377,9 +489,9 @@ int main() CameraF32 h_camera = {0}; h_camera.focal_length = 1.0f; cuErr = cudaMemcpyToSymbol(camera, &h_camera, sizeof(CameraF32), 0, - cudaMemcpyHostToDevice); + cudaMemcpyHostToDevice); CUDA_CHECK(cuErr); - + // ------------- ViewportF32 h_viewport = {0}; h_viewport.height = 2.0f; @@ -387,30 +499,62 @@ int main() h_viewport.aspect_ratio = h_viewport.width/h_viewport.height; h_viewport.u = vec3F32(h_viewport.width, 0.0f, 0.0f); h_viewport.v = vec3F32(0.0f, -h_viewport.height, 0.0f); - + F32 width_inverse = 1.0f/(F32)h_image.width; F32 height_inverse = 1.0f/(F32)h_image.height; h_viewport.pixel_delta_u = scale_V3F32(width_inverse, h_viewport.u); h_viewport.pixel_delta_v = scale_V3F32(height_inverse, h_viewport.v); - + // upper_left = camera - vec3(0,0,focal_length) - viewport_u/2 - viewport_v/2 Vec3F32 viewport_upper_left = sub_V3F32(h_camera.center, - vec3F32(0.0f, 0.0f, h_camera.focal_length)); + vec3F32(0.0f, 0.0f, h_camera.focal_length)); viewport_upper_left = sub_V3F32(viewport_upper_left, scale_V3F32(0.5f, h_viewport.u)); viewport_upper_left = sub_V3F32(viewport_upper_left, scale_V3F32(0.5f, h_viewport.v)); h_viewport.upper_left = viewport_upper_left; - + // pixel_origin = upper_left + 0.5 * (delta u + delta v) Vec3F32 pixel_delta_sum = add_V3F32(h_viewport.pixel_delta_u, h_viewport.pixel_delta_v); h_viewport.pixel_origin = add_V3F32(viewport_upper_left, - scale_V3F32(0.5f, pixel_delta_sum)); + scale_V3F32(0.5f, pixel_delta_sum)); cuErr = cudaMemcpyToSymbol(viewport, &h_viewport, sizeof(ViewportF32), 0, - cudaMemcpyHostToDevice); + cudaMemcpyHostToDevice); CUDA_CHECK(cuErr); LOG("Viewport size %.2f x %.2f, aspect ratio: %.4f \n", h_viewport.width, h_viewport.height, h_viewport.aspect_ratio); + + ////////////////////////////////////////////////////////////////////////////////////////// + // Setup entities and copy to device + U64 entity_list_size = sizeof(Entity)*MAX_NUM_ENTITIES; + Entity *h_entities = (Entity *)malloc(entity_list_size); + for(U32 i = 0; i < MAX_NUM_ENTITIES; i += 1) + { + // Init all entities to nil + //h_entities[i] = {0}; + //h_entities[i].kind = EntityKind_Nil; + h_entities[i] = nil_entity; + } + + // Manual spheres + { + h_entities[0].kind = EntityKind_Sphere; + h_entities[0].center = vec3F32(0.0f, 0.0f, -1.0f); + h_entities[0].radius = 0.5f; + + h_entities[1].kind = EntityKind_Sphere; + h_entities[1].center = vec3F32(0.0f, -100.5f, -1.0f); + h_entities[1].radius = 100.0f; + } + + // Copy to device + Entity *entities = 0; + cuErr = cudaMalloc(&entities, entity_list_size); + CUDA_CHECK(cuErr); + cuErr = cudaMemcpy(entities, h_entities, entity_list_size, cudaMemcpyHostToDevice); + CUDA_CHECK(cuErr); + + ////////////////////////////////////////////////////////////////////////////////////////// // Define grid, blocks, threads and any buffers such as pixel data and random state // ------------ @@ -436,6 +580,8 @@ int main() cuErr = cudaMalloc(&d_rand_state, num_pixels*sizeof(curandState)); CUDA_CHECK(cuErr); + + ////////////////////////////////////////////////////////////////////////////////////////// // Initialise CUDA state such as random number states per thread. // This is separate for performance measurements @@ -456,7 +602,7 @@ int main() LOG("threads per block: (%i, %i %i) \n", threads_per_block.x, threads_per_block.y, threads_per_block.z); - cuda_main<<>>(pixel_buffer, idxbuffer); + cuda_main<<>>(entities, pixel_buffer, idxbuffer); cuErr = cudaGetLastError(); CUDA_CHECK(cuErr); cuErr = cudaDeviceSynchronize(); @@ -468,19 +614,19 @@ int main() // ------------ Vec3F32 *h_pixel_buffer = (Vec3F32 *)malloc(pixel_buffer_size); cuErr = cudaMemcpy(h_pixel_buffer, pixel_buffer, pixel_buffer_size, - cudaMemcpyDeviceToHost); + cudaMemcpyDeviceToHost); CUDA_CHECK(cuErr); // TODO(anton): remove debug buffer U32 *h_idxbuffer = (U32 *)malloc(num_pixels*sizeof(U32)); cuErr = cudaMemcpy(h_idxbuffer, idxbuffer, num_pixels*sizeof(U32), - cudaMemcpyDeviceToHost); + cudaMemcpyDeviceToHost); write_buffer_to_ppm(h_pixel_buffer, h_image.width, h_image.height, h_idxbuffer); - + cuErr = cudaFree(pixel_buffer); CUDA_CHECK(cuErr); - + return 0; } diff --git a/timeBuild.ctm b/timeBuild.ctm index 13b36a6..a58225b 100644 Binary files a/timeBuild.ctm and b/timeBuild.ctm differ