random numbers
This commit is contained in:
parent
315c1fee82
commit
fbf9421843
236
src/main.cu
236
src/main.cu
@ -1,6 +1,9 @@
|
|||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
|
#include <curand_kernel.h>
|
||||||
|
|
||||||
|
//------------------------------------------------------------------------------------------
|
||||||
//~ base defines
|
//~ base defines
|
||||||
|
|
||||||
#define global static
|
#define global static
|
||||||
@ -12,9 +15,6 @@ typedef uint32_t U32;
|
|||||||
typedef uint64_t U64;
|
typedef uint64_t U64;
|
||||||
typedef float F32;
|
typedef float F32;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
//~ utility defines
|
//~ utility defines
|
||||||
|
|
||||||
#define CUDA_CHECK(err) do { \
|
#define CUDA_CHECK(err) do { \
|
||||||
@ -36,7 +36,9 @@ typedef float F32;
|
|||||||
#define IMAGE_WIDTH 1920
|
#define IMAGE_WIDTH 1920
|
||||||
#define ASPECT_RATIO 1.7778f // 16/9
|
#define ASPECT_RATIO 1.7778f // 16/9
|
||||||
|
|
||||||
//------------------------------------------------------------------------------
|
#define CURAND_SEED 1984
|
||||||
|
|
||||||
|
//------------------------------------------------------------------------------------------
|
||||||
//~ structs
|
//~ structs
|
||||||
|
|
||||||
typedef union Vec3F32 Vec3F32;
|
typedef union Vec3F32 Vec3F32;
|
||||||
@ -98,7 +100,7 @@ struct ImageF32
|
|||||||
U32 total_num_pixels;
|
U32 total_num_pixels;
|
||||||
};
|
};
|
||||||
|
|
||||||
//------------------------------------------------------------------------------
|
//------------------------------------------------------------------------------------------
|
||||||
//~ host globals
|
//~ host globals
|
||||||
|
|
||||||
//~ device globals
|
//~ device globals
|
||||||
@ -107,7 +109,7 @@ __constant__ CameraF32 camera;
|
|||||||
__constant__ ViewportF32 viewport;
|
__constant__ ViewportF32 viewport;
|
||||||
__constant__ ImageF32 image;
|
__constant__ ImageF32 image;
|
||||||
|
|
||||||
//------------------------------------------------------------------------------
|
//------------------------------------------------------------------------------------------
|
||||||
//~ routines
|
//~ routines
|
||||||
|
|
||||||
|
|
||||||
@ -183,7 +185,10 @@ __device__ function Vec3F32 lerp_V3F32(F32 s, Vec3F32 a, Vec3F32 b)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
__host__ function void write_buffer_to_ppm(Vec3F32 *buffer, U32 image_width, U32 image_height, U32 *idx_buffer)
|
__host__ function void write_buffer_to_ppm(Vec3F32 *buffer,
|
||||||
|
U32 image_width,
|
||||||
|
U32 image_height,
|
||||||
|
U32 *idx_buffer)
|
||||||
{
|
{
|
||||||
|
|
||||||
const char *filename = "output.ppm";
|
const char *filename = "output.ppm";
|
||||||
@ -195,12 +200,14 @@ __host__ function void write_buffer_to_ppm(Vec3F32 *buffer, U32 image_width, U32
|
|||||||
|
|
||||||
// Write PPM header. First it has "P3" by itself to indicate ASCII colors,
|
// Write PPM header. First it has "P3" by itself to indicate ASCII colors,
|
||||||
fprintf(file, "P3\n");
|
fprintf(file, "P3\n");
|
||||||
// The row below will say the dimensions of the image: (width, height) <-> (num columns, num rows)
|
// The row below will say the dimensions of the image:
|
||||||
|
// (width, height) <-> (num columns, num rows)
|
||||||
fprintf(file, "%i %i\n", image_width, image_height);
|
fprintf(file, "%i %i\n", image_width, image_height);
|
||||||
// Then we have a value for the maximum pixel color
|
// Then we have a value for the maximum pixel color
|
||||||
fprintf(file, "255\n");
|
fprintf(file, "255\n");
|
||||||
// Then we have all the lines with pixel data, it will be three values for each column j on a row i, corresponding
|
// Then we have all the lines with pixel data,
|
||||||
// to a pixel with index (i,j).
|
// it will be three values for each column j on a row i,
|
||||||
|
// corresponding to a pixel with index (i,j).
|
||||||
for(U32 i = 0; i < image_height; i += 1)
|
for(U32 i = 0; i < image_height; i += 1)
|
||||||
{
|
{
|
||||||
for(U32 j = 0; j < image_width; j +=1)
|
for(U32 j = 0; j < image_width; j +=1)
|
||||||
@ -228,22 +235,30 @@ __host__ function void write_buffer_to_ppm(Vec3F32 *buffer, U32 image_width, U32
|
|||||||
|
|
||||||
__device__ function F32 hit_sphere(Vec3F32 center, F32 radius, RayF32 r)
|
__device__ function F32 hit_sphere(Vec3F32 center, F32 radius, RayF32 r)
|
||||||
{
|
{
|
||||||
// We take the quadratic formula -b +- sqrt(b*b-4ac) / 2a,
|
// We take the quadratic formula -b/2a +- sqrt(b*b-4ac) / 2a,
|
||||||
// and we calculate only the sqrt part. If there is a hit with the sphere we either
|
// and we calculate only the sqrt part. If there is a hit with the sphere we either
|
||||||
// have two solutions (positive sqrt), one solution (zero sqrt) or no solution (negative sqrt).
|
// have two solutions (positive sqrt), one solution (zero sqrt)
|
||||||
// If we have no solution we have no hit on the sphere centered at center, with the given radius.
|
// or no solution (negative sqrt).
|
||||||
|
// If we have no solution we have no hit on
|
||||||
|
// the sphere centered at center, with the given radius.
|
||||||
|
|
||||||
|
// Note that we can simplify this, since we always get b = -2(D . (C-Q)), and if
|
||||||
|
// we say b = -2h in the quadradic formula, we get
|
||||||
|
// -(-2h)/2a +- sqrt((-2h)**2 - 4ac) / 2a which expands to
|
||||||
|
// 2h/2a +- 2sqrt(h*h - ac)/2a, simplifying to (h +- sqrt(h*h - ac))/a.
|
||||||
|
// So we use this simplification to optimise away some operations
|
||||||
|
|
||||||
// Compare lines with RTIOW
|
// Compare lines with RTIOW
|
||||||
// (C-Q)
|
// (C-Q)
|
||||||
Vec3F32 oc = sub_V3F32(center, r.origin);
|
Vec3F32 oc = sub_V3F32(center, r.origin);
|
||||||
// a = D.D
|
// a = D.D
|
||||||
F32 a = dot_V3F32(r.direction, r.direction);
|
F32 a = dot_V3F32(r.direction, r.direction);
|
||||||
// b = -2D . (C-Q)
|
// h = D . (C-Q)
|
||||||
F32 b = dot_V3F32(scale_V3F32(-2.0f, r.direction), oc);
|
F32 h = dot_V3F32(r.direction, oc);
|
||||||
// c = (C-Q) . (C-Q) - r*r
|
// c = (C-Q) . (C-Q) - r*r
|
||||||
F32 c = dot_V3F32(oc, oc) - radius*radius;
|
F32 c = dot_V3F32(oc, oc) - radius*radius;
|
||||||
|
|
||||||
F32 discriminant = b*b - 4*a*c;
|
F32 discriminant = h*h - a*c;
|
||||||
|
|
||||||
// We are actually solving for the parameter t in the expression of a point P(t) that
|
// We are actually solving for the parameter t in the expression of a point P(t) that
|
||||||
// intersects the sphere. This is the quadratic problem we get by solving for t in
|
// intersects the sphere. This is the quadratic problem we get by solving for t in
|
||||||
@ -256,9 +271,9 @@ __device__ function F32 hit_sphere(Vec3F32 center, F32 radius, RayF32 r)
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
// t = (-b += sqrt(b*b-4ac))/2a, and here we take the smallest solution to get the point
|
// t = (h += sqrt(h*h-ac))/a, and here we take the smallest solution to get the point
|
||||||
// on the sphere closest to the ray origin.
|
// on the sphere closest to the ray origin.
|
||||||
out = (-b - __fsqrt_rn(discriminant))/(2*a);
|
out = (h - __fsqrt_rn(discriminant))/a;
|
||||||
}
|
}
|
||||||
|
|
||||||
return out;
|
return out;
|
||||||
@ -273,8 +288,7 @@ __global__ function void cuda_main(Vec3F32 *pixelbuffer, U32 *idxbuffer)
|
|||||||
U32 y = blockIdx.y * blockDim.y + threadIdx.y;
|
U32 y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||||
U32 idx = y * image.width + x;
|
U32 idx = y * image.width + x;
|
||||||
|
|
||||||
if(x >= image.width || y >= image.height) return;
|
if(x < image.width && y < image.height)
|
||||||
|
|
||||||
{
|
{
|
||||||
Vec3F32 px_u = scale_V3F32((F32)x, viewport.pixel_delta_u);
|
Vec3F32 px_u = scale_V3F32((F32)x, viewport.pixel_delta_u);
|
||||||
Vec3F32 px_v = scale_V3F32((F32)y, viewport.pixel_delta_v);
|
Vec3F32 px_v = scale_V3F32((F32)y, viewport.pixel_delta_v);
|
||||||
@ -324,11 +338,27 @@ __global__ function void cuda_main(Vec3F32 *pixelbuffer, U32 *idxbuffer)
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__global__ function void cuda_init_state(curandState *rand_state)
|
||||||
|
{
|
||||||
|
|
||||||
|
U32 x = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
|
U32 y = threadIdx.y + blockIdx.y * blockDim.y;
|
||||||
|
|
||||||
|
if(x < image.width && y < image.height)
|
||||||
|
{
|
||||||
|
U32 idx = y * image.width + x;
|
||||||
|
curand_init(CURAND_SEED, idx, 0, &rand_state[idx]);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
//------------------------------------------------------------------------------------------
|
||||||
|
//~ Main
|
||||||
int main()
|
int main()
|
||||||
{
|
{
|
||||||
cudaError_t cuErr;
|
cudaError_t cuErr;
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Define image, camera and viewport on the CPU
|
// Define image, camera and viewport on the CPU
|
||||||
// and then copy to constant globals on device
|
// and then copy to constant globals on device
|
||||||
// -------------
|
// -------------
|
||||||
@ -381,7 +411,9 @@ int main()
|
|||||||
LOG("Viewport size %.2f x %.2f, aspect ratio: %.4f \n",
|
LOG("Viewport size %.2f x %.2f, aspect ratio: %.4f \n",
|
||||||
h_viewport.width, h_viewport.height, h_viewport.aspect_ratio);
|
h_viewport.width, h_viewport.height, h_viewport.aspect_ratio);
|
||||||
|
|
||||||
// Define grid, blocks, threads and pixel buffers
|
//////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Define grid, blocks, threads and any buffers such as pixel data and random state
|
||||||
|
// ------------
|
||||||
U32 num_pixels = h_image.total_num_pixels;
|
U32 num_pixels = h_image.total_num_pixels;
|
||||||
U64 pixel_buffer_size = num_pixels*sizeof(Vec3F32);
|
U64 pixel_buffer_size = num_pixels*sizeof(Vec3F32);
|
||||||
|
|
||||||
@ -395,26 +427,54 @@ int main()
|
|||||||
cuErr = cudaMalloc(&pixel_buffer, pixel_buffer_size);
|
cuErr = cudaMalloc(&pixel_buffer, pixel_buffer_size);
|
||||||
CUDA_CHECK(cuErr);
|
CUDA_CHECK(cuErr);
|
||||||
|
|
||||||
|
// This is just a debug buffer, TODO(anton): remove
|
||||||
U32 *idxbuffer = 0;
|
U32 *idxbuffer = 0;
|
||||||
cuErr = cudaMalloc(&idxbuffer, sizeof(U32)*num_pixels);
|
cuErr = cudaMalloc(&idxbuffer, sizeof(U32)*num_pixels);
|
||||||
|
CUDA_CHECK(cuErr);
|
||||||
|
|
||||||
|
curandState *d_rand_state = 0;
|
||||||
|
cuErr = cudaMalloc(&d_rand_state, num_pixels*sizeof(curandState));
|
||||||
|
CUDA_CHECK(cuErr);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Initialise CUDA state such as random number states per thread.
|
||||||
|
// This is separate for performance measurements
|
||||||
|
// ------------
|
||||||
|
cuda_init_state<<<blocks_per_grid, threads_per_block>>>(d_rand_state);
|
||||||
|
cuErr = cudaGetLastError();
|
||||||
|
CUDA_CHECK(cuErr);
|
||||||
|
cuErr = cudaDeviceSynchronize();
|
||||||
|
CUDA_CHECK(cuErr);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Launch the main CUDA kernel, each thread will color a pixel and store it
|
||||||
|
// in the pixel buffer.
|
||||||
|
// ------------
|
||||||
|
|
||||||
|
|
||||||
LOG("Launching main kernel with \n blocks per grid: (%i, %i, %i) \n",
|
LOG("Launching main kernel with \n blocks per grid: (%i, %i, %i) \n",
|
||||||
blocks_per_grid.x, blocks_per_grid.y, blocks_per_grid.z);
|
blocks_per_grid.x, blocks_per_grid.y, blocks_per_grid.z);
|
||||||
LOG("threads per block: (%i, %i %i) \n",
|
LOG("threads per block: (%i, %i %i) \n",
|
||||||
threads_per_block.x, threads_per_block.y, threads_per_block.z);
|
threads_per_block.x, threads_per_block.y, threads_per_block.z);
|
||||||
|
|
||||||
cuda_main<<<blocks_per_grid, threads_per_block>>>(pixel_buffer, idxbuffer);
|
cuda_main<<<blocks_per_grid, threads_per_block>>>(pixel_buffer, idxbuffer);
|
||||||
cudaDeviceSynchronize();
|
cuErr = cudaGetLastError();
|
||||||
|
CUDA_CHECK(cuErr);
|
||||||
|
cuErr = cudaDeviceSynchronize();
|
||||||
|
|
||||||
Vec3F32 *h_pixel_buffer = (Vec3F32 *)malloc(pixel_buffer_size);
|
|
||||||
cuErr = cudaMemcpy(h_pixel_buffer, pixel_buffer, pixel_buffer_size, cudaMemcpyDeviceToHost);
|
|
||||||
CUDA_CHECK(cuErr);
|
CUDA_CHECK(cuErr);
|
||||||
|
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Copy the pixel buffer back from the device and write it to an image file.
|
||||||
|
// ------------
|
||||||
|
Vec3F32 *h_pixel_buffer = (Vec3F32 *)malloc(pixel_buffer_size);
|
||||||
|
cuErr = cudaMemcpy(h_pixel_buffer, pixel_buffer, pixel_buffer_size,
|
||||||
|
cudaMemcpyDeviceToHost);
|
||||||
|
CUDA_CHECK(cuErr);
|
||||||
|
|
||||||
|
// TODO(anton): remove debug buffer
|
||||||
U32 *h_idxbuffer = (U32 *)malloc(num_pixels*sizeof(U32));
|
U32 *h_idxbuffer = (U32 *)malloc(num_pixels*sizeof(U32));
|
||||||
cuErr = cudaMemcpy(h_idxbuffer, idxbuffer, num_pixels*sizeof(U32), cudaMemcpyDeviceToHost);
|
cuErr = cudaMemcpy(h_idxbuffer, idxbuffer, num_pixels*sizeof(U32),
|
||||||
|
cudaMemcpyDeviceToHost);
|
||||||
|
|
||||||
write_buffer_to_ppm(h_pixel_buffer, h_image.width, h_image.height, h_idxbuffer);
|
write_buffer_to_ppm(h_pixel_buffer, h_image.width, h_image.height, h_idxbuffer);
|
||||||
|
|
||||||
@ -424,121 +484,3 @@ int main()
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
function void write_test_ppm(U32 image_width, U32 image_height)
|
|
||||||
{
|
|
||||||
|
|
||||||
const char *filename = "test_output.ppm";
|
|
||||||
FILE *file = fopen(filename, "w");
|
|
||||||
if(!file)
|
|
||||||
{
|
|
||||||
LOG("Error opening file %s \n", filename);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Write PPM header. First it has "P3" by itself to indicate ASCII colors,
|
|
||||||
fprintf(file, "P3\n");
|
|
||||||
// The row below will say the dimensions of the image: (width, height) <-> (num columns, num rows)
|
|
||||||
fprintf(file, "%i %i\n", image_width, image_height);
|
|
||||||
// Then we have a value for the maximum pixel color
|
|
||||||
fprintf(file, "255\n");
|
|
||||||
// Then we have all the lines with pixel data, it will be three values for each column j on a row i, corresponding
|
|
||||||
// to a pixel with index (i,j).
|
|
||||||
for(U32 i = 0; i < image_height; i += 1)
|
|
||||||
{
|
|
||||||
for(U32 j = 0; j < image_width; j +=1)
|
|
||||||
{
|
|
||||||
// We represent RGB values by floats internally and scale to integer values
|
|
||||||
F32 r = float(j) / (float(image_width)-1.0f);
|
|
||||||
F32 g = float(i) / (float(image_height)-1.0f);
|
|
||||||
F32 b = 0.0f;// - (float(j)/(float(image_width)-1.0f);
|
|
||||||
|
|
||||||
U32 ir = int(255.999f * r);
|
|
||||||
U32 ig = int(255.999f * g);
|
|
||||||
U32 ib = int(255.999f * b);
|
|
||||||
|
|
||||||
fprintf(file, "%i %i %i ", ir, ig, ib);
|
|
||||||
}
|
|
||||||
fprintf(file, "\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
fclose(file);
|
|
||||||
}
|
|
||||||
|
|
||||||
__device__ void modify_array(U32 *arr, U32 N)
|
|
||||||
{
|
|
||||||
U32 i = threadIdx.x;
|
|
||||||
if(i < N)
|
|
||||||
{
|
|
||||||
arr[i] = i;
|
|
||||||
}
|
|
||||||
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
__global__ void hello_from_device(U32 *arr, U32 N)
|
|
||||||
{
|
|
||||||
|
|
||||||
modify_array(arr, N);
|
|
||||||
|
|
||||||
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
int hello_cuda()
|
|
||||||
{
|
|
||||||
|
|
||||||
LOG("Hello from cpu\n");
|
|
||||||
|
|
||||||
|
|
||||||
U32 N = NUM_THREADS;
|
|
||||||
U32 arr_size = N * sizeof(U32);
|
|
||||||
|
|
||||||
|
|
||||||
U32 *arr = (U32 *)malloc(arr_size);
|
|
||||||
memset(arr, 0, arr_size);
|
|
||||||
|
|
||||||
cudaError_t cuErr;
|
|
||||||
U32 *d_arr = 0;
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
cuErr = cudaMalloc(&d_arr, arr_size);
|
|
||||||
CUDA_CHECK(cuErr);
|
|
||||||
|
|
||||||
cuErr = cudaMemcpy(d_arr, arr, arr_size, cudaMemcpyHostToDevice);
|
|
||||||
CUDA_CHECK(cuErr);
|
|
||||||
|
|
||||||
LOG("Array before CUDA \n");
|
|
||||||
for(U32 i = 0; i < N; i += 1)
|
|
||||||
{
|
|
||||||
LOG("%i: %i \n", i, arr[i]);
|
|
||||||
}
|
|
||||||
LOG("\n");
|
|
||||||
|
|
||||||
|
|
||||||
hello_from_device<<<NUM_BLOCKS, NUM_THREADS>>>(d_arr, N);
|
|
||||||
|
|
||||||
cuErr = cudaMemcpy(arr, d_arr, arr_size, cudaMemcpyDeviceToHost);
|
|
||||||
CUDA_CHECK(cuErr);
|
|
||||||
|
|
||||||
cuErr = cudaFree(d_arr);
|
|
||||||
CUDA_CHECK(cuErr);
|
|
||||||
|
|
||||||
LOG("Array after CUDA \n");
|
|
||||||
for(U32 i = 0; i < N; i += 1)
|
|
||||||
{
|
|
||||||
LOG("%i \n", arr[i]);
|
|
||||||
}
|
|
||||||
LOG("\n");
|
|
||||||
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
*/
|
|
||||||
|
|||||||
BIN
timeBuild.ctm
BIN
timeBuild.ctm
Binary file not shown.
Loading…
Reference in New Issue
Block a user