#include "base_core.h"
#include "base_math.h"
#include "rayt_core.h"

//------------------------------------------------------------------------------------------
//~ Program parameter defines
#define NUM_BLOCKS   1
#define NUM_THREADS  32

#define IMAGE_WIDTH 1920
#define ASPECT_RATIO 1.7778f // 16/9

#define CURAND_SEED 1984

#define MAX_RANDOM_UNIT_VECTOR_ITERATIONS 64
#define MAX_NUM_ENTITIES 64
#define SAMPLES_PER_PIXEL 64
#define MAX_DIFFUSE_DEPTH 8

#include "base_math.c"
#include "rayt_core.c"

//------------------------------------------------------------------------------------------
//~ host globals
host_global CameraF32 h_camera;
host_global ViewportF32 h_viewport;
host_global ImageF32 h_image;

__global__ void 
cuda_main(Entity *entities, Vec3F32 *pixelbuffer, curandState *rand_state) 
{

  U32 x = blockIdx.x * blockDim.x + threadIdx.x;
  U32 y = blockIdx.y * blockDim.y + threadIdx.y;
  U32 idx = y * image.width + x;

  if(x < image.width && y < image.height)
  {

    // NOTE! We need to pass this as a pointer to subsequent usage functions, in order
    // to update the random state on this thread, after each call to a distribution function.
    curandState local_rand_state = rand_state[idx];


    // We are adding all samples and then dividing by num samples to get the mean, so 
    // we initialise the color for this pixel to black.
    // Loop over all pixel samples
    Vec3F32 pixel_color = vec3F32(0.0f, 0.0f, 0.0f);
   
    pixelbuffer[idx] = pixel_color;
  }

}

//------------------------------------------------------------------------------------------
//~ Main
int main() 
{
  cudaError_t cuErr;

  //////////////////////////////////////////////////////////////////////////////////////////
  // Define image, camera and viewport on the CPU 
  // and then copy to constant globals on device
  // -------------
  h_image = {0};
  h_image.width = IMAGE_WIDTH;
  h_image.aspect_ratio = ASPECT_RATIO;
  U32 height = U32((F32)h_image.width/h_image.aspect_ratio) + 1;
  h_image.height = height < 1 ? 1 : height;
  h_image.total_num_pixels = h_image.width * h_image.height;
  cuErr = cudaMemcpyToSymbol(image, &h_image, sizeof(ImageF32), 0, cudaMemcpyHostToDevice);
  CUDA_CHECK(cuErr);
  LOG("Image size %i x %i, aspect ratio: %.4f \n", 
      h_image.width, h_image.height, h_image.aspect_ratio);

  // -------------
  h_camera = {0};
  h_camera.focal_length = 1.0f;
  F32 samples_per_pixel = (F32)SAMPLES_PER_PIXEL;
  h_camera.pixel_sample_scale = 1.0f/samples_per_pixel;

  cuErr = cudaMemcpyToSymbol(camera, &h_camera, sizeof(CameraF32), 0, 
      cudaMemcpyHostToDevice);
  CUDA_CHECK(cuErr);

  // -------------
  h_viewport = {0};
  h_viewport.height = 2.0f;
  h_viewport.width = h_viewport.height * ((F32)h_image.width/(F32)h_image.height);
  h_viewport.aspect_ratio = h_viewport.width/h_viewport.height;
  h_viewport.u = vec3F32(h_viewport.width, 0.0f, 0.0f);
  h_viewport.v = vec3F32(0.0f, -h_viewport.height, 0.0f);

  F32 width_inverse = 1.0f/(F32)h_image.width;
  F32 height_inverse = 1.0f/(F32)h_image.height;
  h_viewport.pixel_delta_u = scale_V3F32(width_inverse, h_viewport.u);
  h_viewport.pixel_delta_v = scale_V3F32(height_inverse, h_viewport.v);

  // upper_left = camera - vec3(0,0,focal_length) - viewport_u/2 - viewport_v/2
  Vec3F32 viewport_upper_left = sub_V3F32(h_camera.center, 
      vec3F32(0.0f, 0.0f, h_camera.focal_length));
  viewport_upper_left = sub_V3F32(viewport_upper_left, scale_V3F32(0.5f, h_viewport.u));
  viewport_upper_left = sub_V3F32(viewport_upper_left, scale_V3F32(0.5f, h_viewport.v));
  h_viewport.upper_left = viewport_upper_left;

  // pixel_origin = upper_left + 0.5 * (delta u + delta v)
  Vec3F32 pixel_delta_sum = add_V3F32(h_viewport.pixel_delta_u, h_viewport.pixel_delta_v);
  h_viewport.pixel_origin = add_V3F32(viewport_upper_left, 
      scale_V3F32(0.5f, pixel_delta_sum));

  cuErr = cudaMemcpyToSymbol(viewport, &h_viewport, sizeof(ViewportF32), 0, 
      cudaMemcpyHostToDevice);
  CUDA_CHECK(cuErr);
  LOG("Viewport size %.2f x %.2f, aspect ratio: %.4f \n", 
      h_viewport.width, h_viewport.height, h_viewport.aspect_ratio);


  //////////////////////////////////////////////////////////////////////////////////////////
  // Setup entities and copy to device
  U64 entity_list_byte_size = sizeof(Entity)*MAX_NUM_ENTITIES;
  Entity *h_entities = (Entity *)malloc(entity_list_byte_size);
  memset(h_entities, 0, entity_list_byte_size);
  for(U32 i = 0; i < MAX_NUM_ENTITIES; i += 1)
  {
    // Init all entities to nil
    h_entities[i].kind = EntityKind_Nil;
  }

  // Manual spheres
  {
    h_entities[0].kind = EntityKind_Sphere;
    h_entities[0].center = vec3F32(0.0f, 0.0f, -1.0f);
    h_entities[0].radius = 0.5f;

    h_entities[1].kind = EntityKind_Sphere;
    h_entities[1].center = vec3F32(0.0f, -100.5f, -1.0f);
    h_entities[1].radius = 100.0f;
  }

  // Copy to device
  Entity *entities = 0;
  cuErr = cudaMalloc(&entities, entity_list_byte_size);
  CUDA_CHECK(cuErr);
  cuErr = cudaMemcpy(entities, h_entities, entity_list_byte_size, cudaMemcpyHostToDevice);
  CUDA_CHECK(cuErr);


  //////////////////////////////////////////////////////////////////////////////////////////
  // Define grid, blocks, threads and any buffers such as pixel data and random state
  // ------------
  U32 num_pixels = h_image.total_num_pixels;
  U64 pixel_buffer_size = num_pixels*sizeof(Vec3F32);

  dim3 threads_per_block(16, 8);
  dim3 blocks_per_grid(
      (h_image.width + threads_per_block.x - 1) / threads_per_block.x,
      (h_image.height + threads_per_block.y - 1) / threads_per_block.y
      );

  Vec3F32 *pixel_buffer = 0;
  cuErr = cudaMalloc(&pixel_buffer, pixel_buffer_size);
  CUDA_CHECK(cuErr);

  curandState *rand_state = 0;
  cuErr = cudaMalloc(&rand_state, num_pixels*sizeof(curandState));
  CUDA_CHECK(cuErr);


  //////////////////////////////////////////////////////////////////////////////////////////
  // Initialise CUDA state such as random number states per thread.
  // This is separate for performance measurements
  // ------------
  cuda_init_state<<<blocks_per_grid, threads_per_block>>>(rand_state);
  cuErr = cudaGetLastError();
  CUDA_CHECK(cuErr);
  cuErr = cudaDeviceSynchronize();
  CUDA_CHECK(cuErr);

  //////////////////////////////////////////////////////////////////////////////////////////
  // Launch the main CUDA kernel, each thread will color a pixel and store it 
  // in the pixel buffer.
  // ------------

  LOG("Launching main kernel with \n blocks per grid: (%i, %i, %i) \n",
      blocks_per_grid.x, blocks_per_grid.y, blocks_per_grid.z);
  LOG("threads per block: (%i, %i %i) \n", 
      threads_per_block.x, threads_per_block.y, threads_per_block.z);

  cuda_main<<<blocks_per_grid, threads_per_block>>>(entities, pixel_buffer, rand_state);
  cuErr = cudaGetLastError();
  CUDA_CHECK(cuErr);
  cuErr = cudaDeviceSynchronize();
  CUDA_CHECK(cuErr);


  //////////////////////////////////////////////////////////////////////////////////////////
  // Copy the pixel buffer back from the device and write it to an image file.
  // ------------
  Vec3F32 *h_pixel_buffer = (Vec3F32 *)malloc(pixel_buffer_size);
  cuErr = cudaMemcpy(h_pixel_buffer, pixel_buffer, pixel_buffer_size,
      cudaMemcpyDeviceToHost);
  CUDA_CHECK(cuErr);

  write_buffer_to_ppm(h_pixel_buffer, h_image.width, h_image.height);

  cuErr = cudaFree(pixel_buffer);
  CUDA_CHECK(cuErr);
  cuErr = cudaFree(entities);
  CUDA_CHECK(cuErr);
  cuErr = cudaFree(rand_state);
  CUDA_CHECK(cuErr);

  return 0;
}