#include <stdio.h>
#include <stdint.h>

#include <curand_kernel.h>

//------------------------------------------------------------------------------------------
//~ base defines

#define global static
#define function static

//~ typedefs 
typedef int32_t S32;
typedef uint32_t U32;
typedef uint64_t U64;
typedef float F32;

//~ utility defines

#define CUDA_CHECK(err) do { \
  if (err != cudaSuccess) { \
    fprintf(stderr, "CUDA ERROR: %s at %s:%d\n", \
        cudaGetErrorString(err), __FILE__, __LINE__); \
    exit(EXIT_FAILURE); \
  } \
} while (0)

#define LOG printf


//~ test defines
#define NUM_BLOCKS   1
#define NUM_THREADS  32


#define IMAGE_WIDTH 1920
#define ASPECT_RATIO 1.7778f // 16/9

#define CURAND_SEED 1984

//------------------------------------------------------------------------------------------
//~ structs

typedef union Vec3F32 Vec3F32;
union Vec3F32
{
  struct 
  {
    F32 x;
    F32 y;
    F32 z;
  };
  struct 
  {
    F32 r;
    F32 g;
    F32 b;
  };
  F32 v[3];
};


typedef struct RayF32 RayF32;
struct RayF32
{
  Vec3F32 origin;
  Vec3F32 direction;
};


typedef struct ViewportF32 ViewportF32;

struct ViewportF32 
{
  F32 width;
  F32 height;
  F32 aspect_ratio;
  Vec3F32 u; // along horizontal edge, right from top left corner
  Vec3F32 v; // along vertical edge, down from top left corner
  Vec3F32 upper_left;
  Vec3F32 pixel_origin;
  Vec3F32 pixel_delta_u;
  Vec3F32 pixel_delta_v;
};

typedef struct CameraF32 CameraF32;
struct CameraF32 
{
  Vec3F32 center;
  Vec3F32 up;
  F32 focal_length;
};

typedef struct ImageF32 ImageF32;
struct ImageF32 
{
  U32 width;
  U32 height;
  F32 aspect_ratio;
  U32 total_num_pixels;
};

//------------------------------------------------------------------------------------------
//~ host globals

//~ device globals

__constant__ CameraF32 camera;
__constant__ ViewportF32 viewport;
__constant__ ImageF32 image;

//------------------------------------------------------------------------------------------
//~ routines


__host__ __device__ function Vec3F32 vec3F32(F32 x, F32 y, F32 z)
{
  Vec3F32 out = {0};
  out.x = x;
  out.y = y;
  out.z = z;
  return out;
}


__host__ __device__ function Vec3F32 add_V3F32(Vec3F32 a, Vec3F32 b)
{
  Vec3F32 out = {0};
  out.x = a.x + b.x;
  out.y = a.y + b.y;
  out.z = a.z + b.z;
  return out;
}

__host__ __device__ function Vec3F32 sub_V3F32(Vec3F32 a, Vec3F32 b)
{
  Vec3F32 out = {0};
  out.x = a.x-b.x;
  out.y = a.y-b.y;
  out.z = a.z-b.z;
  return out;
}


__host__ __device__ function Vec3F32 scale_V3F32(F32 s, Vec3F32 v)
{
  Vec3F32 out = {0};
  out.x = s*v.x;
  out.y = s*v.y;
  out.z = s*v.z;
  return out;
}

__device__ function F32 dot_V3F32(Vec3F32 a, Vec3F32 b)
{
  return a.x*b.x + a.y*b.y + a.z*b.z;
}


__device__ function Vec3F32 ray_point_F32(F32 t, RayF32 ray)
{
  Vec3F32 out = add_V3F32(ray.origin, scale_V3F32(t, ray.direction));
  return out;
}

__device__ function F32 mag_V3F32(Vec3F32 a)
{
  return dot_V3F32(a, a);
}

__device__ function F32 norm_V3F32(Vec3F32 a)
{
   F32 mag = mag_V3F32(a);
   return __fsqrt_rn(mag);
}

__device__ function Vec3F32 lerp_V3F32(F32 s, Vec3F32 a, Vec3F32 b)
{
  Vec3F32 lerp_term1 = scale_V3F32(1.0f-s, a);
  Vec3F32 lerp_term2 = scale_V3F32(s, b);
  Vec3F32 lerp_result = add_V3F32(lerp_term1, lerp_term2);

  return lerp_result;
}


__host__ function void write_buffer_to_ppm(Vec3F32 *buffer, 
                                           U32 image_width, 
                                           U32 image_height, 
                                           U32 *idx_buffer) 
{

  const char *filename = "output.ppm";
  FILE *file = fopen(filename, "w");
  if(!file) 
  {
    LOG("Error opening file %s \n", filename);
  }

  // Write PPM header. First it has "P3" by itself to indicate ASCII colors,
  fprintf(file, "P3\n");
  // The row below will say the dimensions of the image:
  // (width, height) <-> (num columns, num rows)
  fprintf(file, "%i %i\n", image_width, image_height);
  // Then we have a value for the maximum pixel color
  fprintf(file, "255\n");
  // Then we have all the lines with pixel data, 
  // it will be three values for each column j on a row i,
  // corresponding to a pixel with index (i,j).
  for(U32 i = 0; i < image_height; i += 1)
  {
    for(U32 j = 0; j < image_width; j +=1)
    {
      // We represent RGB values by floats internally and scale to integer values
      U32 idx = i * image_width + j;
      if(idx_buffer[idx] != 0) {
        //LOG("idx %i, idxbuffer[idx] = %i \n", idx, idx_buffer[idx]);
      }
      F32 r = buffer[idx].r;
      F32 g = buffer[idx].g;
      F32 b = buffer[idx].b;

      U32 ir = int(255.999f * r);
      U32 ig = int(255.999f * g);
      U32 ib = int(255.999f * b);

      fprintf(file, "%i %i %i ", ir, ig, ib);
    }
    fprintf(file, "\n");
  }

  fclose(file);
}

__device__ function F32 hit_sphere(Vec3F32 center, F32 radius, RayF32 r)
{
  // We take the quadratic formula -b/2a +- sqrt(b*b-4ac) / 2a,
  // and we calculate only the sqrt part. If there is a hit with the sphere we either
  // have two solutions (positive sqrt), one solution (zero sqrt) 
  // or no solution (negative sqrt).
  // If we have no solution we have no hit on
  // the sphere centered at center, with the given radius.

  // Note that we can simplify this, since we always get b = -2(D . (C-Q)), and if
  // we say b = -2h in the quadradic formula, we get
  // -(-2h)/2a +- sqrt((-2h)**2 - 4ac) / 2a which expands to
  // 2h/2a +- 2sqrt(h*h - ac)/2a, simplifying to (h +- sqrt(h*h - ac))/a.
  // So we use this simplification to optimise away some operations

  // Compare lines with RTIOW
  // (C-Q)
  Vec3F32 oc = sub_V3F32(center, r.origin);
  // a = D.D
  F32 a = dot_V3F32(r.direction, r.direction);
  // h = D . (C-Q)
  F32 h = dot_V3F32(r.direction, oc);
  // c = (C-Q) . (C-Q) - r*r
  F32 c = dot_V3F32(oc, oc) - radius*radius;

  F32 discriminant = h*h - a*c;

  // We are actually solving for the parameter t in the expression of a point P(t) that 
  // intersects the sphere. This is the quadratic problem we get by solving for t in 
  // (C - P(t)) . (C - P(t)) = r*r, r being the radius and P(t) = tD+Q,
  // where D is the direction of the ray and Q the origin of the ray.
  F32 out = 0.0f;
  if(discriminant < 0.0f)
  {
    out = -1.0f;
  }
  else
  {
    // t = (h += sqrt(h*h-ac))/a, and here we take the smallest solution to get the point 
    // on the sphere closest to the ray origin.
    out = (h - __fsqrt_rn(discriminant))/a; 
  }

  return out;
}


__global__ function void cuda_main(Vec3F32 *pixelbuffer, U32 *idxbuffer) 
{

  U32 x = blockIdx.x * blockDim.x + threadIdx.x;
  U32 y = blockIdx.y * blockDim.y + threadIdx.y;
  U32 idx = y * image.width + x;

  if(x < image.width && y < image.height)
  {
    Vec3F32 px_u = scale_V3F32((F32)x, viewport.pixel_delta_u);
    Vec3F32 px_v = scale_V3F32((F32)y, viewport.pixel_delta_v);
    Vec3F32 pixel_center = add_V3F32(viewport.pixel_origin, add_V3F32(px_u, px_v));
    
    // TODO(anton): Maybe we dont need some ray structure here..
    Vec3F32 ray_direction = sub_V3F32(pixel_center, camera.center);
    RayF32 r = {0};
    r.origin = camera.center;
    r.direction = ray_direction;

    F32 norm = norm_V3F32(r.direction);
    Vec3F32 unit_dir = scale_V3F32(1.0f/norm, r.direction);
    Vec3F32 white = vec3F32(1.0f, 1.0f, 1.0f);
    Vec3F32 light_blue = vec3F32(0.5f, 0.7f, 1.0f);

    // Lerp between white and light blue depending on y position
    F32 blend = 0.5f*(unit_dir.y + 1.0f);
    Vec3F32 pixel_color = {0};

    Vec3F32 sphere_center = vec3F32(0.0f, 0.0f, -1.0f);
    F32 sphere_radius = 0.5f;

    // t is the parameter of the (closest) sphere-ray intersection point P(t) = tD+Q,
    // where Q is the ray origin and D the ray direction.
    F32 t = hit_sphere(sphere_center, sphere_radius, r);
    if(t > 0.0f)
    {
      Vec3F32 intersection_point = ray_point_F32(t, r);
      Vec3F32 N = sub_V3F32(intersection_point, sphere_center);
      N = scale_V3F32(1.0f/sphere_radius, N);
      pixel_color = scale_V3F32(0.5f, add_V3F32(N, vec3F32(1.0f, 1.0f, 1.0f)));
    } 
    else
    {
      pixel_color = lerp_V3F32(blend, white, light_blue);
    }

    pixelbuffer[idx] = pixel_color;

    //pixelbuffer[idx].x = (F32)x/(F32)image.width;
    //pixelbuffer[idx].y = (F32)y/(F32)image.height;
    //pixelbuffer[idx].z = 0.0f;

    idxbuffer[idx] = idx;
  }

}

__global__ function void cuda_init_state(curandState *rand_state) 
{

  U32 x = threadIdx.x + blockIdx.x * blockDim.x;
  U32 y = threadIdx.y + blockIdx.y * blockDim.y;

  if(x < image.width && y < image.height)
  {
    U32 idx = y * image.width + x;
    curand_init(CURAND_SEED, idx, 0, &rand_state[idx]);
  }
  
}

//------------------------------------------------------------------------------------------
//~ Main
int main() 
{
  cudaError_t cuErr;

  //////////////////////////////////////////////////////////////////////////////////////////
  // Define image, camera and viewport on the CPU 
  // and then copy to constant globals on device
  // -------------
  ImageF32 h_image = {0};
  h_image.width = IMAGE_WIDTH;
  h_image.aspect_ratio = ASPECT_RATIO;
  U32 height = U32((F32)h_image.width/h_image.aspect_ratio) + 1;
  h_image.height = height < 1 ? 1 : height;
  h_image.total_num_pixels = h_image.width * h_image.height;
  cuErr = cudaMemcpyToSymbol(image, &h_image, sizeof(ImageF32), 0, cudaMemcpyHostToDevice);
  CUDA_CHECK(cuErr);
  LOG("Image size %i x %i, aspect ratio: %.4f \n", 
      h_image.width, h_image.height, h_image.aspect_ratio);

  // -------------
  CameraF32 h_camera = {0};
  h_camera.focal_length = 1.0f;
  cuErr = cudaMemcpyToSymbol(camera, &h_camera, sizeof(CameraF32), 0, 
                             cudaMemcpyHostToDevice);
  CUDA_CHECK(cuErr);
  
  // -------------
  ViewportF32 h_viewport = {0};
  h_viewport.height = 2.0f;
  h_viewport.width = h_viewport.height * ((F32)h_image.width/(F32)h_image.height);
  h_viewport.aspect_ratio = h_viewport.width/h_viewport.height;
  h_viewport.u = vec3F32(h_viewport.width, 0.0f, 0.0f);
  h_viewport.v = vec3F32(0.0f, -h_viewport.height, 0.0f);
  
  F32 width_inverse = 1.0f/(F32)h_image.width;
  F32 height_inverse = 1.0f/(F32)h_image.height;
  h_viewport.pixel_delta_u = scale_V3F32(width_inverse, h_viewport.u);
  h_viewport.pixel_delta_v = scale_V3F32(height_inverse, h_viewport.v);
  
  // upper_left = camera - vec3(0,0,focal_length) - viewport_u/2 - viewport_v/2
  Vec3F32 viewport_upper_left = sub_V3F32(h_camera.center, 
                                          vec3F32(0.0f, 0.0f, h_camera.focal_length));
  viewport_upper_left = sub_V3F32(viewport_upper_left, scale_V3F32(0.5f, h_viewport.u));
  viewport_upper_left = sub_V3F32(viewport_upper_left, scale_V3F32(0.5f, h_viewport.v));
  h_viewport.upper_left = viewport_upper_left;
  
  // pixel_origin = upper_left + 0.5 * (delta u + delta v)
  Vec3F32 pixel_delta_sum = add_V3F32(h_viewport.pixel_delta_u, h_viewport.pixel_delta_v);
  h_viewport.pixel_origin = add_V3F32(viewport_upper_left, 
                                      scale_V3F32(0.5f, pixel_delta_sum));

  cuErr = cudaMemcpyToSymbol(viewport, &h_viewport, sizeof(ViewportF32), 0, 
                             cudaMemcpyHostToDevice);
  CUDA_CHECK(cuErr);
  LOG("Viewport size %.2f x %.2f, aspect ratio: %.4f \n", 
      h_viewport.width, h_viewport.height, h_viewport.aspect_ratio);

  //////////////////////////////////////////////////////////////////////////////////////////
  // Define grid, blocks, threads and any buffers such as pixel data and random state
  // ------------
  U32 num_pixels = h_image.total_num_pixels;
  U64 pixel_buffer_size = num_pixels*sizeof(Vec3F32);

  dim3 threads_per_block(16, 8);
  dim3 blocks_per_grid(
      (h_image.width + threads_per_block.x - 1) / threads_per_block.x,
      (h_image.height + threads_per_block.y - 1) / threads_per_block.y
      );

  Vec3F32 *pixel_buffer = 0;
  cuErr = cudaMalloc(&pixel_buffer, pixel_buffer_size);
  CUDA_CHECK(cuErr);

  // This is just a debug buffer, TODO(anton): remove
  U32 *idxbuffer = 0;
  cuErr = cudaMalloc(&idxbuffer, sizeof(U32)*num_pixels);
  CUDA_CHECK(cuErr);

  curandState *d_rand_state = 0;
  cuErr = cudaMalloc(&d_rand_state, num_pixels*sizeof(curandState));
  CUDA_CHECK(cuErr);

  //////////////////////////////////////////////////////////////////////////////////////////
  // Initialise CUDA state such as random number states per thread.
  // This is separate for performance measurements
  // ------------
  cuda_init_state<<<blocks_per_grid, threads_per_block>>>(d_rand_state);
  cuErr = cudaGetLastError();
  CUDA_CHECK(cuErr);
  cuErr = cudaDeviceSynchronize();
  CUDA_CHECK(cuErr);

  //////////////////////////////////////////////////////////////////////////////////////////
  // Launch the main CUDA kernel, each thread will color a pixel and store it 
  // in the pixel buffer.
  // ------------

  LOG("Launching main kernel with \n blocks per grid: (%i, %i, %i) \n",
      blocks_per_grid.x, blocks_per_grid.y, blocks_per_grid.z);
  LOG("threads per block: (%i, %i %i) \n", 
      threads_per_block.x, threads_per_block.y, threads_per_block.z);

  cuda_main<<<blocks_per_grid, threads_per_block>>>(pixel_buffer, idxbuffer);
  cuErr = cudaGetLastError();
  CUDA_CHECK(cuErr);
  cuErr = cudaDeviceSynchronize();
  CUDA_CHECK(cuErr);


  //////////////////////////////////////////////////////////////////////////////////////////
  // Copy the pixel buffer back from the device and write it to an image file.
  // ------------
  Vec3F32 *h_pixel_buffer = (Vec3F32 *)malloc(pixel_buffer_size);
  cuErr = cudaMemcpy(h_pixel_buffer, pixel_buffer, pixel_buffer_size,
                     cudaMemcpyDeviceToHost);
  CUDA_CHECK(cuErr);

  // TODO(anton): remove debug buffer
  U32 *h_idxbuffer = (U32 *)malloc(num_pixels*sizeof(U32));
  cuErr = cudaMemcpy(h_idxbuffer, idxbuffer, num_pixels*sizeof(U32), 
                     cudaMemcpyDeviceToHost);

  write_buffer_to_ppm(h_pixel_buffer, h_image.width, h_image.height, h_idxbuffer);
 
  cuErr = cudaFree(pixel_buffer);
  CUDA_CHECK(cuErr);
  
  return 0;
}