#include <stdio.h>
#include <stdlib.h>
#include "header.h"             // Definision

int main(int argc, char **argv) {
  float *h_A, *h_B, *h_C;       // host variables
  float *d_A, *d_B, *d_C;       // device variables
  float result;                 // results
  dim3 dim_grid(LENGTH/BLOCK_SIZE, 1, 1), dim_block(BLOCK_SIZE, 1, 1); // Used in Step 4
  cudaEvent_t start, stop;
  float elapsed_time;

  // Step 0. making times
  cudaEventCreate(&start);
  cudaEventCreate(&stop);

  // Step 1. Generation of data
  h_A = (float *)malloc(sizeof(float) * LENGTH);
  h_B = (float *)malloc(sizeof(float) * LENGTH);
  h_C = (float *)malloc(sizeof(float) * LENGTH);
  for (int i = 0; i < LENGTH; ++i) {
    h_A[i] = 1.0f; h_B[i] = 2.0f; h_C[i] = 0.0f;
  }

  // Step 1.1. Computation in host
  cudaEventRecord(start, 0);
  Sample1Host(h_A, h_B, h_C, LENGTH);
  cudaEventRecord(stop, 0);
  cudaEventSynchronize(stop);
  cudaEventElapsedTime(&elapsed_time, start, stop);

  result = 0.0f;
  for (int i = 0; i < LENGTH; ++i) result += h_C[i];
  result /= (float)LENGTH;
  printf("CPU: result = %f, time = %f [msec]\n", result, elapsed_time);

  // Step 2. Memory allocation for the device
  cudaMalloc((void **)&d_A, sizeof(float) * LENGTH);
  cudaMalloc((void **)&d_B, sizeof(float) * LENGTH);
  cudaMalloc((void **)&d_C, sizeof(float) * LENGTH);

  // Step 3. Copy data from the host to the device
  cudaMemcpy(d_A, h_A, sizeof(float) * LENGTH, cudaMemcpyHostToDevice);
  cudaMemcpy(d_B, h_B, sizeof(float) * LENGTH, cudaMemcpyHostToDevice);

  // Step 4. the host calls the kernel
  cudaEventRecord(start, 0);
  Sample1Kernel<<<dim_grid, dim_block>>>(d_A, d_B, d_C);
  cudaEventRecord(stop, 0);
  cudaEventSynchronize(stop);
  cudaEventElapsedTime(&elapsed_time, start, stop);

  // Step 5. Copy back 
  cudaMemcpy(h_C, d_C, sizeof(float) * LENGTH, cudaMemcpyDeviceToHost);

  // Step 6. Release memory
  cudaFree(d_A); cudaFree(d_B); cudaFree(d_C);

  // Step 7. Host prints
  result = 0.0f;
  for (int i = 0; i < LENGTH; ++i) result += h_C[i];
  result /= (float)LENGTH;
  printf("GPU: result = %f, time = %f [msec]\n", result, elapsed_time);

  // Step 8. Host finalizes
  free(h_A); free(h_B); free(h_C);
  cudaEventDestroy(start);
  cudaEventDestroy(stop);

  return 0;
}
