#include <stdio.h>
#include <stdlib.h>
#include "header.h"             // Definision

int main(int argc, char **argv) {
  float *h_A, *h_B, *h_C;       // host variables
  float *d_A, *d_B, *d_C;       // device vaiables
  float result = 0.0f;          // resuts
  dim3 dim_grid(LENGTH/BLOCK_SIZE, 1, 1), dim_block(BLOCK_SIZE, 1, 1); // Used in Step 4

  // Step 1. host memory allocation
  h_A = (float *)malloc(sizeof(float) * LENGTH);
  h_B = (float *)malloc(sizeof(float) * LENGTH);
  h_C = (float *)malloc(sizeof(float) * LENGTH);
  for (int i = 0; i < LENGTH; ++i) {
    h_A[i] = 1.0f; h_B[i] = 2.0f; h_C[i] = 0.0f;
  }

  // Step 2. device memory alloaction
  cudaMalloc((void **)&d_A, sizeof(float) * LENGTH);
  cudaMalloc((void **)&d_B, sizeof(float) * LENGTH);
  cudaMalloc((void **)&d_C, sizeof(float) * LENGTH);

  // Step 3. copy data the host to the device
  cudaMemcpy(d_A, h_A, sizeof(float) * LENGTH, cudaMemcpyHostToDevice);
  cudaMemcpy(d_B, h_B, sizeof(float) * LENGTH, cudaMemcpyHostToDevice);

  // Step 4. The host calles the karnel
  Sample1Kernel<<<dim_grid, dim_block>>>(d_A, d_B, d_C);

  // Step 5. Result write back
  cudaMemcpy(h_C, d_C, sizeof(float) * LENGTH, cudaMemcpyDeviceToHost);

  // Step 6. Release memory
  cudaFree(d_A); cudaFree(d_B); cudaFree(d_C);

  // Step 7. The host finalize
  for (int i = 0; i < LENGTH; ++i) result += h_C[i];
  result /= (float)LENGTH;
  printf("result = %f\n", result);

  // Step 8. Free memory
  free(h_A); free(h_B); free(h_C);

  return 0;
}
