#include "RCUDA.h"

#define COS_THREAD_CNT 512
#define N 10000000
#define TWO_PI 6.283185


struct cosParams {
	float *arg;
	float *res;
	int n;
};

__global__ void cos_main(struct cosParams params)
{
	// Computes:
	// res[threadIdx.x + 0*COS_THREAD_CNT] = cos(threadIdx.x + 0*COS_THREAD_CNT)
	// res[threadIdx.x + 1*COS_THREAD_CNT] = cos(threadIdx.x + 1*COS_THREAD_CNT)
	// res[threadIdx.x + 2*COS_THREAD_CNT] = cos(threadIdx.x + 2*COS_THREAD_CNT)
	// ...etc...
	//
	// if COS_THREAD_CNT = 1, this computes all of the cosines in one go,
	// otherwise, it spreads it out across threads...

	int i;
	for (i=threadIdx.x; i<params.n; i+=COS_THREAD_CNT){
		params.res[i] = __cosf(params.arg[i]);
	}
	return;
}


extern "C"
SEXP
R_simpleTest()
{
	cudaError_t status;
	cudaError_t cudaStat;
	float* gpu_res = 0;
	float* gpu_arg = 0;
	struct cosParams funcParams;
	float* arg = (float *) malloc(N*sizeof(arg[0]));
	// Allocate vector of length N to store the result:
	float* res = (float *) malloc(N*sizeof(res[0]));
	int i;
	for(i=0; i<N; i++){
		arg[i] = (float)i*(float)TWO_PI/(float)N;
	}

	cudaStat = cudaMalloc((void **)&gpu_arg, N*sizeof(gpu_arg[0]));
	if( cudaStat ){
		printf(" value = %d : Memory Allocation on GPU Device failed\n", cudaStat);
	} else {
		printf("done. Allocating more memory on the GPU... %p\n", gpu_arg);
	}

	// Allocate N floats on the GPU to store the result, and make gpu_res a pointer to that memory:
	cudaStat = cudaMalloc ((void **)&gpu_res, N*sizeof(gpu_res[0]));
	if( cudaStat ){
		printf(" value = %d : Memory Allocation on GPU Device failed\n", cudaStat);
	} else {
		printf("done again. Copying stuff from host (CPU) to device (GPU)... %p\n", gpu_res);
	}

	// Key function:
	// cudaError_t cudaMemcpy(void * dst, const void * src, size_t count, enum cudaMemcpyKind kind);	

	// Copy the vector 0,1,...,N-1 from arg (on the host) to gpu_arg (on the device)
	cudaStat = cudaMemcpy (gpu_arg, arg, N*sizeof(arg[0]), cudaMemcpyHostToDevice);
	if( cudaStat ){
		printf(" Memory Copy from Host to Device failed. %d\n", cudaStat);
	} else {
		printf("successful.\n");
	}

	// Set up the parameters for the GPU kernel:
	funcParams.res = gpu_res;
	funcParams.arg = gpu_arg;
	funcParams.n = N;

	printf("Launching kernel on GPU...\n");

	// Launch the GPU kernel...

	// Key code:
	// KernelFunction<<<dimGrid, dimBlock>>>(args);

	cos_main<<<1,COS_THREAD_CNT>>>(funcParams);

	printf("GPU computations finished. Copying result back to CPU...\n");

	// Copy the vector cos(0),cos(1),...,cos(N-1) from gpu_res (on the device) to res (on the host)
	cudaStat = cudaMemcpy (res, gpu_res, N*sizeof(gpu_res[0]), cudaMemcpyDeviceToHost);
	return(R_NilValue);
}
