# The difference here is that we will repeat the sampling more than once
# in an R loop (e.g. a parametric bootstrap or a simple simulation)
# and we will leave the RNG states allocated on the GPU and also the
# space for the results.

library(RCUDA)
cuGetContext(TRUE)

m = loadModule(system.file("sampleKernels", "random.ptx", package = "RCUDA"))
kernel = m$rnorm_kernel
B = 20 # number of repetitions
N = 250000 # 1e6L fails on my mac... :/
mu <-  -0.3
sigma <-  1.5

threads_per_block <- 512L
block_dims <- c(threads_per_block, 1L, 1L)
grid_d1 <- floor(sqrt(N/threads_per_block))
grid_d2 <- ceiling(N/(grid_d1*threads_per_block))
grid_dims <- c(grid_d1, grid_d2, 1L)

# Call this once to force the compilation before we do timings
rng_states <- cudaMalloc(elType = "curandState", numEls=N, sizeof=48L)
ans = numeric(N)
cu_ans = .cuda(kernel, rng_states, ans, N, mu, sigma, inplace = TRUE, gridDim = grid_dims, blockDim = block_dims, outputs = 2L)


cu_rnorm_total_time = system.time(
    replicate(B, {
           .cuda(kernel, rng_states, ans, N, mu, sigma, inplace = TRUE, gridDim = grid_dims, blockDim = block_dims, outputs = 2L)
         }))

rnorm_total_time = system.time( replicate(B, r_ans <- rnorm(N, mu, sigma) ))

cu_rnorm_total_time/rnorm_total_time