library(RCUDA)

# This overwrites the input values so is intended to process
# the data once and no second passes.

m = loadModule("inst/sampleKernels/dnorm.ptx")
k = m$dnorm_kernel

N = 1e6L
x = rnorm(N)
mu = 0
sigma = 1

cx = copyToDevice(x)
.cuda(k, cx, N, mu, sigma, gridDim = c(64L, 32L), blockDim = 512L)
i = cx[]

cu.tm = system.time(replicate(10, {
cx = copyToDevice(x)
.cuda(k, cx, N, mu, sigma, gridDim = c(64L, 32L), blockDim = 512L)
i = cx[]
}))

r.tm = system.time(replicate(10, dnorm(x, mu, sigma)))

cu.tm/r.tm


head(i)

# The values will not be the same as if we had computed them in R.
# This is due to the difference in numerical precision with floats
# and with the GPUs themselves.
summary(dnorm(x[1:100], mu, sigma) - i[1:100])