library(RCUDA)
#
# This version doesn't write the result over the inputs.
# Instead it writes to a separate vector.
#
m = loadModule("inst/sampleKernels/dnormOutput.ptx")
k = m$dnorm_kernel


N = 1e6L
mu = 0.3
sigma = 1.5
x = rnorm(N)

cx = copyToDevice(x)
out = cudaMalloc(N, elType = "numeric")
.cuda(k, cx, N, mu, sigma, out, gridDim = c(64L, 32L), blockDim = 512L)

vals = out[]
# or explicitly
vals = copyFromDevice(out, N, "float")
head(vals)

# The values will not be the same as if we had computed them in R.
# This is due to the difference in numerical precision with floats
# and with the GPUs themselves.
summary(dnorm(x[1:100], mu, sigma) - vals[1:100])


# Check the x's weren't touched.
summary(abs(copyFromDevice(cx, 10, "float") - x[1:10]))