# When working, also examine
# using cudaMallocPitch()
# and passing the two matrices separately to the euclidean_kernel routine 

library(RCUDA)
kfile = system.file("sampleKernels", "distance_gputools.ptx", package = "RCUDA")
if(!file.exists(kfile))
  kfile = nvcc(system.file("sampleKernels", "distance_gputools.cu", package = "RCUDA"), "distance_gputools.ptx")
mod = loadModule(kfile)

N = c(A = 1e4L, B = 4999L)
p = 200L
A = matrix(rnorm(N["A"]*p), N["A"], p)
B = matrix(rnorm(N["B"]*p), N["B"], p)

AB = rbind(A, B)

gdist.same = 
function(AB, mod, blockSize = 32L, .async = FALSE, ...)
{
   out = .gpu(mod$euclidean_kernel_same,
              t(AB), ncol(AB), nrow(AB),
              NULL, 0L, 0L, 
              ncol(AB), ans = numeric(nrow(AB)^2), nrow(AB), 2.0,
              outputs = 8L, gridDim = c(nrow(AB), nrow(AB)), blockDim = blockSize, .async = .async, ...)
    if(!.async)
       matrix(out, nrow(AB), nrow(AB))
    else
       out
}

invisible(gdist.same(matrix(rnorm(100), 20, 5), mod))

B = 100
tm.parallel = system.time({
  o = gdist.same(AB, mod, .async = TRUE)
        # do something else here
 replicate(B, prod(rnorm(1e6)))
 cudaDeviceSynchronize()
 distances = matrix(o$ans[], nrow(AB), nrow(AB))
})

tm.serial = system.time({ replicate(B, prod(rnorm(1e6))); gdist.same(AB, mod)})

print(tm.parallel)
print(tm.serial)