library(RCUDA)

#
# We'll illustrate synchronizing on an event
#
#
f = system.file("sampleKernels", "distance_gputools.ptx", package = "RCUDA")
mod = loadModule(f)
gdist.same = 
function(ptr, dim, ans, mod, blockSize = 32L, .async = FALSE,  ...)
{
   out = .gpu(mod$euclidean_kernel_same,
              ptr, dim[2], dim[1],
              NULL, 0L, 0L, 
              dim[2], ans = ans, dim[1], 2.0,
              outputs = 8L, gridDim = c(dim[1], dim[1]), blockDim = blockSize, .async = .async, ...)
    if(.async)
       out
    else
       matrix(out, dim[1], dim[1])
}

N = 25000
p = 200
m = matrix(N*p, N, p)
ptr = copyToDevice(as.numeric(t(m)))
out = cudaMalloc(N*N)

stream = cuStreamCreate(0)
ev1 = cuEventCreate(0)
ev2 = cuEventCreate(0)

cuEventRecord(ev1, stream)
cuEventRecord(ev2, stream)

o = gdist.same(ptr, dim(m), out, mod, .async = TRUE, stream = stream)

# Is the stream finished?
print(cuStreamQuery(stream))

#No need to synchronize on this first event
#  cuEventSynchronize(ev1)

cuEventSynchronize(ev2)

cuEventElapsedTime(ev1, ev2)