library(RCUDA) m = loadModule("inst/sampleKernels/set.ptx") k = m$setValue_kernel N = 1e7L i = integer(N) ci = copyToDevice(i) # To get over N threads, we use 512 within a block for the maximum amount # and then 256 x 128 grid. # Would we be better off with a different break down of the grid or the block? system.time(replicate(100, .cuda(k, ci, N, gridDim = c(256L, 128L), blockDim = c(512L)))) system.time(replicate(100, .cuda(k, ci, N, gridDim = c(32768L), blockDim = c(512L)))) system.time(replicate(100, .cuda(k, ci, N, gridDim = c(32768L), blockDim = c(32, 16)))) i = ci[] head(i) done = i[i != 0] length(done) + 1L table(diff(done))