I am trying to run a cuda kernel in numbapro python, but I keep getting an out of resources error. I then tried to execute the kernel into a loop and send smaller arrays, but that still gave me the same error.
Here is my error message:
Traceback (most recent call last):
File "./predict.py", line 418, in <module>
predict[griddim, blockdim, stream](callResult_d, catCount, numWords, counts_d, indptr_d, indices_d, probtcArray_d, priorC_d)
File "/home/mhagen/Developer/anaconda/lib/python2.7/site-packages/numba/cuda/compiler.py", line 228, in __call__
sharedmem=self.sharedmem)
File "/home/mhagen/Developer/anaconda/lib/python2.7/site-packages/numba/cuda/compiler.py", line 268, in _kernel_call
cu_func(*args)
File "/home/mhagen/Developer/anaconda/lib/python2.7/site-packages/numba/cuda/cudadrv/driver.py", line 1044, in __call__
self.sharedmem, streamhandle, args)
File "/home/mhagen/Developer/anaconda/lib/python2.7/site-packages/numba/cuda/cudadrv/driver.py", line 1088, in launch_kernel
None)
File "/home/mhagen/Developer/anaconda/lib/python2.7/site-packages/numba/cuda/cudadrv/driver.py", line 215, in safe_cuda_api_call
self._check_error(fname, retcode)
File "/home/mhagen/Developer/anaconda/lib/python2.7/site-packages/numba/cuda/cudadrv/driver.py", line 245, in _check_error
raise CudaAPIError(retcode, msg)
numba.cuda.cudadrv.driver.CudaAPIError: Call to cuLaunchKernel results in CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES
Here is my source code:
from numbapro.cudalib import cusparse
from numba import *
from numbapro import cuda
@cuda.jit(argtypes=(double[:], int64, int64, double[:], int64[:], int64[:], double[:,:], double[:] ))
def predict( callResult, catCount, wordCount, counts, indptr, indices, probtcArray, priorC ):
    i = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x
    correct = 0
    wrong = 0
    lastDocIndex = -1
    maxProb = -1e6
    picked = -1
    for cat in range(catCount):
        probSum = 0.0
        for j in range(indptr[i],indptr[i+1]):
            wordIndex = indices[j]
            probSum += (counts[j]*math.log(probtcArray[cat,wordIndex]))
        probSum += math.log(priorC[cat])
        if probSum > maxProb:
            maxProb = probSum
            picked = cat
    callResult[i] = picked
predictions = []
counter = 1000
for i in range(int(math.ceil(numDocs/(counter*1.0)))):
    docTestSliceList = docTestList[i*counter:(i+1)*counter]
    numDocsSlice = len(docTestSliceList)
    docTestArray = np.zeros((numDocsSlice,numWords))
    for j,doc in enumerate(docTestSliceList):
        for ind in doc:
            docTestArray[j,ind['term']] = ind['count']
    docTestArraySparse = cusparse.ss.csr_matrix(docTestArray)
    start = time.time()
    OPT_N = numDocsSlice
    blockdim = 1024, 1
    griddim = int(math.ceil(float(OPT_N)/blockdim[0])), 1 
    catCount = len(music_categories)
    callResult = np.zeros(numDocsSlice)
    stream = cuda.stream()
    with stream.auto_synchronize():
        probtcArray_d = cuda.to_device(numpy.asarray(probtcArray),stream)
        priorC_d = cuda.to_device(numpy.asarray(priorC),stream)
        callResult_d = cuda.to_device(callResult, stream)
        counts_d = cuda.to_device(docTestArraySparse.data, stream)
        indptr_d = cuda.to_device(docTestArraySparse.indptr, stream)
        indices_d = cuda.to_device(docTestArraySparse.indices, stream)
        predict[griddim, blockdim, stream](callResult_d, catCount, numWords, counts_d, indptr_d, indices_d, probtcArray_d, priorC_d)
        callResult_d.to_host(stream)
    #stream.synchronize()
    predictions += list(callResult)
    print "prediction %d: %f" % (i,time.time()-start)
 
                        
I found out this was in the cuda procedure.
When you call predict the blockdim is set to 1024. predict[griddim, blockdim, stream](callResult_d, catCount, numWords, counts_d, indptr_d, indices_d, probtcArray_d, priorC_d)
But the procedure is called iteratively with slice sizes of 1000 elements not 1024. So, in the procedure it will attempt to write 24 elements that are out of bounds in the return array.
Sending a number of elements parameter (n_el) and placing an error checking call in the cuda procedure solves it.