The function nppiDotProd_8u64f_C1R causes a cudaErrorUnknown. I'm able to compile and run properly boxFilterNPP and histEqualizationNPP so I assume my system is healthy. I'm running with a GTX470 (compute capability 2.0), CUDA 5.5 and VS2012 x64 on Windows7. I've also run many variations of it on two systems and having the same problem. Here is the code:
NppGpuComputeCapability capability = nppGetGpuComputeCapability();
NppiSize sizeROI;
sizeROI.width = 640;
sizeROI.height = 480;
int nBufferSize = 0;
NppStatus status = nppiDotProdGetBufferHostSize_8u64f_C1R(sizeROI,&nBufferSize);
if(status != NPP_SUCCESS) return status;
unsigned char *pDeviceBuffer;
cudaError_t err = cudaMalloc((void**)&pDeviceBuffer,nBufferSize);
if(err != cudaSuccess) return err;
int stepByte1 = 0;
Npp8u * buf1 = nppiMalloc_8u_C1(sizeROI.width, sizeROI.height, &stepByte1);
status = nppiSet_8u_C1R(1,buf1,stepByte1,sizeROI);
if(status != NPP_SUCCESS) return status;
int stepByte2 = 0;
Npp8u * buf2 = nppiMalloc_8u_C1(sizeROI.width, sizeROI.height, &stepByte2);
status = nppiSet_8u_C1R(1,buf2,stepByte2,sizeROI);
if(status != NPP_SUCCESS) return status;
err = cudaDeviceSynchronize();
if(err != cudaSuccess) return err;
double dp = 0;
status = nppiDotProd_8u64f_C1R(buf1,stepByte1,buf2,stepByte2,sizeROI,&dp,pDeviceBuffer);
if(status != NPP_SUCCESS) return status;
err = cudaDeviceSynchronize(); // return cudaErrorUnknown
// CUDA memchecker gives me "OutOfRangeStore" exception
if(err != cudaSuccess) return err;
printf("result: %f\n", dp);
nppiFree(buf1);
nppiFree(buf2);
cudaFree(pDeviceBuffer);
Any idea about my problem?
Thank you very much!!
The result argument in that
nppiDotProd
call must be a device pointer, not a host pointer. You can fix it by allocating memory fordp
on the device, something like :[disclaimer: written in browser, not compiled or tested, use a own risk]
You will obviously need to copy the result of the dot product back to the host if you need it.