Nvidia nppi function corrupting destination pointer

101 views Asked by At

I am trying to run this 2d convolution filter from the nppi library, but for some reason it seems my destination pointer is getting corrupted.

#define KERNEL_WIDTH 3
#define KERNEL_HEIGHT 3
#define KERNEL_SIZE (KERNEL_WIDTH * KERNEL_HEIGHT)
#define KERNEL {0.0f, 0.25f, 0.0f, 0.25f, 0.0f, 0.25f, 0.0f, 0.25f, 0.0f}

float kernel[KERNEL_SIZE] = KERNEL;
NppiSize kernelSize = {.width=KERNEL_WIDTH, .height=KERNEL_HEIGHT};


int width;
int height;

float *hostIn;  // some data is put here
float *hostOut;
float *deviceIn;
float *deviceOut;

hostOut = malloc(width * height * sizeof(float));
cudaMalloc((void **)&deviceIn, width * height * sizeof(float));
cudaMalloc((void **)&deviceOut, width * height * sizeof(float));
cudaMemcpy(deviceIn, hostInputData, width * height * sizeof(float), cudaMemcpyHostToDevice);

// The next statement works fine
cudaMemcpy(hostOutputData, deviceOut, width * height * sizeof(float), cudaMemcpyDeviceToHost);

NppiSize inputSize = {.width=width, .height=height};
NppiSize oSizeROI = {.width=width, .height=height};
NppiPoint oAnchor = {.x=0, .y=0};
NppiPoint oSrcOffset = {.x=0, .y=0};
// The following statement returns NPP_SUCCESS.
NppStatus err = nppiFilterBorder_32f_C1R(deviceIn, width * sizeof(float), inputSize, oSrcOffset, deviceOut, width*sizeof(float), oSizeROI, kernel, kernelSize, oAnchor, NPP_BORDER_REPLICATE);
assert(err == NPP_SUCCESS);
// The next statement fails with illegal memory access error
cudaMemcpy(hostOutputData, deviceOut, width * height * sizeof(float), cudaMemcpyDeviceToHost);

What I do not understand is why the cudaMemcpy after the nppi call is resulting in an illegal access error. Any advice would be great!

1

There are 1 answers

0
Robert Crovella On BEST ANSWER

The main issue is that you cannot use a __constant__ global definition for the convolution kernel you pass to npp. That's not an option.

Here's a modification of what you have shown, converted to complete code, that converts that kernel to an ordinary device allocation, and runs without runtime error:

# cat t73.cu
#define KERNEL_WIDTH 3
#define KERNEL_HEIGHT 3
#define KERNEL_SIZE (KERNEL_WIDTH * KERNEL_HEIGHT)
#define KERNEL {0.0f, 0.25f, 0.0f, 0.25f, 0.0f, 0.25f, 0.0f, 0.25f, 0.0f}

#include <cassert>
#include <npp.h>

int main(){

  float kernel[KERNEL_SIZE] = KERNEL;
  NppiSize kernelSize = {.width=KERNEL_WIDTH, .height=KERNEL_HEIGHT};


  int width = 256;
  int height = 256;

  float *hostIn;  // some data is put here
  float *hostOut;
  float *deviceIn;
  float *deviceOut;
  float *dKernel;

  hostOut = (float *)malloc(width * height * sizeof(float));
  hostIn = (float *)malloc(width * height * sizeof(float));
  cudaMalloc((void **)&deviceIn, width * height * sizeof(float));
  cudaMalloc((void **)&deviceOut, (width+2) * (height+2) * sizeof(float));
  cudaMemcpy(deviceIn, hostIn, width * height * sizeof(float), cudaMemcpyHostToDevice);
  cudaMalloc(&dKernel, KERNEL_SIZE*sizeof(dKernel[0]));
  cudaMemcpy(dKernel, kernel, KERNEL_SIZE*sizeof(dKernel[0]), cudaMemcpyHostToDevice);
  NppiSize inputSize = {.width=width, .height=height};
  NppiSize oSizeROI = {.width=width, .height=height};
  NppiPoint oAnchor = {.x=0, .y=0};
  NppiPoint oSrcOffset = {.x=0, .y=0};
// The following statement returns NPP_SUCCESS.
  NppStatus err = nppiFilterBorder_32f_C1R(deviceIn, width * sizeof(float), inputSize, oSrcOffset, deviceOut, width*sizeof(float), oSizeROI, dKernel, kernelSize, oAnchor, NPP_BORDER_REPLICATE);
  assert(err == NPP_SUCCESS);
// The next statement fails with illegal memory access error
  cudaMemcpy(hostOut, deviceOut, width * height * sizeof(float), cudaMemcpyDeviceToHost);
}
# nvcc -o t73 t73.cu -lnppif
# compute-sanitizer ./t73
========= COMPUTE-SANITIZER
========= ERROR SUMMARY: 0 errors
#