Convert NV12 to BGR by NVIDIA Performance Primitives

2.1k views Asked by At

I'm trying to convert NV12 image to BGR by npp, but in the final array i have zeroes.

int lumaStepBytes, chromaStepBytes;
int rgbStepBytes;
      
auto dpNV12LumaFrame = nppiMalloc_8u_C1(dec.GetWidth(), dec.GetHeight(), &lumaStepBytes);
auto dpNV12ChromaFrame = nppiMalloc_8u_C1(dec.GetWidth(), dec.GetChromaHeight(), &chromaStepBytes);
auto dpBGRFrame = nppiMalloc_8u_C3(dec.GetWidth(), dec.GetHeight(), &rgbStepBytes);
            
cudaMemcpy2D(dpNV12LumaFrame, lumaStepBytes, pFrame, dec.GetWidth(),
             dec.GetWidth(), dec.GetHeight(), cudaMemcpyKind::cudaMemcpyHostToDevice);
cudaMemcpy2D(dpNV12ChromaFrame, chromaStepBytes, pFrame + dec.GetLumaPlaneSize(), dec.GetWidth(),
             dec.GetWidth(), dec.GetChromaHeight(), cudaMemcpyKind::cudaMemcpyHostToDevice);

Npp8u *planesAddres[2];
planesAddres[0] = dpNV12LumaFrame;
planesAddres[1] = dpNV12ChromaFrame;

nppiNV12ToBGR_8u_P2C3R(planesAddres, lumaStepBytes,
                       dpBGRFrame, rgbStepBytes,
                       {dec.GetWidth(), dec.GetHeight()});

res.m_data.resize(dec.GetWidth() * dec.GetHeight() * 3);
cudaMemcpy2D(res.m_data.data(), dec.GetWidth(), dpBGRFrame, rgbStepBytes,
             dec.GetWidth(), dec.GetHeight(), cudaMemcpyKind::cudaMemcpyDeviceToHost);
nppiFree(dpBGRFrame);
nppiFree(dpNV12ChromaFrame);
nppiFree(dpNV12LumaFrame);

dec is a video decoder which gives pFrame in NV12 format and provide additional information about that, like offsets, dimensions, NV12 planes, etc. The same result I have if I use cu... and cuda... functions for allocating without alignment.

Do anybody have any ideas about the problem?

3

There are 3 answers

0
Павел Бодак On BEST ANSWER

the problem went away when I changed the video card. GeForce 740 to 1080

0
Robert Crovella On

For questions like this the SO expectation is that you provide a complete example, see item 1 here. So I haven't tried to determine exactly what is wrong with your code.

However I can show you a complete code that converts NV12 to RGB (and other things as well) which is working correctly for me:

// sample compile command line: nvcc -o rs rs.cu -lnppicc -lnppig -DUSE_DEBUG -DUNIT_TEST
#include <nppi.h>
#include <iostream>

template <typename T>
__global__ void pack_uv(T * __restrict__ u, T * __restrict__ v, T * __restrict__ uv, const int w, const int h, const int pitch_uv, const int pitch_u, const int pitch_v){

  int idx = threadIdx.x+blockDim.x*blockIdx.x;
  int idy = threadIdx.y+blockDim.y*blockIdx.y;
  if ((idx < w) && (idy < h)){
    T *o  = (T *)(((char *)uv) + idy*pitch_uv);
    T *iu = (T *)(((char *)u)  + idy*pitch_u);
    T *iv = (T *)(((char *)v)  + idy*pitch_v);
    int idx2 = idx >> 1;
    o[idx] = (idx&1)?iv[idx2]:iu[idx2];}
}

int rs(const int ish, const int isw, const int ipitch, const int osh, const int osw, const int opitch, const unsigned char *iy, const unsigned char *iuv, unsigned char *oy, unsigned char *ouv, unsigned char *tempbuff, int method = 0,  int eInterpolation = NPPI_INTER_LANCZOS){

#ifdef USE_DEBUG
  if ((iy != NULL) && (tempbuff == NULL)) std::cout << "error: tempbuff is NULL" << std::endl;
  if ((iy != NULL) && (iuv == NULL)) std::cout << "error: iuv is NULL" << std::endl;
  if ((iy != NULL) && (oy == NULL)) std::cout << "error: oy is NULL" << std::endl;
  if ((iy != NULL) && (ouv == NULL)) std::cout << "error: ouv is NULL" << std::endl;
  if (isw < 2) std::cout << "error on input width: " << isw << std::endl;
  if (ish < 2) std::cout << "error on input height: " << ish << std::endl;
  if (ipitch < isw) std::cout << "error on input pitch: " << ipitch << std::endl;
  if (osw < 1) std::cout << "error on output width: " << osw << std::endl;
  if (osh < 1) std::cout << "error on output height: " << osh << std::endl;
  if (opitch < osw) std::cout << "error on output pitch: " << opitch << std::endl;
#endif
  cudaError_t err;
  NppStatus stat;

// convert NV12 input to RGB

  if (iy == NULL){ // temp buffer sizing
     // for method 1
     NppiSize oSrcROI;
     oSrcROI.width  = isw;
     oSrcROI.height = ish;
     NppiSize oDstROI;
     oDstROI.width  = osw;
     oDstROI.height = osh;
     int bufferSize;
     stat = nppiResizeAdvancedGetBufferHostSize_8u_C1R(oSrcROI, oDstROI, &bufferSize, NPPI_INTER_LANCZOS3_ADVANCED);
     return ((ish*isw + osh*osw)*3*sizeof(unsigned char))+bufferSize;  // temp buffer sizing
     }
  if (method == 0){

    const Npp8u *pSrc[2] = {iy, iuv};
    NppiSize oSizeROI;
    oSizeROI.width  = isw;
    oSizeROI.height = ish;
#ifdef USE_709
    stat = nppiNV12ToRGB_709HDTV_8u_P2C3R(pSrc, ipitch, tempbuff, isw*3*sizeof(Npp8u), oSizeROI);
#else
    stat = nppiNV12ToRGB_8u_P2C3R(pSrc, ipitch, tempbuff, isw*3*sizeof(Npp8u), oSizeROI);
#endif
#ifdef USE_DEBUG
    err = cudaDeviceSynchronize();
    if (err != cudaSuccess) std::cout << "NV12 to RGB CUDA error: " << cudaGetErrorString(err) << std::endl;
    if (stat != NPP_SUCCESS) std::cout << "NV12 to RGB NPP error: " << (int)stat << std::endl;
#endif
    if (stat != NPP_SUCCESS) return -1;

// perform resize

    NppiSize oSrcSize;
    oSrcSize.width = isw;
    oSrcSize.height = ish;
    NppiRect oSrcROI;
    oSrcROI.x = 0;
    oSrcROI.y = 0;
    oSrcROI.width = isw;
    oSrcROI.height = ish;
    NppiRect oDstROI;
    oDstROI.x = 0;
    oDstROI.y = 0;
    oDstROI.width = osw;
    oDstROI.height = osh;
    double nXFactor = osw/(double)isw;
    double nYFactor = osh/(double)ish;
    double nXShift = 0;
    double nYShift = 0;
    stat = nppiResizeSqrPixel_8u_C3R(tempbuff, oSrcSize, isw*3*sizeof(Npp8u), oSrcROI, tempbuff+ish*isw*3, osw*3*sizeof(Npp8u), oDstROI, nXFactor, nYFactor, nXShift, nYShift, eInterpolation);
#ifdef USE_DEBUG
    err = cudaDeviceSynchronize();
    if (err != cudaSuccess) std::cout << "RGB LANCZOS RESIZE CUDA error: " << cudaGetErrorString(err) << std::endl;
    if (stat != NPP_SUCCESS) std::cout << "RGB LANCZOS RESIZE NPP error: " << (int)stat << std::endl;
#endif
    if (stat != NPP_SUCCESS) return -2;

// convert resized RGB to YUV420


    Npp8u *pDst[3] = { oy, ouv, ouv + osh*opitch/4 };
    int rDstStep[3] = { opitch, opitch/2, opitch/2 };
    oSizeROI.width = osw;
    oSizeROI.height = osh;
    stat = nppiRGBToYUV420_8u_C3P3R(tempbuff+ish*isw*3, osw*3*sizeof(Npp8u), pDst, rDstStep, oSizeROI);
#ifdef USE_DEBUG
    err = cudaDeviceSynchronize();
    if (err != cudaSuccess) std::cout << "RGB TO YUV420 CUDA error: " << cudaGetErrorString(err) << std::endl;
    if (stat != NPP_SUCCESS) std::cout << "RGB TO YUV420 NPP error: " << (int)stat << std::endl;
#endif
    if (stat != NPP_SUCCESS) return -3;

// pack uv

    dim3 block(32, 8);
    dim3 grid((osw+block.x-1)/block.x, (osh+block.y-1)/block.y);
    pack_uv<<< grid, block >>>(ouv, ouv + osh*opitch/4, tempbuff, osw, osh/2, osw, osw/2, osw/2);
    err = cudaGetLastError();
#ifdef USE_DEBUG
    if (err != cudaSuccess) std::cout << "PACK UV LAUNCH CUDA error: " << cudaGetErrorString(err) << std::endl;
    err = cudaDeviceSynchronize();
    if (err != cudaSuccess) std::cout << "PACK UV EXEC CUDA error: " << cudaGetErrorString(err) << std::endl;
#endif
    if (err != cudaSuccess) return -4;

// move packed uv to output

    err = cudaMemcpy2D(ouv, opitch,  tempbuff, osw*sizeof(Npp8u), osw*sizeof(Npp8u), osh/2, cudaMemcpyDeviceToDevice);
#ifdef USE_DEBUG
    if (err != cudaSuccess) std::cout << "PACK UV COPY CUDA error: " << cudaGetErrorString(err) << std::endl;
#endif
    if (err != cudaSuccess) return -5;
    }
  else{  // method 1

// NV12 to YUV420 planar
    const Npp8u *const pSrc[2] = {iy, iuv};
    Npp8u *pDst[3] = {tempbuff, tempbuff+isw*ish, tempbuff+isw*ish+(isw*ish)/4};
    int aDstStep[3] = {isw, isw/2, isw/2};
    NppiSize oSizeROI;
    oSizeROI.width  = isw;
    oSizeROI.height = ish;
    stat = nppiNV12ToYUV420_8u_P2P3R(pSrc, ipitch, pDst, aDstStep, oSizeROI);
#ifdef USE_DEBUG
    err = cudaDeviceSynchronize();
    if (err != cudaSuccess) std::cout << "NV12 TO YUV420 CUDA error: " << cudaGetErrorString(err) << std::endl;
    if (stat != NPP_SUCCESS) std::cout << "NV12 TO YUV420 NPP error: " << (int)stat << std::endl;
#endif
    if (stat != NPP_SUCCESS) return -6;
// resize each plane individually
    NppiSize oSrcSize = oSizeROI;
    NppiRect oSrcROI;
    oSrcROI.x = 0;
    oSrcROI.y = 0;
    oSrcROI.width = isw;
    oSrcROI.height = ish;
    NppiRect oDstROI;
    oDstROI.x = 0;
    oDstROI.y = 0;
    oDstROI.width = osw;
    oDstROI.height = osh;
    double nXFactor = osw/(double)isw;
    double nYFactor = osh/(double)ish;
// resize Y
    stat = nppiResizeSqrPixel_8u_C1R_Advanced(tempbuff, oSrcSize, isw, oSrcROI, oy, opitch, oDstROI, nXFactor, nYFactor, tempbuff+(ish*isw*3),NPPI_INTER_LANCZOS3_ADVANCED);
#ifdef USE_DEBUG
    err = cudaDeviceSynchronize();
    if (err != cudaSuccess) std::cout << "Y RESIZE CUDA error: " << cudaGetErrorString(err) << std::endl;
    if (stat != NPP_SUCCESS) std::cout << "Y RESIZE NPP error: " << (int)stat << std::endl;
#endif
    if (stat != NPP_SUCCESS) return -7;
// resize U
    oSrcSize.width  /= 2;
    oSrcSize.height /= 2;
    oSrcROI.width   /= 2;
    oSrcROI.height  /= 2;
    oDstROI.width   /= 2;
    oDstROI.height  /= 2;
    stat = nppiResizeSqrPixel_8u_C1R_Advanced(tempbuff+ish*isw, oSrcSize, isw/2, oSrcROI, tempbuff+(ish*isw*3), osw/2, oDstROI, nXFactor, nYFactor, tempbuff+(ish*isw*3) + (osh*osw*3),NPPI_INTER_LANCZOS3_ADVANCED);
#ifdef USE_DEBUG
    err = cudaDeviceSynchronize();
    if (err != cudaSuccess) std::cout << "U RESIZE CUDA error: " << cudaGetErrorString(err) << std::endl;
    if (stat != NPP_SUCCESS) std::cout << "U RESIZE NPP error: " << (int)stat << std::endl;
#endif
    if (stat != NPP_SUCCESS) return -8;
// resize V
    stat = nppiResizeSqrPixel_8u_C1R_Advanced(tempbuff+ish*isw+(ish*isw/4), oSrcSize, isw/2, oSrcROI, tempbuff+(ish*isw*3)+(osh*osw/4), osw/2, oDstROI, nXFactor, nYFactor, tempbuff+(ish*isw*3) + (osh*osw*3),NPPI_INTER_LANCZOS3_ADVANCED);
#ifdef USE_DEBUG
    err = cudaDeviceSynchronize();
    if (err != cudaSuccess) std::cout << "V RESIZE CUDA error: " << cudaGetErrorString(err) << std::endl;
    if (stat != NPP_SUCCESS) std::cout << "V RESIZE NPP error: " << (int)stat << std::endl;
#endif
    if (stat != NPP_SUCCESS) return -9;

// pack_uv
    dim3 block(32, 8);
    dim3 grid((osw+block.x-1)/block.x, (osh+block.y-1)/block.y);
    pack_uv<<< grid, block >>>(tempbuff+(ish*isw*3), tempbuff+(ish*isw*3)+(osh*osw/4), ouv, osw, osh/2, opitch, osw/2, osw/2);
    err = cudaGetLastError();
#ifdef USE_DEBUG
    if (err != cudaSuccess) std::cout << "PACK UV LAUNCH CUDA error: " << cudaGetErrorString(err) << std::endl;
    err = cudaDeviceSynchronize();
    if (err != cudaSuccess) std::cout << "PACK UV EXEC CUDA error: " << cudaGetErrorString(err) << std::endl;
#endif
    if (err != cudaSuccess) return -10;
    }

  return 0;
}

#ifdef UNIT_TEST
// timing
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
unsigned long long dtime_usec(unsigned long long start){

  timeval tv;
  gettimeofday(&tv, 0);
  return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}

// bitmap file handling
struct Info{
    int width;
    int height;
    int offset;
    unsigned char * info;
    unsigned char * data;

    int size;
};
#include <fstream>
Info readBMP(const char* filename)
{
    int i;
    std::ifstream is(filename, std::ifstream::binary);
    is.seekg(0, is.end);
    i = is.tellg();
    is.seekg(0);
    unsigned char *info = new unsigned char[i];
    is.read((char *)info,i);

    int width = *(int*)&info[18];
    int height = *(int*)&info[22];
    int offset = *(int*)&info[10];
    Info dat;
    dat.width = width;
    dat.height = height;
    dat.offset = offset;
    dat.size = i;
    dat.info = new unsigned char[offset - 1];
    dat.data = new unsigned char[i - offset + 1];
    if ((i-offset+1) < (3*height*width)) std::cout << "size: " << i-offset+1 << " expected: " << height*width*3 << std::endl;
    std::copy(info,
              info + offset,
              dat.info);

    std::copy(info + offset,
              info + i,
              dat.data);
    delete[] info;
    return dat;

}
void writeBMP(const char *filename, Info dat){

    std::ofstream fout;
    fout.open(filename, std::ios::binary | std::ios::out);
    fout.write( reinterpret_cast<char *>(dat.info), dat.offset);

    fout.write( reinterpret_cast<char *>(dat.data), dat.size - dat.offset );
    fout.close();
}

int main(int argc, char *argv[]){
  int eInterpolation = NPPI_INTER_LANCZOS;
  if (argc > 1) eInterpolation = atoi(argv[1]);
  else{
    std::cout << "Must specify a valid interpolation mode:" << std::endl;
    std::cout << NPPI_INTER_NN << " :NPPI_INTER_NN" << std::endl;
    std::cout << NPPI_INTER_LINEAR << " :NPPI_INTER_LINEAR" << std::endl;
    std::cout << NPPI_INTER_CUBIC << " :NPPI_INTER_CUBIC" << std::endl;
    std::cout << NPPI_INTER_LANCZOS << " :NPPI_INTER_LANCZOS" << std::endl;
    return 0;}
  int method = 0;
  if (argc > 2) method = atoi(argv[2]);
  // input to NV12
  Info rfile = readBMP("input.bmp");
  const int H = rfile.height;
  const int W = rfile.width;
  std::cout << "Height = " << rfile.height << std::endl;
  std::cout << "Width  = " << rfile.width  << std::endl;
  Npp8u *rgbdata, *ty, *tu, *tv, *tuv;
  cudaMalloc(&rgbdata, H*W*3);
  cudaMalloc(&ty, H*W);
  cudaMalloc(&tu, H*W/4);
  cudaMalloc(&tv, H*W/4);
  cudaMalloc(&tuv, H*W/2);

  cudaMemcpy(rgbdata, rfile.data, H*W*3, cudaMemcpyHostToDevice);
  Npp8u *pDst[3] = { ty, tu, tv};
  int rDstStep[3] = { W, W/2, W/2 };
  NppiSize oSizeROI;
  oSizeROI.width = W;
  oSizeROI.height = H;
  NppStatus stat = nppiRGBToYUV420_8u_C3P3R(rgbdata, W*3*sizeof(Npp8u), pDst, rDstStep, oSizeROI);
  if (stat != NPP_SUCCESS) { std::cout << "Input NPP error"  << std::endl; return 0;}
  dim3 block(32, 8);
  dim3 grid((W+block.x-1)/block.x, (H+block.y-1)/block.y);
  pack_uv<<< grid, block >>>(tu, tv, tuv, W, H/2, W, W/2, W/2);

  // 1:1 test

  int buff_size = rs(H, W, W, H, W, W, NULL, NULL, NULL, NULL, NULL);
  unsigned char *tbuff;
  cudaError_t err = cudaMalloc(&tbuff, buff_size);
  if (err != cudaSuccess) {std::cout << "on temp buff allocation of size: " << buff_size << " error: " << (int)err << std::endl; return 0;}
  unsigned char *oy, *ouv;
  err = cudaMalloc(&oy, H*W*sizeof(unsigned char));
  if (err != cudaSuccess) {std::cout << "on oy allocation of size: " << H*W*sizeof(unsigned char) << " error: " << (int)err << std::endl; return 0;}
  err = cudaMalloc(&ouv, H*W*sizeof(unsigned char)/2);
  if (err != cudaSuccess) {std::cout << "on ouv allocation of size: " << H*W*sizeof(unsigned char) << " error: " << (int)err << std::endl; return 0;}


  int error = rs(H, W, W, H, W, W, ty, tuv, oy, ouv, tbuff, method, eInterpolation);
  if (error != 0) std::cout << "Function Failure: " << error << std::endl;
  // output to RGB

  const Npp8u *pSrc[2] = {ty, tuv};
  oSizeROI.width  = W;
  oSizeROI.height = H;
#ifdef USE_709
  stat = nppiNV12ToRGB_709HDTV_8u_P2C3R(pSrc, W, rgbdata, W*3*sizeof(Npp8u), oSizeROI);
#else
  stat = nppiNV12ToRGB_8u_P2C3R(pSrc, W, rgbdata, W*3*sizeof(Npp8u), oSizeROI);
#endif
  if (stat != NPP_SUCCESS) { std::cout << "Output NPP error"  << std::endl; return 0;}
  cudaMemcpy(rfile.data, rgbdata, H*W*3, cudaMemcpyDeviceToHost);

  writeBMP("output.bmp", rfile);
  // 2x upscale test

  cudaFree(tbuff);
  buff_size = rs(H, W, W, 2*H, 2*W, 2*W, NULL, NULL, NULL, NULL, NULL);
  err = cudaMalloc(&tbuff, buff_size);
  if (err != cudaSuccess) {std::cout << "on temp buff allocation of size: " << buff_size << " error: " << (int)err << std::endl; return 0;}
  cudaFree(oy);
  cudaFree(ouv);
  err = cudaMalloc(&oy, 4*H*W*sizeof(unsigned char));
  if (err != cudaSuccess) {std::cout << "on oy allocation of size: " << H*W*sizeof(unsigned char) << " error: " << (int)err << std::endl; return 0;}
  err = cudaMalloc(&ouv, 2*H*W*sizeof(unsigned char));
  if (err != cudaSuccess) {std::cout << "on ouv allocation of size: " << H*W*sizeof(unsigned char) << " error: " << (int)err << std::endl; return 0;}

  unsigned long long dt = dtime_usec(0);
  error = rs(H, W, W, 2*H, 2*W, 2*W, ty, tuv, oy, ouv, tbuff, method, eInterpolation);
  cudaDeviceSynchronize();
  dt = dtime_usec(dt);
  if (error != 0) std::cout << "Function Failure: " << error << std::endl;
  std::cout << "2x resize time: " << dt/(float)USECPSEC << "s" << std::endl;
  // output to RGB

  const Npp8u *pSrc2[2] = {oy, ouv};
  oSizeROI.width  = 2*W;
  oSizeROI.height = 2*H;
  cudaFree(rgbdata);
  cudaMalloc(&rgbdata, H*W*12);
#ifdef USE_709
  stat = nppiNV12ToRGB_709HDTV_8u_P2C3R(pSrc2, 2*W, rgbdata, W*6*sizeof(Npp8u), oSizeROI);
#else
  stat = nppiNV12ToRGB_8u_P2C3R(pSrc2, 2*W, rgbdata, W*6*sizeof(Npp8u), oSizeROI);
#endif
  if (stat != NPP_SUCCESS) { std::cout << "Output NPP error"  << std::endl; return 0;}
  delete[] rfile.data;
  rfile.data = new unsigned char[H*W*12];
  cudaMemcpy(rfile.data, rgbdata, H*W*12, cudaMemcpyDeviceToHost);
  int osize = rfile.size - rfile.offset;
  int nsizeinc = H*W*12 - osize;
  rfile.size += nsizeinc;
  *((int*)(rfile.info+18)) = 2*W;
  *((int*)(rfile.info+22)) = 2*H;
  writeBMP("output2.bmp", rfile);
  return 0;
}
#endif

The above code does the following steps:

  • read in a .bmp file from disk into RGB storage
  • convert to YUV420
  • convert to NV12
  • resize the NV12 image (there are multiple steps here)
  • convert the resized NV12 image to RGB
  • write RGB image as a .bmp file
0
cjl On
    NppiSize oSizeROI;
    oSizeROI.width = frame_dec->width;
    oSizeROI.height = frame_dec->height;
    DBUG_PRINT("width: %d, height: %d, stepBytes: %d\n", oSizeROI.width, oSizeROI.height, stepBytes);
    // NppStatus stat = nppiNV12ToBGR_8u_P2C3R(frame_dec->data, frame_dec->width, bgrData, frame_dec->width*3*sizeof(Npp8u), oSizeROI);
    NppStatus stat;
#ifdef USE_709
    stat = nppiNV12ToBGR_8u_P2C3R(frame_dec->data, frame_dec->linesize[0], bgrData, frame_dec->width*3, oSizeROI);
#else
    stat = nppiNV12ToBGR_709HDTV_8u_P2C3R(frame_dec->data, frame_dec->linesize[0], bgrData, frame_dec->width * 3, oSizeROI);
#endif
    unsigned char *data = (unsigned char *)malloc(frame_dec->width * frame_dec->height * 3);
    cudaMemcpy(data, bgrData, frame_dec->height * frame_dec->width * 3, cudaMemcpyDeviceToHost);
    cv::Mat mat_test(frame_dec->height, frame_dec->width, CV_8UC3, data);
    imwrite("test.jpg", mat_test);
    free(data);
    nppiFree(bgrData);
    exit(0);

frame_dec decoded by ffmpeg cuda decoder