Disadvantage of Constant Memory for Large Convolution Matrixes

Question

Disadvantage of Constant Memory for Large Convolution Matrixes

102 views Asked by logaritmabir At 04 March 2024 at 10:09

I have been trying to have benefit of constant memory in convolution operations. In 3x3 convolution I can gain little bit speed up but whenever i increase the size of convolution matrix then constant memory support becoming inefficient.

According to its intended use,each thread accesses the same element of convolution matrix at same time in all kernels (3x3, 5x5, 7x7). I have used OpenCV::Mat images(each pixel 8bit unsigned char for gray images).

Question : Why constant memory doesn't accelerate the 5x5 and 7x7 convolution while accelerating the 3x3 convolution ?

Hardware I use : Nvidia GTX 1660 TI (Mobile) with CC 7.5 Turing Arch

There are the kernels :

__global__ void k_1D_gf_3x3_default(unsigned char* input, int rows, int cols)
{
    int ty = blockIdx.x * blockDim.x + threadIdx.x;
    int tx = blockIdx.y * blockDim.y + threadIdx.y;
    int threadId = (tx * cols + ty);

    unsigned char conv_kernel[3][3] = { {1, 2, 1}, {2, 4, 2}, {1, 2, 1} };
    int new_val = 0;

    if ((tx > 0 && tx < rows - 1) && (ty > 0 && ty < cols - 1)) {
        new_val += conv_kernel[0][0] * input[(tx - 1) * cols + ty - 1];
        new_val += conv_kernel[0][1] * input[(tx - 1) * cols + ty];
        new_val += conv_kernel[0][2] * input[(tx - 1) * cols + ty + 1];
        new_val += conv_kernel[1][0] * input[tx * cols + ty - 1];
        new_val += conv_kernel[1][1] * input[tx * cols + ty];
        new_val += conv_kernel[1][2] * input[tx * cols + ty + 1];
        new_val += conv_kernel[2][0] * input[(tx + 1) * cols + ty - 1];
        new_val += conv_kernel[2][1] * input[(tx + 1) * cols + ty];
        new_val += conv_kernel[2][2] * input[(tx + 1) * cols + ty + 1];

        input[threadId] = new_val / 16;
    }
}

__global__ void k_1D_gf_5x5_default(unsigned char* input, int rows, int cols)
{
    int ty = blockIdx.x * blockDim.x + threadIdx.x;
    int tx = blockIdx.y * blockDim.y + threadIdx.y;
    int threadId = (tx * cols + ty);

    unsigned char conv_kernel5x5[5][5] = { {1,  4,  6,  4,  1},
                                    {4, 16, 24, 16,  4},
                                    {6, 24, 36, 24,  6},
                                    {4, 16, 24, 16,  4},
                                    {1,  4,  6,  4,  1} };
    int new_val = 0;

    if ((tx > 1 && tx < rows - 2) && (ty > 1 && ty < cols - 2)) {
        new_val = conv_kernel5x5[0][0] * input[(tx - 2) * cols + ty - 2];
        new_val += conv_kernel5x5[0][1] * input[(tx - 2) * cols + ty - 1];
        new_val += conv_kernel5x5[0][2] * input[(tx - 2) * cols + ty];
        new_val += conv_kernel5x5[0][3] * input[(tx - 2) * cols + ty + 1];
        new_val += conv_kernel5x5[0][4] * input[(tx - 2) * cols + ty + 2];

        new_val += conv_kernel5x5[1][0] * input[(tx - 1) * cols + ty - 2];
        new_val += conv_kernel5x5[1][1] * input[(tx - 1) * cols + ty - 1];
        new_val += conv_kernel5x5[1][2] * input[(tx - 1) * cols + ty];
        new_val += conv_kernel5x5[1][3] * input[(tx - 1) * cols + ty + 1];
        new_val += conv_kernel5x5[1][4] * input[(tx - 1) * cols + ty + 2];
        new_val += conv_kernel5x5[2][0] * input[(tx)*cols + ty - 2];
        new_val += conv_kernel5x5[2][1] * input[(tx)*cols + ty - 1];
        new_val += conv_kernel5x5[2][2] * input[(tx)*cols + ty];
        new_val += conv_kernel5x5[2][3] * input[(tx)*cols + ty + 1];
        new_val += conv_kernel5x5[2][4] * input[(tx)*cols + ty + 2];

        new_val += conv_kernel5x5[3][0] * input[(tx + 1) * cols + ty - 2];
        new_val += conv_kernel5x5[3][1] * input[(tx + 1) * cols + ty - 1];
        new_val += conv_kernel5x5[3][2] * input[(tx + 1) * cols + ty];
        new_val += conv_kernel5x5[3][3] * input[(tx + 1) * cols + ty + 1];
        new_val += conv_kernel5x5[3][4] * input[(tx + 1) * cols + ty + 2];

        new_val += conv_kernel5x5[4][0] * input[(tx + 2) * cols + ty - 2];
        new_val += conv_kernel5x5[4][1] * input[(tx + 2) * cols + ty - 1];
        new_val += conv_kernel5x5[4][2] * input[(tx + 2) * cols + ty];
        new_val += conv_kernel5x5[4][3] * input[(tx + 2) * cols + ty + 1];
        new_val += conv_kernel5x5[4][4] * input[(tx + 2) * cols + ty + 2];

        input[threadId] = new_val / 273;
    }
}

__global__ void k_1D_gf_7x7_default(unsigned char* input, int rows, int cols)
{
    int ty = blockIdx.x * blockDim.x + threadIdx.x;
    int tx = blockIdx.y * blockDim.y + threadIdx.y;
    int threadId = (tx * cols + ty);

    unsigned char conv_kernel7x7[7][7] = { {0, 0, 1, 2, 1, 0,  0},
                                        {0,  3, 13, 22, 13, 3, 0},
                                        {1, 13, 59, 97, 59, 13, 1},
                                        {2, 22, 97, 159, 97, 22, 2},
                                        {1, 13, 59, 97, 59, 13, 1},
                                        {0,  3, 13, 22, 13, 3, 0},
                                        {0,  0,  1, 2, 1, 0, 0} };
    int new_val = 0;

    if ((tx > 2 && tx < rows - 3) && (ty > 2 && ty < cols - 3)) {
        new_val += conv_kernel7x7[0][0] * input[(tx - 3) * cols + ty - 3];
        new_val += conv_kernel7x7[0][1] * input[(tx - 3) * cols + ty - 2];
        new_val += conv_kernel7x7[0][2] * input[(tx - 3) * cols + ty - 1];
        new_val += conv_kernel7x7[0][3] * input[(tx - 3) * cols + ty];
        new_val += conv_kernel7x7[0][4] * input[(tx - 3) * cols + ty + 1];
        new_val += conv_kernel7x7[0][5] * input[(tx - 3) * cols + ty + 2];
        new_val += conv_kernel7x7[0][6] * input[(tx - 3) * cols + ty + 3];

        new_val += conv_kernel7x7[1][0] * input[(tx - 2) * cols + ty - 3];
        new_val += conv_kernel7x7[1][1] * input[(tx - 2) * cols + ty - 2];
        new_val += conv_kernel7x7[1][2] * input[(tx - 2) * cols + ty - 1];
        new_val += conv_kernel7x7[1][3] * input[(tx - 2) * cols + ty];
        new_val += conv_kernel7x7[1][4] * input[(tx - 2) * cols + ty + 1];
        new_val += conv_kernel7x7[1][5] * input[(tx - 2) * cols + ty + 2];
        new_val += conv_kernel7x7[1][6] * input[(tx - 2) * cols + ty + 3];
                
        new_val += conv_kernel7x7[2][0] * input[(tx - 1) * cols + ty - 3];
        new_val += conv_kernel7x7[2][1] * input[(tx - 1) * cols + ty - 2];
        new_val += conv_kernel7x7[2][2] * input[(tx - 1) * cols + ty - 1];
        new_val += conv_kernel7x7[2][3] * input[(tx - 1) * cols + ty];
        new_val += conv_kernel7x7[2][4] * input[(tx - 1) * cols + ty + 1];
        new_val += conv_kernel7x7[2][5] * input[(tx - 1) * cols + ty + 2];
        new_val += conv_kernel7x7[2][6] * input[(tx - 1) * cols + ty + 3];
                
        new_val += conv_kernel7x7[3][0] * input[(tx) * cols + ty - 3];
        new_val += conv_kernel7x7[3][1] * input[(tx) * cols + ty - 2];
        new_val += conv_kernel7x7[3][2] * input[(tx) * cols + ty - 1];
        new_val += conv_kernel7x7[3][3] * input[(tx) * cols + ty];
        new_val += conv_kernel7x7[3][4] * input[(tx) * cols + ty + 1];
        new_val += conv_kernel7x7[3][5] * input[(tx) * cols + ty + 2];
        new_val += conv_kernel7x7[3][6] * input[(tx) * cols + ty + 3];
                
        new_val += conv_kernel7x7[4][0] * input[(tx + 1) * cols + ty - 3];
        new_val += conv_kernel7x7[4][1] * input[(tx + 1) * cols + ty - 2];
        new_val += conv_kernel7x7[4][2] * input[(tx + 1) * cols + ty - 1];
        new_val += conv_kernel7x7[4][3] * input[(tx + 1) * cols + ty];
        new_val += conv_kernel7x7[4][4] * input[(tx + 1) * cols + ty + 1];
        new_val += conv_kernel7x7[4][5] * input[(tx + 1) * cols + ty + 2];
        new_val += conv_kernel7x7[4][6] * input[(tx + 1) * cols + ty + 3];
                
        new_val += conv_kernel7x7[5][0] * input[(tx + 2) * cols + ty - 3];
        new_val += conv_kernel7x7[5][1] * input[(tx + 2) * cols + ty - 2];
        new_val += conv_kernel7x7[5][2] * input[(tx + 2) * cols + ty - 1];
        new_val += conv_kernel7x7[5][3] * input[(tx + 2) * cols + ty];
        new_val += conv_kernel7x7[5][4] * input[(tx + 2) * cols + ty + 1];
        new_val += conv_kernel7x7[5][5] * input[(tx + 2) * cols + ty + 2];
        new_val += conv_kernel7x7[5][6] * input[(tx + 2) * cols + ty + 3];
                
        new_val += conv_kernel7x7[6][0] * input[(tx + 3) * cols + ty - 3];
        new_val += conv_kernel7x7[6][1] * input[(tx + 3) * cols + ty - 2];
        new_val += conv_kernel7x7[6][2] * input[(tx + 3) * cols + ty - 1];
        new_val += conv_kernel7x7[6][3] * input[(tx + 3) * cols + ty];
        new_val += conv_kernel7x7[6][4] * input[(tx + 3) * cols + ty + 1];
        new_val += conv_kernel7x7[6][5] * input[(tx + 3) * cols + ty + 2];
        new_val += conv_kernel7x7[6][6] * input[(tx + 3) * cols + ty + 3];

        input[threadId] = new_val / 1003;
    }
}

__constant__ unsigned char dev_const_conv_kernel3x3[3][3];
__constant__ unsigned char dev_const_conv_kernel5x5[5][5];
__constant__ unsigned char dev_const_conv_kernel7x7[7][7];

__global__ void k_1D_gf_3x3_constant(unsigned char* input, int rows, int cols)
{
    int ty = blockIdx.x * blockDim.x + threadIdx.x;
    int tx = blockIdx.y * blockDim.y + threadIdx.y;

    int new_val = 0;

    new_val += dev_const_conv_kernel3x3[0][0] * input[(tx - 1) * cols + ty - 1];
    new_val += dev_const_conv_kernel3x3[0][1] * input[(tx - 1) * cols + ty];
    new_val += dev_const_conv_kernel3x3[0][2] * input[(tx - 1) * cols + ty + 1];
    new_val += dev_const_conv_kernel3x3[1][0] * input[tx * cols + ty - 1];
    new_val += dev_const_conv_kernel3x3[1][1] * input[tx * cols + ty];
    new_val += dev_const_conv_kernel3x3[1][2] * input[tx * cols + ty + 1];
    new_val += dev_const_conv_kernel3x3[2][0] * input[(tx + 1) * cols + ty - 1];
    new_val += dev_const_conv_kernel3x3[2][1] * input[(tx + 1) * cols + ty];
    new_val += dev_const_conv_kernel3x3[2][2] * input[(tx + 1) * cols + ty + 1];

    input[tx * cols + ty] = new_val / 16;
}

__global__ void k_1D_gf_5x5_constant(unsigned char* input, int rows, int cols)
{
    int ty = blockIdx.x * blockDim.x + threadIdx.x;
    int tx = blockIdx.y * blockDim.y + threadIdx.y;
    int threadId = (tx * cols + ty);

    int new_val = 0;

    if ((tx > 1 && tx < rows - 2) && (ty > 1 && ty < cols - 2)) {
        new_val += dev_const_conv_kernel5x5[0][0] * input[(tx - 2) * cols + ty - 2];
        new_val += dev_const_conv_kernel5x5[0][1] * input[(tx - 2) * cols + ty - 1];
        new_val += dev_const_conv_kernel5x5[0][2] * input[(tx - 2) * cols + ty];
        new_val += dev_const_conv_kernel5x5[0][3] * input[(tx - 2) * cols + ty + 1];
        new_val += dev_const_conv_kernel5x5[0][4] * input[(tx - 2) * cols + ty + 2];
                   
        new_val += dev_const_conv_kernel5x5[1][0] * input[(tx - 1) * cols + ty - 2];
        new_val += dev_const_conv_kernel5x5[1][1] * input[(tx - 1) * cols + ty - 1];
        new_val += dev_const_conv_kernel5x5[1][2] * input[(tx - 1) * cols + ty];
        new_val += dev_const_conv_kernel5x5[1][3] * input[(tx - 1) * cols + ty + 1];
        new_val += dev_const_conv_kernel5x5[1][4] * input[(tx - 1) * cols + ty + 2];

        new_val += dev_const_conv_kernel5x5[2][0] * input[(tx)*cols + ty - 2];
        new_val += dev_const_conv_kernel5x5[2][1] * input[(tx)*cols + ty - 1];
        new_val += dev_const_conv_kernel5x5[2][2] * input[(tx)*cols + ty];
        new_val += dev_const_conv_kernel5x5[2][3] * input[(tx)*cols + ty + 1];
        new_val += dev_const_conv_kernel5x5[2][4] * input[(tx)*cols + ty + 2];
                  
        new_val += dev_const_conv_kernel5x5[3][0] * input[(tx + 1) * cols + ty - 2];
        new_val += dev_const_conv_kernel5x5[3][1] * input[(tx + 1) * cols + ty - 1];
        new_val += dev_const_conv_kernel5x5[3][2] * input[(tx + 1) * cols + ty];
        new_val += dev_const_conv_kernel5x5[3][3] * input[(tx + 1) * cols + ty + 1];
        new_val += dev_const_conv_kernel5x5[3][4] * input[(tx + 1) * cols + ty + 2];
                   
        new_val += dev_const_conv_kernel5x5[4][0] * input[(tx + 2) * cols + ty - 2];
        new_val += dev_const_conv_kernel5x5[4][1] * input[(tx + 2) * cols + ty - 1];
        new_val += dev_const_conv_kernel5x5[4][2] * input[(tx + 2) * cols + ty];
        new_val += dev_const_conv_kernel5x5[4][3] * input[(tx + 2) * cols + ty + 1];
        new_val += dev_const_conv_kernel5x5[4][4] * input[(tx + 2) * cols + ty + 2];

        input[threadId] = new_val / 273;
    }
}

__global__ void k_1D_gf_7x7_constant(unsigned char* input, int rows, int cols)
{
    int ty = blockIdx.x * blockDim.x + threadIdx.x;
    int tx = blockIdx.y * blockDim.y + threadIdx.y;
    int threadId = (tx * cols + ty);

    int new_val = 0;

    if ((tx > 2 && tx < rows - 3) && (ty > 2 && ty < cols - 3)) {
        new_val += dev_const_conv_kernel7x7[0][0] * input[(tx - 3) * cols + ty - 3];
        new_val += dev_const_conv_kernel7x7[0][1] * input[(tx - 3) * cols + ty - 2];
        new_val += dev_const_conv_kernel7x7[0][2] * input[(tx - 3) * cols + ty - 1];
        new_val += dev_const_conv_kernel7x7[0][3] * input[(tx - 3) * cols + ty];
        new_val += dev_const_conv_kernel7x7[0][4] * input[(tx - 3) * cols + ty + 1];
        new_val += dev_const_conv_kernel7x7[0][5] * input[(tx - 3) * cols + ty + 2];
        new_val += dev_const_conv_kernel7x7[0][6] * input[(tx - 3) * cols + ty + 3];
                   
        new_val += dev_const_conv_kernel7x7[1][0] * input[(tx - 2) * cols + ty - 3];
        new_val += dev_const_conv_kernel7x7[1][1] * input[(tx - 2) * cols + ty - 2];
        new_val += dev_const_conv_kernel7x7[1][2] * input[(tx - 2) * cols + ty - 1];
        new_val += dev_const_conv_kernel7x7[1][3] * input[(tx - 2) * cols + ty];
        new_val += dev_const_conv_kernel7x7[1][4] * input[(tx - 2) * cols + ty + 1];
        new_val += dev_const_conv_kernel7x7[1][5] * input[(tx - 2) * cols + ty + 2];
        new_val += dev_const_conv_kernel7x7[1][6] * input[(tx - 2) * cols + ty + 3];
                   
        new_val += dev_const_conv_kernel7x7[2][0] * input[(tx - 1) * cols + ty - 3];
        new_val += dev_const_conv_kernel7x7[2][1] * input[(tx - 1) * cols + ty - 2];
        new_val += dev_const_conv_kernel7x7[2][2] * input[(tx - 1) * cols + ty - 1];
        new_val += dev_const_conv_kernel7x7[2][3] * input[(tx - 1) * cols + ty];
        new_val += dev_const_conv_kernel7x7[2][4] * input[(tx - 1) * cols + ty + 1];
        new_val += dev_const_conv_kernel7x7[2][5] * input[(tx - 1) * cols + ty + 2];
        new_val += dev_const_conv_kernel7x7[2][6] * input[(tx - 1) * cols + ty + 3];
                  
        new_val += dev_const_conv_kernel7x7[3][0] * input[(tx)*cols + ty - 3];
        new_val += dev_const_conv_kernel7x7[3][1] * input[(tx)*cols + ty - 2];
        new_val += dev_const_conv_kernel7x7[3][2] * input[(tx)*cols + ty - 1];
        new_val += dev_const_conv_kernel7x7[3][3] * input[(tx)*cols + ty];
        new_val += dev_const_conv_kernel7x7[3][4] * input[(tx)*cols + ty + 1];
        new_val += dev_const_conv_kernel7x7[3][5] * input[(tx)*cols + ty + 2];
        new_val += dev_const_conv_kernel7x7[3][6] * input[(tx)*cols + ty + 3];
                   
        new_val += dev_const_conv_kernel7x7[4][0] * input[(tx + 1) * cols + ty - 3];
        new_val += dev_const_conv_kernel7x7[4][1] * input[(tx + 1) * cols + ty - 2];
        new_val += dev_const_conv_kernel7x7[4][2] * input[(tx + 1) * cols + ty - 1];
        new_val += dev_const_conv_kernel7x7[4][3] * input[(tx + 1) * cols + ty];
        new_val += dev_const_conv_kernel7x7[4][4] * input[(tx + 1) * cols + ty + 1];
        new_val += dev_const_conv_kernel7x7[4][5] * input[(tx + 1) * cols + ty + 2];
        new_val += dev_const_conv_kernel7x7[4][6] * input[(tx + 1) * cols + ty + 3];
                   
        new_val += dev_const_conv_kernel7x7[5][0] * input[(tx + 2) * cols + ty - 3];
        new_val += dev_const_conv_kernel7x7[5][1] * input[(tx + 2) * cols + ty - 2];
        new_val += dev_const_conv_kernel7x7[5][2] * input[(tx + 2) * cols + ty - 1];
        new_val += dev_const_conv_kernel7x7[5][3] * input[(tx + 2) * cols + ty];
        new_val += dev_const_conv_kernel7x7[5][4] * input[(tx + 2) * cols + ty + 1];
        new_val += dev_const_conv_kernel7x7[5][5] * input[(tx + 2) * cols + ty + 2];
        new_val += dev_const_conv_kernel7x7[5][6] * input[(tx + 2) * cols + ty + 3];
                   
        new_val += dev_const_conv_kernel7x7[6][0] * input[(tx + 3) * cols + ty - 3];
        new_val += dev_const_conv_kernel7x7[6][1] * input[(tx + 3) * cols + ty - 2];
        new_val += dev_const_conv_kernel7x7[6][2] * input[(tx + 3) * cols + ty - 1];
        new_val += dev_const_conv_kernel7x7[6][3] * input[(tx + 3) * cols + ty];
        new_val += dev_const_conv_kernel7x7[6][4] * input[(tx + 3) * cols + ty + 1];
        new_val += dev_const_conv_kernel7x7[6][5] * input[(tx + 3) * cols + ty + 2];
        new_val += dev_const_conv_kernel7x7[6][6] * input[(tx + 3) * cols + ty + 3];

        input[threadId] = new_val / 1003;
    }
}

float gf_1d_gpu(cv::Mat* output_img, GAUSSIAN ver)
{
    unsigned char* gpu_input = nullptr;
    unsigned char* output = output_img->data;

    unsigned int cols = (*output_img).cols;
    unsigned int rows = (*output_img).rows;
    unsigned int size = cols * rows * sizeof(unsigned char);

    unsigned char conv_kernel3x3[3][3] = { {1, 2, 1}, {2, 4, 2}, {1, 2, 1} };
    unsigned char conv_kernel5x5[5][5] = {{1, 4,  7,  4,  1},
                                        {4, 16, 26, 16, 4},
                                        {7, 26, 41, 26, 7},
                                        {4, 16, 26, 16, 4},
                                        {1, 4,  7,  4,  1}};
    unsigned char conv_kernel7x7[7][7] = {{0, 0, 1, 2, 1, 0,  0},
                                        {0,  3, 13, 22, 13, 3, 0},
                                        {1, 13, 59, 97, 59, 13, 1},
                                        {2, 22, 97, 159, 97, 22, 2},
                                        {1, 13, 59, 97, 59, 13, 1},
                                        {0,  3, 13, 22, 13, 3, 0},
                                        {0,  0,  1, 2, 1, 0, 0}};


    dim3 block(32, 32);
    dim3 grid((cols + block.x - 1) / block.x, (rows + block.y - 1) / block.y);
    dim3 grid2(((cols / 2) + block.x - 1) / block.x, (rows + block.y - 1) / block.y);
    dim3 grid3(((cols / 3) + block.x - 1) / block.x, (rows + block.y - 1) / block.y);
    dim3 grid4(((cols / 4) + block.x - 1) / block.x, (rows + block.y - 1) / block.y);

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaEventRecord(start);

    cudaHostRegister(output, size, cudaHostRegisterPortable);
    CHECK_CUDA_ERROR(cudaMalloc((unsigned char**)&gpu_input, size));
    CHECK_CUDA_ERROR(cudaMemcpy(gpu_input, output, size, cudaMemcpyHostToDevice));

    switch (ver)
    {
    default:
        break;
    case GAUSSIAN_3x3_default:
        k_1D_gf_3x3_default << <grid, block >> > (gpu_input, rows, cols);
        break;
    case GAUSSIAN_5x5_default:
        k_1D_gf_5x5_default << <grid, block >> > (gpu_input, rows, cols);
        break;
    case GAUSSIAN_7x7_default:
        k_1D_gf_7x7_default << <grid, block >> > (gpu_input, rows, cols);
        break;
    case GAUSSIAN_3x3_constant:
        CHECK_CUDA_ERROR(cudaMemcpyToSymbol(dev_const_conv_kernel3x3, conv_kernel3x3, sizeof(unsigned char) * 3 * 3));
        k_1D_gf_3x3_constant << <grid, block >> > (gpu_input, rows, cols);
        break;
    case GAUSSIAN_5x5_constant:
        CHECK_CUDA_ERROR(cudaMemcpyToSymbol(dev_const_conv_kernel5x5, conv_kernel5x5, sizeof(unsigned char) * 5 * 5));
        k_1D_gf_5x5_constant << <grid, block >> > (gpu_input, rows, cols);
        break;
    case GAUSSIAN_7x7_constant:
        CHECK_CUDA_ERROR(cudaMemcpyToSymbol(dev_const_conv_kernel7x7, conv_kernel7x7, sizeof(unsigned char) * 7 * 7));
        k_1D_gf_7x7_constant << <grid, block >> > (gpu_input, rows, cols);
        break;
    }
    CHECK_CUDA_ERROR(cudaMemcpy(output, gpu_input, size, cudaMemcpyDeviceToHost));

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float elapsed = 0.0f;
    cudaEventElapsedTime(&elapsed, start, stop);

    cudaHostUnregister(output);
    cudaFree(gpu_input);
    cudaDeviceReset();
    return elapsed;
}

int main(){
...
..
.

    gray_input_images.push_back(cv::imread(img_path256, cv::IMREAD_GRAYSCALE));
    gray_input_images.push_back(cv::imread(img_path512, cv::IMREAD_GRAYSCALE));
    gray_input_images.push_back(cv::imread(img_path1024, cv::IMREAD_GRAYSCALE));
    gray_input_images.push_back(cv::imread(img_path2048, cv::IMREAD_GRAYSCALE));
    gray_input_images.push_back(cv::imread(img_path4096, cv::IMREAD_GRAYSCALE));
    gray_input_images.push_back(cv::imread(img_path8192, cv::IMREAD_GRAYSCALE));

    float elapsed = 0.0f;

    std::vector <cv::Mat> output1;
    std::vector <cv::Mat> output2;
    std::vector <cv::Mat> output3;
    std::vector <cv::Mat> output4;
    std::vector <cv::Mat> output5;
    std::vector <cv::Mat> output6;
    std::vector <cv::Mat> output7;

    for (cv::Mat& e: gray_input_images) {
        output1.push_back(e.clone());
        output2.push_back(e.clone());
        output3.push_back(e.clone());
        output4.push_back(e.clone());
        output5.push_back(e.clone());
        output6.push_back(e.clone());
        output7.push_back(e.clone());
    }

    for (cv::Mat& e : output2) {
        elapsed += gf_1d_gpu(&e, GAUSSIAN_3x3_default);
    }

    for (cv::Mat& e : output1) {
        elapsed += gf_1d_gpu(&e, GAUSSIAN_5x5_default);
    }


    for (cv::Mat& e : output3) {
        elapsed += gf_1d_gpu(&e, GAUSSIAN_7x7_default);
    }
.
..
...

And this is the Nsight Compute output:

I have tried loop unrolling and converting images to float. Conversion didn't help me in both speed up or better constant memory usage. But as i expected loop unrolling increased the speed up i gain from constant memory.

Original Q&A

TechQA.

Disadvantage of Constant Memory for Large Convolution Matrixes

There are 0 answers

Related Questions in CUDA

Related Questions in CONVOLUTION

Related Questions in GPU-CONSTANT-MEMORY

Popular Questions

Trending Questions