We have a fairly a single kernel (see below) that we fire off with a grid, block of 1,1
kernel<<<1,1>>>
And then it dynamically fires off lots of smaller kernels. In general, the data flows from kernel to kernel, with the input starting with the first kernel and flowing to to the end.
But we have identified a potential ability to overlap two streams of data each running this identical kernel.
Question: Do we have to abandon dynamic kernel execution, and go to a host based approach to get overlap of execution for two mega-kernels? Or is the scheduler in the card smart enough to interleave the execution between the two mega-kernels and process each one as a separate scheduled item?
We are talking about a Tesla K80. Linux host.
(yes, we will gain some overlap with the cudamemcopyasync() overlapping the execution, but we would like to see some execution overlap also).
#include <cuda.h>
#include <cuda_runtime.h>
#include "coss_types.h"
#include "image.h"
#include "centroid.h"
#include "gpu.h"
#define GPU_TILE_WIDTH 16
#define GPU_TILE_HEIGHT 16
#define GPU_TILE_WBIG 32
#define GPU_TILE_HBIG 32
#define K_IMG_MAX 1024
__constant__ unsigned short* pFrameStack[GPU_CHX];
__constant__ unsigned short* pBackground[GPU_CHX];
__constant__ short* pCleanground[GPU_CHX];
__constant__ unsigned char* pMask[GPU_CHX];
__constant__ float* pForeground[GPU_CHX];
__constant__ float* pLowground[GPU_CHX];
__constant__ float* pLowgroundRow[GPU_CHX];
__constant__ float* pHighground[GPU_CHX];
__constant__ float* pHighgroundRow[GPU_CHX];
__constant__ float* pMins[GPU_CHX];
__constant__ float* pMaxs[GPU_CHX];
__constant__ int gSlot;
__constant__ int* pPercentile[GPU_CHX];
__constant__ int* pLabels1[GPU_CHX];
__constant__ int* pLabels2[GPU_CHX];
__constant__ int* pRawLabels[GPU_CHX];
__constant__ int* pLabels[GPU_CHX];
__constant__ ImgInfoBlock_t* pInfo[GPU_CHX];
__constant__ unsigned short* pSums[GPU_CHX];
__constant__ unsigned short* pBlockSums[GPU_CHX];
__constant__ ImgCentroid_t* pCenters[GPU_CHX];
__constant__ float threshold_sigma = 9.0f;
/* INCLUDED GENERATED CUDA CODE BELOW */
#include "cuda.cu"
/* INCLUDED GENERATED CUDA CODE ABOVE */
extern "C" __device__ void Background(int ch)
{
dim3 block;
dim3 grid;
/* Background Estimation */
block = dim3(128);
grid = dim3((IMG_PIXELS)/256); /* Only half screens at a time */
gMedian<<<grid,block>>>(
pFrameStack[ch],GPU_STACKSIZE,IMG_PIXELS,
pBackground[ch],IMG_HEIGHT,IMG_WIDTH,gSlot);
cudaDeviceSynchronize();
/* Background Removal */
block = dim3(128);
grid = dim3((IMG_PIXELS)/128);
gScrub<<<grid,block>>>(
pFrameStack[ch],GPU_STACKSIZE,IMG_PIXELS,
pBackground[ch],IMG_HEIGHT,IMG_WIDTH,
pCleanground[ch],IMG_HEIGHT,IMG_WIDTH,gSlot);
cudaDeviceSynchronize();
}
extern "C" __device__ void Convolution(int ch)
{
dim3 block;
dim3 grid;
dim3 block_b;
dim3 grid_b;
/* Convolve Rows */
block = dim3(GPU_TILE_WIDTH,GPU_TILE_HEIGHT);
grid = dim3(IMG_WIDTH/GPU_TILE_WIDTH,IMG_HEIGHT/GPU_TILE_HEIGHT);
gConvolveRow<<<grid,block>>>(
pCleanground[ch], IMG_HEIGHT,IMG_WIDTH,
pLowgroundRow[ch], IMG_HEIGHT,IMG_WIDTH);
block_b = dim3(GPU_TILE_WBIG,GPU_TILE_HBIG);
grid_b = dim3(IMG_WIDTH/GPU_TILE_WBIG,IMG_HEIGHT/GPU_TILE_HBIG);
gConvolveBigRow<<<grid_b,block_b>>>(
pCleanground[ch], IMG_HEIGHT,IMG_WIDTH,
pHighgroundRow[ch], IMG_HEIGHT,IMG_WIDTH);
/* Convolve Cols */
cudaDeviceSynchronize();
gConvolveCol<<<grid,block>>>(
pLowgroundRow[ch], IMG_HEIGHT,IMG_WIDTH,
pLowground[ch], IMG_HEIGHT,IMG_WIDTH);
gConvolveBigCol<<<grid_b,block_b>>>(
pHighgroundRow[ch], IMG_HEIGHT,IMG_WIDTH,
pHighground[ch], IMG_HEIGHT,IMG_WIDTH);
/* Band pass */
cudaDeviceSynchronize();
block = dim3(256,4);
grid = dim3(IMG_WIDTH / 256, IMG_HEIGHT / 4);
gBpass<<<grid,block>>>(
pLowground[ch], IMG_HEIGHT,IMG_WIDTH,
pHighground[ch], IMG_HEIGHT,IMG_WIDTH,
pForeground[ch], IMG_HEIGHT,IMG_WIDTH);
cudaDeviceSynchronize();
}
extern "C" __device__ void Threshold(int ch)
{
dim3 block;
dim3 grid;
/* Set the calibration sigma in Info Bloc */
pInfo[ch]->sigma = threshold_sigma;
/* Min Max kernels */
block = dim3(512, 2);
grid = dim3(IMG_WIDTH / 512, IMG_HEIGHT / 2);
gMinMax<<<grid,block>>>(
pForeground[ch],IMG_HEIGHT,IMG_WIDTH,
pMins[ch], 5 * K_IMG_MAX,
pMaxs[ch], 5 * K_IMG_MAX);
cudaDeviceSynchronize();
block = dim3(K_IMG_MAX);
grid = dim3(1);
gMinMaxMinMax<<<grid,K_IMG_MAX>>>(
pMins[ch], 5 * K_IMG_MAX,
pMaxs[ch], 5 * K_IMG_MAX,
(struct PipeInfoBlock*)pInfo[ch],1);
/* Histogram */
cudaDeviceSynchronize();
block = dim3(GPU_TILE_WBIG,GPU_TILE_HBIG);
grid = dim3(IMG_WIDTH/GPU_TILE_WBIG,IMG_HEIGHT/GPU_TILE_HBIG);
gHistogram<<<grid,block>>>(
pForeground[ch],IMG_HEIGHT,IMG_WIDTH,
pPercentile[ch],K_IMG_MAX,
(struct PipeInfoBlock*)pInfo[ch],1);
cudaDeviceSynchronize();
block = dim3(K_IMG_MAX);
grid = dim3(1);
gSumHistogram<<<grid,block>>>(pPercentile[ch],K_IMG_MAX);
cudaDeviceSynchronize();
gIQR<<<grid,block>>>(pPercentile[ch],K_IMG_MAX,(struct PipeInfoBlock*)pInfo[ch],1);
cudaDeviceSynchronize();
block = dim3(256,4);
grid = dim3(IMG_WIDTH / 256, IMG_HEIGHT / 4);
gThreshold<<<grid,block>>>(
pForeground[ch],IMG_HEIGHT,IMG_WIDTH,
pMask[ch],IMG_HEIGHT,IMG_WIDTH,
(struct PipeInfoBlock*)pInfo[ch],1);
cudaDeviceSynchronize();
}
extern "C" __device__ void Gluing(int ch)
{
dim3 block;
dim3 grid;
block = dim3(24, 24);
grid = dim3(IMG_WIDTH / 16, IMG_HEIGHT / 16);
gGlue<<<grid, block>>>(
pMask[ch],IMG_HEIGHT,IMG_WIDTH,
pMask[ch],IMG_HEIGHT,IMG_WIDTH);
cudaDeviceSynchronize();
}
extern "C" __device__ void Labeling(int ch)
{
dim3 block;
dim3 grid;
/* CCL */
//block = dim3(1, 128);
//grid = dim3(1, IMG_HEIGHT / 128);
block = dim3(256,1);
grid = dim3(IMG_WIDTH/256,IMG_HEIGHT);
gCCL0<<<grid, block>>>(
pMask[ch],IMG_HEIGHT,IMG_WIDTH,
pRawLabels[ch],IMG_HEIGHT,IMG_WIDTH);
cudaDeviceSynchronize();
block = dim3(24, 24);
grid = dim3(IMG_WIDTH / 16, IMG_HEIGHT / 16);
gCCLMerge<<<grid, block>>>(
pRawLabels[ch],IMG_HEIGHT,IMG_WIDTH,
pRawLabels[ch],IMG_HEIGHT,IMG_WIDTH);
cudaDeviceSynchronize();
gCCLMerge<<<grid, block>>>(
pRawLabels[ch],IMG_HEIGHT,IMG_WIDTH,
pRawLabels[ch],IMG_HEIGHT,IMG_WIDTH);
cudaDeviceSynchronize();
gCCLMerge<<<grid, block>>>(
pRawLabels[ch],IMG_HEIGHT,IMG_WIDTH,
pRawLabels[ch],IMG_HEIGHT,IMG_WIDTH);
cudaDeviceSynchronize();
}
extern "C" __device__ void Relabeling(int ch)
{
dim3 block;
dim3 grid;
/* Relabel */
block = dim3(160, 1);
grid = dim3(IMG_WIDTH / 160, IMG_HEIGHT / 1);
gScan<<<grid, block>>>(
pRawLabels[ch],IMG_HEIGHT,IMG_WIDTH,
pSums[ch],IMG_PIXELS);
cudaDeviceSynchronize();
grid = dim3(IMG_PIXELS / K_IMG_MAX);
gSum<<<grid, K_IMG_MAX>>>(pSums[ch],IMG_PIXELS, pBlockSums[ch], 5*K_IMG_MAX);
cudaDeviceSynchronize();
grid = dim3(1);
gSumBlocks<<<grid, K_IMG_MAX>>>(pBlockSums[ch], 5*K_IMG_MAX, (struct PipeInfoBlock*)pInfo[ch],1);
cudaDeviceSynchronize();
grid = dim3(IMG_PIXELS / K_IMG_MAX);
gFixSums<<<grid, K_IMG_MAX>>>(pSums[ch],IMG_PIXELS, pBlockSums[ch], 5*K_IMG_MAX);
cudaDeviceSynchronize();
block = dim3(160, 1);
grid = dim3(IMG_WIDTH / 160, IMG_HEIGHT / 1);
gRelabeler<<<grid, block>>>(
pRawLabels[ch],IMG_HEIGHT,IMG_WIDTH,
pSums[ch],IMG_PIXELS,
pLabels[ch], IMG_HEIGHT,IMG_WIDTH);
cudaDeviceSynchronize();
}
extern "C" __device__ void Centroiding(int ch)
{
dim3 block;
dim3 grid;
int starcount = IMG_STARS_MAX;
if (pInfo[ch]->starCount > 0 && pInfo[ch]->starCount < IMG_STARS_MAX)
{
starcount = pInfo[ch]->starCount;
/* Centroid */
block = dim3(32, 32);
grid = dim3(IMG_WIDTH / 32, IMG_HEIGHT / 32);
gCentroid<<<grid, block>>>(
pLabels[ch], IMG_HEIGHT,IMG_WIDTH,
pForeground[ch],IMG_HEIGHT,IMG_WIDTH,
(PipeCentroid *)pCenters[ch],starcount);
cudaDeviceSynchronize();
block = dim3(starcount);
gCentroidFinal<<<1, block>>>((PipeCentroid *)pCenters[ch],starcount);
cudaDeviceSynchronize();
}
else
{
pInfo[ch]->starCount = 0;
}
}
extern "C" __global__ void gPipeline(int gpuId)
{ int ch;
for(ch=0; ch < GPU_CHX; ch++)
{
Background(ch);
Convolution(ch);
Threshold(ch);
Gluing(ch);
Labeling(ch);
Relabeling(ch);
Centroiding(ch);
}
}
extern "C" {
static void ImgKernel_ClearBuffers(int32_t gpu, int32_t ch)
{
/* Clear Work Buffers */
cudaMemset(gInfo[gpu][ch],0,(int)sizeof(ImgInfoBlock_t));
cudaMemset(gCenters[gpu][ch],0,(int)sizeof(ImgCentroid_t)*IMG_STARS_MAX);
cudaMemset(gPercentile[gpu][ch],0,(int)sizeof(int32_t)*K_IMG_MAX);
cudaMemset(gLabels1[gpu][ch],0,(int)sizeof(int32_t) *IMG_PIXELS);
cudaMemset(gLabels2[gpu][ch],0,(int)sizeof(int32_t) *IMG_PIXELS);
cudaMemset(gRawLabels[gpu][ch],0,(int)sizeof(int32_t) *IMG_PIXELS);
cudaMemset(gSums[gpu][ch],0,(int)IMG_BYTES);
cudaMemset(gBlockSums[gpu][ch],0,(int)sizeof(uint16_t)*5*K_IMG_MAX);
}
void ImgKernel_Pipeline(int gpu)
{
cudaSetDevice(gpu);
cudaDeviceSynchronize();
/* Start a new run by clearing the buffers */
ImgKernel_ClearBuffers(gpu,GPU_CH0);
ImgKernel_ClearBuffers(gpu,GPU_CH1);
/* Update Constants */
cudaMemcpyToSymbol(gSlot,(void*)&slot,sizeof(slot));
cudaMemcpyToSymbol(threshold_sigma,(void*)&sigmaThreshold,sizeof(sigmaThreshold));
/* Start the next pipeline kernel */
gPipeline<<<1,1>>>(gpu);
}
#define LFILTER_LEN 15
static float lFilter[LFILTER_LEN] = { .0009f, .01f,
.02f, .05f, .08f, .10f, .1325f, .1411f, .1325f, .10f, .08f, .05f, .02f, .01f, .0009f };
#define HFILTER_LEN 31
static float hFilter[HFILTER_LEN] = {0.0002f, 0.0006f,
0.0025f, 0.0037f, 0.0053f, 0.0074f, 0.0099f, 0.0130f, 0.0164f,
0.0201f, 0.0239f, 0.0275f, 0.0306f, 0.0331f, 0.0347f, 0.0353f,
0.0347f, 0.0331f, 0.0306f, 0.0275f, 0.0239f, 0.0201f, 0.0164f,
0.0130f, 0.0099f, 0.0074f, 0.0053f, 0.0037f, 0.0025f, 0.0006f, 0.0002f};
static float32_t kernel[LFILTER_LEN];
static float32_t kernelBig[HFILTER_LEN];
static inline float32_t ImgKernel_FilterSum(float* arr, int32_t len)
{
int32_t i;
float32_t sum = 0.0f;
for (i=0;i<len;i++) sum += arr[i];
return sum;
}
void ImgKernel_Setup(int gpu)
{
int32_t i,ch;
float32_t sum = 0;
sum = ImgKernel_FilterSum(lFilter,LFILTER_LEN);
for (i = 0; i < LFILTER_LEN; i++) kernel[i] = lFilter[i] / sum;
sum = ImgKernel_FilterSum(hFilter,HFILTER_LEN);
for (i = 0; i < HFILTER_LEN; i++) kernelBig[i] = hFilter[i] / sum;
/* One time copy of locations into GPU constant memory */
cudaMemcpyToSymbol(gkernel, (void*)&kernel, sizeof(float32_t)*LFILTER_LEN);
cudaMemcpyToSymbol(gkernelBig, (void*)&kernelBig, sizeof(float32_t)*HFILTER_LEN);
cudaMemcpyToSymbol(pFrameStack,(void*)&gFrameStack[gpu][0], sizeof(uint16_t*)*GPU_CHX);
cudaMemcpyToSymbol(pBackground,(void*)&gBackground[gpu][0], sizeof(uint16_t*)*GPU_CHX);
cudaMemcpyToSymbol(pCleanground,(void*)&gCleanground[gpu][0], sizeof(int16_t*)*GPU_CHX);
cudaMemcpyToSymbol(pLowground, (void*)&gLowground[gpu][0], sizeof(float32_t*)*GPU_CHX);
cudaMemcpyToSymbol(pLowgroundRow,(void*)&gLowgroundRow[gpu][0],sizeof(float32_t*)*GPU_CHX);
cudaMemcpyToSymbol(pHighground,(void*)&gHighground[gpu][0], sizeof(float32_t*)*GPU_CHX);
cudaMemcpyToSymbol(pHighgroundRow,(void*)&gHighgroundRow[gpu][0],sizeof(float32_t*)*GPU_CHX);
cudaMemcpyToSymbol(pForeground,(void*)&gForeground[gpu][0], sizeof(float32_t*)*GPU_CHX);
cudaMemcpyToSymbol(pMask, (void*)&gMask[gpu][0], sizeof(uint8_t*)*GPU_CHX);
cudaMemcpyToSymbol(pPercentile,(void*)&gPercentile[gpu][0], sizeof(int32_t*)*GPU_CHX);
cudaMemcpyToSymbol(pMins, (void*)&gMins[gpu][0], sizeof(float32_t*)*GPU_CHX);
cudaMemcpyToSymbol(pMaxs, (void*)&gMaxs[gpu][0], sizeof(float32_t*)*GPU_CHX);
cudaMemcpyToSymbol(pLabels1, (void*)&gLabels1[gpu][0], sizeof(int32_t*)*GPU_CHX);
cudaMemcpyToSymbol(pLabels2, (void*)&gLabels2[gpu][0], sizeof(int32_t*)*GPU_CHX);
cudaMemcpyToSymbol(pRawLabels, (void*)&gRawLabels[gpu][0], sizeof(int32_t*)*GPU_CHX);
cudaMemcpyToSymbol(pLabels, (void*)&gLabels[gpu][0], sizeof(int32_t*)*GPU_CHX);
cudaMemcpyToSymbol(pInfo, (void*)&gInfo[gpu][0], sizeof(ImgInfoBlock_t*)*GPU_CHX);
cudaMemcpyToSymbol(pSums, (void*)&gSums[gpu][0], sizeof(uint16_t*)*GPU_CHX);
cudaMemcpyToSymbol(pBlockSums, (void*)&gBlockSums[gpu][0], sizeof(uint16_t*)*GPU_CHX);
cudaMemcpyToSymbol(pCenters, (void*)&gCenters[gpu][0], sizeof(ImgCentroid_t*)*GPU_CHX);
for (ch = 0; ch < GPU_CHX; ch++)
{
/* Clear the working buffers */
ImgKernel_ClearBuffers(gpu,ch);
}
}
}
It should be possible for both the parent and child kernels to all be co-resident (ie. executing concurrently) for two dynamic parallelism kernels launched in separate host streams.
How to get things to run concurrently is a common question. Once all the requirements have been met, whether or not you actually witness concurrent kernel execution will be a matter of resources consumed by each kernel: how many threads per block, how many total threadblocks, how many registers, and how much shared memory are a few examples of the types of resources that, if consumed by one kernel, may prevent the concurrent execution of another kernel, even if all the requirements have been met.
The machine does not have infinite capacity. Once the capacity of the machine has been consumed, exposing additional parallelism (e.g. by attempting to launch independent kernels concurrently) might not yield any improvement.
GPU scheduling behavior may affect this as well as pointed out by Greg. Depending on specific GPU and CUDA version and perhaps other factors, two kernels with large numbers of threadblocks may not execute "concurrently" simply because the threadblocks of one kernel may all be scheduled before any of the threadblocks of the other kernel are scheduled. In my opinion, this behavior is simply another manifestation of a resource issue. (Also note that scheduling of threadblocks of individual kernels may also be affected by stream priorities).
However if we are careful to constrain the resource usage, it's possible for the parent and child kernels of two dynamic parallelism kernels to be co-resident i.e. execute concurrently. Here's a worked example (CUDA 7, Fedora 20, GeForce GT640 cc3.5 GPU):
In this case we see that if I don't use
cuda-memcheck
, then regardless of whether I run one or two copies of the (parent) kernels in separate host streams, the execution time is approximately the same (~5.6s). Since the execution time is the same, the inescapable conclusion is that these kernels are executing concurrently (both parent, and both child kernels). This isn't too surprising since these kernels have tiny resource usage. (one threadblock each, of one thread each, with very low register usage and no shared memory usage).On the other hand, if I run the same test with
cuda-memcheck
, there is evident serialization, because although the time for a single kernel launch is relatively unaffected, the time for two "concurrent" kernel launches is approximately double.