How to show nested cuda kernel calls in Nsight Systems

86 views Asked by At

Imagine that I have two kernels like this:

__global__ void kernel_2(...) {
  // do something
}
__global__ void kernel_1(...) {
  kernel_2<<<n, m>>>(...);
  __syncthreads();
}
int main() {
  kernel_1<<<x, y>>>(...);
  cudaDeviceSynchronize();
}

Everything works, no problem. But I'm trying to check these two kernel running time in Nsight System. But what I see there is only the running time for kernel_1 and then an empty gap, which I assume that gap is the running time for kernel_2, but it doesn't show that over there.

Is there any way that I can make it to be displayed there?

1

There are 1 answers

0
Abator Abetor On

It is possible to use nvtx to manually mark begin and end of kernels launched from the device. This will not be as accurate as profiling a stand-alone kernel, but can give you some approximate running times.

The following code uses cuda::latch in managed memory to communicate kernel begin and end with the host. The host will then insert a custom range into the nsys profile. I used CUDA 12.3.

//nvcc -O3 -arch=sm_86 -std=c++17 -lineinfo -g -rdc=true -lnvToolsExt main.cu -o main
#include <iostream>
#include <cstring>
#include <string>

#include <nvToolsExt.h>
#include <cuda/latch>

using MyLatch = cuda::latch<cuda::thread_scope_system>;

void push_range(const std::string& name){
    nvtxEventAttributes_t eventAttrib;
    std::memset(&eventAttrib, 0, sizeof(nvtxEventAttributes_t));
    eventAttrib.version = NVTX_VERSION;
    eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
    eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
    eventAttrib.message.ascii = name.c_str();
    nvtxRangePushEx(&eventAttrib);
}

void pop_range(){
    nvtxRangePop();
}


__global__ 
void kernel_2(MyLatch* latch1, MyLatch* latch2) {

    if(threadIdx.x + blockIdx.x * blockDim.x == 0){
        latch1->count_down();
    }

    for(int i = 0; i < 20; i++){
        __nanosleep(1'000'000);
    }

    if(threadIdx.x + blockIdx.x * blockDim.x == 0){
        latch2->count_down();
    }

}

__global__ 
void kernel_1(MyLatch* latch1, MyLatch* latch2) {
    for(int i = 0; i < 50; i++){
        __nanosleep(1'000'000);
    }

    kernel_2<<<1, 1,0, cudaStreamFireAndForget>>>(latch1, latch2);

    for(int i = 0; i < 50; i++){
        __nanosleep(1'000'000);
    }
}

int main() {

    MyLatch* latch1; cudaMallocManaged(&latch1, sizeof(MyLatch));
    MyLatch* latch2; cudaMallocManaged(&latch2, sizeof(MyLatch));    

    for(int i = 0; i < 5; i++){
        new (latch1) MyLatch(1);
        new (latch2) MyLatch(1);

        kernel_1<<<1, 1>>>(latch1, latch2);

        latch1->wait();
        push_range("kernel_2");
        latch2->wait();
        pop_range();

        cudaDeviceSynchronize();
    }
}

Profiling this with nsight systems 2023.3.1 shows the custom marker for kernel_2.

nsys profile showing the custom range