How to show nested cuda kernel calls in Nsight Systems

Question

How to show nested cuda kernel calls in Nsight Systems

87 views Asked by Bawbak At 15 February 2024 at 22:36

Imagine that I have two kernels like this:

__global__ void kernel_2(...) {
  // do something
}
__global__ void kernel_1(...) {
  kernel_2<<<n, m>>>(...);
  __syncthreads();
}
int main() {
  kernel_1<<<x, y>>>(...);
  cudaDeviceSynchronize();
}

Everything works, no problem. But I'm trying to check these two kernel running time in Nsight System. But what I see there is only the running time for kernel_1 and then an empty gap, which I assume that gap is the running time for kernel_2, but it doesn't show that over there.

Is there any way that I can make it to be displayed there?

Original Q&A

There are 1 answers

**Abator Abetor** · Answer 1 · 2024-02-16T18:57:06+00:00

It is possible to use nvtx to manually mark begin and end of kernels launched from the device. This will not be as accurate as profiling a stand-alone kernel, but can give you some approximate running times.

The following code uses cuda::latch in managed memory to communicate kernel begin and end with the host. The host will then insert a custom range into the nsys profile. I used CUDA 12.3.

//nvcc -O3 -arch=sm_86 -std=c++17 -lineinfo -g -rdc=true -lnvToolsExt main.cu -o main
#include <iostream>
#include <cstring>
#include <string>

#include <nvToolsExt.h>
#include <cuda/latch>

using MyLatch = cuda::latch<cuda::thread_scope_system>;

void push_range(const std::string& name){
    nvtxEventAttributes_t eventAttrib;
    std::memset(&eventAttrib, 0, sizeof(nvtxEventAttributes_t));
    eventAttrib.version = NVTX_VERSION;
    eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
    eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
    eventAttrib.message.ascii = name.c_str();
    nvtxRangePushEx(&eventAttrib);
}

void pop_range(){
    nvtxRangePop();
}


__global__ 
void kernel_2(MyLatch* latch1, MyLatch* latch2) {

    if(threadIdx.x + blockIdx.x * blockDim.x == 0){
        latch1->count_down();
    }

    for(int i = 0; i < 20; i++){
        __nanosleep(1'000'000);
    }

    if(threadIdx.x + blockIdx.x * blockDim.x == 0){
        latch2->count_down();
    }

}

__global__ 
void kernel_1(MyLatch* latch1, MyLatch* latch2) {
    for(int i = 0; i < 50; i++){
        __nanosleep(1'000'000);
    }

    kernel_2<<<1, 1,0, cudaStreamFireAndForget>>>(latch1, latch2);

    for(int i = 0; i < 50; i++){
        __nanosleep(1'000'000);
    }
}

int main() {

    MyLatch* latch1; cudaMallocManaged(&latch1, sizeof(MyLatch));
    MyLatch* latch2; cudaMallocManaged(&latch2, sizeof(MyLatch));    

    for(int i = 0; i < 5; i++){
        new (latch1) MyLatch(1);
        new (latch2) MyLatch(1);

        kernel_1<<<1, 1>>>(latch1, latch2);

        latch1->wait();
        push_range("kernel_2");
        latch2->wait();
        pop_range();

        cudaDeviceSynchronize();
    }
}

Profiling this with nsight systems 2023.3.1 shows the custom marker for kernel_2.

TechQA.

How to show nested cuda kernel calls in Nsight Systems

There are 1 answers

Related Questions in CUDA

Related Questions in GPU

Related Questions in PROFILING

Related Questions in NSIGHT-SYSTEMS

Popular Questions

Popular Tags

Trending Questions