Using multi streams in cuda graph, the execution order is uncontrolled

1.1k views Asked by At

I am using cuda graph stream capture API to implement a small demo with multi streams. Referenced by the CUDA Programming Guide here, I wrote the complete code. In my knowledge, kernelB should execute on stream1, but with nsys I found kernelB is executed on a complete new stream. It is under-control. The scheduling graph is showed below:

enter image description here

Here is my code:

#include <iostream>

__global__ void kernelA() {}
__global__ void kernelB() {}
__global__ void kernelC() {}

int main() {
  cudaStream_t stream1, stream2;
  cudaStreamCreate(&stream1);
  cudaStreamCreate(&stream2);

  cudaGraphExec_t graphExec = NULL;
  cudaEvent_t event1, event2;
  cudaEventCreate(&event1);
  cudaEventCreate(&event2);

  for (int i = 0; i < 10; i++) {
    cudaGraph_t graph;
    cudaGraphExecUpdateResult updateResult;
    cudaGraphNode_t errorNode;
    cudaStreamBeginCapture(stream1, cudaStreamCaptureModeGlobal);
    kernelA<<<512, 512, 0, stream1>>>();
    cudaEventRecord(event1, stream1);
    cudaStreamWaitEvent(stream2, event1, 0);
    kernelB<<<256, 512, 0, stream1>>>();
    kernelC<<<16, 512, 0, stream2>>>();
    cudaEventRecord(event2, stream2);
    cudaStreamWaitEvent(stream1, event2, 0);
    cudaStreamEndCapture(stream1, &graph);
    if (graphExec != NULL) {
      cudaGraphExecUpdate(graphExec, graph, &errorNode, &updateResult);
    }
    if (graphExec == NULL || updateResult != cudaGraphExecUpdateSuccess) {
      if (graphExec != NULL) {
        cudaGraphExecDestroy(graphExec);
      }
      cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0);
    }
    cudaGraphDestroy(graph);
    cudaGraphLaunch(graphExec, stream1);
    cudaStreamSynchronize(stream1);
  }
}
2

There are 2 answers

1
bob On

"An operation may be scheduled at any time once the nodes on which it depends are complete. Scheduling is left up to the CUDA system." Here.

0
poohRui On

I also ask in Nvidia Forums, Robert answered this question which help me a lot. Someone who are interested in the scheduling of cuda graph can also reference to this answer here.