I am using cuda graph stream capture API to implement a small demo with multi streams. Referenced by the CUDA Programming Guide here, I wrote the complete code. In my knowledge, kernelB should execute on stream1, but with nsys I found kernelB is executed on a complete new stream. It is under-control. The scheduling graph is showed below:
Here is my code:
#include <iostream>
__global__ void kernelA() {}
__global__ void kernelB() {}
__global__ void kernelC() {}
int main() {
cudaStream_t stream1, stream2;
cudaStreamCreate(&stream1);
cudaStreamCreate(&stream2);
cudaGraphExec_t graphExec = NULL;
cudaEvent_t event1, event2;
cudaEventCreate(&event1);
cudaEventCreate(&event2);
for (int i = 0; i < 10; i++) {
cudaGraph_t graph;
cudaGraphExecUpdateResult updateResult;
cudaGraphNode_t errorNode;
cudaStreamBeginCapture(stream1, cudaStreamCaptureModeGlobal);
kernelA<<<512, 512, 0, stream1>>>();
cudaEventRecord(event1, stream1);
cudaStreamWaitEvent(stream2, event1, 0);
kernelB<<<256, 512, 0, stream1>>>();
kernelC<<<16, 512, 0, stream2>>>();
cudaEventRecord(event2, stream2);
cudaStreamWaitEvent(stream1, event2, 0);
cudaStreamEndCapture(stream1, &graph);
if (graphExec != NULL) {
cudaGraphExecUpdate(graphExec, graph, &errorNode, &updateResult);
}
if (graphExec == NULL || updateResult != cudaGraphExecUpdateSuccess) {
if (graphExec != NULL) {
cudaGraphExecDestroy(graphExec);
}
cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0);
}
cudaGraphDestroy(graph);
cudaGraphLaunch(graphExec, stream1);
cudaStreamSynchronize(stream1);
}
}
"An operation may be scheduled at any time once the nodes on which it depends are complete. Scheduling is left up to the CUDA system." Here.