I am not able to debug for the "global" function lines for which I set breakpoints. I debug with "Start CUDA Debugging" option from NSight menu.
My NSight plugin is successfully installed for VS 2010, I am able to debug my other projects (sample projects came within NSight debugger)
My code is here (it is a bit long but generally repeats same functions) :
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include "device_launch_parameters.h"
#include <cuda_runtime.h>
#include <cufft.h>
#include <helper_cuda.h>
#include "book.h"
#define N (131072)
__global__ void conjugate( float2 *a ) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < N) {
a[idx] = cuConjf(a[idx]);
}
}
__global__ void multWithReference( float2 *signal, float2 *reference ) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < N) {
signal[idx].x = signal[idx].x * reference[idx].x;
signal[idx].y = signal[idx].y * reference[idx].y;
}
}
__global__ void shift( float2 *signal, size_t shiftamount, float2* shifted ) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
*(shifted+((idx+shiftamount)%131072)) = *(signal+idx);
}
__global__ void fftshift(float2 *u_d)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if(i < 131072)
{
double a = 1-2*(i&1);
u_d[i].x *= a;
u_d[i].y *= a;
}
}
static inline cufftHandle createFFTPlan(cudaStream_t* stream)
{
cufftHandle plan;
if (cudaGetLastError() != cudaSuccess){
fprintf(stderr, "Cuda error: Failed to allocate\n");
}
if (cufftPlan1d(&plan, 131072, CUFFT_C2C,1) != CUFFT_SUCCESS){
fprintf(stderr, "CUFFT error: Plan creation failed");
}
if (cufftSetStream(plan, *stream) != CUFFT_SUCCESS){
fprintf(stderr, "CUFFT error: Plan stream association failed");
}
return plan;
}
int main( void ) {
cudaDeviceProp prop;
int whichDevice;
HANDLE_ERROR( cudaGetDevice( &whichDevice ) );
HANDLE_ERROR( cudaGetDeviceProperties( &prop, whichDevice ) );
if (!prop.deviceOverlap) {
printf( "Device will not handle overlaps, so no speed up from streams\n" );
return 0;
}
cudaEvent_t start, stop;
float elapsedTime;
cudaStream_t stream0, stream1, stream2, stream3, stream4, stream5, stream6, stream7;
float2* host_ref, *host_0, *host_1, *host_2, *host_3, *host_4, *host_5, *host_6, *host_7;
float2* dev_ref, *dev_0, *dev_1, *dev_2, *dev_3, *dev_4, *dev_5, *dev_6, *dev_7;
// start the timers
HANDLE_ERROR( cudaEventCreate( &start ) );
HANDLE_ERROR( cudaEventCreate( &stop ) );
// initialize the streams
HANDLE_ERROR( cudaStreamCreate( &stream0 ) );
HANDLE_ERROR( cudaStreamCreate( &stream1 ) );
HANDLE_ERROR( cudaStreamCreate( &stream2 ) );
HANDLE_ERROR( cudaStreamCreate( &stream3 ) );
HANDLE_ERROR( cudaStreamCreate( &stream4 ) );
HANDLE_ERROR( cudaStreamCreate( &stream5 ) );
HANDLE_ERROR( cudaStreamCreate( &stream6 ) );
HANDLE_ERROR( cudaStreamCreate( &stream7 ) );
// allocate the memory on the GPU
HANDLE_ERROR( cudaMalloc( (void**)&dev_ref,
N * sizeof(float2) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_0,
N * sizeof(float2) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_1,
N * sizeof(float2) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_2,
N * sizeof(float2) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_3,
N * sizeof(float2) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_4,
N * sizeof(float2) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_5,
N * sizeof(float2) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_6,
N * sizeof(float2) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_7,
N * sizeof(float2) ) );
// allocate host locked memory, used to stream
HANDLE_ERROR( cudaHostAlloc( (void**)&host_ref,
N * sizeof(float2),
cudaHostAllocDefault ) );
HANDLE_ERROR( cudaHostAlloc( (void**)&host_0,
N * sizeof(float2),
cudaHostAllocDefault ) );
HANDLE_ERROR( cudaHostAlloc( (void**)&host_1,
N * sizeof(float2),
cudaHostAllocDefault ) );
HANDLE_ERROR( cudaHostAlloc( (void**)&host_2,
N * sizeof(float2),
cudaHostAllocDefault ) );
HANDLE_ERROR( cudaHostAlloc( (void**)&host_3,
N * sizeof(float2),
cudaHostAllocDefault ) );
HANDLE_ERROR( cudaHostAlloc( (void**)&host_4,
N * sizeof(float2),
cudaHostAllocDefault ) );
HANDLE_ERROR( cudaHostAlloc( (void**)&host_5,
N * sizeof(float2),
cudaHostAllocDefault ) );
HANDLE_ERROR( cudaHostAlloc( (void**)&host_6,
N * sizeof(float2),
cudaHostAllocDefault ) );
HANDLE_ERROR( cudaHostAlloc( (void**)&host_7,
N * sizeof(float2),
cudaHostAllocDefault ) );
// Open signal file
FILE *fp;
if(NULL == (fp = fopen("testSignal4.bin","r"))){
printf("can not open file...");
exit(1);
}
fread(host_ref, sizeof(float2), 131072, fp);
fclose(fp);
if(NULL == (fp = fopen("testSignal4.bin","r"))){
printf("can not open file...");
exit(1);
}
fread(host_0, sizeof(float2), 131072, fp);
fclose(fp);
if(NULL == (fp = fopen("testSignal4.bin","r"))){
printf("can not open file...");
exit(1);
}
fread(host_1, sizeof(float2), 131072, fp);
fclose(fp);
if(NULL == (fp = fopen("testSignal4.bin","r"))){
printf("can not open file...");
exit(1);
}
fread(host_2, sizeof(float2), 131072, fp);
fclose(fp);
if(NULL == (fp = fopen("testSignal4.bin","r"))){
printf("can not open file...");
exit(1);
}
fread(host_3, sizeof(float2), 131072, fp);
fclose(fp);
if(NULL == (fp = fopen("testSignal4.bin","r"))){
printf("can not open file...");
exit(1);
}
fread(host_4, sizeof(float2), 131072, fp);
fclose(fp);
if(NULL == (fp = fopen("testSignal4.bin","r"))){
printf("can not open file...");
exit(1);
}
fread(host_5, sizeof(float2), 131072, fp);
fclose(fp);
if(NULL == (fp = fopen("testSignal4.bin","r"))){
printf("can not open file...");
exit(1);
}
fread(host_6, sizeof(float2), 131072, fp);
fclose(fp);
if(NULL == (fp = fopen("testSignal4.bin","r"))){
printf("can not open file...");
exit(1);
}
fread(host_7, sizeof(float2), 131072, fp);
fclose(fp);
// create FFT plans
cufftHandle plan0 = createFFTPlan(&stream0);
cufftHandle plan1 = createFFTPlan(&stream1);
cufftHandle plan2 = createFFTPlan(&stream2);
cufftHandle plan3 = createFFTPlan(&stream3);
cufftHandle plan4 = createFFTPlan(&stream4);
cufftHandle plan5 = createFFTPlan(&stream5);
cufftHandle plan6 = createFFTPlan(&stream6);
cufftHandle plan7 = createFFTPlan(&stream7);
float2* shifted0;
HANDLE_ERROR( cudaMalloc( (void**)&shifted0,
N * sizeof(float2) ) );
float2* shifted1;
HANDLE_ERROR( cudaMalloc( (void**)&shifted1,
N * sizeof(float2) ) );
float2* shifted2;
HANDLE_ERROR( cudaMalloc( (void**)&shifted2,
N * sizeof(float2) ) );
float2* shifted3;
HANDLE_ERROR( cudaMalloc( (void**)&shifted3,
N * sizeof(float2) ) );
float2* shifted4;
HANDLE_ERROR( cudaMalloc( (void**)&shifted4,
N * sizeof(float2) ) );
float2* shifted5;
HANDLE_ERROR( cudaMalloc( (void**)&shifted5,
N * sizeof(float2) ) );
float2* shifted6;
HANDLE_ERROR( cudaMalloc( (void**)&shifted6,
N * sizeof(float2) ) );
float2* shifted7;
HANDLE_ERROR( cudaMalloc( (void**)&shifted7,
N * sizeof(float2) ) );
HANDLE_ERROR( cudaEventRecord( start, 0 ) );
// enqueue copies of a in stream0 and stream1
HANDLE_ERROR( cudaMemcpyAsync( dev_ref, host_ref,
sizeof(float2),
cudaMemcpyHostToDevice,
stream2 ) );
HANDLE_ERROR( cudaMemcpyAsync( dev_0, host_0,
sizeof(float2),
cudaMemcpyHostToDevice,
stream0 ) );
HANDLE_ERROR( cudaMemcpyAsync( dev_1, host_1,
sizeof(float2),
cudaMemcpyHostToDevice,
stream1 ) );
HANDLE_ERROR( cudaMemcpyAsync( dev_2, host_2,
sizeof(float2),
cudaMemcpyHostToDevice,
stream2 ) );
HANDLE_ERROR( cudaMemcpyAsync( dev_3, host_3,
sizeof(float2),
cudaMemcpyHostToDevice,
stream3 ) );
HANDLE_ERROR( cudaMemcpyAsync( dev_4, host_4,
sizeof(float2),
cudaMemcpyHostToDevice,
stream4 ) );
HANDLE_ERROR( cudaMemcpyAsync( dev_5, host_5,
sizeof(float2),
cudaMemcpyHostToDevice,
stream5 ) );
HANDLE_ERROR( cudaMemcpyAsync( dev_6, host_6,
sizeof(float2),
cudaMemcpyHostToDevice,
stream6 ) );
HANDLE_ERROR( cudaMemcpyAsync( dev_7, host_7,
sizeof(float2),
cudaMemcpyHostToDevice,
stream7 ) );
for(int i = 0; i < 100; i++){
shift<<<131072,131072,0>>>(dev_0, i, shifted0);
shift<<<131072,131072,0,stream1>>>(dev_1, i, shifted1);
shift<<<131072,131072,0,stream2>>>(dev_2, i, shifted2);
shift<<<131072,131072,0,stream3>>>(dev_3, i, shifted3);
shift<<<131072,131072,0,stream4>>>(dev_4, i, shifted4);
shift<<<131072,131072,0,stream5>>>(dev_5, i, shifted5);
shift<<<131072,131072,0,stream6>>>(dev_6, i, shifted6);
shift<<<131072,131072,0,stream7>>>(dev_7, i, shifted7);
conjugate<<<131072/256,131072,0,stream0>>>(shifted0);
conjugate<<<131072/256,131072,0,stream1>>>(shifted1);
conjugate<<<131072/256,131072,0,stream2>>>(shifted2);
conjugate<<<131072/256,131072,0,stream3>>>(shifted3);
conjugate<<<131072/256,131072,0,stream4>>>(shifted4);
conjugate<<<131072/256,131072,0,stream5>>>(shifted5);
conjugate<<<131072/256,131072,0,stream6>>>(shifted6);
conjugate<<<131072/256,131072,0,stream7>>>(shifted7);
multWithReference<<<131072/256,131072,0,stream0>>>(shifted0,dev_ref);
multWithReference<<<131072/256,131072,0,stream1>>>(shifted1,dev_ref);
multWithReference<<<131072/256,131072,0,stream2>>>(shifted2,dev_ref);
multWithReference<<<131072/256,131072,0,stream3>>>(shifted3,dev_ref);
multWithReference<<<131072/256,131072,0,stream4>>>(shifted4,dev_ref);
multWithReference<<<131072/256,131072,0,stream5>>>(shifted5,dev_ref);
multWithReference<<<131072/256,131072,0,stream6>>>(shifted6,dev_ref);
multWithReference<<<131072/256,131072,0,stream7>>>(shifted7,dev_ref);
if (cufftExecC2C(plan0, shifted0, shifted0, CUFFT_FORWARD) != CUFFT_SUCCESS){
fprintf(stderr, "CUFFT error: ExecC2C Forward failed");
}
if (cufftExecC2C(plan1, shifted1, shifted1, CUFFT_FORWARD) != CUFFT_SUCCESS){
fprintf(stderr, "CUFFT error: ExecC2C Forward failed");
}
if (cufftExecC2C(plan2, shifted2, shifted2, CUFFT_FORWARD) != CUFFT_SUCCESS){
fprintf(stderr, "CUFFT error: ExecC2C Forward failed");
}
if (cufftExecC2C(plan3, shifted3, shifted3, CUFFT_FORWARD) != CUFFT_SUCCESS){
fprintf(stderr, "CUFFT error: ExecC2C Forward failed");
}
if (cufftExecC2C(plan4, shifted4, shifted4, CUFFT_FORWARD) != CUFFT_SUCCESS){
fprintf(stderr, "CUFFT error: ExecC2C Forward failed");
}
if (cufftExecC2C(plan5, shifted5, shifted5, CUFFT_FORWARD) != CUFFT_SUCCESS){
fprintf(stderr, "CUFFT error: ExecC2C Forward failed");
}
if (cufftExecC2C(plan6, shifted6, shifted6, CUFFT_FORWARD) != CUFFT_SUCCESS){
fprintf(stderr, "CUFFT error: ExecC2C Forward failed");
}
if (cufftExecC2C(plan7, shifted7, shifted7, CUFFT_FORWARD) != CUFFT_SUCCESS){
fprintf(stderr, "CUFFT error: ExecC2C Forward failed");
}
fftshift<<<131072,131072,0,stream0>>>(shifted0);
fftshift<<<131072,131072,0,stream1>>>(shifted1);
fftshift<<<131072,131072,0,stream2>>>(shifted2);
fftshift<<<131072,131072,0,stream3>>>(shifted3);
fftshift<<<131072,131072,0,stream4>>>(shifted4);
fftshift<<<131072,131072,0,stream5>>>(shifted5);
fftshift<<<131072,131072,0,stream6>>>(shifted6);
fftshift<<<131072,131072,0,stream7>>>(shifted7);
}
if (cudaThreadSynchronize() != cudaSuccess){
fprintf(stderr, "Cuda error: Failed to synchronize\n");
}
float2 *host_last = (float2 *)malloc(8*131072);
// enqueue copies of c from device to locked memory
HANDLE_ERROR( cudaMemcpyAsync( host_last, shifted0,
sizeof(float2),
cudaMemcpyDeviceToHost,
stream0 ) );
// enqueue copies of c from device to locked memory
HANDLE_ERROR( cudaMemcpyAsync( host_0, shifted0,
sizeof(float2),
cudaMemcpyDeviceToHost,
stream0 ) );
HANDLE_ERROR( cudaMemcpyAsync( host_1, shifted1,
sizeof(float2),
cudaMemcpyDeviceToHost,
stream1 ) );
HANDLE_ERROR( cudaMemcpyAsync( host_2, shifted2,
sizeof(float2),
cudaMemcpyDeviceToHost,
stream2 ) );
HANDLE_ERROR( cudaMemcpyAsync( host_3, shifted3,
sizeof(float2),
cudaMemcpyDeviceToHost,
stream3 ) );
HANDLE_ERROR( cudaMemcpyAsync( host_4, shifted4,
sizeof(float2),
cudaMemcpyDeviceToHost,
stream4 ) );
HANDLE_ERROR( cudaMemcpyAsync( host_5, shifted5,
sizeof(float2),
cudaMemcpyDeviceToHost,
stream5 ) );
HANDLE_ERROR( cudaMemcpyAsync( host_6, shifted6,
sizeof(float2),
cudaMemcpyDeviceToHost,
stream6 ) );
HANDLE_ERROR( cudaMemcpyAsync( host_7, shifted7,
sizeof(float2),
cudaMemcpyDeviceToHost,
stream7 ) );
// Streamleri senkronize et
HANDLE_ERROR( cudaStreamSynchronize( stream0 ) );
HANDLE_ERROR( cudaStreamSynchronize( stream1 ) );
HANDLE_ERROR( cudaStreamSynchronize( stream2 ) );
HANDLE_ERROR( cudaStreamSynchronize( stream3 ) );
HANDLE_ERROR( cudaStreamSynchronize( stream4 ) );
HANDLE_ERROR( cudaStreamSynchronize( stream5 ) );
HANDLE_ERROR( cudaStreamSynchronize( stream6 ) );
HANDLE_ERROR( cudaStreamSynchronize( stream7 ) );
// Stop timer
HANDLE_ERROR( cudaEventRecord( stop, 0 ) );
HANDLE_ERROR( cudaEventSynchronize( stop ) );
HANDLE_ERROR( cudaEventElapsedTime( &elapsedTime,
start, stop ) );
printf( "Time taken: %3.1f ms\n", elapsedTime );
FILE *fp2;
if(NULL == (fp2 = fopen("result.bin","wb+"))){
printf("can not open file...");
exit(1);
}
fwrite(host_last, sizeof(float2), 131072, fp2);
printf("signal written \n");
fflush(stdout);
fclose(fp2);
// cleanup the streams and memory
HANDLE_ERROR( cudaFreeHost( host_0 ) );
HANDLE_ERROR( cudaFreeHost( host_1 ) );
HANDLE_ERROR( cudaFreeHost( host_2 ) );
HANDLE_ERROR( cudaFreeHost( host_3 ) );
HANDLE_ERROR( cudaFreeHost( host_4 ) );
HANDLE_ERROR( cudaFreeHost( host_5 ) );
HANDLE_ERROR( cudaFreeHost( host_6 ) );
HANDLE_ERROR( cudaFreeHost( host_7 ) );
HANDLE_ERROR( cudaFree( dev_0 ) );
HANDLE_ERROR( cudaFree( dev_1 ) );
HANDLE_ERROR( cudaFree( dev_2 ) );
HANDLE_ERROR( cudaFree( dev_3 ) );
HANDLE_ERROR( cudaFree( dev_4 ) );
HANDLE_ERROR( cudaFree( dev_5 ) );
HANDLE_ERROR( cudaFree( dev_6 ) );
HANDLE_ERROR( cudaFree( dev_7 ) );
cufftDestroy(plan0);
cufftDestroy(plan1);
cufftDestroy(plan2);
cufftDestroy(plan3);
cufftDestroy(plan4);
cufftDestroy(plan5);
cufftDestroy(plan6);
cufftDestroy(plan7);
HANDLE_ERROR( cudaStreamDestroy( stream0 ) );
HANDLE_ERROR( cudaStreamDestroy( stream1 ) );
HANDLE_ERROR( cudaStreamDestroy( stream2 ) );
HANDLE_ERROR( cudaStreamDestroy( stream3 ) );
HANDLE_ERROR( cudaStreamDestroy( stream4 ) );
HANDLE_ERROR( cudaStreamDestroy( stream5 ) );
HANDLE_ERROR( cudaStreamDestroy( stream6 ) );
HANDLE_ERROR( cudaStreamDestroy( stream7 ) );
printf("hit [enter] to exit...");
fflush(stdout);
getchar();
return 0;
}
Binary file needed to reproduce the problem is within this link :
When I run "cuda-memcheck" on release exe file I get the following result:
When debugging GPU code in Nsight VSE you need to start debugging through the Nsight menu ("Start CUDA Debugging"). See this walkthrough for more information.
EDIT
Based on the additional information you provided, in particular the
cuda-memcheck
output, it seems like your kernel is not actually being launched. Error 9 is cudaErrorInvalidConfiguration indicating that the launch configuration (blocks, threads/block, smem/block) is incompatible with the device.In fact, you're trying to launch 131072 threads/block which is way above the limits (see the Programming Guide for details and for the specific limits). You should launch smaller blocks and increase the number of blocks accordingly.
As Robert Crovella said, you should always ensure you have proper error checking.