I'm trying do a simple tutorial about dot product in cuda c using shared memory; the code is quite simple and it basically does the product between the elements of two arrays and then sums the result of from each block:
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cuda.h>
#define imin(a,b) (a<b?a:b)
const int N = 33*1024;
const int threadsPerBlock = 256;
const int blocksPerGrid = imin(32 , (N+threadsPerBlock-1)/threadsPerBlock);
__global__ void dot(float *a, float *b, float *c){
__shared__ float cache[threadsPerBlock];
int tid = threadIdx.x + blockIdx.x*blockDim.x;
int cacheIndex = threadIdx.x;
float temp = 0;
while (tid < N){
temp += a[tid]*b[tid];
tid += blockDim.x*gridDim.x; /* Aggiorno l'indice per l'evenutale overshoot. */
}
cache[cacheIndex] = temp;
__syncthreads();
int i = blockDim.x/2;
while(i != 0){ /
if(cacheIndex < i){
cache[cacheIndex] += cache[cacheIndex + i];
__syncthreads();
i /= 2;
}
}
if(cacheIndex == 0){
c[blockIdx.x] = cache[0];
}
}
int main(void){
cudaError_t err = cudaSuccess;
float a[N], b[N], c[blocksPerGrid];
float *d_a, *d_b, *d_c;
int i;
for(i=0;i<N;i++){
a[i] = i;
b[i] = i*2;
}
for(i=0; i<blocksPerGrid;i++){
c[i] = 0;
}
err = cudaMalloc((void**)&d_a, N*sizeof(float));
if (err != cudaSuccess){fprintf(stderr, "Failed to allocate device vector a (error code %s)! \n", cudaGetErrorString(err));exit(EXIT_FAILURE);}
err = cudaMalloc((void**)&d_b, N*sizeof(float));
if (err != cudaSuccess){fprintf(stderr, "Failed to allocate device vector b (error code %s)! \n", cudaGetErrorString(err));exit(EXIT_FAILURE);}
err = cudaMalloc((void**)&d_c, blocksPerGrid*sizeof(float));
if (err != cudaSuccess){fprintf(stderr, "Failed to allocate device vector c (error code %s)! \n", cudaGetErrorString(err));exit(EXIT_FAILURE);}
/* Copio i valori dei vettori a e b nello spazio di memoria allocato precedentemente nel device. */
err = cudaMemcpy(d_a, a, N*sizeof(float), cudaMemcpyHostToDevice);
if (err != cudaSuccess){fprintf(stderr, "Failed to copy vector a from host to device (error code %s)!\n", cudaGetErrorString(err));exit(EXIT_FAILURE);}
err = cudaMemcpy(d_b, b, N*sizeof(float), cudaMemcpyHostToDevice);
if (err != cudaSuccess){fprintf(stderr, "Failed to copy vector b from host to device (error code %s)!\n", cudaGetErrorString(err));exit(EXIT_FAILURE);}
err = cudaMemcpy(d_c, c, blocksPerGrid*sizeof(float), cudaMemcpyHostToDevice);
if (err != cudaSuccess){fprintf(stderr, "Failed to copy vector c from host to device (error code %s)!\n", cudaGetErrorString(err));exit(EXIT_FAILURE);}
dot<<<blocksPerGrid,threadsPerBlock>>>(d_a, d_b, d_c); err = cudaGetLastError();
err = cudaMemcpy(c, d_c, blocksPerGrid*sizeof(float), cudaMemcpyDeviceToHost);
if (err != cudaSuccess){fprintf(stderr, "Failed to copy vector c from device to host (error code %s)!\n", cudaGetErrorString(err));exit(EXIT_FAILURE);}
err = cudaFree(d_a);
if (err != cudaSuccess){fprintf(stderr, "Failed to free device vector a (error code %s)!\n", cudaGetErrorString(err));exit(EXIT_FAILURE);}
err = cudaFree(d_b);
if (err != cudaSuccess){fprintf(stderr, "Failed to free device vector b (error code %s)!\n", cudaGetErrorString(err));exit(EXIT_FAILURE);}
err = cudaFree(d_c);
if (err != cudaSuccess){fprintf(stderr, "Failed to free device vector c (error code %s)!\n", cudaGetErrorString(err));exit(EXIT_FAILURE);}
float result = 0;
for(i=0;i<blocksPerGrid;i++){
result += c[i];
}
printf("il risultato finale รจ: %.2f\n", result);
return 0;
}
This code is identical to the one presented in the Cuda by Example book where the only difference is in the definition of the vectors a, b and c (the way i defined them shouldn't be a problem since I've done it several times).
Here's the problem: when i try to run the program it crashes! The terminal says that the problem is: Failed to copy vector c from device to host (error code the launch timed out and was terminated)!
That seams strange since i think i have allocated vector c in the proper way... does anyone have any idea of what I'm doing wrong? is it the global function or the main that has something wrong?
Your kernel has an infinite loop in it. The reason you are getting an error at all is because you are on a platform that has a watchdog timeout, and the watchdog is killing the kernel execution.
consider this code:
You are only dividing the loop index (
i
) by 2 if thecacheIndex
is less thani
. For other threads,i
remains at the same value constantly, once that thread drops out of the if statement. For those threads, the while loop is never exited (i
is never equal to zero). You want to divide thei
variable for all threads. Like so:Note that I have moved the
__syncthreads()
out of the if statement as well. This may not be necessary to fix your issue, but is technically not correct because we generally want all threads to participate in a__syncthreads()
statement. It is only allowed in conditional code if the condition evaluates the same across all threads -- this is documented in the programming guide.And if you compare your code in this respect to what is in the cuda by examples source code for dot.cu in chapter 5, I think you'll find they are not identical.