I'm trying to get my head around CUB, and having a bit of trouble following the (rather incomplete) worked examples. CUB looks like it is a fantastic tool, I just can't make sense of the example code.
I've built a simple proto-warp reduce example:
#include <cub/cub.cuh>
#include <cuda.h>
#include <vector>
using std::vector;
#include <iostream>
using std::cout;
using std::endl;
const int N = 128;
__global__ void sum(float *indata, float *outdata) {
typedef cub::WarpReduce<float,4> WarpReduce;
__shared__ typename WarpReduce::TempStorage temp_storage;
int id = blockIdx.x*blockDim.x+threadIdx.x;
if( id < 128 ) {
outdata[id] = WarpReduce(temp_storage).Sum(indata[id]);
}
}
int main() {
vector<float> y(N), sol(N);
float *dev_y, *dev_sol;
cudaMalloc((void**)&dev_y,N*sizeof(float));
cudaMalloc((void**)&dev_sol,N*sizeof(float));
for( int i = 0; i < N; i++ ) {
y[i] = (float)i;
}
cout << "input: ";
for( int i = 0; i < N; i++ ) cout << y[i] << " ";
cout << endl;
cudaMemcpy(&y[0],dev_y,N*sizeof(float),cudaMemcpyHostToDevice);
sum<<<1,32>>>(dev_y,dev_sol);
cudaMemcpy(dev_sol,&sol[0],N*sizeof(float),cudaMemcpyDeviceToHost);
cout << "output: ";
for( int i = 0; i < N; i++ ) cout << sol[i] << " ";
cout << endl;
cudaFree(dev_y);
cudaFree(dev_sol);
return 0;
}
which returns all zeros.
I'm aware that this code would return a reduction that was banded with every 32nd element being the sum of a warp and the other elements being undefined - I just want to get a feel for how CUB works. Can someone point out what I'm doing wrong?
(also, does CUB deserve its own tag yet?)
Your
cudaMemcpy
arguments are back to front, the destination comes first (to be consistent withmemcpy
).See the API reference for more info.