reduction example using cuda and CUB

1.6k views Asked by At

I'm trying to get my head around CUB, and having a bit of trouble following the (rather incomplete) worked examples. CUB looks like it is a fantastic tool, I just can't make sense of the example code.

I've built a simple proto-warp reduce example:

#include <cub/cub.cuh>
#include <cuda.h>
#include <vector>
using std::vector;
#include <iostream>
using std::cout;
using std::endl;

const int N = 128;

__global__ void sum(float *indata, float *outdata) {
    typedef cub::WarpReduce<float,4> WarpReduce;
    __shared__ typename WarpReduce::TempStorage temp_storage;
    int id = blockIdx.x*blockDim.x+threadIdx.x;
    if( id < 128 ) {
        outdata[id] = WarpReduce(temp_storage).Sum(indata[id]);
    }
}

int main() {
    vector<float> y(N), sol(N);
    float *dev_y, *dev_sol;
    cudaMalloc((void**)&dev_y,N*sizeof(float));
    cudaMalloc((void**)&dev_sol,N*sizeof(float));
    for( int i = 0; i < N; i++ ) {
        y[i] = (float)i;
    }
    cout << "input: ";
    for( int i = 0; i < N; i++ ) cout << y[i] << " ";
    cout << endl;
    cudaMemcpy(&y[0],dev_y,N*sizeof(float),cudaMemcpyHostToDevice);
    sum<<<1,32>>>(dev_y,dev_sol);
    cudaMemcpy(dev_sol,&sol[0],N*sizeof(float),cudaMemcpyDeviceToHost);
    cout << "output: ";
    for( int i = 0; i < N; i++ ) cout << sol[i] << " ";
    cout << endl;
    cudaFree(dev_y);
    cudaFree(dev_sol);
    return 0;
}

which returns all zeros.

I'm aware that this code would return a reduction that was banded with every 32nd element being the sum of a warp and the other elements being undefined - I just want to get a feel for how CUB works. Can someone point out what I'm doing wrong?

(also, does CUB deserve its own tag yet?)

1

There are 1 answers

0
Tom On BEST ANSWER

Your cudaMemcpy arguments are back to front, the destination comes first (to be consistent with memcpy).

cudaError_t cudaMemcpy ( void* dst, const void* src, size_t count, cudaMemcpyKind kind )

See the API reference for more info.