Why does this CUDA reduction fail if I use 31 blocks?

106 views Asked by At

The following CUDA code takes a list of labels (0, 1, 2, 3, ...) and finds the sums of the weights of these labels.

To accelerate the calculation, I use shared memory so that each thread maintains its own running sum. At the end of the calculation, I perform a CUB block-wide reduction and then an atomic add to the global memory.

The CPU and GPU agree on the results if I use fewer than 30 blocks, but disagree if I use more than this. Why is this and how can I fix it?

Checking error codes in the code doesn't yield anything and cuda-gdb and cuda-memcheck do not show any uncaught errors or memory issues.

I'm using NVCC v10.1.243 and running on a Nvidia Quadro P2000.

MWE

//Compile with, e.g., nvcc -I /z/downloads/cub-1.8.0/ cuda_reduction.cu -arch=sm_61
#include <algorithm>
#include <cub/cub.cuh>
#include <thrust/device_vector.h>

#include <random>



__global__ void group_summer(
  const int32_t *const labels,
  const float *const   weights,
  const int num_elements,
  const int num_classes,
  double *const        sums,
  uint32_t *const      counts
){
  constexpr int num_threads = 128;
  assert(num_threads==blockDim.x);

  //Get shared memory
  extern __shared__ int s[];
  double   *const sums_shmem = (double*)s;
  uint32_t *const counts_shmem = (uint32_t*)&sums_shmem[num_threads*num_classes];

  double   *const my_sums   = &sums_shmem  [num_classes*threadIdx.x];
  uint32_t *const my_counts = &counts_shmem[num_classes*threadIdx.x];

  for(int i=0;i<num_threads*num_classes;i+=num_threads){
    sums_shmem[i] = 0;
    counts_shmem[i] = 0;
  }
  __syncthreads();

  for(int i=blockIdx.x * blockDim.x + threadIdx.x;i<num_elements;i+=gridDim.x*blockDim.x){
    // printf("Thread %d at %d looking at %d with %f at %ld and %ld\n", threadIdx.x, i, labels[i], weights[i], (long int)&my_counts[i], (long int)&my_sums[i]);
    const auto l = labels[i];
    // printf("Before thread %d at %d now has %d counts and %lf sums\n", threadIdx.x, i, my_counts[l], my_sums[l]);
    my_sums[l] += weights[i];
    my_counts[l]++;
    // printf("After thread %d at %d now has %d counts and %lf sums\n", threadIdx.x, i, my_counts[l], my_sums[l]);
  }

  __syncthreads();

  __shared__ cub::BlockReduce<double, num_threads>::TempStorage double_temp_storage;
  __shared__ cub::BlockReduce<uint32_t, num_threads>::TempStorage uint32_t_temp_storage;
  for(int l=0;l<num_classes;l++){
    // printf("Thread %d has %d counts with total weight %f for label %d\n", threadIdx.x, my_counts[l], my_sums[l], l);
    const auto sums_total   = cub::BlockReduce<double,num_threads>(double_temp_storage).Reduce(my_sums[l], cub::Sum());
    const auto counts_total = cub::BlockReduce<uint32_t,num_threads>(uint32_t_temp_storage).Reduce(my_counts[l], cub::Sum());
    if(threadIdx.x==0){
      atomicAdd(&sums[l], sums_total);
      atomicAdd(&counts[l], counts_total);
    }
  }
}

void group_summer_cpu(
  const std::vector<int32_t> &labels,
  const std::vector<float>   &weights,
  std::vector<double>    &sums,
  std::vector<uint32_t>  &counts
){
  for(int i=0;i<labels.size();i++){
    const auto l = labels[i];
    sums[l] += weights[i];
    counts[l]++;
  }
}

template<class T>
bool vec_nearly_equal(const std::vector<T> &a, const std::vector<T> &b){
  if(a.size()!=b.size())
    return false;

  for(size_t i=0;i<a.size();i++){
    if(std::abs(a[i]-b[i])>1e-4)
      return false;
  }

  return true;
}

void TestGroupSummer(std::mt19937 &gen, const int N, const int label_max, const int num_blocks){
  std::vector<int32_t> labels(N);
  std::vector<float>   weights(N);

  std::uniform_int_distribution<int> label_dist(0, label_max);
  std::uniform_real_distribution<float> weight_dist(0, 5000);

  for(int i=0;i<N;i++){
    labels[i] = label_dist(gen);
    weights[i] = weight_dist(gen);
  }

  // for(const auto &x: labels) std::cout<<x<<" "; std::cout<<std::endl;
  // for(const auto &x: weights) std::cout<<x<<" "; std::cout<<std::endl;

  const int num_classes = 1 + *std::max_element(labels.begin(), labels.end());

  thrust::device_vector<int32_t>   d_labels(labels.size());
  thrust::device_vector<float>     d_weights(labels.size());
  thrust::device_vector<double>    d_sums(num_classes);
  thrust::device_vector<uint32_t>  d_counts(num_classes);

  thrust::copy(labels.begin(), labels.end(), d_labels.begin());
  thrust::copy(weights.begin(), weights.end(), d_weights.begin());

  constexpr int num_threads = 128;
  const int shmem = num_threads * num_classes * (sizeof(double)+sizeof(uint32_t));

  std::cout<<"Num blocks:    "<<num_blocks<<std::endl;
  std::cout<<"Shared memory: "<<shmem<<std::endl;

  group_summer<<<num_blocks,num_threads,shmem>>>(
    thrust::raw_pointer_cast(d_labels.data()),
    thrust::raw_pointer_cast(d_weights.data()),
    labels.size(),
    num_classes,
    thrust::raw_pointer_cast(d_sums.data()),
    thrust::raw_pointer_cast(d_counts.data())
  );
  if(cudaGetLastError()!=CUDA_SUCCESS){
    std::cout<<"Kernel failed to launch!"<<std::endl;
  }
  cudaDeviceSynchronize();
  if(cudaGetLastError()!=CUDA_SUCCESS){
    std::cout<<"Error in kernel!"<<std::endl;
  }

  std::vector<double>   h_sums(num_classes);
  std::vector<uint32_t> h_counts(num_classes);

  thrust::copy(d_sums.begin(), d_sums.end(), h_sums.begin());
  thrust::copy(d_counts.begin(), d_counts.end(), h_counts.begin());

  std::vector<double>   correct_sums(num_classes);
  std::vector<uint32_t> correct_counts(num_classes);

  group_summer_cpu(labels, weights, correct_sums, correct_counts);

  std::cout<<"Sums good? "  <<vec_nearly_equal(h_sums,correct_sums)<<std::endl;
  std::cout<<"Counts good? "<<(h_counts==correct_counts)<<std::endl;

  std::cout<<"GPU Sums: ";   for(const auto &x: h_sums)         std::cout<<x<<" "; std::cout<<std::endl;
  std::cout<<"CPU Sums: ";   for(const auto &x: correct_sums)   std::cout<<x<<" "; std::cout<<std::endl;
  std::cout<<"GPU Counts: "; for(const auto &x: h_counts)       std::cout<<x<<" "; std::cout<<std::endl;
  std::cout<<"CPU Counts: "; for(const auto &x: correct_counts) std::cout<<x<<" "; std::cout<<std::endl;
}


int main(){
  std::mt19937 gen;

  //These all work
  TestGroupSummer(gen, 1000000, 10, 30);
  TestGroupSummer(gen, 1000000, 10, 30);
  TestGroupSummer(gen, 1000000, 10, 30);
  TestGroupSummer(gen, 1000000, 10, 30);

  //This fails
  TestGroupSummer(gen, 1000000, 10, 31);
}
1

There are 1 answers

2
Robert Crovella On BEST ANSWER

When I run your code on a Tesla V100, all the results are failures except the first test.

You have a problem here:

  for(int i=0;i<num_threads*num_classes;i+=num_threads){
    sums_shmem[i] = 0;
    counts_shmem[i] = 0;
  }

That is not properly zero-ing out shared memory. You need to change the i=0 to i=threadIdx.x.

When I make that change, everything passes for me.

As an aside, this is not correct:

if(cudaGetLastError()!=CUDA_SUCCESS)

CUDA_SUCCESS is not the correct enum token to use with the runtime API. You should use cudaSuccess instead (there are 2 instances of this).

I also think your error comparison is apt to cause trouble:

if(std::abs(a[i]-b[i])>1e-4)

but it doesn't seem to be an issue here. I would normally expect to see some scaling before the test.