How to use cub::DeviceReduce::ArgMin()

794 views Asked by At

I am having some confusions about how to use the cub::DeviceReduce::ArgMin(). Here I copy the code from the documentation of CUB.

#include <cub/cub.cuh> 
// Declare, allocate, and initialize device-accessible pointers for input and output
int                      num_items;      // e.g., 7
int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9], located in GPU
KeyValuePair<int, int>   *d_out;         // e.g., [{-,-}]

// Determine temporary device storage requirements
void     *d_temp_storage = NULL;
size_t   temp_storage_bytes = 0;
cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
// Allocate temporary storage
cudaMalloc(&d_temp_storage, temp_storage_bytes);
// Run argmin-reduction
cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
// d_out <-- [{5, 0}]

And the questions are as follow:

  1. if the d_in is the pointer to some GPU memory (device), how to initialize the pointer of d_out?
  2. if the operation of ArgMin() is finished in the device (GPU), how can I copy the result to my CPU?
1

There are 1 answers

0
Robert Crovella On

if the d_in is the pointer to some GPU memory (device), how to initialize the pointer of d_out?

You use cudaMalloc, similar to how you would initialize the d_in pointer.

if the operation of ArgMin() is finished in the device (GPU), how can I copy the result to my CPU?

You use cudaMemcpy, similar to how you would copy the d_in data from host to device, except now you are copying the d_out data from device to host. The KeyValuePair is a C++ object that has key and value members.

Here is a complete example:

$ cat t37.cu
#include <cub/cub.cuh>
#include <iostream>

int main(){


  // Declare, allocate, and initialize device-accessible pointers for input and output
  int                      num_items = 32;
  int                      *d_in;
  cub::KeyValuePair<int, int>   *d_out;

  int *h_in = new int[num_items];
  cub::KeyValuePair<int, int> *h_out = new cub::KeyValuePair<int, int>;
  cudaMalloc(&d_in, num_items*sizeof(d_in[0]));
  cudaMalloc(&d_out, sizeof(cub::KeyValuePair<int, int>));
  for (int i = 0; i < num_items; i++) h_in[i] = 4;
  h_in[12] = 2;  // so we expect our return tuple to be 12,2
  cudaMemcpy(d_in, h_in, num_items*sizeof(d_in[0]), cudaMemcpyHostToDevice);

  // Determine temporary device storage requirements
  void     *d_temp_storage = NULL;
  size_t   temp_storage_bytes = 0;
  cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
  // Allocate temporary storage
  cudaMalloc(&d_temp_storage, temp_storage_bytes);
  // Run argmin-reduction
  cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);

  cudaMemcpy(h_out, d_out, sizeof(cub::KeyValuePair<int, int>), cudaMemcpyDeviceToHost);
  std::cout << "minimum value: " << h_out[0].value << std::endl;
  std::cout << "index of min:  " << h_out[0].key << std::endl;
}
$ nvcc -o t37 t37.cu -arch=sm_35 -std=c++14 -Wno-deprecated-gpu-targets
$ ./t37
minimum value: 2
index of min:  12
$