Pair deduplication on CUDA

105 views Asked by At

I have a data structure already running on CUDA and collect the data as below:

struct SearchDataOnDevice
{
    size_t npair;
    int * id1;
    int * id2;
};

I'd like to remove the duplicated id pair w/ and w/o an option called same_id_src, when same_id_src is true, <0, 5> and <5, 0> are duplicated and <5, 0> should be removed. when same_id_src is false, both pairs should be kept.

I am new to CUDA and the Thrust library, can anyone help with a quick hint?

1

There are 1 answers

13
Robert Crovella On BEST ANSWER

Here is one possible approach:

  1. sort the ID pairs, to bring "like" pairs together (thrust::sort)
  2. identify which pairs to keep, based on comparing adjacent pairs. Put results in a stencil array (thrust::transform)
  3. based on the stencil array, use thrust::copy_if to perform stream compaction on the sorted pairs to produce the deduplicated result

The need to handle the cases where e.g. <0 5> and <5 0> are considered "identical" or not, is handled via modification to the sort functor, as well as modification to the transform functor. In the sort functor case, we reorder, for comparison purposes, each pair such that the lower ID appears first in the pair. We must arrange the sort functor carefully, so that the case of <0 5> is chosen preferentially over the case of <5 0>, when the special condition is true.

Here is an example:

# cat t76.cu
#include <thrust/sort.h>
#include <thrust/copy.h>
#include <thrust/transform.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/device_ptr.h>
#include <thrust/device_vector.h>
#include <iostream>
#include <cstdlib>

struct my_sort_functor
{
  bool same_id_src;
  my_sort_functor(bool _same_id_src) : same_id_src(_same_id_src) {};
  template <typename T1, typename T2>
  __host__ __device__
  bool operator()(T1 t1, T2 t2){
    int t1a = thrust::get<0>(t1);
    int t1b = thrust::get<1>(t1);
    int t2a = thrust::get<0>(t2);
    int t2b = thrust::get<1>(t2);
    if (same_id_src) {//need to possibly reorder each pair for testing
      bool t1s = (t1a > t1b);
      bool t2s = (t2a > t2b);
      // sort on smaller id first
      if ((t1s?t1b:t1a) < (t2s?t2b:t2a)) return true;
      if ((t1s?t1b:t1a) > (t2s?t2b:t2a)) return false;
      // then sort on larger id
      if ((t1s?t1a:t1b) < (t2s?t2a:t2b)) return true;
      if ((t1s?t1a:t1b) > (t2s?t2a:t2b)) return false;
      // then sort based on the equality case
      // we prefer to choose <0,5> over <5,0>
      // so order that one first
      return !t1s;}
    else { // no reordering of pairs
      // sort on first id
      if (t1a < t2a) return true;
      if (t1a > t2a) return false;
      // sort on second id
      if (t1b < t2b) return true;
      return false;}
    }
};

struct my_transform_functor
{
  bool same_id_src;
  my_transform_functor(bool _same_id_src) : same_id_src(_same_id_src) {};
  template <typename T1, typename T2>
  __host__ __device__
  bool operator()(T1 t1, T2 t2){
    if ((thrust::get<0>(t1) == thrust::get<0>(t2)) && (thrust::get<1>(t1) == thrust::get<1>(t2))) return false;
    if (same_id_src)
      if ((thrust::get<0>(t1) == thrust::get<1>(t2)) && (thrust::get<1>(t1) == thrust::get<0>(t2))) return false;
    return true;
    }
};

struct my_copy_predicate
{
  __host__ __device__
  bool operator()(bool t) { return t;}
};

int main(int argc, char *argv[]){

  // data setup
  int d1[] = {0, 1, 2, 3, 4, 5, 0};
  int d2[] = {5, 2, 1, 2, 1, 0, 5};
  bool same_id_src = true;
  if (argc > 1) same_id_src = false;
  size_t npair = sizeof(d1)/sizeof(d1[0]);
  int *id1, *id2;
  cudaMalloc(&id1, sizeof(d1));
  cudaMalloc(&id2, sizeof(d1));
  cudaMemcpy(id1, d1, sizeof(d1), cudaMemcpyHostToDevice);
  cudaMemcpy(id2, d2, sizeof(d1), cudaMemcpyHostToDevice);
  auto dp_id1 = thrust::device_ptr<int>(id1);
  auto dp_id2 = thrust::device_ptr<int>(id2);
  auto dzip = thrust::make_zip_iterator(thrust::make_tuple(dp_id1, dp_id2));
  thrust::device_vector<bool> stencil(npair, true);
  thrust::device_vector<int> r1(npair);
  thrust::device_vector<int> r2(npair);
  auto rzip = thrust::make_zip_iterator(thrust::make_tuple(r1.begin(), r2.begin()));
  // step 1: sort
  thrust::sort(dzip, dzip+npair, my_sort_functor(same_id_src));
  // step 2: mark pairs to be kept in stencil
  thrust::transform(dzip, dzip+npair-1, dzip+1,  stencil.begin()+1, my_transform_functor(same_id_src));
  // step 3: copy if, using stencil
  int rsize = thrust::copy_if(dzip, dzip+npair, stencil.begin(), rzip, my_copy_predicate()) - rzip;
  // display result
  thrust::copy_n(r1.begin(), rsize, std::ostream_iterator<int>(std::cout, " "));
  std::cout << std::endl;
  thrust::copy_n(r2.begin(), rsize, std::ostream_iterator<int>(std::cout, " "));
  std::cout << std::endl;
}
# nvcc -o t76 t76.cu
# compute-sanitizer ./t76
========= COMPUTE-SANITIZER
0 1 4 3
5 2 1 2
========= ERROR SUMMARY: 0 errors
# compute-sanitizer ./t76 1
========= COMPUTE-SANITIZER
0 1 2 3 4 5
5 2 1 2 1 0
========= ERROR SUMMARY: 0 errors
#

When we don't specify a command-line argument, the special case is considered to be true, and additional "duplicates" are removed. When we do specify a command-line argument, the special case is considered to be false.

EDIT: Working off a comment from paleonix below, we can improve the above implementation by replacing steps 2 and 3 with a call to thrust::unique_copy, which is also a stream compaction operation. The sort process remains unchanged, and only a slight change is made to our previous transform functor, to make it usable for the unique_copy operation:

# cat t76.cu
#include <thrust/sort.h>
#include <thrust/copy.h>
#include <thrust/unique.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/device_ptr.h>
#include <thrust/device_vector.h>
#include <iostream>
#include <cstdlib>

struct my_sort_functor
{
  bool same_id_src;
  my_sort_functor(bool _same_id_src) : same_id_src(_same_id_src) {};
  template <typename T1, typename T2>
  __host__ __device__
  bool operator()(T1 t1, T2 t2){
    int t1a = thrust::get<0>(t1);
    int t1b = thrust::get<1>(t1);
    int t2a = thrust::get<0>(t2);
    int t2b = thrust::get<1>(t2);
    if (same_id_src) {//need to possibly reorder each pair for testing
      bool t1s = (t1a > t1b);
      bool t2s = (t2a > t2b);
      // sort on smaller id first
      if ((t1s?t1b:t1a) < (t2s?t2b:t2a)) return true;
      if ((t1s?t1b:t1a) > (t2s?t2b:t2a)) return false;
      // then sort on larger id
      if ((t1s?t1a:t1b) < (t2s?t2a:t2b)) return true;
      if ((t1s?t1a:t1b) > (t2s?t2a:t2b)) return false;
      // then sort based on the equality case
      // we prefer to choose <0,5> over <5,0>
      // so order that one first
      return !t1s;}
    else { // no reordering of pairs
      // sort on first id
      if (t1a < t2a) return true;
      if (t1a > t2a) return false;
      // sort on second id
      if (t1b < t2b) return true;
      return false;}
    }
};

struct my_transform_functor
{
  bool same_id_src;
  my_transform_functor(bool _same_id_src) : same_id_src(_same_id_src) {};
  template <typename T1, typename T2>
  __host__ __device__
  bool operator()(T1 t1, T2 t2){
    if ((thrust::get<0>(t1) == thrust::get<0>(t2)) && (thrust::get<1>(t1) == thrust::get<1>(t2))) return true;
    if (same_id_src)
      if ((thrust::get<0>(t1) == thrust::get<1>(t2)) && (thrust::get<1>(t1) == thrust::get<0>(t2))) return true;
    return false;
    }
};

int main(int argc, char *argv[]){

  // data setup
  int d1[] = {0, 1, 2, 3, 4, 5, 0};
  int d2[] = {5, 2, 1, 2, 1, 0, 5};
  bool same_id_src = true;
  if (argc > 1) same_id_src = false;
  size_t npair = sizeof(d1)/sizeof(d1[0]);
  int *id1, *id2;
  cudaMalloc(&id1, sizeof(d1));
  cudaMalloc(&id2, sizeof(d1));
  cudaMemcpy(id1, d1, sizeof(d1), cudaMemcpyHostToDevice);
  cudaMemcpy(id2, d2, sizeof(d1), cudaMemcpyHostToDevice);
  auto dp_id1 = thrust::device_ptr<int>(id1);
  auto dp_id2 = thrust::device_ptr<int>(id2);
  auto dzip = thrust::make_zip_iterator(thrust::make_tuple(dp_id1, dp_id2));
  thrust::device_vector<int> r1(npair);
  thrust::device_vector<int> r2(npair);
  auto rzip = thrust::make_zip_iterator(thrust::make_tuple(r1.begin(), r2.begin()));
  // step 1: sort
  thrust::sort(dzip, dzip+npair, my_sort_functor(same_id_src));
  // step 2: copy unique elements
  int rsize = thrust::unique_copy(dzip, dzip+npair, rzip, my_transform_functor(same_id_src)) - rzip;
  // display result
  thrust::copy_n(r1.begin(), rsize, std::ostream_iterator<int>(std::cout, " "));
  std::cout << std::endl;
  thrust::copy_n(r2.begin(), rsize, std::ostream_iterator<int>(std::cout, " "));
  std::cout << std::endl;
}
# nvcc -o t76 t76.cu
# ./t76
0 1 4 3
5 2 1 2
# ./t76 1
0 1 2 3 4 5
5 2 1 2 1 0
#