I'm new to CUDA and Thrust and I'm trying to implement a matrix multiplication and I want to achieve this by only using the thrust algorithms, because I want to avoid calling a kernel manually.

Is there a way I can achieve this efficiently? (At least without Using 2 nested for loops)

Or do I have to resign and call a CUDA Kernel?

//My data
thrust::device_vector<float> data(n*m);
thrust::device_vector<float> other(m*r);
thrust::device_vector<float> result(n*r);

// To make indexing faster, not really needed
transpose(other);

// My current approach
for (int i = 0; i < n; ++i)
{
   for (int j = 0; j < r;++j)
   {
       result[i*r+ j] = thrust::inner_product(data.begin()+(i*m), data.begin()+((i+1)*m),other+(j*m), 0.0f);
   }
}

1 Answers

0
Robert Crovella On Best Solutions

If you are interested in performance (usually why people use GPUs for computing tasks) you should not use thrust and you should not call or write your own CUDA kernel. You should use the CUBLAS library. For a learning exercise, if you want to study your own CUDA kernel, you can refer to a first-level-optimized CUDA version in the CUDA programming guide in the shared memory section. If you really want to use thrust with a single thrust call, it is possible.

The basic idea is to use an element-wise operation like thrust::transform as described here. The per-output-array-element dot-product is computed with a functor consisting of a loop.

Here's a worked example considering 3 methods. Your original double-nested loop method (relatively slow), a single thrust call method (faster) and the cublas method (fastest, certainly for larger matrix sizes). The code below only runs method 1 for matrix side dimensions of 200 or less because it is so slow. Here is an example on Tesla P100:

$ cat t463.cu
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/inner_product.h>
#include <thrust/execution_policy.h>
#include <thrust/equal.h>
#include <thrust/iterator/constant_iterator.h>
#include <cublas_v2.h>
#include <iostream>
#include <time.h>
#include <sys/time.h>
#include <cstdlib>
#define USECPSEC 1000000ULL

long long dtime_usec(unsigned long long start){

  timeval tv;
  gettimeofday(&tv, 0);
  return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}

struct dp
{
  float *A, *B;
  int m,n,r;
  dp(float *_A, float *_B, int _m, int _n, int _r): A(_A), B(_B), m(_m), n(_n), r(_r) {};
  __host__ __device__
  float operator()(size_t idx){
    float sum = 0.0f;
    int row = idx/r;
    int col = idx - (row*r); // cheaper modulo
    for (int i = 0; i < m; i++)
      sum += A[col + row*i] * B[col + row*i];
    return sum;}
};

const int dsd = 200;
int main(int argc, char *argv[]){
  int ds = dsd;
  if (argc > 1) ds = atoi(argv[1]);
  const int n = ds;
  const int m = ds;
  const int r = ds;
  // data setup
  thrust::device_vector<float> data(n*m,1);
  thrust::device_vector<float> other(m*r,1);
  thrust::device_vector<float> result(n*r,0);
  // method 1
  //let's pretend that other is (already) transposed for efficient memory access by thrust
  // therefore each dot-product is formed using a row of data and a row of other
  long long dt = dtime_usec(0);
    if (ds < 201){
    for (int i = 0; i < n; ++i)
    {
      for (int j = 0; j < r;++j)
      {
         result[i*r+ j] = thrust::inner_product(data.begin()+(i*m), data.begin()+((i+1)*m),other.begin()+(j*m), 0.0f);
      }
    }
    cudaDeviceSynchronize();
    dt = dtime_usec(dt);
    if (thrust::equal(result.begin(), result.end(), thrust::constant_iterator<float>(m)))
      std::cout << "method 1 time: " << dt/(float)USECPSEC << "s" << std::endl;
    else
      std::cout << "method 1 failure" << std::endl;
    }
  thrust::fill(result.begin(), result.end(), 0);
  cudaDeviceSynchronize();
// method 2
  //let's pretend that data is (already) transposed for efficient memory access by thrust
  // therefore each dot-product is formed using a column of data and a column of other
  dt = dtime_usec(0);
  thrust::transform(thrust::counting_iterator<int>(0), thrust::counting_iterator<int>(n*r), result.begin(), dp(thrust::raw_pointer_cast(data.data()), thrust::raw_pointer_cast(other.data()), m, n, r));
  cudaDeviceSynchronize();
  dt = dtime_usec(dt);
  if (thrust::equal(result.begin(), result.end(), thrust::constant_iterator<float>(m)))
    std::cout << "method 2 time: " << dt/(float)USECPSEC << "s" << std::endl;
  else
    std::cout << "method 2 failure" << std::endl;
// method 3
  // once again, let's pretend the data is ready to go for CUBLAS
  cublasHandle_t h;
  cublasCreate(&h);
  thrust::fill(result.begin(), result.end(), 0);
  float alpha = 1.0f;
  float beta = 0.0f;
  cudaDeviceSynchronize();
  dt = dtime_usec(0);
  cublasSgemm(h, CUBLAS_OP_T, CUBLAS_OP_T, n, r, m, &alpha, thrust::raw_pointer_cast(data.data()), n, thrust::raw_pointer_cast(other.data()), m, &beta, thrust::raw_pointer_cast(result.data()), n);
  cudaDeviceSynchronize();
  dt = dtime_usec(dt);
  if (thrust::equal(result.begin(), result.end(), thrust::constant_iterator<float>(m)))
    std::cout << "method 3 time: " << dt/(float)USECPSEC << "s" << std::endl;
  else
    std::cout << "method 3 failure" << std::endl;
}
$ nvcc -o t463 t463.cu -lcublas
$ ./t463
method 1 time: 20.1648s
method 2 time: 6.3e-05s
method 3 time: 5.7e-05s
$ ./t463 1024
method 2 time: 0.008063s
method 3 time: 0.000458s
$

For the default dimension 200 case, the single thrust call and cublas method are fairly close, but are much faster than the loop method. For a side dimension of 1024, the cublas method is almost 20x faster than the single thrust call method.

Note that I have chosen "optimal" transpose configurations for all 3 methods. For method 1, the best case timing is when the inner_product is using a "row" from each input matrix (effectively the tranpose of the 2nd input matrix). For method 2, the best case timing is when the functor is traversing a "column" from each input matrix (effectively the transpose of the first input matrix). For method 3, the choice of CUBLAS_OP_T for both input matrices seems to be fastest. In reality, only the cublas method has the flexibility to be useful for a variety of input cases with good performance.