in cuda kernel , the shared memory matrix As is transposed, resulting in an error

Question

in cuda kernel , the shared memory matrix As is transposed, resulting in an error

25 views Asked by spnooy seed At 14 March 2024 at 06:14

After the CUDA GEMM operator performs global memory partitioning, the shared memory matrix As is transposed, resulting in an error.

My right code is :

#define Offset(row, col, k) ((row) * (k) + (col))
template <
    const int Block_Size_N, /*height of block of C that each thread block
                               calculate*/
    const int Block_Size_K, /*width of block of A that each thread block load
                         into shared memory*/

    const int Block_Size_M,  /*width of block of C that each thread block
                                calculate*/
    const int Thread_Size_X, /*height of block of C that each thread calculate*/
    const int Thread_Size_Y, /* width of block of C that each thread calculate*/
    const bool double_buffer /*whether enable double buffering or not*/>

__global__ void Gemm(float *__restrict__ A, float *__restrict__ B,
                     float *__restrict__ C, const int N, const int M,
                     const int K) {
  // block index
  int bx = blockIdx.x, by = blockIdx.y;
  // thread index
  int tx = threadIdx.x, ty = threadIdx.y;

  int row = by * blockDim.y + ty, col = bx * blockDim.x + tx;
  __shared__ float As[Block_Size_N][Block_Size_K];
  __shared__ float Bs[Block_Size_K][Block_Size_M];
  float tmp = 0.0;
  for (int i = 0; i < K; i += Block_Size_K) { 
// Attempting to transpose the shared memory matrix As during stored procedures
    As[ty][tx] = A[Offset(row, tx + i, K)];
    Bs[ty][tx] = B[Offset(ty + i, col, M)];
    __syncthreads();
    for (int j = 0; j < Block_Size_K; ++j) {
// Attempting to transpose the shared memory matrix As during loaded procedures
      tmp += As[ty][j] * Bs[j][tx];
    }
  }
  C[Offset(row, col, M)] = tmp;
}

After transposing the As matrix, the code is as follows, and the result of this kernel function is incorrect：

#define Offset(row, col, k) ((row) * (k) + (col))
template <
    const int Block_Size_N, /*height of block of C that each thread block
                               calculate*/
    const int Block_Size_K, /*width of block of A that each thread block load
                         into shared memory*/

    const int Block_Size_M,  /*width of block of C that each thread block
                                calculate*/
    const int Thread_Size_X, /*height of block of C that each thread calculate*/
    const int Thread_Size_Y, /* width of block of C that each thread calculate*/
    const bool double_buffer /*whether enable double buffering or not*/>

__global__ void Gemm(float *__restrict__ A, float *__restrict__ B,
                     float *__restrict__ C, const int N, const int M,
                     const int K) {
  // block index
  int bx = blockIdx.x, by = blockIdx.y;
  // thread index
  int tx = threadIdx.x, ty = threadIdx.y;

  int row = by * blockDim.y + ty, col = bx * blockDim.x + tx;
  __shared__ float As[Block_Size_K][Block_Size_N];
  __shared__ float Bs[Block_Size_K][Block_Size_M];
  float tmp = 0.0;
  for (int i = 0; i < K; i += Block_Size_K) {
// Attempting to transpose the shared memory matrix As during stored procedures
    As[tx][ty] = A[Offset(row, tx + i, K)];
    Bs[ty][tx] = B[Offset(ty + i, col, M)];
    __syncthreads();
    for (int j = 0; j < Block_Size_K; ++j) {
// Attempting to transpose the shared memory matrix As during loaded procedures
      tmp += As[j][ty] * Bs[j][tx];
    }
  }
  C[Offset(row, col, M)] = tmp;
}

Among them, Block_Size_N = Block_Size_K = Block_Size_M = 32 , N = M = K = 2048.

Thread_Size_Y , Thread_Size_Y , double_buffer , these three variables were not used.

The complete code is as follows:

#include "stdio.h"
#include "stdlib.h"

#include "assert.h"

#include "cublas_v2.h"
#include "cuda_runtime.h"
#include <iostream>
#define checkCudaErrors(func)                                                  \
  {                                                                            \
    cudaError_t e = func;                                                      \
    if (e != cudaSuccess) {                                                    \
      printf("gemm_v2_shared_block , %s , %d CUDA: %s\n", __FILE__, __LINE__,  \
             cudaGetErrorString(e));                                           \
    }                                                                          \
  }

// transfer float4
#define Fetch_Float4(pointer) (reinterpret_cast<float4 *>(&(pointer))[0])
#define Offset(row, col, k) ((row) * (k) + (col))
template <
    const int Block_Size_N, /*height of block of C that each thread block
                               calculate*/
    const int Block_Size_K, /*width of block of A that each thread block load
                         into shared memory*/

    const int Block_Size_M,  /*width of block of C that each thread block
                                calculate*/
    const int Thread_Size_X, /*height of block of C that each thread calculate*/
    const int Thread_Size_Y, /* width of block of C that each thread calculate*/
    const bool double_buffer /*whether enable double buffering or not*/>

__global__ void Gemm(float *__restrict__ A, float *__restrict__ B,
                     float *__restrict__ C, const int N, const int M,
                     const int K) {
  // block index
  int bx = blockIdx.x, by = blockIdx.y;
  // thread index
  int tx = threadIdx.x, ty = threadIdx.y;

  int row = by * blockDim.y + ty, col = bx * blockDim.x + tx;
  __shared__ float As[Block_Size_K][Block_Size_N];
  __shared__ float Bs[Block_Size_K][Block_Size_M];
  float tmp = 0.0;
  for (int i = 0; i < K; i += Block_Size_K) {
    As[tx][ty] = A[Offset(row, tx + i, K)];
    Bs[ty][tx] = B[Offset(ty + i, col, M)];
    __syncthreads();
    for (int j = 0; j < Block_Size_K; ++j) {
      tmp += As[j][ty] * Bs[j][tx];
    }
  }
  C[Offset(row, col, M)] = tmp;
}

int main(int argc, char **argv) {
  if (argc != 4) {
    printf("gemm_v2_shared_block , usage: ./main [N] [K] [M]");
    exit(0);
  }
  size_t N = atoi(argv[1]), K = atoi(argv[2]), M = atoi(argv[3]);

  float *h_a = new float[N * K], *h_b = new float[K * M],
        *h_c = new float[N * M], *h_c1 = new float[N * M];

  float *d_a, *d_b, *d_c;
  size_t bytes_A = sizeof(float) * N * K, bytes_B = sizeof(float) * M * K,
         bytes_C = sizeof(float) * N * M;
  checkCudaErrors(cudaMalloc(&d_a, sizeof(float) * N * K));
  checkCudaErrors(cudaMalloc(&d_b, sizeof(float) * M * K));
  checkCudaErrors(cudaMalloc(&d_c, sizeof(float) * N * M));

  const int Block_Size_N = 32, Block_Size_M = 32, Block_Size_K = 32,
            Thread_Size_X = 8, Thread_Size_Y = 8;
  const bool Enable_Double_Buffer = false;
  // generate A
  for (int i = 0; i < N * K; ++i) {
    h_a[i] = i;
  }
  // generate B
  for (int i = 0; i < M * K; ++i) {
    h_b[i] = i;
  }
  checkCudaErrors(cudaMemcpy(d_a, h_a, bytes_A, cudaMemcpyHostToDevice));
  checkCudaErrors(cudaMemcpy(d_b, h_b, bytes_B, cudaMemcpyHostToDevice));

  cudaEvent_t start, stop;
  checkCudaErrors(cudaEventCreate(&start));
  checkCudaErrors(cudaEventCreate(&stop));
  float msecTotal = 0;
  int nIter = 1000;

  checkCudaErrors(cudaMemcpy(d_c, h_c, bytes_C, cudaMemcpyHostToDevice));
  checkCudaErrors(cudaEventRecord(start));

  for (int run = 0; run < nIter; ++run) {
    dim3 dimBlock(Block_Size_N, Block_Size_M);
    dim3 dimGrid(N / Block_Size_N, M / Block_Size_M);
    // printf("Grid = (%d , %d)\n" , dimGrid.x , dimGrid.y) ;
    Gemm<Block_Size_N, Block_Size_K, Block_Size_M, Thread_Size_X, Thread_Size_Y,
         Enable_Double_Buffer><<<dimGrid, dimBlock>>>(d_a, d_b, d_c, N, M, K);
  }

  checkCudaErrors(cudaEventRecord(stop));
  checkCudaErrors(cudaEventSynchronize(stop));
  checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop));
  checkCudaErrors(cudaMemcpy(h_c, d_c, bytes_C, cudaMemcpyDeviceToHost));


  double msecPerMatrixMul[2] = {0, 0};
  double gigaFlops[2] = {0, 0};
  double flopsPerMatrixMul = 2.0 * M * N * K;
  msecPerMatrixMul[0] = msecTotal / nIter;
  gigaFlops[0] =
      (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul[0] / 1000.0f);
  printf("gemm_v2_shared_block , My gemm Performance= %.2f GFlop/s, Time= %.3f "
         "msec, Size= %.0f Ops,\n",
         gigaFlops[0], msecPerMatrixMul[0], flopsPerMatrixMul);

  // cublas
  cublasHandle_t blas_handle;
  cublasCreate(&blas_handle);
  float alpha = 1.0;
  float beta = 0;
  checkCudaErrors(cudaMemcpy(d_c, h_c, bytes_C, cudaMemcpyHostToDevice));
  checkCudaErrors(cudaEventRecord(start));
  for (int run = 0; run < nIter; run++) {
    cublasSgemm(blas_handle, CUBLAS_OP_T, CUBLAS_OP_T, N, M, K, &alpha, d_a, K,
                d_b, M, &beta, d_c, N);
  }
  checkCudaErrors(cudaEventRecord(stop));
  checkCudaErrors(cudaEventSynchronize(stop));
  checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop));

  checkCudaErrors(cudaMemcpy(h_c1, d_c, bytes_C, cudaMemcpyDeviceToHost));

  msecPerMatrixMul[1] = msecTotal / nIter;
  gigaFlops[1] =
      (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul[1] / 1000.0f);
  printf("gemm_v2_shared_block , CuBlas Performance= %.2f GFlop/s, Time= %.3f "
         "msec, Size= %.0f Ops,\n",
         gigaFlops[1], msecPerMatrixMul[1], flopsPerMatrixMul);

  cublasDestroy(blas_handle);

  double eps = 1.e-6; // machine zero
  bool correct = true;
  for (int i = 0; i < N * M; i++) {
    int row = i / M;
    int col = i % M;
    double abs_err = fabs(h_c[i] - h_c1[col * N + row]);
    double dot_length = M;
    double abs_val = fabs(h_c[i]);
    double rel_err = abs_err / abs_val / dot_length;
    if (rel_err > eps) {
      printf("gemm_v2_shared_block , Error! Matrix[%05d]=%.8f, ref=%.8f error "
             "term is > %E\n",
             i, h_c[i], h_c1[col * N + row], eps);
      correct = false;
      break;
    }
  }

  printf("gemm_v2_shared_block , %s\n",
         correct ? "Result= PASS" : "Result= FAIL");
  printf("gemm_v2_shared_block , ratio= %f\n", gigaFlops[0] / gigaFlops[1]);

  delete h_a, h_b, h_c, h_c1;
  cudaFree(d_a);
  cudaFree(d_b);
  cudaFree(d_c);
}

How to run this code:

nvcc -std=c++11 -lcublas gemm_v2_shared_block.cu -o a
./a 2048 2048 2048

I cannot understand why an error occurs simply by transposing a shared memory matrix.

Original Q&A

TechQA.

in cuda kernel , the shared memory matrix As is transposed, resulting in an error

There are 0 answers

Related Questions in CUDA

Related Questions in MATRIX-MULTIPLICATION

Related Questions in GPU-SHARED-MEMORY

Popular Questions

Trending Questions