After the CUDA GEMM operator performs global memory partitioning, the shared memory matrix As is transposed, resulting in an error.
My right code is :
#define Offset(row, col, k) ((row) * (k) + (col))
template <
const int Block_Size_N, /*height of block of C that each thread block
calculate*/
const int Block_Size_K, /*width of block of A that each thread block load
into shared memory*/
const int Block_Size_M, /*width of block of C that each thread block
calculate*/
const int Thread_Size_X, /*height of block of C that each thread calculate*/
const int Thread_Size_Y, /* width of block of C that each thread calculate*/
const bool double_buffer /*whether enable double buffering or not*/>
__global__ void Gemm(float *__restrict__ A, float *__restrict__ B,
float *__restrict__ C, const int N, const int M,
const int K) {
// block index
int bx = blockIdx.x, by = blockIdx.y;
// thread index
int tx = threadIdx.x, ty = threadIdx.y;
int row = by * blockDim.y + ty, col = bx * blockDim.x + tx;
__shared__ float As[Block_Size_N][Block_Size_K];
__shared__ float Bs[Block_Size_K][Block_Size_M];
float tmp = 0.0;
for (int i = 0; i < K; i += Block_Size_K) {
// Attempting to transpose the shared memory matrix As during stored procedures
As[ty][tx] = A[Offset(row, tx + i, K)];
Bs[ty][tx] = B[Offset(ty + i, col, M)];
__syncthreads();
for (int j = 0; j < Block_Size_K; ++j) {
// Attempting to transpose the shared memory matrix As during loaded procedures
tmp += As[ty][j] * Bs[j][tx];
}
}
C[Offset(row, col, M)] = tmp;
}
After transposing the As matrix, the code is as follows, and the result of this kernel function is incorrect:
#define Offset(row, col, k) ((row) * (k) + (col))
template <
const int Block_Size_N, /*height of block of C that each thread block
calculate*/
const int Block_Size_K, /*width of block of A that each thread block load
into shared memory*/
const int Block_Size_M, /*width of block of C that each thread block
calculate*/
const int Thread_Size_X, /*height of block of C that each thread calculate*/
const int Thread_Size_Y, /* width of block of C that each thread calculate*/
const bool double_buffer /*whether enable double buffering or not*/>
__global__ void Gemm(float *__restrict__ A, float *__restrict__ B,
float *__restrict__ C, const int N, const int M,
const int K) {
// block index
int bx = blockIdx.x, by = blockIdx.y;
// thread index
int tx = threadIdx.x, ty = threadIdx.y;
int row = by * blockDim.y + ty, col = bx * blockDim.x + tx;
__shared__ float As[Block_Size_K][Block_Size_N];
__shared__ float Bs[Block_Size_K][Block_Size_M];
float tmp = 0.0;
for (int i = 0; i < K; i += Block_Size_K) {
// Attempting to transpose the shared memory matrix As during stored procedures
As[tx][ty] = A[Offset(row, tx + i, K)];
Bs[ty][tx] = B[Offset(ty + i, col, M)];
__syncthreads();
for (int j = 0; j < Block_Size_K; ++j) {
// Attempting to transpose the shared memory matrix As during loaded procedures
tmp += As[j][ty] * Bs[j][tx];
}
}
C[Offset(row, col, M)] = tmp;
}
Among them, Block_Size_N = Block_Size_K = Block_Size_M = 32 , N = M = K = 2048.
Thread_Size_Y , Thread_Size_Y , double_buffer , these three variables were not used.
The complete code is as follows:
#include "stdio.h"
#include "stdlib.h"
#include "assert.h"
#include "cublas_v2.h"
#include "cuda_runtime.h"
#include <iostream>
#define checkCudaErrors(func) \
{ \
cudaError_t e = func; \
if (e != cudaSuccess) { \
printf("gemm_v2_shared_block , %s , %d CUDA: %s\n", __FILE__, __LINE__, \
cudaGetErrorString(e)); \
} \
}
// transfer float4
#define Fetch_Float4(pointer) (reinterpret_cast<float4 *>(&(pointer))[0])
#define Offset(row, col, k) ((row) * (k) + (col))
template <
const int Block_Size_N, /*height of block of C that each thread block
calculate*/
const int Block_Size_K, /*width of block of A that each thread block load
into shared memory*/
const int Block_Size_M, /*width of block of C that each thread block
calculate*/
const int Thread_Size_X, /*height of block of C that each thread calculate*/
const int Thread_Size_Y, /* width of block of C that each thread calculate*/
const bool double_buffer /*whether enable double buffering or not*/>
__global__ void Gemm(float *__restrict__ A, float *__restrict__ B,
float *__restrict__ C, const int N, const int M,
const int K) {
// block index
int bx = blockIdx.x, by = blockIdx.y;
// thread index
int tx = threadIdx.x, ty = threadIdx.y;
int row = by * blockDim.y + ty, col = bx * blockDim.x + tx;
__shared__ float As[Block_Size_K][Block_Size_N];
__shared__ float Bs[Block_Size_K][Block_Size_M];
float tmp = 0.0;
for (int i = 0; i < K; i += Block_Size_K) {
As[tx][ty] = A[Offset(row, tx + i, K)];
Bs[ty][tx] = B[Offset(ty + i, col, M)];
__syncthreads();
for (int j = 0; j < Block_Size_K; ++j) {
tmp += As[j][ty] * Bs[j][tx];
}
}
C[Offset(row, col, M)] = tmp;
}
int main(int argc, char **argv) {
if (argc != 4) {
printf("gemm_v2_shared_block , usage: ./main [N] [K] [M]");
exit(0);
}
size_t N = atoi(argv[1]), K = atoi(argv[2]), M = atoi(argv[3]);
float *h_a = new float[N * K], *h_b = new float[K * M],
*h_c = new float[N * M], *h_c1 = new float[N * M];
float *d_a, *d_b, *d_c;
size_t bytes_A = sizeof(float) * N * K, bytes_B = sizeof(float) * M * K,
bytes_C = sizeof(float) * N * M;
checkCudaErrors(cudaMalloc(&d_a, sizeof(float) * N * K));
checkCudaErrors(cudaMalloc(&d_b, sizeof(float) * M * K));
checkCudaErrors(cudaMalloc(&d_c, sizeof(float) * N * M));
const int Block_Size_N = 32, Block_Size_M = 32, Block_Size_K = 32,
Thread_Size_X = 8, Thread_Size_Y = 8;
const bool Enable_Double_Buffer = false;
// generate A
for (int i = 0; i < N * K; ++i) {
h_a[i] = i;
}
// generate B
for (int i = 0; i < M * K; ++i) {
h_b[i] = i;
}
checkCudaErrors(cudaMemcpy(d_a, h_a, bytes_A, cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(d_b, h_b, bytes_B, cudaMemcpyHostToDevice));
cudaEvent_t start, stop;
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
float msecTotal = 0;
int nIter = 1000;
checkCudaErrors(cudaMemcpy(d_c, h_c, bytes_C, cudaMemcpyHostToDevice));
checkCudaErrors(cudaEventRecord(start));
for (int run = 0; run < nIter; ++run) {
dim3 dimBlock(Block_Size_N, Block_Size_M);
dim3 dimGrid(N / Block_Size_N, M / Block_Size_M);
// printf("Grid = (%d , %d)\n" , dimGrid.x , dimGrid.y) ;
Gemm<Block_Size_N, Block_Size_K, Block_Size_M, Thread_Size_X, Thread_Size_Y,
Enable_Double_Buffer><<<dimGrid, dimBlock>>>(d_a, d_b, d_c, N, M, K);
}
checkCudaErrors(cudaEventRecord(stop));
checkCudaErrors(cudaEventSynchronize(stop));
checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop));
checkCudaErrors(cudaMemcpy(h_c, d_c, bytes_C, cudaMemcpyDeviceToHost));
double msecPerMatrixMul[2] = {0, 0};
double gigaFlops[2] = {0, 0};
double flopsPerMatrixMul = 2.0 * M * N * K;
msecPerMatrixMul[0] = msecTotal / nIter;
gigaFlops[0] =
(flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul[0] / 1000.0f);
printf("gemm_v2_shared_block , My gemm Performance= %.2f GFlop/s, Time= %.3f "
"msec, Size= %.0f Ops,\n",
gigaFlops[0], msecPerMatrixMul[0], flopsPerMatrixMul);
// cublas
cublasHandle_t blas_handle;
cublasCreate(&blas_handle);
float alpha = 1.0;
float beta = 0;
checkCudaErrors(cudaMemcpy(d_c, h_c, bytes_C, cudaMemcpyHostToDevice));
checkCudaErrors(cudaEventRecord(start));
for (int run = 0; run < nIter; run++) {
cublasSgemm(blas_handle, CUBLAS_OP_T, CUBLAS_OP_T, N, M, K, &alpha, d_a, K,
d_b, M, &beta, d_c, N);
}
checkCudaErrors(cudaEventRecord(stop));
checkCudaErrors(cudaEventSynchronize(stop));
checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop));
checkCudaErrors(cudaMemcpy(h_c1, d_c, bytes_C, cudaMemcpyDeviceToHost));
msecPerMatrixMul[1] = msecTotal / nIter;
gigaFlops[1] =
(flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul[1] / 1000.0f);
printf("gemm_v2_shared_block , CuBlas Performance= %.2f GFlop/s, Time= %.3f "
"msec, Size= %.0f Ops,\n",
gigaFlops[1], msecPerMatrixMul[1], flopsPerMatrixMul);
cublasDestroy(blas_handle);
double eps = 1.e-6; // machine zero
bool correct = true;
for (int i = 0; i < N * M; i++) {
int row = i / M;
int col = i % M;
double abs_err = fabs(h_c[i] - h_c1[col * N + row]);
double dot_length = M;
double abs_val = fabs(h_c[i]);
double rel_err = abs_err / abs_val / dot_length;
if (rel_err > eps) {
printf("gemm_v2_shared_block , Error! Matrix[%05d]=%.8f, ref=%.8f error "
"term is > %E\n",
i, h_c[i], h_c1[col * N + row], eps);
correct = false;
break;
}
}
printf("gemm_v2_shared_block , %s\n",
correct ? "Result= PASS" : "Result= FAIL");
printf("gemm_v2_shared_block , ratio= %f\n", gigaFlops[0] / gigaFlops[1]);
delete h_a, h_b, h_c, h_c1;
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
}
How to run this code:
nvcc -std=c++11 -lcublas gemm_v2_shared_block.cu -o a
./a 2048 2048 2048
I cannot understand why an error occurs simply by transposing a shared memory matrix.