I have a problem when using cublasDgemm(this function is in cublas, and the result is A*B,A=750*600,B=600*1000).
for (i=0; i < N; ++i) {
cublasDgemm();
}
N=10, total time is 0.000473s, average call is 0.0000473
N=100, total time is 0.00243s, average call is 0.0000243
N=1000, total time is 0.715072s, average call is 0.000715
N=10000, total time is 10.4998s, average call is 0.00104998
why the average time is increasing so much?
#include <cuda_runtime.h>
#include <string.h>
#include <cublas.h>
#include <cublas_v2.h>
#include <time.h>
#include <sys/time.h>
#include <iostream>
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
using namespace std;
#define IDX2C(i,j,leading) (((j)*(leading))+(i))
#define CHECK_EQ(a,b) do { \
if ((a) != (b)) { \
cout <<__FILE__<<" : "<< __LINE__<<" : check failed because "<<a<<"!="<<b<<endl;\
exit(1);\
}\
} while(0)
#define CUBLAS_CHECK(condition) \
do {\
cublasStatus_t status = condition; \
CHECK_EQ(status, CUBLAS_STATUS_SUCCESS); \
} while(0)
#define CUDA_CHECK(condition)\
do {\
cudaError_t error = condition;\
CHECK_EQ(error, cudaSuccess);\
} while(0)
//check after kernel function
#define CUDA_POST_KERNEL_CHECK CUDA_CHECK(cudaPeekAtLastError())
template <class T>
void randMtx(T *mat, int n, double range) {
srand((unsigned int)time(NULL));
for (int i = 0; i < n; ++i) {
//mat[i] = 1.0;
double flag = 1.0;
if (rand() % 2 == 0) flag = -1.0;
mat[i] = flag * rand()/RAND_MAX * range;
}
}
int main(int argc, char *argv[]) {
if (argc != 9) {
cout << "m1_row m1_col m2_row m2_col m1 m2 count range\n";
return -1;
}
int row1 = atoi(argv[1]);
int col1 = atoi(argv[2]);
int row2 = atoi(argv[3]);
int col2 = atoi(argv[4]);
int count = atoi(argv[7]);
double range = atof(argv[8]);
cublasOperation_t opt1 = CUBLAS_OP_N;
cublasOperation_t opt2 = CUBLAS_OP_N;
int row3 = row1;
int col3 = col2;
int k = col1;
if (argv[5][0] == 't') {
opt1 = CUBLAS_OP_T;
row3 = col1;
k = row1;
}
if (argv[6][0] == 't') {
opt2 = CUBLAS_OP_T;
col3 = row2;
}
double *mat1_c = (double*)malloc(sizeof(double)*row1*col1);
double *mat2_c = (double*)malloc(sizeof(double)*row2*col2);
double *mat3_c = (double*)malloc(sizeof(double)*row3*col3);
srand((unsigned int)time(NULL));
randMtx(mat1_c, row1*col1, range);
randMtx(mat2_c, row2*col2, range);
double *mat1_g;
double *mat2_g;
double *mat3_g;
double alpha = 1.0;
double beta = 0.0;
CUDA_CHECK(cudaMalloc((void **)&(mat1_g), sizeof(double)*row1*col1));
CUDA_CHECK(cudaMalloc((void **)&(mat2_g), sizeof(double)*row2*col2));
CUDA_CHECK(cudaMalloc((void **)&(mat3_g), sizeof(double)*row3*col3));
CUDA_CHECK(cudaMemcpy(mat1_g, mat1_c, sizeof(double)*row1*col1, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(mat2_g, mat2_c, sizeof(double)*row2*col2, cudaMemcpyHostToDevice));
cublasHandle_t handle;
CUBLAS_CHECK(cublasCreate(&handle));
struct timeval beg, end, b1, e1;
gettimeofday(&beg, NULL);
for (int i = 0; i < count ;++i) {
CUBLAS_CHECK(cublasDgemm(handle, opt1, opt2, row3, col3, k, &alpha, mat1_g, row1, mat2_g, row2, &beta, mat3_g, row3));
}
cudaDeviceSynchronize();//
gettimeofday(&end, NULL);
cout << "real time used: " << end.tv_sec-beg.tv_sec + (double)(end.tv_usec-beg.tv_usec)/1000000 <<endl;
free(mat1_c);
free(mat2_c);
free(mat3_c);
cudaFree(mat1_g);
cudaFree(mat2_g);
cudaFree(mat3_g);
return 1;
}
this is the code. I add cudaDeviceSynchronize after the loop block, and no matter the value of count, the average call time is about 0.001s
As pointed out by @talonmies, this behavior is probably exactly what would be expected.
When you call cublasDgemm, the call (usually) returns control to the host (CPU) thread, before the operation is complete. In fact there is a queue that calls like this will go into, each time you make the call. The operation will be placed into a queue, and your host code will continue.
Furthermore, CUDA and CUBLAS usually have some one-time overhead that is associated with using the API. For example, the call to create a CUBLAS handle usually incurs some measurable time, in order to initialize the library.
So your measurements can be broken into 3 groups:
"Small" iteration counts (e.g. 10). In this case, each call pays the cost to put a Dgemm request into the queue, plus the amortization of the startup costs over a relatively small number of iterations. This corresponds to your measurements like this: "average call is 0.0000473"
"Medium" iteration counts (e.g. 100-1000). In this case, the amortization of the start up costs becomes very small per call, and so most of the measurement is just the time to add a Dgemm request to the queue. This corresponds to your measurements like this: "average call is 0.0000243"
"Large" iteration counts (e.g. 10000). At some point, the internal request queue becomes full and can no longer accept new requests, until some requests have been completed and removed from the queue. What happens at this point is that the Dgemm call switches from non-blocking to blocking. It blocks (holds up the host/CPU thread) until a queue slot becomes available. What happens at this point then, is that suddenly new requests must wait effectively for a previous request to finish, so now the cost for a new Dgemm request approximately equals the time to execute and complete a (previous) Dgemm request. So the per-call cost jumps up dramatically from the cost to add an item to the queue to the cost to complete a request. This corresponds to your measurements like this: "average call is 0.00104998"