I'm involved in a large project, that involves FFT, so I'm porting it to cuFFT. I modified this code CUFFT R2C to use dynamic memory and splitting the code into functions, initialization of cuda-arrays and execution of the actual FFT.
I tried different data types for the d_idata & d_odata arrays, but nothing worked, I got the following errors:
GPUassert: an illegal memory access was encountered t734-cufft-R2C-functions.cu 53
or is out of bounds if I check it through compute-sanitizer.
Here is the code:
#include <cufft.h>
#include <stdio.h>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
void Print2DComplex(int rows, int cols, cufftComplex *data, bool cufft_symmetry = false){
int sym_cols = cols;
if (cufft_symmetry) sym_cols = cols/2 + 1;
printf("Real Part: \n");
for (int i = 0; i < rows; i++){
for (int j = 0; j < cols; j++)
if (j>=sym_cols)
printf("%f ", data[i*sym_cols+(cols-j)].x);
else
printf("%f ", data[i*sym_cols+j].x);
printf("\n");}
printf("Imag Part: \n");
for (int i = 0; i < rows; i++){
for (int j = 0; j < cols; j++)
if (j>=sym_cols)
printf("%f ", -data[i*sym_cols+(cols-j)].y); // complex (hermitian) symmetry
else
printf("%f ", data[i*sym_cols+j].y);
printf("\n");}
}
void cudaarrays(cufftHandle *plan, cufftReal *d_idata, cufftComplex *h_odata, cufftComplex *d_odata, const int row, const int col, float *A)
{
gpuErrchk(cudaMalloc((void**)&d_idata, sizeof(cufftComplex)*row*col));
gpuErrchk(cudaMalloc((void**)&d_odata, sizeof(cufftComplex)*row*col));
gpuErrchk(cudaMemset(d_idata, 0, sizeof(cufftComplex)*row*col));
gpuErrchk(cudaMemset(d_odata, 0, sizeof(cufftComplex)*row*col));
gpuErrchk(cudaMallocHost((void**)&h_odata, sizeof(cufftComplex)*row*col));
gpuErrchk(cudaMemcpy(d_idata,A,sizeof(cufftReal)*row*col,cudaMemcpyHostToDevice));
if ((cufftPlan2d(plan, row,col, CUFFT_R2C))!= CUFFT_SUCCESS) {printf("cufft plan error\n"); exit(-1);}
}
void exec(cufftHandle plan, cufftReal *d_idata, cufftComplex *h_odata, cufftComplex *d_odata, const int row, const int col)
{
if ((cufftExecR2C(plan, (cufftReal*)d_idata, (cufftComplex*)d_odata))!=CUFFT_SUCCESS) {printf("cufft exec error\n"); exit(-1);}
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(h_odata,d_odata,sizeof(cufftComplex)*row*col,cudaMemcpyDeviceToHost));
gpuErrchk(cudaDeviceSynchronize());
}
int main()
{
const int row = 4;
const int col = 4;
/*
double A[row][col] =
{{ 1, 2, 3, 4},
{ 5, 6, 7, 8},
{ 9,10,11,12},
{13,14,15,16}};
*/
float *A = (float*)calloc(row*col , sizeof(float));
int j=0;
for(int i=1; i <= (row*col); i++)
A[j++] = i;
cufftHandle plan;
cufftReal *d_idata=NULL;
cufftComplex *h_odata=NULL, *d_odata=NULL;
cudaarrays(&plan, d_idata, h_odata, d_odata, row, col, A);
//bool symmetric_data = false;
exec(plan, d_idata, h_odata, d_odata, row, col);
//symmetric_data = true;
// Print2DComplex(row,col,h_odata, symmetric_data);
return 0;
}
I compile it as follows: nvcc t734-cufft-r2c.cu -o t734-cufft-r2c -l cufft
and using an NVIDIA GeForce RTX 2060 device on
an the following drivers ** NVIDIA-SMI 470.199.02 Driver Version: 470.199.02 CUDA Version: 11.4 **
What am I missing or doing wrong?