I'm writing a frequency filtering application for a school assignment in C++ and Cuda using cuFFT and I can't get it to work. You can find the whole Visual Studio 2010 solution here. (Needs glut.)
Here is the part I think is relevant: (fourierUtils.cu/194)
//////////////////////////////////////////////////////////////////////////////
// Function to help invoking the kernel, creates the parameters and gets
// the result
__host__
void Process(
BitmapStruct& in_img, // these contain an image in an rgba byte array
BitmapStruct& out_img,
MaskGenerator maskGenerator, // this is a pointer to a device function
float param1, // mask parameters
float param2)
{
// Declare and allocate variables
cufftHandle plan;
cufftReal* img;
cufftReal* dev_img;
cufftComplex* dev_freq_img;
int imgsize = in_img.image_size();
int pixelcount = imgsize / 4;
img = new float[pixelcount];
checkResult(
cudaMalloc(&dev_img, sizeof(cufftReal) * pixelcount));
checkResult(
cudaMalloc(&dev_freq_img, sizeof(cufftComplex) * pixelcount));
// Optimize execution
cudaFuncAttributes attrs;
checkResult(
cudaFuncGetAttributes(&attrs, &Filter));
std::pair<dim3, dim3> params
= Optimizer::GetOptimalParameters(pixelcount, attrs);
// Process r, g, b channels
for(int chan = 0; chan <= 2; chan++)
{
// Init
for(int i = 0; i < pixelcount; i++)
{
img[i] = in_img.pixels[4 * i + chan];
}
checkResult(
cudaMemcpy(dev_img, img, pixelcount, cudaMemcpyHostToDevice));
// Create frequency image
checkResult(
cufftPlan1d(&plan, pixelcount, CUFFT_R2C, 1));
checkResult(
cufftExecR2C(plan, dev_img, dev_freq_img));
checkResult(
cudaThreadSynchronize());
checkResult(
cufftDestroy(plan));
// Mask frequency image
Filter<<<params.first, params.second>>>(
dev_freq_img, in_img.x, in_img.y, maskGenerator, param1, param2);
getLastCudaError("Filtering the image failed.");
// Get result
checkResult(
cufftPlan1d(&plan, pixelcount, CUFFT_C2R, 1));
checkResult(
cufftExecC2R(plan, dev_freq_img, dev_img));
checkResult(
cudaThreadSynchronize());
checkResult(
cufftDestroy(plan));
checkResult(
cudaMemcpy(img, dev_img, pixelcount, cudaMemcpyDeviceToHost));
for(int i = 0; i < pixelcount; i++)
{
out_img.pixels[4 * i + chan] = img[i];
}
}
// Copy alpha channel
for(int i = 0; i < pixelcount; i++)
{
out_img.pixels[4 * i + 3] = in_img.pixels[4 * i + 3];
}
// Free memory
checkResult(
cudaFree(dev_freq_img));
checkResult(
cudaFree(dev_img));
delete img;
getLastCudaError("An error occured during processing the image.");
}
I can't see any practical differences compared to the official examples I've seen, yet when I debug into it with Nsight, all the cufftComplex values received by my kernel are NaNs and the only difference between the input and the result images are that the result has a black bar at the bottom, no matter which filtering mask and what parameters I use. All Cuda and cuFFT calls return success and there is no error reported after the kernel invocation either.
What do I do wrong?
I've tried replacing img and dev_img with complex arrays and using C2C conversions and also doing them inplace, but it only changed the size of the black bar on the result image.
Thank you for your help.
Edit: here is a reduced version that doesn't need glut and should also compile on linux.
My mistake was forgetting to multiply the number of items with their size in some of the
cudaMemcpy
calls, thus the end of the vectors fed to cuFFT was made up of NaNs. Fixing those has solved the problem.I also replaced the cufftReal arrays with cufftComplex ones as the C2C transformations seem to be more predictable and added normalization for the values.
So the final working method is:
Thank you for the help.