How to read back a CUDA Texture for testing?

8.7k views Asked by At

Ok, so far, I can create an array on the host computer (of type float), and copy it to the gpu, then bring it back to the host as another array (to test if the copy was successful by comparing to the original).

I then create a CUDA array from the array on the GPU. Then I bind that array to a CUDA texture.

I now want to read that texture back and compare with the original array (again to test that it copied correctly). I saw some sample code that uses the readTexel() function shown below. It doesn't seem to work for me... (basically everything works except for the section in the bindToTexture(float* deviceArray) function starting at the readTexels(SIZE, testArrayDevice) line).

Any suggestions of a different way to do this? Or are there some obvious problems I missed in my code?

Thanks for the help guys!

#include <stdio.h>
#include <assert.h>
#include <cuda.h>

#define SIZE 20;

//Create a channel description to use.
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);

//Create a texture to use.
texture<float, 2, cudaReadModeElementType> cudaTexture;
//cudaTexture.filterMode = cudaFilterModeLinear;
//cudaTexture.normalized = false;

__global__ void readTexels(int amount, float *Array)
{
  int index = blockIdx.x * blockDim.x + threadIdx.x;

  if (index < amount)
  {
    float x = tex1D(cudaTexture, float(index));
    Array[index] = x;
  }
}

float* copyToGPU(float* hostArray, int size)
{
  //Create pointers, one for the array to be on the device, and one for bringing it back to the host for testing.
  float* deviceArray;
  float* testArray;

  //Allocate some memory for the two arrays so they don't get overwritten.
  testArray = (float *)malloc(sizeof(float)*size);

  //Allocate some memory for the array to be put onto the GPU device.
  cudaMalloc((void **)&deviceArray, sizeof(float)*size);

  //Actually copy the array from hostArray to deviceArray.
  cudaMemcpy(deviceArray, hostArray, sizeof(float)*size, cudaMemcpyHostToDevice);

  //Copy the deviceArray back to testArray in host memory for testing.
  cudaMemcpy(testArray, deviceArray, sizeof(float)*size, cudaMemcpyDeviceToHost);

  //Make sure contents of testArray match the original contents in hostArray.
  for (int i = 0; i < size; i++)
  {
    if (hostArray[i] != testArray[i])
    {
      printf("Location [%d] does not match in hostArray and testArray.\n", i);
    }
  }

  //Don't forget free these arrays after you're done!
  free(testArray);

  return deviceArray; //TODO: FREE THE DEVICE ARRAY VIA cudaFree(deviceArray);
}

cudaArray* bindToTexture(float* deviceArray)
{
  //Create a CUDA array to translate deviceArray into.
  cudaArray* cuArray;

  //Allocate memory for the CUDA array.
  cudaMallocArray(&cuArray, &cudaTexture.channelDesc, SIZE, 1);

  //Copy the deviceArray into the CUDA array.
  cudaMemcpyToArray(cuArray, 0, 0, deviceArray, sizeof(float)*SIZE, cudaMemcpyHostToDevice);

  //Release the deviceArray
  cudaFree(deviceArray);

  //Bind the CUDA array to the texture.
  cudaBindTextureToArray(cudaTexture, cuArray);

  //Make a test array on the device and on the host to verify that the texture has been saved correctly.
  float* testArrayDevice;
  float* testArrayHost;

  //Allocate memory for the two test arrays.
  cudaMalloc((void **)&testArray, sizeof(float)*SIZE);
  testArrayHost = (float *)malloc(sizeof(float)*SIZE);

  //Read the texels of the texture to the test array in the device.
  readTexels(SIZE, testArrayDevice);

  //Copy the device test array to the host test array.
  cudaMemcpy(testArrayHost, testArrayDevice, sizeof(float)*SIZE, cudaMemcpyDeviceToHost);

  //Print contents of the array out.
  for (int i = 0; i < SIZE; i++)
  {
    printf("%f\n", testArrayHost[i]);
  }

  //Free the memory for the test arrays.
  free(testArrayHost);
  cudaFree(testArrayDevice);

  return cuArray; //TODO: UNBIND THE CUDA TEXTURE VIA cudaUnbindTexture(cudaTexture);
  //TODO: FREE THE CUDA ARRAY VIA cudaFree(cuArray);
}


int main(void)
{
  float* hostArray;

  hostArray = (float *)malloc(sizeof(float)*SIZE);

  for (int i = 0; i < SIZE; i++)
  {
    hostArray[i] = 10.f + i;
  }

  float* deviceAddy = copyToGPU(hostArray, SIZE);

  free(hostArray);

  return 0;
}
2

There are 2 answers

1
cibercitizen1 On BEST ANSWER

Briefly:

------------- in your main.cu ---------------------------------------------------------------------------------------

-1. Define the texture as a globlal variable


       texture refTexture; // global variable !
       // meaning: address the texture with (x,y) (2D) and get an unsinged int

In the main function:

-2. Use arrays combined with texture

    cudaArray* myArray; // declar.
    // ask for memory
    cudaMallocArray (   &myArray,
&refTex.channelDesc, /* with this you don't need to fill a channel descriptor */ width,
height);

-3. copy data from CPU to GPU (to the array)

 cudaMemcpyToArray ( arrayCudaEntrada, // destination: the array
0, 0, // offsets sourceData, // pointer uint* widthheightsizeof(uint), // total amount of bytes to be copied cudaMemcpyHostToDevice);

-4. bind texture and array

    cudaBindTextureToArray( refTex,arrayCudaEntrada)

-5. change some parameters in the texture


refTextura_In.normalized = false; // don't automatically convert fetched data to [0,1[ refTextura_In.addressMode[0] = cudaAddressModeClamp; // if my indexing is out of bounds: automatically use a valid indexing (0 if negative index, last if too great index) refTextura_In.addressMode[1] = cudaAddressModeClamp;

---------- in the kernel --------------------------------------------------------

    // find out indexes (f,c) to process by this thread
     uint f = (blockIdx.x * blockDim.x) + threadIdx.x;
     uint c = (blockIdx.y * blockDim.y) + threadIdx.y;

  // this is curious and necessary: indexes for reading from a texture
  // are floats !. Even if you are certain to access (4,5) you have
  // match the "center" this is (4.5, 5.5)
  uint read = tex2D( refTex, c+0.5f, f+0.5f); // texRef is a global variable

Now You process read and write the results to other zone of the device global memory, not to the texture itself !

0
Tom On

readTexels() is a kernel (__global__) function, i.e. it runs on the GPU. Therefore you need to use the correct syntax to launch a kernel.

Take a look through the CUDA Programming Guide and some of the SDK samples, both available via the NVIDIA CUDA site to see how to launch a kernel.

Hint: It'll end up something like readTexels<<<grid,block>>>(...)