C/CUDA: Only every fourth element in CudaArray can be indexed

250 views Asked by At

This is my first post, so I am thrilled to get some new insights and enlarge my knowledge. Currently I am working on a C-project where a binary raw file with 3d-data is loaded, processed in CUDA and saved in a new binary raw file.

This is based on the simpleTexture3D project from CUDA Samples:

This is my cpp

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

// includes, cuda
#include <vector_types.h>
#include <driver_functions.h>
#include <cuda_runtime.h>

// CUDA utilities and system includes
#include <helper_cuda.h>
#include <helper_functions.h>
#include <vector_types.h>

typedef unsigned int  uint;
typedef unsigned char uchar;

const char *sSDKsample = "simpleTexture3D";

const char *volumeFilename = "Bucky.raw";
const cudaExtent volumeSize = make_cudaExtent(32, 32, 32);

const uint width = 64, height = 64, depth=64;

//const char *volumeFilename = "TestOCT.raw";
//const cudaExtent volumeSize = make_cudaExtent(1024, 512, 512);
//
//const uint width = 1024, height = 512, depth=512;
const dim3 blockSize(8, 8, 8);
const dim3 gridSize(width / blockSize.x, height / blockSize.y, depth / blockSize.z);

uint *d_output = NULL;

int *pArgc = NULL;
char **pArgv = NULL;

extern "C" void cleanup();
extern "C" void initCuda(const uchar *h_volume, cudaExtent volumeSize);
extern "C" void render_kernel(dim3 gridSize, dim3 blockSize, uint *d_output, uint imageW, uint imageH, uint imageD);

void loadVolumeData(char *exec_path);


// render image using CUDA
void render()
{
    // call CUDA kernel
    render_kernel(gridSize, blockSize, d_output, width, height, depth);

    getLastCudaError("render_kernel failed");
}

void cleanup()
{
    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
    // needed to ensure correct operation when the application is being
    // profiled. Calling cudaDeviceReset causes all profile data to be
    // flushed before the application exits
    checkCudaErrors(cudaDeviceReset());
}

// Load raw data from disk
uchar *loadRawFile(const char *filename, size_t size)
{
    FILE *fp = fopen(filename, "rb");

    if (!fp)
    {
        fprintf(stderr, "Error opening file '%s'\n", filename);
        return 0;
    }

    uchar *data = (uchar *) malloc(size);
    size_t read = fread(data, 1, size, fp);
    fclose(fp);

    printf("Read '%s', %lu bytes\n", filename, read);

    return data;
}

// write raw data to disk
int writeRawFile(const char *filename, uchar *data, size_t size)
{
    int returnState=0;
    // cut file extension from filename
    char *a=strdup(filename);   //via strdup you dumb a const char to char, you must free it yourself

    int len = strlen(a);
    a[len-4] = '\0';    //deletes '.raw'
    //printf("%s\n",a);

    char b[50];
    sprintf(b, "_%dx%dx%d_out.raw", width, height, depth);

    //char b[]="_out.raw";  //Add suffix out to filename

    char buffer[256]; // <- danger, only storage for 256 characters.
    strncpy(buffer, a, sizeof(buffer));
    strncat(buffer, b, sizeof(buffer));
    free(a);

    FILE *fp = fopen(buffer, "wb"); //Open or create file for writing as binary, all existing data is cleared

    if (!fp)
    {
        fprintf(stderr, "Error opening or creating file '%s'\n", buffer);
        return 0;
    }

    size_t write = fwrite(data, 1, size, fp);
    fclose(fp);

    if (write==size)
    {    
         printf("Wrote %lu bytes to '%s'\n", write, buffer);
         return 0;
    }
    else
    {
        printf("Error writing data to file '%s'\n", buffer);    
        return 1;
    }
}

// General initialization call for CUDA Device
int chooseCudaDevice(int argc, char **argv)
{
    int result = 0;

    result = findCudaDevice(argc, (const char **)argv);

    return result;
}

void runAutoTest(char *exec_path, char *PathToFile)
{
    // set path
    char *path;
    if (PathToFile == NULL)
    {
        path = sdkFindFilePath(volumeFilename, exec_path);
    }
    else
    {
        path = PathToFile;
    }

    if (path == NULL)
    {
        fprintf(stderr, "Error unable to find 3D Volume file: '%s'\n", volumeFilename);
        exit(EXIT_FAILURE);
    }

    // Allocate output memory
    checkCudaErrors(cudaMalloc((void **)&d_output, width*height*depth*sizeof(uchar)));

    // zero out the output array with cudaMemset
    cudaMemset(d_output, 0, width*height*depth*sizeof(uchar));

    // render the volumeData
    render_kernel(gridSize, blockSize, d_output, width, height, depth);

    checkCudaErrors(cudaDeviceSynchronize());
    getLastCudaError("render_kernel failed");

    uchar *h_output = (uchar*)malloc(width*height*depth);
    checkCudaErrors(cudaMemcpy(h_output, d_output, width*height*depth*sizeof(uchar), cudaMemcpyDeviceToHost));
    int wState=writeRawFile(path,h_output,width*height*depth);

    checkCudaErrors(cudaFree(d_output));
    free(h_output);

    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
    // needed to ensure correct operation when the application is being
    // profiled. Calling cudaDeviceReset causes all profile data to be
    // flushed before the application exits
    cudaDeviceReset();

    //exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
}


void loadVolumeData(char *exec_path, char *PathToFile)
{
    char *path;
    // load volume data
    if (PathToFile == NULL)
    {
        path = sdkFindFilePath(volumeFilename, exec_path);
    }
    else
    {
        path = PathToFile;
    }

    if (path == NULL)
    {
        fprintf(stderr, "Error unable to find 3D Volume file: '%s'\n", volumeFilename);
        exit(EXIT_FAILURE);
    }

    size_t size = volumeSize.width*volumeSize.height*volumeSize.depth;
    uchar *h_volume = loadRawFile(path, size);
    //int wState=writeRawFile(path,h_volume,size);

    initCuda(h_volume, volumeSize);

    free(h_volume);
}


////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main(int argc, char **argv)
{
    pArgc = &argc;
    pArgv = argv;

    char *image_file = NULL;

    printf("%s Starting...\n\n", sSDKsample);

    if (checkCmdLineFlag(argc, (const char **)argv, "file"))    //Note cmd line argument is -file "PathToFile/File.raw"
    {                                                           // for example -file "C:\ProgramData\NVIDIA Corporation\CUDA Samples\v7.0\2_Graphics\simpleTexture3D_FanBeamCorr\data\TestOCT_Kopie.raw"            
        getCmdLineArgumentString(argc, (const char **)argv, "file", &image_file);
    }

    if (image_file)
    {
        chooseCudaDevice(argc, argv);

        loadVolumeData(argv[0],image_file);

        runAutoTest(argv[0],image_file);
    }
    else
    {
        // use command-line specified CUDA device, otherwise use device with highest Gflops/s
        chooseCudaDevice(argc, argv);

        loadVolumeData(argv[0],NULL);

        runAutoTest(argv[0],NULL);
    }

    printf("I am finished...\n"
           "Can I get some ice cream please\n");

    exit(EXIT_SUCCESS);
}

And this is my .cu

#ifndef _SIMPLETEXTURE3D_KERNEL_CU_
#define _SIMPLETEXTURE3D_KERNEL_CU_


#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

#include <helper_cuda.h>
#include <helper_math.h>

typedef unsigned int  uint;
typedef unsigned char uchar;

texture<uchar, 3, cudaReadModeNormalizedFloat> tex;  // 3D texture

cudaArray *d_volumeArray = 0;

__global__ void
d_render(uint *d_output, uint imageW, uint imageH, uint imageD)
{
    uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
    uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
    uint z = __umul24(blockIdx.z, blockDim.z) + threadIdx.z;

 //   float u = x / (float) imageW;
 //   float v = y / (float) imageH;
    //float w = z / (float) imageD;
 //   // read from 3D texture
 //   float voxel = tex3D(tex, u, v, w);
    uint ps=__umul24(imageW,imageH);

    if ((x < imageW) && (y < imageH) && (z < imageD))
    {
        // write output color
        uint i = __umul24(z,ps) +__umul24(y, imageW) + x;
        d_output[1] = (uchar) 255;//+0*voxel*255;
    }
}

extern "C"
void initCuda(const uchar *h_volume, cudaExtent volumeSize)
{
    // create 3D array
    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<uchar>();
    checkCudaErrors(cudaMalloc3DArray(&d_volumeArray, &channelDesc, volumeSize));

    // copy data to 3D array
    cudaMemcpy3DParms copyParams = {0};
    copyParams.srcPtr   = make_cudaPitchedPtr((void *)h_volume, volumeSize.width*sizeof(uchar), volumeSize.width, volumeSize.height);
    copyParams.dstArray = d_volumeArray;
    copyParams.extent   = volumeSize;
    copyParams.kind     = cudaMemcpyHostToDevice;
    checkCudaErrors(cudaMemcpy3D(&copyParams));

    // set texture parameters
    tex.normalized = true;                      // access with normalized texture coordinates
    tex.filterMode = cudaFilterModeLinear;      // linear interpolation
    tex.addressMode[0] = cudaAddressModeBorder;   // wrap texture coordinates
    tex.addressMode[1] = cudaAddressModeBorder;
    tex.addressMode[2] = cudaAddressModeBorder;

    // bind array to 3D texture
    checkCudaErrors(cudaBindTextureToArray(tex, d_volumeArray, channelDesc));
}

extern "C"
void render_kernel(dim3 gridSize, dim3 blockSize, uint *d_output, uint imageW, uint imageH, uint imageD)
{
    d_render<<<gridSize, blockSize>>>(d_output, imageW, imageH, imageD);
}

#endif // #ifndef _SIMPLETEXTURE3D_KERNEL_CU_

As you can see, currently, I set all values to zero except the index = 1, which is set to 255. Yet when I now open the image stack in Fiji, I see that the fourth pixel on the first slide is white. If I use index=i instead, I get white vertical lines across the image stack periodically every four columns. Generally spoken, it seems that only every fourth element is beeing indexed in the CudaArray. So I am wondering if there is somekind of error here resulting from sizeof(uchar)=1 and sizeof(uint)=4. There would obviously be the factor 4 :)

I am eager to here from you experts

Cheers Mika

1

There are 1 answers

0
Mika On BEST ANSWER

I figured it out by myself. The kernel works with uint* d_output while the copy to the host is written into a uchar* h_output

uchar *h_output = (uchar*)malloc(width*height*depth);
checkCudaErrors(cudaMemcpy(h_output, d_output, width*height*depth*sizeof(uchar), cudaMemcpyDeviceToHost));

This led to this strange behavior