I want to use global variables to keep data in a memory between kernel calls but I cannot figure out how to allocate memory and how to access it correctly. The code below gives wrong output.

I need to use pointers because my device method "outputs" unkown size of data. The program is going to look for strongly connected verticles in a directed graph - obviosuly there could be none, just as well there might be milions. The ideas is to call kernel, ask how big is the data and then memcopy it to a host memory. I'm new to cuda so I don't know any design patters that address my issue.

__device__ int* d_array;

  __global__ void doCalculation(int* adr, int testSize) {
        // Index evaluation
    int threadsPerBlock = blockDim.x * blockDim.y * blockDim.z;
    int threadPosInBlock = threadIdx.x +
        blockDim.x * threadIdx.y +
        blockDim.x * blockDim.y * threadIdx.z;
    int blockPosInGrid = blockIdx.x +
        gridDim.x * blockDim.y +
        gridDim.x * gridDim.y * blockIdx.z;
    int tid = blockPosInGrid * threadsPerBlock + threadPosInBlock;

        // "calculations"
    if (tid < testSize) {
        adr[tid] = 1;
    }
}

void startCalculation() {
    dim3 block(8, 8, 8);
    dim3 grid(16,16);
    int testSize = 5;

    cudaMalloc(&d_array, testSize * sizeof(int));
    int* d_addr;
    cudaGetSymbolAddress((void**)&d_addr, d_array);

    doCalculation<<<grid, grid>>>(d_addr, testSize);
    cudaDeviceSynchronize();

    int* array = (int*)malloc(testSize * sizeof(int));
    cudaMemcpyFromSymbol(array, d_array, testSize*sizeof(int));

    for (int i = 0; i < testSize; i++)
    {
        std::cout << "Array at " << i << " is " << array[i] << std::endl;
    } //This outputs -842150451 in my case
}

0 Answers