i have some cuda code with a 2d kernel function like this:
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#define row 65
#define col 13824
__global__ void tt(int *pp){
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    int j = blockDim.y * blockIdx.y + threadIdx.y;
    for(unsigned ig=0;ig<33;ig++){
        pp[i*col+j]+=1;
    }
    return;
}
int main(){
    int *pp;
    int *rr;
    pp=(int*)malloc(sizeof(int)*col*row);
    rr=(int*)malloc(sizeof(int)*col*row);
    memset(pp,0,sizeof(int)*row*col);
    int *pp_g;
    cudaMalloc((void**)&pp_g,sizeof(int)*row*col);
    cudaMemcpy(pp_g,pp,sizeof(int)*row*col,cudaMemcpyHostToDevice);
    dim3 block(32,32,1);
    dim3 grid(row/32+1,col/32+1,1);
    tt<<<grid,block>>>(pp_g);
    cudaDeviceSynchronize();
    cudaMemcpy(rr,pp_g,sizeof(int)*row*col,cudaMemcpyDeviceToHost);
    int ct=0;
    for(unsigned i=0;i<row*col;i++){
        if(rr[i]!=33){
            //printf("%d\n",rr[i]);
            ct++;
        }
        //printf("%d\n",rr[i]);
    }
    printf("%d\n",ct);
    return 0;
}
the excepted results in rr array should all be 33, however, the actual results are all 0. But when I change block and grid to block(8,8,1) and grid(row/8+1,col/8+1,1), some results in rr array become correct while there are still 512 wrong results being 0 in rr array.
I cannot figure out what's wrong with my code. I want to know what happend with my results.
 
                        
The threads of your kernel perform illegal memory accesses when
i >= rowandj >= col. Also, wheni >= rowandj < col, the threads perform memory writes concurrently with the "legal" threads.Checking CUDA errors would have helped you understand the problem. See this post.