Each time I run this kernel, I get some different value. I want to sum up the res array after second loop computation (something like np.sum(res, axis = 0)). When I do np.sum outside the kernel I get the same value but inside the kernel it is not possible. Probably barrier is unable to fence the memory properly. Any help would be much appreciated.
kernelsource = """
__kernel void forceFinder(
const int N,
const int dim,
const float sigma,
__global float* datacl,
__constant float* poscl,
__global float* res)
{
int i = get_global_id(0); // Global id
float f_sum ;
int k;
float sigma2 = sigma * sigma;
f_sum = 0;
for (k = 0; k < dim; k++)
{
f_sum += pown((poscl[k] - datacl[i * dim + k]), 2);
}
for (k = 0; k < dim; k++)
{
res[i * dim + k] = (datacl[i * dim + k] - poscl[k]) * exp(-f_sum/sigma2)/sigma2;
}
barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE);
for(k=0; k<dim; k++)
{
res[k] += res[i*dim+k];
}
}
"""