I have written a skeletonize function already. It is running well on CPU but when I try to offload to intel graphics card, it returns no data or doesn't even return any value. What am I doing wrong? I am using C++, OpenCV and Intel parallels libraries.
Here is the code : the main part is the #pragma
void thinning(cv::Mat & inputarray, cv::Mat & outputarray)
{
//....
cv::Mat p_enlarged_src = cv::Mat(rows + 2, cols + 2, CV_32FC1);
//for intel offload
float* matData_penlarged_src = (float*)p_enlarged_src.data;
size_t elem_step = p_enlarged_src.step / sizeof(float);
#ifdef __INTEL_OFFLOAD
#pragma offload target(gfx) inout(matData_penlarged_src[0:(rows+1)*(cols+1)]) in(elem_step)
_Cilk_for(int i = 0; i < (rows+2); i++)
{
//p_enlarged_src.at<float>(i, 0) = 0.0f;
matData_penlarged_src[i * elem_step + 0] = 0.0f;
//p_enlarged_src.at<float>( i, cols+1) = 0.0f;
matData_penlarged_src[i * elem_step + (cols+1)] = 0.0f;
}
#pragma offload target(gfx) inout(matData_penlarged_src[0:(rows+1)*(cols+1)]) in(elem_step)
_Cilk_for(int j = 0; j < (cols+2); j++)
{
//p_enlarged_src.at<float>(0, j) = 0.0f;
matData_penlarged_src[0 * elem_step + j] = 0.0f;
//p_enlarged_src.at<float>(rows+1, j) = 0.0f;
matData_penlarged_src[(rows+1) * elem_step + j] = 0.0f;
}
#pragma offload target(gfx) inout(matData_penlarged_src[0:(rows+1)*(cols+1)]) in(elem_step)
_Cilk_for(int i = 0; i < rows; i++)
{
_Cilk_for(int j = 0; j < cols; j++)
{
//if (inputarray.at<float>(i, j) >= 20.0f)
if(matData_penlarged_src[i * elem_step + j] >= 20.0f)
{
//p_enlarged_src.at<float>( i+1, j+1) = 1.0f;
matData_penlarged_src[(i+1) * elem_step + (j+1)] = 1.0f;
}
else
//p_enlarged_src.at<float>( i+1, j+1) = 0.0f;
matData_penlarged_src[(i+1) * elem_step + (j+1)] = 0.0f;
}
}
#else
//same code like above, but only on the CPU and it works!
}
#endif