opencl kernel float4 issues

478 views Asked by At

I am working on an OpenCL implementation of the Fruchtermon and Reingold layout algorithm, I have it pretty much working when compared to the CPU version I already have implemented. However I noticed that for large graphs there is a bottleneck on the repel function - which calculates the repelling force between every pair of vertices in the graph. I decided to try using the OpenCL float4 structure to reduce this bottleneck, but I'm getting mixed results.

I've set up the code so I can use float4, while actually only using 1 of the positions (for debugging). When I use only a single position (float_workers = 1) the function works correctlyCorrect behaviour

However, when I set float_workers > 1 I get increasingly bizzare behaviour. Incorrect behaviour (float_workers = 4)

I can't figure out what is wrong with my code - I am using JOCL, as such the entire kernel is in a string in my Java class, so lines such as: gid *= " + String.valueOf(float_workers) + "; do make sense, and compile.

line markers (1) to (2) are setting up the float4 with the correct number of items (1-4 items, depending on float_workers). This loop maps a GID to a pair of nodes (when float_workers = 1) or to two, three or four pairs of nodes when float_workers is higher. Line markers (2) to (3) are the actual work being performed (calculating the amount to repel the nodes). Line markers (3) to end are setting the results into a "displacement" array, so the positions of the nodes can be updated later on. Obviously when I enqueue the kernel I adjust the number of workers correctly, so that doesnt seem to be the issue.

Am I correct in assuming that each xPos[...] can only be set exactly once? Thats the only thing I can think of that would break this. Have I missed something else?

__kernel void repel(__global const float *optDist,
          __global const int *nIndexes,
          __global const float *xPos,
          __global const float *yPos,
          __global float *xDisp,
          __global float *yDisp,
          __global int *nodes, +
          __global int *startIndexes,
          __global int* totalWork){
             int sub = nodes[0] - 1;
             int work_dim = get_work_dim();
             int gid = 0;
             for(int i = 0; i < work_dim - 1; i++){
               gid += get_global_id(i) * get_global_size(i);
             }
             gid += get_global_id(work_dim - 1);
             gid *= " + String.valueOf(float_workers) + ";
             int gid_start = gid;
             if(gid >= totalWork[0]){return;}
             int v,u,i;
             v = u = i = 0;
             float4 xDelta = (float4)(0.0f,0.0f,0.0f,0.0f); 
             float4 yDelta = (float4)(0.0f,0.0f,0.0f,0.0f);
             int found = 0;
             (1)for(i = 0; i < nodes[0]; i++){
               if(found == " + String.valueOf(float_workers) + "){
                 break;
               }
               if(gid < startIndexes[i]){
                 v = i - 1;
                 u = (gid - startIndexes[i - 1]) + 1;
                 gid ++;
                 i--;
                 if(found == 0){
                   xDelta.s0 = xPos[v] - xPos[u];
                   yDelta.s0 = yPos[v] - yPos[u];
                 }
                 if(found == 1){
                   xDelta.s1 = xPos[v] - xPos[u];
                   yDelta.s1 = yPos[v] - yPos[u];
                 }
                 if(found == 2){
                   xDelta.s2 = xPos[v] - xPos[u];
                   yDelta.s2 = yPos[v] - yPos[u];
                 }
                 if(found == 3){
                   xDelta.s3 = xPos[v] - xPos[u];
                   yDelta.s3 = yPos[v] - yPos[u];
                 }
                 found++;
               }
             }
             (2)
             float4 deltaLength = sqrt((xDelta * xDelta) + (yDelta * yDelta));
             float4 _optDist = (float4)(optDist[0], optDist[0], optDist[0], optDist[0]);
             float4 force = _optDist / deltaLength;
             float4 xResult = (xDelta / deltaLength) * force;
             float4 yResult = (yDelta / deltaLength) * force;
             (3)
             if ((xDelta.s0 == 0) && (yDelta.s0 == 0)) {
               xDisp[gid_start + 0] = 0;
               yDisp[gid_start + 0] = 0;
             }
             else{
               xDisp[gid_start + 0] = xResult.s0;
               yDisp[gid_start + 0] = yResult.s0;
             }
             if(" + String.valueOf(float_workers) + " > 1){
               if ((xDelta.s1 == 0) && (yDelta.s1 == 0)) {
                 xDisp[gid_start + 1] = 0;
                 yDisp[gid_start + 1] = 0;
               }
               else{
                 xDisp[gid_start + 1] = xResult.s1;
                 yDisp[gid_start + 1] = yResult.s1;
               }
             }
             if(" + String.valueOf(float_workers) + " > 2){
               if ((xDelta.s2 == 0) && (yDelta.s2 == 0)) {
                 xDisp[gid_start + 2] = 0;
                 yDisp[gid_start + 2] = 0;
               }
               else{
                 xDisp[gid_start + 2] = xResult.s2;
                 yDisp[gid_start + 2] = yResult.s2;
                 }
             }
             if(" + String.valueOf(float_workers) + " > 3){
               if ((xDelta.s3 == 0) && (yDelta.s3 == 0)) {
                 xDisp[gid_start + 3] = 0;
                 yDisp[gid_start + 3] = 0;
               }
               else{
                 xDisp[gid_start + 3] = xResult.s3;
                 yDisp[gid_start + 3] = yResult.s3;
               }
             }
    }
0

There are 0 answers