openMP bad performance with false sharing

734 views Asked by At

I know that it exists this thread openMP performance

but here my example is very simple

C code:

int MaFunc(size_t szGlobalWorkSize)
{
        int iGID = 0;
        float *pfResult = (float *)calloc(szGlobalWorkSize * 100, sizeof(float));
        float fValue = 0.5f;
        struct timeval tim;
        gettimeofday(&tim, NULL);
        double tLaunch1=tim.tv_sec+(tim.tv_usec/1000000.0);

        #pragma omp parallel for
        for (iGID = 0; iGID < (int)szGlobalWorkSize * 100; iGID++)
        {
          pfResult[iGID] = fValue;
         // printf("Element %d traité par le thread %d \n",iGID,omp_get_thread_num());
        }
        gettimeofday(&tim, NULL);
        double tLaunch2=tim.tv_sec+(tim.tv_usec/1000000.0);
        printf("%.6lf Time OMP\n", tLaunch2-tLaunch1);
     }

Timing of this example increases when i use openMP 0.015s without openMP against 0.045 sec with openMP (szGlobalworkSize = 131072)

I use this line of gcc: gcc -march=native -fopenmp -O3 MyCode.c -lm

gcc (GCC) 4.8.2 20140120 (Red Hat 4.8.2-15)

Edit1:

int MyFunc2()
{
        int iGID = 0;
        int j = 0;
        //float *pfResult = (float *)calloc(szGlobalWorkSize * 100, sizeof(float));
        float *pfResult = (float *)valloc(szGlobalWorkSize * 100* sizeof(float));
        float fValue = 0.5f;
        struct timeval tim;
        gettimeofday(&tim, NULL);

        double tLaunch1=tim.tv_sec+(tim.tv_usec/1000000.0);
        double time = omp_get_wtime();
        int iChunk = getpagesize();
        int iSize = ((int)szGlobalWorkSize * 100) / iChunk;


       // #pragma omp parallel
        #pragma omp parallel for
        for (iGID = 0; iGID < iSize; iGID++)
        {
          for (j = 0; j < iChunk; j++)
          {

             pfResult[iGID * iChunk + j] = fValue;
         //pfResult[iGID] = fValue;
      }
         // printf("Element %d traité par le thread %d \n",iGID,omp_get_thread_num());
        }
        time = omp_get_wtime() - time;
        gettimeofday(&tim, NULL);
        double tLaunch2=tim.tv_sec+(tim.tv_usec/1000000.0);
        printf("%.6lf Time OMP\n", tLaunch2-tLaunch1);
        printf("Pagesize=%d\n", getpagesize());
        printf("%.6lf Time OMP2\n", time);
     }

also same time with chunk with memalign

Edit 2 with timing by thread

#pragma omp parallel private(dLocalTime)
    {
           pdTime[omp_get_thread_num()] = omp_get_wtime();
       printf("Thread Begin %d Time %f\n", omp_get_thread_num(), pdTime[omp_get_thread_num()] );
       #pragma omp for
           for (iGID = 0; iGID < iSize; iGID++)
           {
    //   for (j = 0; j < iChunk; j++)
             {

             //  pfResult[iGID * iChunk + j] = fValue;
            pfResult[iGID] = fValue;
         }

           }
       //dLocalTime = (omp_get_wtime() - dLocalTime);
         pdTime[omp_get_thread_num()] = (omp_get_wtime() - pdTime[omp_get_thread_num()]);
             printf("Thread End %d Time %f\n", omp_get_thread_num(), pdTime[omp_get_thread_num()]);

      // printf("End Element %d traité par le thread %d \n",0,tid);
    }

Each threads takes 0.015 for a total of 0.045 so there is a fix part in openmp of 0.03 It is strange that even with huge dimensions we see this fix part of openmp and thread which have less work takes same time that the whole size (48 threads here)

Thanks

1

There are 1 answers

3
Anton On BEST ANSWER

Ok, since you insist.. :)

With fixed threads warm-up:

#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <omp.h>
#include <unistd.h>

int main()
{
        int szGlobalWorkSize = 131072;
        int iGID = 0;
        int j = 0;
        omp_set_dynamic(0);
        // warmup
        #if WARMUP
        #pragma omp parallel
        {
        #pragma omp master
        {
        printf("%d threads\n", omp_get_num_threads());
        }
        }
        #endif
        printf("Pagesize=%d\n", getpagesize());
        float *pfResult = (float *)valloc(szGlobalWorkSize * 100* sizeof(float));
        float fValue = 0.5f;
        struct timeval tim;
        gettimeofday(&tim, NULL);

        double tLaunch1=tim.tv_sec+(tim.tv_usec/1000000.0);
        double time = omp_get_wtime();
        int iChunk = getpagesize();
        int iSize = ((int)szGlobalWorkSize * 100) / iChunk;

        #pragma omp parallel for
        for (iGID = 0; iGID < iSize; iGID++)
        {
          for (j = 0; j < iChunk; j++)
             pfResult[iGID * iChunk + j] = fValue;
        }
        time = omp_get_wtime() - time;
        gettimeofday(&tim, NULL);
        double tLaunch2=tim.tv_sec+(tim.tv_usec/1000000.0);
        printf("%.6lf Time1\n", tLaunch2-tLaunch1);
        printf("%.6lf Time2\n", time);
}

I've got the following numbers on my machine:

$ g++ -O2 -fopenmp testomp.cpp && OMP_NUM_THREADS=1 ./a.out
Pagesize=4096
0.036493 Time1
0.036489 Time2
$ g++ -O2 -fopenmp testomp.cpp && ./a.out
Pagesize=4096
0.034721 Time1
0.034718 Time2
$ g++ -O2 -fopenmp testomp.cpp -DWARMUP && ./a.out
24 threads
Pagesize=4096
0.026966 Time1
0.026963 Time2

As you can see, the threads creation time contributes a lot to the numbers.

Why still it doesn't scale? Well, this is extremely memory-bound workload. Actually, it fills the pages twice: once OS clears it on the first touch, then the program fills it via the value. It seems there is just not enough memory bandwidth in the system. I'd not expect the false-sharing to play a significant role here since parallel for by default uses static schedule which does not interleave the iterations between threads, so the false sharing is only possible once on the boundaries.