OpenMp For-Loop Parallelization

141 views Asked by At

I try to parallelize the following piece of code with OpenMP.

#pragma omp parallel for collapse(2)
{
    for (int mNdx = 0; mNdx < M; ++mNdx)
    {
        for (int nNdx = mNdx; nNdx < N; ++nNdx)
        {
            for (int elemNdx = mNdx; elemNdx <= nNdx; ++elemNdx)
            {
                result[mNdx * N + nNdx] += matrixOne[mNdx * N + elemNdx] * matrixTwo[elemNdx * N + nNdx];
            }
        }
    }
}

But the SpeedUp is always about 1.0 (+- 0.05). I tried every possible scheduling (auto, dynamic, static and guided ), different chunk sizes, without collapse and with collapse(2). The runtime doesn't change at all ...

Can anyone explain me why or where my fault is ?
Could it be due to automated for-loop compiler parallelization ?

Thanks in advance for any advice/tip !

Update:

As required an verifiable example.

#include <omp.h>
#include <iostream>

void initMatrix(float* mat, const int M, const int N);
void initResMatrix(float* mat, const int M, const int N);

double matMulUpperTriangular_C(float* matrixOne, float* matrixTwo, float* result, const int M, const int N);
double matMulUpperTriangular_Omp(float* matrixOne, float* matrixTwo, float* result, const int M, const int N);

int main()
{
    const int M = 2048, N = 2048;
    float* matOne = (float*)malloc(M * N * sizeof(float));
    float* matTwo = (float*)malloc(M * N * sizeof(float));
    float* res = (float*)malloc(M * N * sizeof(float));

    initMatrix(matOne, M, N);
    initMatrix(matTwo, M, N);
    initResMatrix(res, M, N);

    double timeConsumption[2] = { 0.0, 0.0 };

    timeConsumption[0] = matMulUpperTriangular_C(matOne, matTwo, res, M, N);
    timeConsumption[1] = matMulUpperTriangular_Omp(matOne, matTwo, res, M, N);

    std::cout << "Runtime C:\t\t" << timeConsumption[0] << "s" << std::endl;
    std::cout << "Runtime Omp:\t\t" << timeConsumption[1] << "s";
    std::cout << " | SpeedUp: " << timeConsumption[0] / timeConsumption[1] << std::endl;

    system("PAUSE");
    return 0;
}

void initMatrix(float* mat, const int M, const int N)
{
    for (int mNdx = 0; mNdx < M; ++mNdx)
    {
        for (int nNdx = 0; nNdx < mNdx; ++nNdx)
        {
            mat[mNdx * N + nNdx] = 0;
        }
        for (int nNdx = mNdx; nNdx < N; ++nNdx)
        {
            mat[mNdx * N + nNdx] = ((mNdx + nNdx) % 5 + 1) * 0.1f;
        }
    }
}

void initResMatrix(float* mat, const int M, const int N)
{
    for (int mNdx = 0; mNdx < M; ++mNdx)
    {
        for (int nNdx = 0; nNdx < N; ++nNdx)
        {
            mat[mNdx * N + nNdx] = 0.0f;
        }
    }
}

double matMulUpperTriangular_C(float* matrixOne, float* matrixTwo, float* result, const int M, const int N)
{
    double startTime = omp_get_wtime();

    for (int mNdx = 0; mNdx < M; ++mNdx)
    {
        for (int nNdx = mNdx; nNdx < N; ++nNdx)
        {
            for (int elemNdx = mNdx; elemNdx <= nNdx; ++elemNdx)
            {
                result[mNdx * N + nNdx] += matrixOne[mNdx * N + elemNdx] * matrixTwo[elemNdx * N + nNdx];
            }
        }
    }

    double endTime = omp_get_wtime();

    return endTime - startTime;
}

double matMulUpperTriangular_Omp(float* matrixOne, float* matrixTwo, float* result, const int M, const int N)
{
    double startTime = omp_get_wtime();

    #pragma omp parallel for collapse(2)
    {
        for (int mNdx = 0; mNdx < M; ++mNdx)
        {
            for (int nNdx = mNdx; nNdx < N; ++nNdx)
            {
                for (int elemNdx = mNdx; elemNdx <= nNdx; ++elemNdx)
                {
                    result[mNdx * N + nNdx] += matrixOne[mNdx * N + elemNdx] * matrixTwo[elemNdx * N + nNdx];
                }
            }
        }
    }

    double endTime = omp_get_wtime();

    return endTime - startTime;
}

Solved:

Stupid mistake ... had set the /MP-Flag (Build with Multiple Processes) instead of /openmp. Now the SpeedUp is about 2.0 :)

Sorry and thank you all for your help !

0

There are 0 answers