I created a simple test that used SIMD to add 4 elements of an array at a time (to sum) vs just accumulating it with 4 sum variables and adding them up at the end. Here's my test case code:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <omp.h>
#include <immintrin.h>
#include <x86intrin.h>
int main()
{
double time1, time2;
time1 = time2 = 0;
int n = 50000000;
int runs = 5;
double * test = _mm_malloc(sizeof(double) * n, 32);
for(int i = 0; i < n; i++){
test[i] = i;
}
time1 = omp_get_wtime();
double overalla;
for(int a = 0; a < runs; a++){
__m256d accumulate = _mm256_setzero_pd();
overalla = 0;
for(int i = 0; i < n; i += 4){
accumulate = _mm256_add_pd(_mm256_load_pd(test + i), accumulate);
}
double result[4] __attribute__ ((aligned (32)));
_mm256_store_pd((double *)&result, accumulate);
overalla = result[0] + result[1] + result[2] + result[3];
}
time1 = omp_get_wtime() - time1;
double overall;
time2 = omp_get_wtime();
for(int a = 0; a < runs; a++){
double sum1, sum2, sum3, sum4;
sum1 = sum2 = sum3 = sum4 = 0;
overall = 0;
for(int i = 0; i < n; i += 4){
sum1 += test[i];
sum2 += test[i+1];
sum3 += test[i+2];
sum4 += test[i+3];
}
overall = sum1 + sum2 + sum3 + sum4;
}
time2 = omp_get_wtime() - time2;
printf("A: %f, B: %f\n", overalla, overall);
printf("Time 1: %f, Time 2: %f\n", time1, time2);
printf("Unroll %f times faster\n", time1/time2);
}
I expected the SIMD to be significantly faster (4 adds at once), but this is not the case. I was wondering if anyone could point me to why that is? The result I get from running the code is:
A: 1249999975000000.000000, B: 1249999975000000.000000
Time 1: 0.317978, Time 2: 0.207965
Unroll 1.528996 times faster
I am compiling without optimizations, the gcc options are gcc -fopenmp -mavx -mfma -pthread