I'm an experienced C++ programmer, used to low level optimization an I'm trying to get performances out of Go.
So far, I'm interested in GFlop/s.
I wrote the following go code:
package main
import (
"fmt"
"time"
"runtime"
"sync"
)
func expm1(x float64) float64 {
return ((((((((((((((15.0 + x) * x + 210.0) * x + 2730.0) * x + 32760.0) * x + 360360.0) * x + 3603600.0) * x + 32432400.0) * x + 259459200.0) * x + 1816214400.0) * x + 10897286400.0) * x + 54486432000.0) * x + 217945728000.0) *
x + 653837184000.0) * x + 1307674368000.0) * x * 7.6471637318198164759011319857881e-13;
}
func twelve(x float64) float64 {
return expm1( expm1( expm1( expm1( expm1( expm1( expm1( expm1( expm1( expm1( expm1( expm1(x))))))))))));
}
func populate(data []float64, N int) {
CPUCOUNT := runtime.NumCPU();
var wg sync.WaitGroup
var slice = N / CPUCOUNT;
wg.Add(CPUCOUNT)
defer wg.Wait()
for i := 0; i < CPUCOUNT; i++ {
go func(ii int) {
for j := ii * slice; j < ii * slice + slice; j += 1 {
data[j] = 0.1;
}
defer wg.Done();
}(i);
}
}
func apply(data []float64, N int) {
CPUCOUNT := runtime.NumCPU();
var wg sync.WaitGroup
var slice = N / CPUCOUNT;
wg.Add(CPUCOUNT)
defer wg.Wait()
for i := 0; i < CPUCOUNT; i++ {
go func(ii int) {
for j := ii * slice; j < ii * slice + slice; j += 8 {
data[j] = twelve(data[j]);
data[j+1] = twelve(data[j+1]);
data[j+2] = twelve(data[j+2]);
data[j+3] = twelve(data[j+3]);
data[j+4] = twelve(data[j+4]);
data[j+5] = twelve(data[j+5]);
data[j+6] = twelve(data[j+6]);
data[j+7] = twelve(data[j+7]);
}
defer wg.Done();
}(i);
}
}
func Run(data []float64, N int) {
populate(data, N);
start:= time.Now();
apply(data, N);
stop:= time.Now();
elapsed:=stop.Sub(start);
seconds := float64(elapsed.Milliseconds()) / 1000.0;
Gflop := float64(N) * 12.0 * 15.0E-9;
fmt.Printf("%f\n", Gflop / seconds);
}
func main() {
CPUCOUNT := runtime.NumCPU();
fmt.Printf("num procs : %d\n", CPUCOUNT);
N := 1024*1024*32 * CPUCOUNT;
data:= make([]float64, N);
for i := 0; i < 100; i++ {
Run(data, N);
}
}
which is an attempt of translation from my c++ benchmark which yields 80% of peak flops.
The C++ version yields 95 GFlop/s where the go version yields 6 GFlops/s (FMA counter for 1).
Here is a piece of the go assembly (gccgo -O3 -mfma -mavx2):
vfmadd132sd %xmm1, %xmm15, %xmm0
.loc 1 12 50
vfmadd132sd %xmm1, %xmm14, %xmm0
.loc 1 12 64
vfmadd132sd %xmm1, %xmm13, %xmm0
.loc 1 12 79
vfmadd132sd %xmm1, %xmm12, %xmm0
.loc 1 12 95
vfmadd132sd %xmm1, %xmm11, %xmm0
.loc 1 12 112
vfmadd132sd %xmm1, %xmm10, %xmm0
And what I get from my c++ code (g++ -fopenmp -mfma -mavx2 -O3):
vfmadd213pd .LC3(%rip), %ymm12, %ymm5
vfmadd213pd .LC3(%rip), %ymm11, %ymm4
vfmadd213pd .LC3(%rip), %ymm10, %ymm3
vfmadd213pd .LC3(%rip), %ymm9, %ymm2
vfmadd213pd .LC3(%rip), %ymm8, %ymm1
vfmadd213pd .LC3(%rip), %ymm15, %ymm0
vfmadd213pd .LC4(%rip), %ymm15, %ymm0
vfmadd213pd .LC4(%rip), %ymm14, %ymm7
vfmadd213pd .LC4(%rip), %ymm13, %ymm6
vfmadd213pd .LC4(%rip), %ymm12, %ymm5
vfmadd213pd .LC4(%rip), %ymm11, %ymm4
I therefore have a few questions, most important of which is :
- Do I express parallelism the right way ?
and if not, how should I do that ?
For additional performance improvements, I'd need to know what's wrong with the following items :
- Why do I see only vfmadd132sd instructions in the assembly, instead of vfmadd132pd?
- How can I properly align memory allocations?
- How can I remove debug info from the generated executable?
- Do I pass the right options to gccgo?
- Do I use the right compiler?
No. You might be trashing the CPU cache. (But this is hard to tell without knowing details about your system. Guess it's not NUMA?). Anyway, technically your code is concurrent not parallel.
Because the compiler put it there. Is this a compiler question or a programming question?
That depends on your definition of "properly". Struct field and slice alignments are not ad hoc controllable, but you can reorder struct fields (which you did not use at all, so I do not know what you are asking here).
Consult the documentation of gcc.
I do not know.
What makes a compiler "right"?