I wrote the following test cases to bench some operations:
#define BENCH_ROUNDS 1000000000 // 10**9
static volatile UINT64 _test_argument, _test_result;
static _Atomic(UINT64) _test_atom;
// For convenience
#define bench_loop(var) for (UINT64 var = 0; var < BENCH_ROUNDS; var++)
// Just for reference
VOID test_empty(VOID) {
bench_loop (i);
}
// Just for reference
VOID test_calc(VOID) {
UINT64 res = _test_argument;
bench_loop (i) {
res += i;
}
_test_result = res;
}
VOID test_store(VOID) {
bench_loop (i) {
_test_result = i;
}
}
VOID test_modify(VOID) {
_test_result = _test_argument;
bench_loop (i) {
_test_result += i;
}
}
VOID test_modify__calc(VOID) {
_test_result = _test_argument;
UINT64 res = _test_argument;
bench_loop (i) {
_test_result += i;
res += i;
}
_test_result = res;
}
VOID test_atom_modify(VOID) {
bench_loop (i) {
_test_atom += i;
}
}
VOID test_atom_modify__calc(VOID) {
UINT64 res = _test_argument;
bench_loop (i) {
_test_atom += i;
res += i;
}
_test_result = res;
}
VOID test_atom_modify__calc2(VOID) {
UINT64 res = _test_argument;
bench_loop (i) {
_test_atom += i;
res += i, res ^= i;
res += i, res ^= i;
res += i, res ^= i;
res += i, res ^= i;
}
_test_result = res;
}
They are wrapped with QueryPerformanceCounter to get execution times and compiled with gcc -Wall -Og -m64 ... (won't eliminate predictable loops). The results given by an AMD Ryzen 7 4800H CPU @4.3GHz are like:
000233.787ms test_empty
000467.274ms test_calc
000233.846ms test_store
001869.585ms test_modify
001869.459ms test_modify__calc
004102.050ms test_atom_modify
004102.818ms test_atom_modify__calc
003928.742ms test_atom_modify__calc2
The outcome looks mostly reasonable (at least acceptable), except for the last one, where I have no ideas on what happened. I have tested multiple times, and test_atom_modify__calc2 always runs faster than other test_atom_modifys, for about 150~200ms (which is less than 1 clock cycle for each iteration). I also tested a similar test_modify__calc2 but it won't have such speed-up.
So why does such speed-up exist? Thanks in advance!
For reference, here is the assembly of some loops.
# test_modify:
jmp .L11
.L12:
movq _test_result(%rip), %rdx
addq %rax, %rdx
movq %rdx, _test_result(%rip)
addq $1, %rax
.L11:
cmpq $999999999, %rax
jbe .L12
# test_atom_modify:
jmp .L20
.L21:
lock addq %rax, _test_atom(%rip)
addq $1, %rax
.L20:
cmpq $999999999, %rax
jbe .L21
# test_atom_modify__calc:
jmp .L23
.L24:
lock addq %rax, _test_atom(%rip)
addq %rax, %rdx
addq $1, %rax
.L23:
cmpq $999999999, %rax
jbe .L24
# test_atom_modify__calc2:
jmp .L26
.L27:
lock addq %rdx, _test_atom(%rip)
addq %rdx, %rax
xorq %rdx, %rax
addq %rdx, %rax
xorq %rdx, %rax
addq %rdx, %rax
xorq %rdx, %rax
addq %rdx, %rax
xorq %rdx, %rax
addq $1, %rdx
.L26:
cmpq $999999999, %rdx
jbe .L27
And here is a wrapping program could be used to measure times.
#include <stdio.h>
#include <stdatomic.h>
#include <Windows.h>
/* Paste test code here! */
static VOID bench_time(VOID (*func)(VOID)) {
UINT64 freq, st, ed;
QueryPerformanceFrequency((LARGE_INTEGER*) &freq);
QueryPerformanceCounter((LARGE_INTEGER*) &st);
(*func)();
QueryPerformanceCounter((LARGE_INTEGER*) &ed);
UINT64 t = ed - st;
printf("%010.3lfms\n", t * 1000.0 / freq);
}
INT main() {
_test_argument = GetCurrentProcessId();
atomic_init(&_test_atom, 0);
bench_time(&test_empty);
bench_time(&test_calc);
bench_time(&test_store);
bench_time(&test_modify);
bench_time(&test_modify__calc);
// bench_time(&test_modify__calc2);
bench_time(&test_atom_modify);
bench_time(&test_atom_modify__calc);
bench_time(&test_atom_modify__calc2);
return 0;
}
Update: To compare, I just ran these on an Intel Xeon E5-2643 v3 CPU, and its results do not show strange speed-up.
(Loops are decreased to 1 million times, and sleeps are added between tests, because I cannot fix its frequency.)
000000.835ms test_empty
000000.836ms test_calc
000000.836ms test_store
000006.264ms test_modify
000006.264ms test_modify__calc
000015.926ms test_atom_modify
000015.873ms test_atom_modify__calc
000015.900ms test_atom_modify__calc2