The motivation is simple: I iterate over two uint64 arrays, bit-and their values and store the result in third array. However, after I generalized solution to accept input in form of structure, the code slowed down considerably and I found out that generated assembly is fairly different. My question is why is that and how can I prevent this behaviour? (See notes at the end of the question for further details about the motivation.)
The minimized code looks like this:
struct FAndExpressionIter
{
public:
    const uint64* LhsIter = nullptr;
    const uint64* RhsIter = nullptr;
    inline void operator++()
    {
        ++LhsIter;
        ++RhsIter;
    }
    inline uint64 operator*() const
    {
        return (*LhsIter & *RhsIter);
    }
};
void FooBitAndExpression(FAndExpressionIter SrcIter, uint64* ResultIter, const uint64* ResultEnd)
{
    while (ResultIter != ResultEnd)
    {
        *ResultIter = *SrcIter;
        ++SrcIter;
        ++ResultIter;
    }
}
void FooBitAndRaw(const uint64* LhsIter, const uint64* RhsIter, uint64* ResultIter, const uint64* ResultEnd)
{
    while (ResultIter != ResultEnd)
    {
        *ResultIter = (*LhsIter & *RhsIter);
        ++LhsIter;
        ++RhsIter;
        ++ResultIter;
    }
}
The FooBitAndExpression is compiled into:
00007FFA37075910  mov         qword ptr [rsp+8],rbx  
     7:     while (ResultIter != ResultEnd)
00007FFA37075915  xor         r10d,r10d  
00007FFA37075918  mov         rbx,r8  
00007FFA3707591B  sub         rbx,rdx  
00007FFA3707591E  mov         r9,rdx  
00007FFA37075921  add         rbx,7  
00007FFA37075925  mov         r11,rcx  
00007FFA37075928  shr         rbx,3  
00007FFA3707592C  cmp         rdx,r8  
00007FFA3707592F  cmova       rbx,r10  
00007FFA37075933  test        rbx,rbx  
00007FFA37075936  je          FooBitAndExpression+55h (07FFA37075965h)  
00007FFA37075938  mov         rax,qword ptr [rcx+8]  
00007FFA3707593C  nop         dword ptr [rax]  
     8:     {
     9:         *ResultIter = *SrcIter;
00007FFA37075940  mov         rdx,qword ptr [r11]  
    10:         ++SrcIter;
00007FFA37075943  lea         rax,[rax+8]  
    11:         ++ResultIter;
00007FFA37075947  inc         r10  
00007FFA3707594A  lea         r9,[r9+8]  
     8:     {
     9:         *ResultIter = *SrcIter;
00007FFA3707594E  mov         rcx,qword ptr [rdx]  
00007FFA37075951  and         rcx,qword ptr [rax-8]  
00007FFA37075955  mov         qword ptr [r9-8],rcx  
    10:         ++SrcIter;
00007FFA37075959  lea         rcx,[rdx+8]  
00007FFA3707595D  mov         qword ptr [r11],rcx  
     7:     while (ResultIter != ResultEnd)
00007FFA37075960  cmp         r10,rbx  
00007FFA37075963  jne         FooBitAndExpression+30h (07FFA37075940h)  
    12:     }
    13: }
00007FFA37075965  mov         rbx,qword ptr [rsp+8]  
    12:     }
    13: }
00007FFA3707596A  ret  
while FooBitAndRaw is compiled into:
    17:     while (ResultIter != ResultEnd)
00007FFA37075980  xor         r10d,r10d  
00007FFA37075983  mov         r11,r9  
00007FFA37075986  sub         r11,r8  
00007FFA37075989  add         r11,7  
00007FFA3707598D  shr         r11,3  
00007FFA37075991  cmp         r8,r9  
00007FFA37075994  cmova       r11,r10  
00007FFA37075998  test        r11,r11  
00007FFA3707599B  je          FooBitAndRaw+0F7h (07FFA37075A77h)  
00007FFA370759A1  cmp         r11,8  
00007FFA370759A5  jb          FooBitAndRaw+0D2h (07FFA37075A52h)  
00007FFA370759AB  lea         rax,[rdx-8]  
00007FFA370759AF  lea         rax,[rax+r11*8]  
00007FFA370759B3  lea         r9,[r8-8]  
00007FFA370759B7  lea         r9,[r9+r11*8]  
00007FFA370759BB  cmp         r8,rax  
00007FFA370759BE  ja          FooBitAndRaw+49h (07FFA370759C9h)  
00007FFA370759C0  cmp         r9,rdx  
00007FFA370759C3  jae         FooBitAndRaw+0D2h (07FFA37075A52h)  
00007FFA370759C9  lea         rax,[rcx-8]  
00007FFA370759CD  lea         rax,[rax+r11*8]  
00007FFA370759D1  cmp         r8,rax  
00007FFA370759D4  ja          FooBitAndRaw+5Bh (07FFA370759DBh)  
00007FFA370759D6  cmp         r9,rcx  
00007FFA370759D9  jae         FooBitAndRaw+0D2h (07FFA37075A52h)  
00007FFA370759DB  mov         rax,r11  
00007FFA370759DE  and         rax,0FFFFFFFFFFFFFFF8h  
00007FFA370759E2  nop         dword ptr [rax]  
00007FFA370759E6  nop         word ptr [rax+rax]  
    18:     {
    19:         *ResultIter = (*LhsIter & *RhsIter);
00007FFA370759F0  movdqu      xmm0,xmmword ptr [rdx]  
    20:         ++LhsIter;
    21:         ++RhsIter;
    22:         ++ResultIter;
00007FFA370759F4  add         r10,8  
00007FFA370759F8  movdqu      xmm1,xmmword ptr [rcx]  
00007FFA370759FC  pand        xmm1,xmm0  
00007FFA37075A00  movdqu      xmm0,xmmword ptr [rdx+10h]  
00007FFA37075A05  movdqu      xmmword ptr [r8],xmm1  
00007FFA37075A0A  movdqu      xmm1,xmmword ptr [rcx+10h]  
00007FFA37075A0F  pand        xmm1,xmm0  
00007FFA37075A13  movdqu      xmm0,xmmword ptr [rdx+20h]  
00007FFA37075A18  movdqu      xmmword ptr [r8+10h],xmm1  
00007FFA37075A1E  movdqu      xmm1,xmmword ptr [rcx+20h]  
00007FFA37075A23  pand        xmm1,xmm0  
00007FFA37075A27  movdqu      xmm0,xmmword ptr [rdx+30h]  
00007FFA37075A2C  add         rdx,40h  
00007FFA37075A30  movdqu      xmmword ptr [r8+20h],xmm1  
00007FFA37075A36  movdqu      xmm1,xmmword ptr [rcx+30h]  
00007FFA37075A3B  add         rcx,40h  
00007FFA37075A3F  pand        xmm1,xmm0  
00007FFA37075A43  movdqu      xmmword ptr [r8+30h],xmm1  
00007FFA37075A49  add         r8,40h  
00007FFA37075A4D  cmp         r10,rax  
00007FFA37075A50  jne         FooBitAndRaw+70h (07FFA370759F0h)  
    17:     while (ResultIter != ResultEnd)
00007FFA37075A52  cmp         r10,r11  
00007FFA37075A55  je          FooBitAndRaw+0F7h (07FFA37075A77h)  
00007FFA37075A57  sub         rcx,rdx  
    17:     while (ResultIter != ResultEnd)
00007FFA37075A5A  sub         r8,rdx  
00007FFA37075A5D  nop         dword ptr [rax]  
    18:     {
    19:         *ResultIter = (*LhsIter & *RhsIter);
00007FFA37075A60  mov         rax,qword ptr [rcx+rdx]  
    20:         ++LhsIter;
    21:         ++RhsIter;
    22:         ++ResultIter;
00007FFA37075A64  inc         r10  
00007FFA37075A67  and         rax,qword ptr [rdx]  
00007FFA37075A6A  mov         qword ptr [r8+rdx],rax  
00007FFA37075A6E  lea         rdx,[rdx+8]  
00007FFA37075A72  cmp         r10,r11  
00007FFA37075A75  jne         FooBitAndRaw+0E0h (07FFA37075A60h)  
    23:     }
    24: }
00007FFA37075A77  ret  
Please note that my knowledge of assembler is really limited and my conclusion is based on guess. If I understand the generated assembly correctly, the FooBitAndRaw function loops over iterated data using sse instructions. When remaining space is too small, it falls back to iteration over single value.
The FooBitAndExpression function, however, does no such thing and iterates over single values immediately. To my understanding, both implementations are equal in high-level instructions except two method calls which are inlined.
I already found out that C#'s JIT won't use registers for variables in local structure variables used for iteration which slows down the application considerably. Obviously, MS Visual C++ uses registers in similar case, but it leads me to a sub-question: is there similar limitations which disallows using sse optimizations if variables are stored in local structure?
Futher details and notes:
- I believe this is not a premature optimization. Originally, I simply stored boolean values as a member variable of completely different structure and simply iterated over array of these structures when I wanted run a logical operation over these fields. It was designated for certain number of elements. However, the input increased by degree of magnitude and code was extremely slow. So as an optimization, I implemented bit-field based structure for storing certain boolean values. Most of the system I work on is O(N), however the part using these bitfield tricks is O(M*N) so even slight change affects performance considerably.
- I noticed this behaviour when I tried to optimize the implementation by unrolling loops. The FooBitAndExpressionwas faster, whileFooBitAndRawslowed down considerably which was unexpected so I investigated - and found this.
- What's my primary motivation/objective: after profiling, I found weakest parts of the code and optimized them (e.g. by implementing FooBitAndRaw). However, the code became much less readable so now I want to return some of the readability while not affecting performance. I intend to encapsulate some source data and operations in structures. Currently, the code which uses optimized bit operations must be commented per line of code to describe intend, because it's not that clear from function calls which takes 6+ arguments.
- This is just an example. I also implement similar functions for logic or, xor, certain kind of comparisons, etc. That's why I implemented FooBitAndExpressionfunction. The next step would be to change it to template to support various kinds of input expressions.
- I use Visual Studio C++ 2017 on Windows 10.