Best way to XOR two 128 bit values in MSVC?

205 views Asked by At

I'm trying to XOR the 128 bit Initialization Vector with the Plaintext as seen here

aes-cbc

In linux x86-64 gcc 12.2, there's a one liner

*(unsigned __int128 *)( plaintext ) ^= *(unsigned __int128 *)( ivvectext );

For example, https://godbolt.org/z/sc8e66qeo

#include <stdio.h>
#include <stdint.h>

int main()
{
    uint8_t plaintext[16] = {'t','h','e','q','u','i','c','k','b','r','o','w','n','f','o','x'};
    uint8_t ivvectext[16] = {'w','a','1','2','o','b','s','q','m','v','c','s','s','q','u','w'};

    *(unsigned __int128 *)( plaintext ) ^= *(unsigned __int128 *)( ivvectext );

    for (int i = 0; i < sizeof(plaintext); i++) { printf("%02X ", (unsigned char)plaintext[i]); }

    return 0;
}

Question

In MSVC, what's the preferred method to XOR these 128 bit values?

Update

As noted in one of the answers, use the compiler intrinsic _mm_xor_si128

#include <stdint.h>
#include <immintrin.h>
#include <iostream>
#include <ios>
#include <iomanip>

int main() {

    uint8_t plaintext[16] = { 't','h','e','q','u','i','c','k','b','r','o','w','n','f','o','x' };
    uint8_t ivvectext[16] = { 'w','a','1','2','o','b','s','q','m','v','c','s','s','q','u','w' };

    __m128i plain = _mm_loadu_si128((__m128i*)plaintext);
    __m128i ivvec = _mm_loadu_si128((__m128i*)ivvectext);

    __m128i xored = _mm_xor_si128(plain, ivvec);

    uint8_t* xored_array = (uint8_t*)&xored;
    for (int i = 0; i < 16; i++) {
        std::cout << std::uppercase << std::setw(2) << std::setfill('0') << std::hex << (int)xored_array[i] << " ";
    }
    std::cout << std::endl;

    return 0;
}

The output matches linux

03 09 54 43 1A 0B 10 1A 0F 04 0C 04 1D 17 1A 0F 

However, other answers suggest more readable code

for (int i = 0; i < sizeof(plaintext); i++) 
{
    plaintext[i] ^= ivvectext[i];
}

and let the compiler optimizations figure out the internal assembly code. :)

2

There are 2 answers

3
Bodo On BEST ANSWER

If your goal is to optimize your code, then leave this task to the compiler. (Of course you might have to enable optimization.)

You can write a simple loop like

for (int i = 0; i < sizeof(plaintext); i++) 
{
    plaintext[i] ^= ivvectext[i];
}

and let the compiler optimize this.

For example, x86 msvc v19.latest with option -O2 creates SSE2 instructions from this loop resulting in a single 128-bit operation.

_main   PROC                                      ; COMDAT
        sub     esp, 36                             ; 00000024H
        mov     eax, DWORD PTR ___security_cookie
        xor     eax, esp
        mov     DWORD PTR __$ArrayPad$[esp+36], eax
        mov     DWORD PTR _plaintext$[esp+36], 1902471284 ; 71656874H
        mov     DWORD PTR _plaintext$[esp+40], 1801677173 ; 6b636975H
        mov     DWORD PTR _plaintext$[esp+44], 2003792482 ; 776f7262H
        mov     DWORD PTR _plaintext$[esp+48], 2020566638 ; 786f666eH
        movups  xmm1, XMMWORD PTR _plaintext$[esp+36]
        mov     DWORD PTR _ivvectext$[esp+36], 842097015 ; 32316177H
        mov     DWORD PTR _ivvectext$[esp+40], 1903387247 ; 7173626fH
        mov     DWORD PTR _ivvectext$[esp+44], 1935898221 ; 7363766dH
        mov     DWORD PTR _ivvectext$[esp+48], 2004185459 ; 77757173H
        movups  xmm0, XMMWORD PTR _ivvectext$[esp+36]
        push    esi
        xor     esi, esi
        pxor    xmm1, xmm0
        movups  XMMWORD PTR _plaintext$[esp+40], xmm1
...

See https://godbolt.org/z/afTPK5von

Additional hints from comments:

Even if you determine that you need to hand-optimize the code and use the intrinsic functions explicitly (e.g., the optimizer doesn't use them for some reason, sad panda), I recommend also keeping the straightforward implementation as a reference implementation for development & debugging purposes. (Eljay's comment)

Sometimes the MS compiler won't optimize what looks like a simple loop, in this case you can enable Vectorizer and parallelizer messages which can give you hints as to why it didn't. (user20716902's comment)

0
user20716902 On

As mentioned in my comment I would use intrincis: Here is how to do it in MSVC:

#include <immintrin.h>
int main() {

    uint8_t plaintext[16] = { 't','h','e','q','u','i','c','k','b','r','o','w','n','f','o','x' };
    uint8_t ivvectext[16] = { 'w','a','1','2','o','b','s','q','m','v','c','s','s','q','u','w' };

    __m128i plain = _mm_loadu_si128((__m128i*)plaintext);
    __m128i ivvec = _mm_loadu_si128((__m128i*)ivvectext);

    __m128i xored = _mm_xor_si128(plain, ivvec);
    
    return 0;
}