I'm porting a application that uses AES encryption and decryption instructions to randomize some data from x86 to POWER8. I hit a wall with the _mm_aesdec_si128 instruction, it seems to do something different than the equivalent IBM __builtin_crypto_vncipher. The documentation at https://link.springer.com/content/pdf/10.1007/978-3-642-03317-9_4.pdf, pages 52-54, mention that it follows FIPS 197. The IBM documentation at https://ibm.ent.box.com/s/jd5w15gz301s5b5dt375mshpq9c3lh4u, page 305 also says that it follow FIPS197, the only difference is that the order of InvMixColumns and the xor with round key are flipped, but does that change the result?

How can they both say they follow the specification if the results are different?

The following C program works fine in x86, but will output the wrong result for aesdec in ppc64. The aesenc in ppc64 thankfully works as expected.

For now I solved the problem by using a software implementation of aesdec, but I want to do everything in hardware.

C program:

//compile with "gcc -maes aestest.c -o aestest" in x86
//compile with "gcc -mcrypto -flax-vector-conversions aestest.c -o aestest" in power8

#include <stdio.h>
#include <stdint.h>
#include <string.h>

#ifdef __x86_64__
#include <x86intrin.h>
__m128i aesenc(__m128i d,__m128i k){
  return _mm_aesenc_si128(d,k);
}
__m128i aesdec(__m128i d,__m128i k){
  return _mm_aesdec_si128(d,k);
}
#endif

#ifdef __PPC64__
#include <endian.h>
#include <altivec.h>
#undef vector
#undef pixel
#undef bool
typedef __vector uint8_t __m128i;

//flip vector to BE order
__m128i vrev(__m128i v){
  #if __BYTE_ORDER == __BIG_ENDIAN
  return v;
  #else
  return vec_perm(v,(__m128i){0},(__m128i){15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0});
  #endif
}
__m128i aesenc(__m128i d,__m128i k){
  return vrev(__builtin_crypto_vcipher(vrev(d),vrev(k)));
}
__m128i aesdec(__m128i d,__m128i k){
  return vrev(__builtin_crypto_vncipher(vrev(d),vrev(k)));
}
#endif

void print_m128(char* msg,  __m128i v){
   uint8_t* t = (uint8_t*)&v;
   printf("%s: %02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n",msg,t[0],t[1],t[2],t[3], t[4],t[5],t[6],t[7], t[8],t[9],t[10],t[11], t[12],t[13],t[14],t[15]) ;
}


int main(int argc,char* argv[]){
  uint8_t msg[] = "0123456789abcde"; 
  uint8_t key1[] = {255,255,255,255, 255,255,255,255, 255,255,255,255, 255,255,255,255};
  uint8_t key2[] = {0x00,0x11,0x22,0x33,0x44,0x55,0x66,0x77,0x88,0x99,0xaa,0xbb,0xcc,0xdd,0xee,0xff};
  uint8_t* c;
  __m128i xmm1 = (__m128i){0};
  __m128i xmm2 = (__m128i){0};
  __m128i encR = (__m128i){0};
  __m128i decR = (__m128i){0};
//zero test
  printf("zero test\n");
  print_m128("xmm1",xmm1);
  print_m128("xmm2",xmm2);
  encR = aesenc(xmm1,xmm2);
  decR = aesdec(xmm1,xmm2);
  print_m128("enc ",encR);
  print_m128("dec ",decR);
//zero key test
  printf("zero key test\n");
  c = (uint8_t*)&xmm1;
  memcpy(c,msg,16);
  print_m128("xmm1",xmm1);
  print_m128("xmm2",xmm2);
  encR = aesenc(xmm1,xmm2);
  decR = aesdec(xmm1,xmm2);
  print_m128("enc ",encR);
  print_m128("dec ",decR);
//ff key test
  printf("ff key test\n");
  c = (uint8_t*)&xmm1;
  memcpy(c,msg,16);
  c = (uint8_t*)&xmm2;
  memcpy(c,key1,16);
  print_m128("xmm1",xmm1);
  print_m128("xmm2",xmm2);
  encR = aesenc(xmm1,xmm2);
  decR = aesdec(xmm1,xmm2);
  print_m128("enc ",encR);
  print_m128("dec ",decR);
//key test
  printf("key test\n");
  c = (uint8_t*)&xmm1;
  memcpy(c,msg,16);
  c = (uint8_t*)&xmm2;
  memcpy(c,key2,16);
  print_m128("xmm1",xmm1);
  print_m128("xmm2",xmm2);
  encR = aesenc(xmm1,xmm2);
  decR = aesdec(xmm1,xmm2);
  print_m128("enc ",encR);
  print_m128("dec ",decR);
}
#Results in x86:

zero test
xmm1: 00000000000000000000000000000000
xmm2: 00000000000000000000000000000000
enc : 63636363636363636363636363636363
dec : 52525252525252525252525252525252
zero key test
xmm1: 30313233343536373839616263646500
xmm2: 00000000000000000000000000000000
enc : 257af2b38828ceea727eb74610cbd39b
dec : a903befadbaa6d0dc8b9a78af780e18f
ff key test
xmm1: 30313233343536373839616263646500
xmm2: ffffffffffffffffffffffffffffffff
enc : da850d4c77d731158d8148b9ef342c64
dec : 56fc4105245592f237465875087f1e70
key test
xmm1: 30313233343536373839616263646500
xmm2: 00112233445566778899aabbccddeeff
enc : 256bd080cc7da89dfae71dfddc163d64
dec : a9129cc99fff0b7a40200d313b5d0f70

#Results in ppc64:

zero test
xmm1: 00000000000000000000000000000000
xmm2: 00000000000000000000000000000000
enc : 63636363636363636363636363636363
dec : 52525252525252525252525252525252
zero key test
xmm1: 30313233343536373839616263646500
xmm2: 00000000000000000000000000000000
enc : 257af2b38828ceea727eb74610cbd39b
dec : a903befadbaa6d0dc8b9a78af780e18f
ff key test
xmm1: 30313233343536373839616263646500
xmm2: ffffffffffffffffffffffffffffffff
enc : da850d4c77d731158d8148b9ef342c64
dec : 56fc4105245592f237465875087f1e70
key test
xmm1: 30313233343536373839616263646500
xmm2: 00112233445566778899aabbccddeeff
enc : 256bd080cc7da89dfae71dfddc163d64
dec : 03fc36273511a194eacea7df91b3a59e

As seen, the last test fails.

1 Answers

2
Community On Best Solutions

The solution was using a zero key to make the xor step in the middle return identity, then xoring with the real key at the end.

__m128i aesd(__m128i d,__m128i k){
  __m128i out = vrev(__builtin_crypto_vncipher(vrev(d),(__m128i){0}));
  return vec_xor(out,k);
}