27 #if (CRYPTOPP_AESNI_AVAILABLE)
29 # include <emmintrin.h>
30 # include <smmintrin.h>
31 # include <wmmintrin.h>
34 #if (CRYPTOPP_ARM_NEON_HEADER)
36 # include <arm_neon.h>
39 #if (CRYPTOPP_ARM_ACLE_HEADER)
42 # include <arm_acle.h>
49 #if defined(CRYPTOPP_POWER8_AES_AVAILABLE)
54 #ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
59 #ifndef EXCEPTION_EXECUTE_HANDLER
60 # define EXCEPTION_EXECUTE_HANDLER 1
64 #define M128_CAST(x) ((__m128i *)(void *)(x))
65 #define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
68 extern const char RIJNDAEL_SIMD_FNAME[] = __FILE__;
74 #ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
76 typedef void (*SigHandler)(int);
78 static jmp_buf s_jmpSIGILL;
79 static void SigIllHandler(
int)
81 longjmp(s_jmpSIGILL, 1);
84 #endif // Not CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY
86 #if (CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARMV8)
89 #if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES)
91 #elif (CRYPTOPP_ARM_AES_AVAILABLE)
92 # if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY)
93 volatile bool result =
true;
97 uint8x16_t data = vdupq_n_u8(0), key = vdupq_n_u8(0);
98 uint8x16_t r1 = vaeseq_u8(data, key);
99 uint8x16_t r2 = vaesdq_u8(data, key);
101 r2 = vaesimcq_u8(r2);
103 result = !!(vgetq_lane_u8(r1,0) | vgetq_lane_u8(r2,7));
105 __except (EXCEPTION_EXECUTE_HANDLER)
114 volatile bool result =
true;
116 volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler);
117 if (oldHandler == SIG_ERR)
120 volatile sigset_t oldMask;
121 if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask))
123 signal(SIGILL, oldHandler);
127 if (setjmp(s_jmpSIGILL))
131 uint8x16_t data = vdupq_n_u8(0), key = vdupq_n_u8(0);
132 uint8x16_t r1 = vaeseq_u8(data, key);
133 uint8x16_t r2 = vaesdq_u8(data, key);
135 r2 = vaesimcq_u8(r2);
138 result = !!(vgetq_lane_u8(r1,0) | vgetq_lane_u8(r2,7));
141 sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
142 signal(SIGILL, oldHandler);
147 #endif // CRYPTOPP_ARM_AES_AVAILABLE
149 #endif // ARM32 or ARM64
153 #if (CRYPTOPP_ARM_AES_AVAILABLE)
155 ANONYMOUS_NAMESPACE_BEGIN
157 inline void ARMV8_Enc_Block(uint64x2_t &data,
const word32 *subkeys,
unsigned int rounds)
160 const byte *keys =
reinterpret_cast<const byte*
>(subkeys);
161 uint8x16_t block = vreinterpretq_u8_u64(data);
164 block = vaeseq_u8(block, vld1q_u8(keys+0*16));
166 block = vaesmcq_u8(block);
168 for (
unsigned int i=1; i<rounds-1; i+=2)
171 block = vaeseq_u8(block, vld1q_u8(keys+i*16));
173 block = vaesmcq_u8(block);
175 block = vaeseq_u8(block, vld1q_u8(keys+(i+1)*16));
177 block = vaesmcq_u8(block);
181 block = vaeseq_u8(block, vld1q_u8(keys+(rounds-1)*16));
183 block = veorq_u8(block, vld1q_u8(keys+rounds*16));
185 data = vreinterpretq_u64_u8(block);
188 inline void ARMV8_Enc_6_Blocks(uint64x2_t &data0, uint64x2_t &data1,
189 uint64x2_t &data2, uint64x2_t &data3, uint64x2_t &data4, uint64x2_t &data5,
190 const word32 *subkeys,
unsigned int rounds)
193 const byte *keys =
reinterpret_cast<const byte*
>(subkeys);
195 uint8x16_t block0 = vreinterpretq_u8_u64(data0);
196 uint8x16_t block1 = vreinterpretq_u8_u64(data1);
197 uint8x16_t block2 = vreinterpretq_u8_u64(data2);
198 uint8x16_t block3 = vreinterpretq_u8_u64(data3);
199 uint8x16_t block4 = vreinterpretq_u8_u64(data4);
200 uint8x16_t block5 = vreinterpretq_u8_u64(data5);
203 for (
unsigned int i=0; i<rounds-1; ++i)
205 key = vld1q_u8(keys+i*16);
207 block0 = vaeseq_u8(block0, key);
209 block0 = vaesmcq_u8(block0);
211 block1 = vaeseq_u8(block1, key);
213 block1 = vaesmcq_u8(block1);
215 block2 = vaeseq_u8(block2, key);
217 block2 = vaesmcq_u8(block2);
219 block3 = vaeseq_u8(block3, key);
221 block3 = vaesmcq_u8(block3);
223 block4 = vaeseq_u8(block4, key);
225 block4 = vaesmcq_u8(block4);
227 block5 = vaeseq_u8(block5, key);
229 block5 = vaesmcq_u8(block5);
233 key = vld1q_u8(keys+(rounds-1)*16);
234 block0 = vaeseq_u8(block0, key);
235 block1 = vaeseq_u8(block1, key);
236 block2 = vaeseq_u8(block2, key);
237 block3 = vaeseq_u8(block3, key);
238 block4 = vaeseq_u8(block4, key);
239 block5 = vaeseq_u8(block5, key);
242 key = vld1q_u8(keys+rounds*16);
243 data0 = vreinterpretq_u64_u8(veorq_u8(block0, key));
244 data1 = vreinterpretq_u64_u8(veorq_u8(block1, key));
245 data2 = vreinterpretq_u64_u8(veorq_u8(block2, key));
246 data3 = vreinterpretq_u64_u8(veorq_u8(block3, key));
247 data4 = vreinterpretq_u64_u8(veorq_u8(block4, key));
248 data5 = vreinterpretq_u64_u8(veorq_u8(block5, key));
251 inline void ARMV8_Dec_Block(uint64x2_t &data,
const word32 *subkeys,
unsigned int rounds)
254 const byte *keys =
reinterpret_cast<const byte*
>(subkeys);
255 uint8x16_t block = vreinterpretq_u8_u64(data);
258 block = vaesdq_u8(block, vld1q_u8(keys+0*16));
260 block = vaesimcq_u8(block);
262 for (
unsigned int i=1; i<rounds-1; i+=2)
265 block = vaesdq_u8(block, vld1q_u8(keys+i*16));
267 block = vaesimcq_u8(block);
269 block = vaesdq_u8(block, vld1q_u8(keys+(i+1)*16));
271 block = vaesimcq_u8(block);
275 block = vaesdq_u8(block, vld1q_u8(keys+(rounds-1)*16));
277 block = veorq_u8(block, vld1q_u8(keys+rounds*16));
279 data = vreinterpretq_u64_u8(block);
282 inline void ARMV8_Dec_6_Blocks(uint64x2_t &data0, uint64x2_t &data1,
283 uint64x2_t &data2, uint64x2_t &data3, uint64x2_t &data4, uint64x2_t &data5,
284 const word32 *subkeys,
unsigned int rounds)
287 const byte *keys =
reinterpret_cast<const byte*
>(subkeys);
289 uint8x16_t block0 = vreinterpretq_u8_u64(data0);
290 uint8x16_t block1 = vreinterpretq_u8_u64(data1);
291 uint8x16_t block2 = vreinterpretq_u8_u64(data2);
292 uint8x16_t block3 = vreinterpretq_u8_u64(data3);
293 uint8x16_t block4 = vreinterpretq_u8_u64(data4);
294 uint8x16_t block5 = vreinterpretq_u8_u64(data5);
297 for (
unsigned int i=0; i<rounds-1; ++i)
299 key = vld1q_u8(keys+i*16);
301 block0 = vaesdq_u8(block0, key);
303 block0 = vaesimcq_u8(block0);
305 block1 = vaesdq_u8(block1, key);
307 block1 = vaesimcq_u8(block1);
309 block2 = vaesdq_u8(block2, key);
311 block2 = vaesimcq_u8(block2);
313 block3 = vaesdq_u8(block3, key);
315 block3 = vaesimcq_u8(block3);
317 block4 = vaesdq_u8(block4, key);
319 block4 = vaesimcq_u8(block4);
321 block5 = vaesdq_u8(block5, key);
323 block5 = vaesimcq_u8(block5);
327 key = vld1q_u8(keys+(rounds-1)*16);
328 block0 = vaesdq_u8(block0, key);
329 block1 = vaesdq_u8(block1, key);
330 block2 = vaesdq_u8(block2, key);
331 block3 = vaesdq_u8(block3, key);
332 block4 = vaesdq_u8(block4, key);
333 block5 = vaesdq_u8(block5, key);
336 key = vld1q_u8(keys+rounds*16);
337 data0 = vreinterpretq_u64_u8(veorq_u8(block0, key));
338 data1 = vreinterpretq_u64_u8(veorq_u8(block1, key));
339 data2 = vreinterpretq_u64_u8(veorq_u8(block2, key));
340 data3 = vreinterpretq_u64_u8(veorq_u8(block3, key));
341 data4 = vreinterpretq_u64_u8(veorq_u8(block4, key));
342 data5 = vreinterpretq_u64_u8(veorq_u8(block5, key));
345 ANONYMOUS_NAMESPACE_END
347 size_t Rijndael_Enc_AdvancedProcessBlocks_ARMV8(
const word32 *subKeys,
size_t rounds,
348 const byte *inBlocks,
const byte *xorBlocks,
byte *outBlocks,
size_t length, word32 flags)
351 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
354 size_t Rijndael_Dec_AdvancedProcessBlocks_ARMV8(
const word32 *subKeys,
size_t rounds,
355 const byte *inBlocks,
const byte *xorBlocks,
byte *outBlocks,
size_t length, word32 flags)
358 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
361 #endif // CRYPTOPP_ARM_AES_AVAILABLE
365 #if (CRYPTOPP_AESNI_AVAILABLE)
367 ANONYMOUS_NAMESPACE_BEGIN
370 CRYPTOPP_ALIGN_DATA(16)
371 const word32 s_rconLE[] = {
372 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1B, 0x36
375 inline void AESNI_Enc_Block(__m128i &block,
MAYBE_CONST word32 *subkeys,
unsigned int rounds)
377 const __m128i* skeys =
reinterpret_cast<const __m128i*
>(subkeys);
379 block = _mm_xor_si128(block, skeys[0]);
380 for (
unsigned int i=1; i<rounds-1; i+=2)
382 block = _mm_aesenc_si128(block, skeys[i]);
383 block = _mm_aesenc_si128(block, skeys[i+1]);
385 block = _mm_aesenc_si128(block, skeys[rounds-1]);
386 block = _mm_aesenclast_si128(block, skeys[rounds]);
389 inline void AESNI_Enc_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3,
392 const __m128i* skeys =
reinterpret_cast<const __m128i*
>(subkeys);
394 __m128i rk = skeys[0];
395 block0 = _mm_xor_si128(block0, rk);
396 block1 = _mm_xor_si128(block1, rk);
397 block2 = _mm_xor_si128(block2, rk);
398 block3 = _mm_xor_si128(block3, rk);
399 for (
unsigned int i=1; i<rounds; i++)
402 block0 = _mm_aesenc_si128(block0, rk);
403 block1 = _mm_aesenc_si128(block1, rk);
404 block2 = _mm_aesenc_si128(block2, rk);
405 block3 = _mm_aesenc_si128(block3, rk);
408 block0 = _mm_aesenclast_si128(block0, rk);
409 block1 = _mm_aesenclast_si128(block1, rk);
410 block2 = _mm_aesenclast_si128(block2, rk);
411 block3 = _mm_aesenclast_si128(block3, rk);
414 inline void AESNI_Dec_Block(__m128i &block,
MAYBE_CONST word32 *subkeys,
unsigned int rounds)
416 const __m128i* skeys =
reinterpret_cast<const __m128i*
>(subkeys);
418 block = _mm_xor_si128(block, skeys[0]);
419 for (
unsigned int i=1; i<rounds-1; i+=2)
421 block = _mm_aesdec_si128(block, skeys[i]);
422 block = _mm_aesdec_si128(block, skeys[i+1]);
424 block = _mm_aesdec_si128(block, skeys[rounds-1]);
425 block = _mm_aesdeclast_si128(block, skeys[rounds]);
428 inline void AESNI_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3,
431 const __m128i* skeys =
reinterpret_cast<const __m128i*
>(subkeys);
433 __m128i rk = skeys[0];
434 block0 = _mm_xor_si128(block0, rk);
435 block1 = _mm_xor_si128(block1, rk);
436 block2 = _mm_xor_si128(block2, rk);
437 block3 = _mm_xor_si128(block3, rk);
438 for (
unsigned int i=1; i<rounds; i++)
441 block0 = _mm_aesdec_si128(block0, rk);
442 block1 = _mm_aesdec_si128(block1, rk);
443 block2 = _mm_aesdec_si128(block2, rk);
444 block3 = _mm_aesdec_si128(block3, rk);
447 block0 = _mm_aesdeclast_si128(block0, rk);
448 block1 = _mm_aesdeclast_si128(block1, rk);
449 block2 = _mm_aesdeclast_si128(block2, rk);
450 block3 = _mm_aesdeclast_si128(block3, rk);
453 ANONYMOUS_NAMESPACE_END
455 void Rijndael_UncheckedSetKey_SSE4_AESNI(
const byte *userKey,
size_t keyLen, word32 *rk)
457 const size_t rounds = keyLen / 4 + 6;
458 const word32 *rc = s_rconLE;
460 __m128i temp = _mm_loadu_si128(
M128_CAST(userKey+keyLen-16));
461 std::memcpy(rk, userKey, keyLen);
464 const size_t keySize = 4*(rounds+1);
465 const word32* end = rk + keySize;
469 rk[keyLen/4] = rk[0] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 3) ^ *(rc++);
470 rk[keyLen/4+1] = rk[1] ^ rk[keyLen/4];
471 rk[keyLen/4+2] = rk[2] ^ rk[keyLen/4+1];
472 rk[keyLen/4+3] = rk[3] ^ rk[keyLen/4+2];
474 if (rk + keyLen/4 + 4 == end)
479 rk[10] = rk[ 4] ^ rk[ 9];
480 rk[11] = rk[ 5] ^ rk[10];
481 temp = _mm_insert_epi32(temp, rk[11], 3);
483 else if (keyLen == 32)
485 temp = _mm_insert_epi32(temp, rk[11], 3);
486 rk[12] = rk[ 4] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 2);
487 rk[13] = rk[ 5] ^ rk[12];
488 rk[14] = rk[ 6] ^ rk[13];
489 rk[15] = rk[ 7] ^ rk[14];
490 temp = _mm_insert_epi32(temp, rk[15], 3);
494 temp = _mm_insert_epi32(temp, rk[7], 3);
501 void Rijndael_UncheckedSetKeyRev_AESNI(word32 *key,
unsigned int rounds)
508 for (i = 4, j = 4*rounds-4; i < j; i += 4, j -= 4)
510 temp = _mm_aesimc_si128(*
M128_CAST(key+i));
518 size_t Rijndael_Enc_AdvancedProcessBlocks_AESNI(
const word32 *subKeys,
size_t rounds,
519 const byte *inBlocks,
const byte *xorBlocks,
byte *outBlocks,
size_t length, word32 flags)
527 sk, rounds, ib, xb, outBlocks, length, flags);
530 size_t Rijndael_Dec_AdvancedProcessBlocks_AESNI(
const word32 *subKeys,
size_t rounds,
531 const byte *inBlocks,
const byte *xorBlocks,
byte *outBlocks,
size_t length, word32 flags)
538 sk, rounds, ib, xb, outBlocks, length, flags);
541 #endif // CRYPTOPP_AESNI_AVAILABLE
545 #if (CRYPTOPP_POWER8_AES_AVAILABLE)
547 ANONYMOUS_NAMESPACE_BEGIN
550 CRYPTOPP_ALIGN_DATA(16)
551 static const uint32_t s_rconBE[] = {
552 0x01000000, 0x02000000, 0x04000000, 0x08000000,
553 0x10000000, 0x20000000, 0x40000000, 0x80000000,
554 0x1B000000, 0x36000000
557 inline void POWER8_Enc_Block(
uint32x4_p &block,
const word32 *subkeys,
unsigned int rounds)
560 const byte *keys =
reinterpret_cast<const byte*
>(subkeys);
565 for (
size_t i=1; i<rounds-1; i+=2)
577 uint32x4_p &block5,
const word32 *subkeys,
unsigned int rounds)
580 const byte *keys =
reinterpret_cast<const byte*
>(subkeys);
583 block0 =
VecXor(block0, k);
584 block1 =
VecXor(block1, k);
585 block2 =
VecXor(block2, k);
586 block3 =
VecXor(block3, k);
587 block4 =
VecXor(block4, k);
588 block5 =
VecXor(block5, k);
590 for (
size_t i=1; i<rounds; ++i)
610 inline void POWER8_Dec_Block(
uint32x4_p &block,
const word32 *subkeys,
unsigned int rounds)
613 const byte *keys =
reinterpret_cast<const byte*
>(subkeys);
618 for (
size_t i=rounds-1; i>1; i-=2)
630 uint32x4_p &block5,
const word32 *subkeys,
unsigned int rounds)
633 const byte *keys =
reinterpret_cast<const byte*
>(subkeys);
636 block0 =
VecXor(block0, k);
637 block1 =
VecXor(block1, k);
638 block2 =
VecXor(block2, k);
639 block3 =
VecXor(block3, k);
640 block4 =
VecXor(block4, k);
641 block5 =
VecXor(block5, k);
643 for (
size_t i=rounds-1; i>0; --i)
663 ANONYMOUS_NAMESPACE_END
665 void Rijndael_UncheckedSetKey_POWER8(
const byte* userKey,
size_t keyLen, word32* rk,
const byte* Se)
667 const size_t rounds = keyLen / 4 + 6;
668 const word32 *rc = s_rconBE;
669 word32 *rkey = rk, temp;
674 const size_t keySize = 4*(rounds+1);
675 const word32* end = rkey + keySize;
679 temp = rkey[keyLen/4-1];
680 word32 x = (word32(Se[GETBYTE(temp, 2)]) << 24) ^ (word32(Se[GETBYTE(temp, 1)]) << 16) ^
681 (word32(Se[GETBYTE(temp, 0)]) << 8) ^ Se[GETBYTE(temp, 3)];
682 rkey[keyLen/4] = rkey[0] ^ x ^ *(rc++);
683 rkey[keyLen/4+1] = rkey[1] ^ rkey[keyLen/4];
684 rkey[keyLen/4+2] = rkey[2] ^ rkey[keyLen/4+1];
685 rkey[keyLen/4+3] = rkey[3] ^ rkey[keyLen/4+2];
687 if (rkey + keyLen/4 + 4 == end)
692 rkey[10] = rkey[ 4] ^ rkey[ 9];
693 rkey[11] = rkey[ 5] ^ rkey[10];
695 else if (keyLen == 32)
698 rkey[12] = rkey[ 4] ^ (word32(Se[GETBYTE(temp, 3)]) << 24) ^ (word32(Se[GETBYTE(temp, 2)]) << 16) ^ (word32(Se[GETBYTE(temp, 1)]) << 8) ^ Se[GETBYTE(temp, 0)];
699 rkey[13] = rkey[ 5] ^ rkey[12];
700 rkey[14] = rkey[ 6] ^ rkey[13];
701 rkey[15] = rkey[ 7] ^ rkey[14];
706 #if (CRYPTOPP_LITTLE_ENDIAN)
708 const uint8x16_p mask = {12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3};
711 for (i=0; i<rounds; i+=2, rkey+=8)
717 for ( ; i<rounds+1; i++, rkey+=4)
722 size_t Rijndael_Enc_AdvancedProcessBlocks128_6x1_ALTIVEC(
const word32 *subKeys,
size_t rounds,
723 const byte *inBlocks,
const byte *xorBlocks,
byte *outBlocks,
size_t length, word32 flags)
726 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
729 size_t Rijndael_Dec_AdvancedProcessBlocks128_6x1_ALTIVEC(
const word32 *subKeys,
size_t rounds,
730 const byte *inBlocks,
const byte *xorBlocks,
byte *outBlocks,
size_t length, word32 flags)
733 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
736 #endif // CRYPTOPP_POWER8_AES_AVAILABLE