diff options
Diffstat (limited to 'security/nss/lib/freebl/aes-armv8.c')
-rw-r--r-- | security/nss/lib/freebl/aes-armv8.c | 1169 |
1 files changed, 1169 insertions, 0 deletions
diff --git a/security/nss/lib/freebl/aes-armv8.c b/security/nss/lib/freebl/aes-armv8.c new file mode 100644 index 000000000..8213272f5 --- /dev/null +++ b/security/nss/lib/freebl/aes-armv8.c @@ -0,0 +1,1169 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "secerr.h" +#include "rijndael.h" + +#if ((defined(__clang__) || \ + (defined(__GNUC__) && defined(__GNUC_MINOR__) && \ + (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 8)))) && \ + (defined(__ARM_NEON) || defined(__ARM_NEON__))) + +#ifndef __ARM_FEATURE_CRYPTO +#error "Compiler option is invalid" +#endif + +#include <arm_neon.h> + +SECStatus +arm_aes_encrypt_ecb_128(AESContext *cx, unsigned char *output, + unsigned int *outputLen, + unsigned int maxOutputLen, + const unsigned char *input, + unsigned int inputLen, + unsigned int blocksize) +{ +#if !defined(HAVE_UNALIGNED_ACCESS) + pre_align unsigned char buf[16] post_align; +#endif + uint8x16_t key1, key2, key3, key4, key5, key6, key7, key8, key9, key10; + uint8x16_t key11; + const PRUint8 *key = (const PRUint8 *)cx->k.expandedKey; + + if (!inputLen) { + return SECSuccess; + } + + key1 = vld1q_u8(key); + key2 = vld1q_u8(key + 16); + key3 = vld1q_u8(key + 32); + key4 = vld1q_u8(key + 48); + key5 = vld1q_u8(key + 64); + key6 = vld1q_u8(key + 80); + key7 = vld1q_u8(key + 96); + key8 = vld1q_u8(key + 112); + key9 = vld1q_u8(key + 128); + key10 = vld1q_u8(key + 144); + key11 = vld1q_u8(key + 160); + + while (inputLen > 0) { + uint8x16_t state; +#if defined(HAVE_UNALIGNED_ACCESS) + state = vld1q_u8(input); +#else + if ((uintptr_t)input & 0x7) { + memcpy(buf, input, 16); + state = vld1q_u8(__builtin_assume_aligned(buf, 16)); + } else { + state = vld1q_u8(__builtin_assume_aligned(input, 8)); + } +#endif + input += 16; + inputLen -= 16; + + /* Rounds */ + state = vaeseq_u8(state, key1); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key2); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key3); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key4); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key5); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key6); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key7); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key8); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key9); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key10); + /* AddRoundKey */ + state = veorq_u8(state, key11); + +#if defined(HAVE_UNALIGNED_ACCESS) + vst1q_u8(output, state); +#else + if ((uintptr_t)output & 0x7) { + vst1q_u8(__builtin_assume_aligned(buf, 16), state); + memcpy(output, buf, 16); + } else { + vst1q_u8(__builtin_assume_aligned(output, 8), state); + } +#endif + output += 16; + } + + return SECSuccess; +} + +SECStatus +arm_aes_decrypt_ecb_128(AESContext *cx, unsigned char *output, + unsigned int *outputLen, + unsigned int maxOutputLen, + const unsigned char *input, + unsigned int inputLen, + unsigned int blocksize) +{ +#if !defined(HAVE_UNALIGNED_ACCESS) + pre_align unsigned char buf[16] post_align; +#endif + uint8x16_t key1, key2, key3, key4, key5, key6, key7, key8, key9, key10; + uint8x16_t key11; + const PRUint8 *key = (const PRUint8 *)cx->k.expandedKey; + + if (inputLen == 0) { + return SECSuccess; + } + + key1 = vld1q_u8(key); + key2 = vld1q_u8(key + 16); + key3 = vld1q_u8(key + 32); + key4 = vld1q_u8(key + 48); + key5 = vld1q_u8(key + 64); + key6 = vld1q_u8(key + 80); + key7 = vld1q_u8(key + 96); + key8 = vld1q_u8(key + 112); + key9 = vld1q_u8(key + 128); + key10 = vld1q_u8(key + 144); + key11 = vld1q_u8(key + 160); + + while (inputLen > 0) { + uint8x16_t state; +#if defined(HAVE_UNALIGNED_ACCESS) + state = vld1q_u8(input); +#else + if ((uintptr_t)input & 0x7) { + memcpy(buf, input, 16); + state = vld1q_u8(__builtin_assume_aligned(buf, 16)); + } else { + state = vld1q_u8(__builtin_assume_aligned(input, 8)); + } +#endif + input += 16; + inputLen -= 16; + + /* Rounds */ + state = vaesdq_u8(state, key11); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key10); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key9); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key8); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key7); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key6); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key5); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key4); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key3); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key2); + /* AddRoundKey */ + state = veorq_u8(state, key1); + +#if defined(HAVE_UNALIGNED_ACCESS) + vst1q_u8(output, state); +#else + if ((uintptr_t)output & 0x7) { + vst1q_u8(__builtin_assume_aligned(buf, 16), state); + memcpy(output, buf, 16); + } else { + vst1q_u8(__builtin_assume_aligned(output, 8), state); + } +#endif + output += 16; + } + + return SECSuccess; +} + +SECStatus +arm_aes_encrypt_cbc_128(AESContext *cx, unsigned char *output, + unsigned int *outputLen, + unsigned int maxOutputLen, + const unsigned char *input, + unsigned int inputLen, + unsigned int blocksize) +{ +#if !defined(HAVE_UNALIGNED_ACCESS) + pre_align unsigned char buf[16] post_align; +#endif + uint8x16_t key1, key2, key3, key4, key5, key6, key7, key8, key9, key10; + uint8x16_t key11; + uint8x16_t iv; + const PRUint8 *key = (const PRUint8 *)cx->k.expandedKey; + + if (!inputLen) { + return SECSuccess; + } + + /* iv */ + iv = vld1q_u8(cx->iv); + + /* expanedKey */ + key1 = vld1q_u8(key); + key2 = vld1q_u8(key + 16); + key3 = vld1q_u8(key + 32); + key4 = vld1q_u8(key + 48); + key5 = vld1q_u8(key + 64); + key6 = vld1q_u8(key + 80); + key7 = vld1q_u8(key + 96); + key8 = vld1q_u8(key + 112); + key9 = vld1q_u8(key + 128); + key10 = vld1q_u8(key + 144); + key11 = vld1q_u8(key + 160); + + while (inputLen > 0) { + uint8x16_t state; +#if defined(HAVE_UNALIGNED_ACCESS) + state = vld1q_u8(input); +#else + if ((uintptr_t)input & 0x7) { + memcpy(buf, input, 16); + state = vld1q_u8(__builtin_assume_aligned(buf, 16)); + } else { + state = vld1q_u8(__builtin_assume_aligned(input, 8)); + } +#endif + input += 16; + inputLen -= 16; + + state = veorq_u8(state, iv); + + /* Rounds */ + state = vaeseq_u8(state, key1); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key2); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key3); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key4); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key5); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key6); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key7); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key8); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key9); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key10); + /* AddRoundKey */ + state = veorq_u8(state, key11); + +#if defined(HAVE_UNALIGNED_ACCESS) + vst1q_u8(output, state); +#else + if ((uintptr_t)output & 0x7) { + vst1q_u8(__builtin_assume_aligned(buf, 16), state); + memcpy(output, buf, 16); + } else { + vst1q_u8(__builtin_assume_aligned(output, 8), state); + } +#endif + output += 16; + iv = state; + } + vst1q_u8(cx->iv, iv); + + return SECSuccess; +} + +SECStatus +arm_aes_decrypt_cbc_128(AESContext *cx, unsigned char *output, + unsigned int *outputLen, + unsigned int maxOutputLen, + const unsigned char *input, + unsigned int inputLen, + unsigned int blocksize) +{ +#if !defined(HAVE_UNALIGNED_ACCESS) + pre_align unsigned char buf[16] post_align; +#endif + uint8x16_t iv; + uint8x16_t key1, key2, key3, key4, key5, key6, key7, key8, key9, key10; + uint8x16_t key11; + const PRUint8 *key = (const PRUint8 *)cx->k.expandedKey; + + if (!inputLen) { + return SECSuccess; + } + + /* iv */ + iv = vld1q_u8(cx->iv); + + /* expanedKey */ + key1 = vld1q_u8(key); + key2 = vld1q_u8(key + 16); + key3 = vld1q_u8(key + 32); + key4 = vld1q_u8(key + 48); + key5 = vld1q_u8(key + 64); + key6 = vld1q_u8(key + 80); + key7 = vld1q_u8(key + 96); + key8 = vld1q_u8(key + 112); + key9 = vld1q_u8(key + 128); + key10 = vld1q_u8(key + 144); + key11 = vld1q_u8(key + 160); + + while (inputLen > 0) { + uint8x16_t state, old_state; +#if defined(HAVE_UNALIGNED_ACCESS) + state = vld1q_u8(input); +#else + if ((uintptr_t)input & 0x7) { + memcpy(buf, input, 16); + state = vld1q_u8(__builtin_assume_aligned(buf, 16)); + } else { + state = vld1q_u8(__builtin_assume_aligned(input, 8)); + } +#endif + old_state = state; + input += 16; + inputLen -= 16; + + /* Rounds */ + state = vaesdq_u8(state, key11); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key10); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key9); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key8); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key7); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key6); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key5); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key4); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key3); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key2); + /* AddRoundKey */ + state = veorq_u8(state, key1); + + state = veorq_u8(state, iv); + +#if defined(HAVE_UNALIGNED_ACCESS) + vst1q_u8(output, state); +#else + if ((uintptr_t)output & 0x7) { + vst1q_u8(__builtin_assume_aligned(buf, 16), state); + memcpy(output, buf, 16); + } else { + vst1q_u8(__builtin_assume_aligned(output, 8), state); + } +#endif + output += 16; + + iv = old_state; + } + vst1q_u8(cx->iv, iv); + + return SECSuccess; +} + +SECStatus +arm_aes_encrypt_ecb_192(AESContext *cx, unsigned char *output, + unsigned int *outputLen, + unsigned int maxOutputLen, + const unsigned char *input, + unsigned int inputLen, + unsigned int blocksize) +{ +#if !defined(HAVE_UNALIGNED_ACCESS) + pre_align unsigned char buf[16] post_align; +#endif + uint8x16_t key1, key2, key3, key4, key5, key6, key7, key8, key9, key10; + uint8x16_t key11, key12, key13; + PRUint8 *key = (PRUint8 *)cx->k.expandedKey; + + if (!inputLen) { + return SECSuccess; + } + + key1 = vld1q_u8(key); + key2 = vld1q_u8(key + 16); + key3 = vld1q_u8(key + 32); + key4 = vld1q_u8(key + 48); + key5 = vld1q_u8(key + 64); + key6 = vld1q_u8(key + 80); + key7 = vld1q_u8(key + 96); + key8 = vld1q_u8(key + 112); + key9 = vld1q_u8(key + 128); + key10 = vld1q_u8(key + 144); + key11 = vld1q_u8(key + 160); + key12 = vld1q_u8(key + 176); + key13 = vld1q_u8(key + 192); + + while (inputLen > 0) { + uint8x16_t state; +#if defined(HAVE_UNALIGNED_ACCESS) + state = vld1q_u8(input); +#else + if ((uintptr_t)input & 0x7) { + memcpy(buf, input, 16); + state = vld1q_u8(__builtin_assume_aligned(buf, 16)); + } else { + state = vld1q_u8(__builtin_assume_aligned(input, 8)); + } +#endif + input += 16; + inputLen -= 16; + + /* Rounds */ + state = vaeseq_u8(state, key1); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key2); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key3); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key4); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key5); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key6); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key7); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key8); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key9); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key10); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key11); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key12); + /* AddRoundKey */ + state = veorq_u8(state, key13); + +#if defined(HAVE_UNALIGNED_ACCESS) + vst1q_u8(output, state); +#else + if ((uintptr_t)output & 0x7) { + vst1q_u8(__builtin_assume_aligned(buf, 16), state); + memcpy(output, buf, 16); + } else { + vst1q_u8(__builtin_assume_aligned(output, 8), state); + } +#endif + output += 16; + } + + return SECSuccess; +} + +SECStatus +arm_aes_decrypt_ecb_192(AESContext *cx, unsigned char *output, + unsigned int *outputLen, + unsigned int maxOutputLen, + const unsigned char *input, + unsigned int inputLen, + unsigned int blocksize) +{ +#if !defined(HAVE_UNALIGNED_ACCESS) + pre_align unsigned char buf[16] post_align; +#endif + uint8x16_t key1, key2, key3, key4, key5, key6, key7, key8, key9, key10; + uint8x16_t key11, key12, key13; + const PRUint8 *key = (const PRUint8 *)cx->k.expandedKey; + + if (!inputLen) { + return SECSuccess; + } + + key1 = vld1q_u8(key); + key2 = vld1q_u8(key + 16); + key3 = vld1q_u8(key + 32); + key4 = vld1q_u8(key + 48); + key5 = vld1q_u8(key + 64); + key6 = vld1q_u8(key + 80); + key7 = vld1q_u8(key + 96); + key8 = vld1q_u8(key + 112); + key9 = vld1q_u8(key + 128); + key10 = vld1q_u8(key + 144); + key11 = vld1q_u8(key + 160); + key12 = vld1q_u8(key + 176); + key13 = vld1q_u8(key + 192); + + while (inputLen > 0) { + uint8x16_t state; +#if defined(HAVE_UNALIGNED_ACCESS) + state = vld1q_u8(input); +#else + if ((uintptr_t)input & 0x7) { + memcpy(buf, input, 16); + state = vld1q_u8(__builtin_assume_aligned(buf, 16)); + } else { + state = vld1q_u8(__builtin_assume_aligned(input, 8)); + } +#endif + input += 16; + inputLen -= 16; + + /* Rounds */ + state = vaesdq_u8(state, key13); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key12); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key11); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key10); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key9); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key8); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key7); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key6); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key5); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key4); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key3); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key2); + /* AddRoundKey */ + state = veorq_u8(state, key1); + +#if defined(HAVE_UNALIGNED_ACCESS) + vst1q_u8(output, state); +#else + if ((uintptr_t)output & 0x7) { + vst1q_u8(__builtin_assume_aligned(buf, 16), state); + memcpy(output, buf, 16); + } else { + vst1q_u8(__builtin_assume_aligned(output, 8), state); + } +#endif + output += 16; + } + + return SECSuccess; +} + +SECStatus +arm_aes_encrypt_cbc_192(AESContext *cx, unsigned char *output, + unsigned int *outputLen, + unsigned int maxOutputLen, + const unsigned char *input, + unsigned int inputLen, + unsigned int blocksize) +{ +#if !defined(HAVE_UNALIGNED_ACCESS) + pre_align unsigned char buf[16] post_align; +#endif + uint8x16_t key1, key2, key3, key4, key5, key6, key7, key8, key9, key10; + uint8x16_t key11, key12, key13; + uint8x16_t iv; + PRUint8 *key = (PRUint8 *)cx->k.expandedKey; + + if (!inputLen) { + return SECSuccess; + } + + /* iv */ + iv = vld1q_u8(cx->iv); + + /* expanedKey */ + key1 = vld1q_u8(key); + key2 = vld1q_u8(key + 16); + key3 = vld1q_u8(key + 32); + key4 = vld1q_u8(key + 48); + key5 = vld1q_u8(key + 64); + key6 = vld1q_u8(key + 80); + key7 = vld1q_u8(key + 96); + key8 = vld1q_u8(key + 112); + key9 = vld1q_u8(key + 128); + key10 = vld1q_u8(key + 144); + key11 = vld1q_u8(key + 160); + key12 = vld1q_u8(key + 176); + key13 = vld1q_u8(key + 192); + + while (inputLen > 0) { + uint8x16_t state; +#if defined(HAVE_UNALIGNED_ACCESS) + state = vld1q_u8(input); +#else + if ((uintptr_t)input & 0x7) { + memcpy(buf, input, 16); + state = vld1q_u8(__builtin_assume_aligned(buf, 16)); + } else { + state = vld1q_u8(__builtin_assume_aligned(input, 8)); + } +#endif + input += 16; + inputLen -= 16; + + state = veorq_u8(state, iv); + + /* Rounds */ + state = vaeseq_u8(state, key1); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key2); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key3); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key4); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key5); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key6); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key7); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key8); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key9); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key10); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key11); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key12); + state = veorq_u8(state, key13); + +#if defined(HAVE_UNALIGNED_ACCESS) + vst1q_u8(output, state); +#else + if ((uintptr_t)output & 0x7) { + vst1q_u8(__builtin_assume_aligned(buf, 16), state); + memcpy(output, buf, 16); + } else { + vst1q_u8(__builtin_assume_aligned(output, 8), state); + } +#endif + output += 16; + iv = state; + } + vst1q_u8(cx->iv, iv); + + return SECSuccess; +} + +SECStatus +arm_aes_decrypt_cbc_192(AESContext *cx, unsigned char *output, + unsigned int *outputLen, + unsigned int maxOutputLen, + const unsigned char *input, + unsigned int inputLen, + unsigned int blocksize) +{ +#if !defined(HAVE_UNALIGNED_ACCESS) + pre_align unsigned char buf[16] post_align; +#endif + uint8x16_t iv; + uint8x16_t key1, key2, key3, key4, key5, key6, key7, key8, key9, key10; + uint8x16_t key11, key12, key13; + const PRUint8 *key = (const PRUint8 *)cx->k.expandedKey; + + if (!inputLen) { + return SECSuccess; + } + + /* iv */ + iv = vld1q_u8(cx->iv); + + /* expanedKey */ + key1 = vld1q_u8(key); + key2 = vld1q_u8(key + 16); + key3 = vld1q_u8(key + 32); + key4 = vld1q_u8(key + 48); + key5 = vld1q_u8(key + 64); + key6 = vld1q_u8(key + 80); + key7 = vld1q_u8(key + 96); + key8 = vld1q_u8(key + 112); + key9 = vld1q_u8(key + 128); + key10 = vld1q_u8(key + 144); + key11 = vld1q_u8(key + 160); + key12 = vld1q_u8(key + 176); + key13 = vld1q_u8(key + 192); + + while (inputLen > 0) { + uint8x16_t state, old_state; +#if defined(HAVE_UNALIGNED_ACCESS) + state = vld1q_u8(input); +#else + if ((uintptr_t)input & 0x7) { + memcpy(buf, input, 16); + state = vld1q_u8(__builtin_assume_aligned(buf, 16)); + } else { + state = vld1q_u8(__builtin_assume_aligned(input, 8)); + } +#endif + old_state = state; + input += 16; + inputLen -= 16; + + /* Rounds */ + state = vaesdq_u8(state, key13); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key12); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key11); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key10); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key9); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key8); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key7); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key6); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key5); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key4); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key3); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key2); + /* AddRoundKey */ + state = veorq_u8(state, key1); + + state = veorq_u8(state, iv); + +#if defined(HAVE_UNALIGNED_ACCESS) + vst1q_u8(output, state); +#else + if ((uintptr_t)output & 0x7) { + vst1q_u8(__builtin_assume_aligned(buf, 16), state); + memcpy(output, buf, 16); + } else { + vst1q_u8(__builtin_assume_aligned(output, 8), state); + } +#endif + output += 16; + + iv = old_state; + } + vst1q_u8(cx->iv, iv); + + return SECSuccess; +} + +SECStatus +arm_aes_encrypt_ecb_256(AESContext *cx, unsigned char *output, + unsigned int *outputLen, + unsigned int maxOutputLen, + const unsigned char *input, + unsigned int inputLen, + unsigned int blocksize) +{ +#if !defined(HAVE_UNALIGNED_ACCESS) + pre_align unsigned char buf[16] post_align; +#endif + uint8x16_t key1, key2, key3, key4, key5, key6, key7, key8, key9, key10; + uint8x16_t key11, key12, key13, key14, key15; + PRUint8 *key = (PRUint8 *)cx->k.expandedKey; + + if (inputLen == 0) { + return SECSuccess; + } + + key1 = vld1q_u8(key); + key2 = vld1q_u8(key + 16); + key3 = vld1q_u8(key + 32); + key4 = vld1q_u8(key + 48); + key5 = vld1q_u8(key + 64); + key6 = vld1q_u8(key + 80); + key7 = vld1q_u8(key + 96); + key8 = vld1q_u8(key + 112); + key9 = vld1q_u8(key + 128); + key10 = vld1q_u8(key + 144); + key11 = vld1q_u8(key + 160); + key12 = vld1q_u8(key + 176); + key13 = vld1q_u8(key + 192); + key14 = vld1q_u8(key + 208); + key15 = vld1q_u8(key + 224); + + while (inputLen > 0) { + uint8x16_t state; +#if defined(HAVE_UNALIGNED_ACCESS) + state = vld1q_u8(input); +#else + if ((uintptr_t)input & 0x7) { + memcpy(buf, input, 16); + state = vld1q_u8(__builtin_assume_aligned(buf, 16)); + } else { + state = vld1q_u8(__builtin_assume_aligned(input, 8)); + } +#endif + input += 16; + inputLen -= 16; + + /* Rounds */ + state = vaeseq_u8(state, key1); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key2); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key3); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key4); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key5); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key6); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key7); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key8); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key9); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key10); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key11); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key12); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key13); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key14); + /* AddRoundKey */ + state = veorq_u8(state, key15); + +#if defined(HAVE_UNALIGNED_ACCESS) + vst1q_u8(output, state); +#else + if ((uintptr_t)output & 0x7) { + vst1q_u8(__builtin_assume_aligned(buf, 16), state); + memcpy(output, buf, 16); + } else { + vst1q_u8(__builtin_assume_aligned(output, 8), state); + } +#endif + output += 16; + } + return SECSuccess; +} + +SECStatus +arm_aes_decrypt_ecb_256(AESContext *cx, unsigned char *output, + unsigned int *outputLen, + unsigned int maxOutputLen, + const unsigned char *input, + unsigned int inputLen, + unsigned int blocksize) +{ +#if !defined(HAVE_UNALIGNED_ACCESS) + pre_align unsigned char buf[16] post_align; +#endif + uint8x16_t key1, key2, key3, key4, key5, key6, key7, key8, key9, key10; + uint8x16_t key11, key12, key13, key14, key15; + const PRUint8 *key = (const PRUint8 *)cx->k.expandedKey; + + if (!inputLen) { + return SECSuccess; + } + + key1 = vld1q_u8(key); + key2 = vld1q_u8(key + 16); + key3 = vld1q_u8(key + 32); + key4 = vld1q_u8(key + 48); + key5 = vld1q_u8(key + 64); + key6 = vld1q_u8(key + 80); + key7 = vld1q_u8(key + 96); + key8 = vld1q_u8(key + 112); + key9 = vld1q_u8(key + 128); + key10 = vld1q_u8(key + 144); + key11 = vld1q_u8(key + 160); + key12 = vld1q_u8(key + 176); + key13 = vld1q_u8(key + 192); + key14 = vld1q_u8(key + 208); + key15 = vld1q_u8(key + 224); + + while (inputLen > 0) { + uint8x16_t state; +#if defined(HAVE_UNALIGNED_ACCESS) + state = vld1q_u8(input); +#else + if ((uintptr_t)input & 0x7) { + memcpy(buf, input, 16); + state = vld1q_u8(__builtin_assume_aligned(buf, 16)); + } else { + state = vld1q_u8(__builtin_assume_aligned(input, 8)); + } +#endif + input += 16; + inputLen -= 16; + + /* Rounds */ + state = vaesdq_u8(state, key15); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key14); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key13); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key12); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key11); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key10); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key9); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key8); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key7); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key6); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key5); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key4); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key3); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key2); + /* AddRoundKey */ + state = veorq_u8(state, key1); + +#if defined(HAVE_UNALIGNED_ACCESS) + vst1q_u8(output, state); +#else + if ((uintptr_t)output & 0x7) { + vst1q_u8(__builtin_assume_aligned(buf, 16), state); + memcpy(output, buf, 16); + } else { + vst1q_u8(__builtin_assume_aligned(output, 8), state); + } +#endif + output += 16; + } + + return SECSuccess; +} + +SECStatus +arm_aes_encrypt_cbc_256(AESContext *cx, unsigned char *output, + unsigned int *outputLen, + unsigned int maxOutputLen, + const unsigned char *input, + unsigned int inputLen, + unsigned int blocksize) +{ +#if !defined(HAVE_UNALIGNED_ACCESS) + pre_align unsigned char buf[16] post_align; +#endif + uint8x16_t key1, key2, key3, key4, key5, key6, key7, key8, key9, key10; + uint8x16_t key11, key12, key13, key14, key15; + uint8x16_t iv; + const PRUint8 *key = (const PRUint8 *)cx->k.expandedKey; + + if (!inputLen) { + return SECSuccess; + } + + /* iv */ + iv = vld1q_u8(cx->iv); + + /* expanedKey */ + key1 = vld1q_u8(key); + key2 = vld1q_u8(key + 16); + key3 = vld1q_u8(key + 32); + key4 = vld1q_u8(key + 48); + key5 = vld1q_u8(key + 64); + key6 = vld1q_u8(key + 80); + key7 = vld1q_u8(key + 96); + key8 = vld1q_u8(key + 112); + key9 = vld1q_u8(key + 128); + key10 = vld1q_u8(key + 144); + key11 = vld1q_u8(key + 160); + key12 = vld1q_u8(key + 176); + key13 = vld1q_u8(key + 192); + key14 = vld1q_u8(key + 208); + key15 = vld1q_u8(key + 224); + + while (inputLen > 0) { + uint8x16_t state; +#if defined(HAVE_UNALIGNED_ACCESS) + state = vld1q_u8(input); +#else + if ((uintptr_t)input & 0x7) { + memcpy(buf, input, 16); + state = vld1q_u8(__builtin_assume_aligned(buf, 16)); + } else { + state = vld1q_u8(__builtin_assume_aligned(input, 8)); + } +#endif + input += 16; + inputLen -= 16; + + state = veorq_u8(state, iv); + + /* Rounds */ + state = vaeseq_u8(state, key1); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key2); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key3); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key4); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key5); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key6); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key7); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key8); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key9); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key10); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key11); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key12); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key13); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key14); + /* AddRoundKey */ + state = veorq_u8(state, key15); + +#if defined(HAVE_UNALIGNED_ACCESS) + vst1q_u8(output, state); +#else + if ((uintptr_t)output & 0x7) { + vst1q_u8(__builtin_assume_aligned(buf, 16), state); + memcpy(output, buf, 16); + } else { + vst1q_u8(__builtin_assume_aligned(output, 8), state); + } +#endif + output += 16; + iv = state; + } + vst1q_u8(cx->iv, iv); + + return SECSuccess; +} + +SECStatus +arm_aes_decrypt_cbc_256(AESContext *cx, unsigned char *output, + unsigned int *outputLen, + unsigned int maxOutputLen, + const unsigned char *input, + unsigned int inputLen, + unsigned int blocksize) +{ +#if !defined(HAVE_UNALIGNED_ACCESS) + pre_align unsigned char buf[16] post_align; +#endif + uint8x16_t iv; + uint8x16_t key1, key2, key3, key4, key5, key6, key7, key8, key9, key10; + uint8x16_t key11, key12, key13, key14, key15; + const PRUint8 *key = (const PRUint8 *)cx->k.expandedKey; + + if (!inputLen) { + return SECSuccess; + } + + /* iv */ + iv = vld1q_u8(cx->iv); + + /* expanedKey */ + key1 = vld1q_u8(key); + key2 = vld1q_u8(key + 16); + key3 = vld1q_u8(key + 32); + key4 = vld1q_u8(key + 48); + key5 = vld1q_u8(key + 64); + key6 = vld1q_u8(key + 80); + key7 = vld1q_u8(key + 96); + key8 = vld1q_u8(key + 112); + key9 = vld1q_u8(key + 128); + key10 = vld1q_u8(key + 144); + key11 = vld1q_u8(key + 160); + key12 = vld1q_u8(key + 176); + key13 = vld1q_u8(key + 192); + key14 = vld1q_u8(key + 208); + key15 = vld1q_u8(key + 224); + + while (inputLen > 0) { + uint8x16_t state, old_state; +#if defined(HAVE_UNALIGNED_ACCESS) + state = vld1q_u8(input); +#else + if ((uintptr_t)input & 0x7) { + memcpy(buf, input, 16); + state = vld1q_u8(__builtin_assume_aligned(buf, 16)); + } else { + state = vld1q_u8(__builtin_assume_aligned(input, 8)); + } +#endif + old_state = state; + input += 16; + inputLen -= 16; + + /* Rounds */ + state = vaesdq_u8(state, key15); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key14); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key13); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key12); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key11); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key10); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key9); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key8); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key7); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key6); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key5); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key4); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key3); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key2); + /* AddRoundKey */ + state = veorq_u8(state, key1); + + state = veorq_u8(state, iv); + +#if defined(HAVE_UNALIGNED_ACCESS) + vst1q_u8(output, state); +#else + if ((uintptr_t)output & 0x7) { + vst1q_u8(__builtin_assume_aligned(buf, 16), state); + memcpy(output, buf, 16); + } else { + vst1q_u8(__builtin_assume_aligned(output, 8), state); + } +#endif + output += 16; + + iv = old_state; + } + vst1q_u8(cx->iv, iv); + + return SECSuccess; +} + +#endif |