diff options
Diffstat (limited to 'security/nss/lib/freebl/intel-gcm.s')
-rw-r--r-- | security/nss/lib/freebl/intel-gcm.s | 1340 |
1 files changed, 1340 insertions, 0 deletions
diff --git a/security/nss/lib/freebl/intel-gcm.s b/security/nss/lib/freebl/intel-gcm.s new file mode 100644 index 000000000..1a3106091 --- /dev/null +++ b/security/nss/lib/freebl/intel-gcm.s @@ -0,0 +1,1340 @@ +# LICENSE: +# This submission to NSS is to be made available under the terms of the +# Mozilla Public License, v. 2.0. You can obtain one at http: +# //mozilla.org/MPL/2.0/. +################################################################################ +# Copyright(c) 2012, Intel Corp. + +.align 16 +.Lone: +.quad 1,0 +.Ltwo: +.quad 2,0 +.Lbswap_mask: +.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +.Lshuff_mask: +.quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f +.Lpoly: +.quad 0x1, 0xc200000000000000 + + +################################################################################ +# Generates the final GCM tag +# void intel_aes_gcmTAG(uint8_t Htbl[16*16], uint8_t *Tp, uint64_t Mlen, uint64_t Alen, uint8_t* X0, uint8_t* TAG); +.type intel_aes_gcmTAG,@function +.globl intel_aes_gcmTAG +.align 16 +intel_aes_gcmTAG: + +.set Htbl, %rdi +.set Tp, %rsi +.set Mlen, %rdx +.set Alen, %rcx +.set X0, %r8 +.set TAG, %r9 + +.set T,%xmm0 +.set TMP0,%xmm1 + + vmovdqu (Tp), T + vpshufb .Lbswap_mask(%rip), T, T + vpxor TMP0, TMP0, TMP0 + shl $3, Mlen + shl $3, Alen + vpinsrq $0, Mlen, TMP0, TMP0 + vpinsrq $1, Alen, TMP0, TMP0 + vpxor TMP0, T, T + vmovdqu (Htbl), TMP0 + call GFMUL + vpshufb .Lbswap_mask(%rip), T, T + vpxor (X0), T, T + vmovdqu T, (TAG) + +ret +.size intel_aes_gcmTAG, .-intel_aes_gcmTAG +################################################################################ +# Generates the H table +# void intel_aes_gcmINIT(uint8_t Htbl[16*16], uint8_t *KS, int NR); +.type intel_aes_gcmINIT,@function +.globl intel_aes_gcmINIT +.align 16 +intel_aes_gcmINIT: + +.set Htbl, %rdi +.set KS, %rsi +.set NR, %edx + +.set T,%xmm0 +.set TMP0,%xmm1 + +CALCULATE_POWERS_OF_H: + vmovdqu 16*0(KS), T + vaesenc 16*1(KS), T, T + vaesenc 16*2(KS), T, T + vaesenc 16*3(KS), T, T + vaesenc 16*4(KS), T, T + vaesenc 16*5(KS), T, T + vaesenc 16*6(KS), T, T + vaesenc 16*7(KS), T, T + vaesenc 16*8(KS), T, T + vaesenc 16*9(KS), T, T + vmovdqu 16*10(KS), TMP0 + cmp $10, NR + je .LH0done + vaesenc 16*10(KS), T, T + vaesenc 16*11(KS), T, T + vmovdqu 16*12(KS), TMP0 + cmp $12, NR + je .LH0done + vaesenc 16*12(KS), T, T + vaesenc 16*13(KS), T, T + vmovdqu 16*14(KS), TMP0 + +.LH0done: + vaesenclast TMP0, T, T + + vpshufb .Lbswap_mask(%rip), T, T + + vmovdqu T, TMP0 + # Calculate H` = GFMUL(H, 2) + vpsrld $7 , T , %xmm3 + vmovdqu .Lshuff_mask(%rip), %xmm4 + vpshufb %xmm4, %xmm3 , %xmm3 + movq $0xff00 , %rax + vmovq %rax, %xmm4 + vpshufb %xmm3, %xmm4 , %xmm4 + vmovdqu .Lpoly(%rip), %xmm5 + vpand %xmm4, %xmm5, %xmm5 + vpsrld $31, T, %xmm3 + vpslld $1, T, %xmm4 + vpslldq $4, %xmm3, %xmm3 + vpxor %xmm3, %xmm4, T #xmm1 holds now p(x)<<1 + + #adding p(x)<<1 to xmm5 + vpxor %xmm5, T , T + vmovdqu T, TMP0 + vmovdqu T, (Htbl) # H * 2 + call GFMUL + vmovdqu T, 16(Htbl) # H^2 * 2 + call GFMUL + vmovdqu T, 32(Htbl) # H^3 * 2 + call GFMUL + vmovdqu T, 48(Htbl) # H^4 * 2 + call GFMUL + vmovdqu T, 64(Htbl) # H^5 * 2 + call GFMUL + vmovdqu T, 80(Htbl) # H^6 * 2 + call GFMUL + vmovdqu T, 96(Htbl) # H^7 * 2 + call GFMUL + vmovdqu T, 112(Htbl) # H^8 * 2 + + # Precalculations for the reduce 4 step + vpshufd $78, (Htbl), %xmm8 + vpshufd $78, 16(Htbl), %xmm9 + vpshufd $78, 32(Htbl), %xmm10 + vpshufd $78, 48(Htbl), %xmm11 + vpshufd $78, 64(Htbl), %xmm12 + vpshufd $78, 80(Htbl), %xmm13 + vpshufd $78, 96(Htbl), %xmm14 + vpshufd $78, 112(Htbl), %xmm15 + + vpxor (Htbl), %xmm8, %xmm8 + vpxor 16(Htbl), %xmm9, %xmm9 + vpxor 32(Htbl), %xmm10, %xmm10 + vpxor 48(Htbl), %xmm11, %xmm11 + vpxor 64(Htbl), %xmm12, %xmm12 + vpxor 80(Htbl), %xmm13, %xmm13 + vpxor 96(Htbl), %xmm14, %xmm14 + vpxor 112(Htbl), %xmm15, %xmm15 + + vmovdqu %xmm8, 128(Htbl) + vmovdqu %xmm9, 144(Htbl) + vmovdqu %xmm10, 160(Htbl) + vmovdqu %xmm11, 176(Htbl) + vmovdqu %xmm12, 192(Htbl) + vmovdqu %xmm13, 208(Htbl) + vmovdqu %xmm14, 224(Htbl) + vmovdqu %xmm15, 240(Htbl) + + ret +.size intel_aes_gcmINIT, .-intel_aes_gcmINIT +################################################################################ +# Authenticate only +# void intel_aes_gcmAAD(uint8_t Htbl[16*16], uint8_t *AAD, uint64_t Alen, uint8_t *Tp); + +.globl intel_aes_gcmAAD +.type intel_aes_gcmAAD,@function +.align 16 +intel_aes_gcmAAD: + +.set DATA, %xmm0 +.set T, %xmm1 +.set BSWAP_MASK, %xmm2 +.set TMP0, %xmm3 +.set TMP1, %xmm4 +.set TMP2, %xmm5 +.set TMP3, %xmm6 +.set TMP4, %xmm7 +.set Xhi, %xmm9 + +.set Htbl, %rdi +.set inp, %rsi +.set len, %rdx +.set Tp, %rcx + +.set hlp0, %r11 + +.macro KARATSUBA_AAD i + vpclmulqdq $0x00, 16*\i(Htbl), DATA, TMP3 + vpxor TMP3, TMP0, TMP0 + vpclmulqdq $0x11, 16*\i(Htbl), DATA, TMP3 + vpxor TMP3, TMP1, TMP1 + vpshufd $78, DATA, TMP3 + vpxor DATA, TMP3, TMP3 + vpclmulqdq $0x00, 16*(\i+8)(Htbl), TMP3, TMP3 + vpxor TMP3, TMP2, TMP2 +.endm + + test len, len + jnz .LbeginAAD + ret + +.LbeginAAD: + + push hlp0 + vzeroupper + + vmovdqa .Lbswap_mask(%rip), BSWAP_MASK + + vpxor Xhi, Xhi, Xhi + + vmovdqu (Tp),T + vpshufb BSWAP_MASK,T,T + + # we hash 8 block each iteration, if the total amount of blocks is not a multiple of 8, we hash the first n%8 blocks first + mov len, hlp0 + and $~-128, hlp0 + + jz .Lmod_loop + + sub hlp0, len + sub $16, hlp0 + + #hash first prefix block + vmovdqu (inp), DATA + vpshufb BSWAP_MASK, DATA, DATA + vpxor T, DATA, DATA + + vpclmulqdq $0x00, (Htbl, hlp0), DATA, TMP0 + vpclmulqdq $0x11, (Htbl, hlp0), DATA, TMP1 + vpshufd $78, DATA, TMP2 + vpxor DATA, TMP2, TMP2 + vpclmulqdq $0x00, 16*8(Htbl, hlp0), TMP2, TMP2 + + lea 16(inp), inp + test hlp0, hlp0 + jnz .Lpre_loop + jmp .Lred1 + + #hash remaining prefix bocks (up to 7 total prefix blocks) +.align 64 +.Lpre_loop: + + sub $16, hlp0 + + vmovdqu (inp),DATA # next data block + vpshufb BSWAP_MASK,DATA,DATA + + vpclmulqdq $0x00, (Htbl,hlp0), DATA, TMP3 + vpxor TMP3, TMP0, TMP0 + vpclmulqdq $0x11, (Htbl,hlp0), DATA, TMP3 + vpxor TMP3, TMP1, TMP1 + vpshufd $78, DATA, TMP3 + vpxor DATA, TMP3, TMP3 + vpclmulqdq $0x00, 16*8(Htbl,hlp0), TMP3, TMP3 + vpxor TMP3, TMP2, TMP2 + + test hlp0, hlp0 + + lea 16(inp), inp + + jnz .Lpre_loop + +.Lred1: + vpxor TMP0, TMP2, TMP2 + vpxor TMP1, TMP2, TMP2 + vpsrldq $8, TMP2, TMP3 + vpslldq $8, TMP2, TMP2 + + vpxor TMP3, TMP1, Xhi + vpxor TMP2, TMP0, T + +.align 64 +.Lmod_loop: + sub $0x80, len + jb .Ldone + + vmovdqu 16*7(inp),DATA # Ii + vpshufb BSWAP_MASK,DATA,DATA + + vpclmulqdq $0x00, (Htbl), DATA, TMP0 + vpclmulqdq $0x11, (Htbl), DATA, TMP1 + vpshufd $78, DATA, TMP2 + vpxor DATA, TMP2, TMP2 + vpclmulqdq $0x00, 16*8(Htbl), TMP2, TMP2 + ######################################################### + vmovdqu 16*6(inp),DATA + vpshufb BSWAP_MASK,DATA,DATA + KARATSUBA_AAD 1 + ######################################################### + vmovdqu 16*5(inp),DATA + vpshufb BSWAP_MASK,DATA,DATA + + vpclmulqdq $0x10, .Lpoly(%rip), T, TMP4 #reduction stage 1a + vpalignr $8, T, T, T + + KARATSUBA_AAD 2 + + vpxor TMP4, T, T #reduction stage 1b + ######################################################### + vmovdqu 16*4(inp),DATA + vpshufb BSWAP_MASK,DATA,DATA + + KARATSUBA_AAD 3 + ######################################################### + vmovdqu 16*3(inp),DATA + vpshufb BSWAP_MASK,DATA,DATA + + vpclmulqdq $0x10, .Lpoly(%rip), T, TMP4 #reduction stage 2a + vpalignr $8, T, T, T + + KARATSUBA_AAD 4 + + vpxor TMP4, T, T #reduction stage 2b + ######################################################### + vmovdqu 16*2(inp),DATA + vpshufb BSWAP_MASK,DATA,DATA + + KARATSUBA_AAD 5 + + vpxor Xhi, T, T #reduction finalize + ######################################################### + vmovdqu 16*1(inp),DATA + vpshufb BSWAP_MASK,DATA,DATA + + KARATSUBA_AAD 6 + ######################################################### + vmovdqu 16*0(inp),DATA + vpshufb BSWAP_MASK,DATA,DATA + vpxor T,DATA,DATA + + KARATSUBA_AAD 7 + ######################################################### + vpxor TMP0, TMP2, TMP2 # karatsuba fixup + vpxor TMP1, TMP2, TMP2 + vpsrldq $8, TMP2, TMP3 + vpslldq $8, TMP2, TMP2 + + vpxor TMP3, TMP1, Xhi + vpxor TMP2, TMP0, T + + lea 16*8(inp), inp + jmp .Lmod_loop + ######################################################### + +.Ldone: + vpclmulqdq $0x10, .Lpoly(%rip), T, TMP3 + vpalignr $8, T, T, T + vpxor TMP3, T, T + + vpclmulqdq $0x10, .Lpoly(%rip), T, TMP3 + vpalignr $8, T, T, T + vpxor TMP3, T, T + + vpxor Xhi, T, T + +.Lsave: + vpshufb BSWAP_MASK,T, T + vmovdqu T,(Tp) + vzeroupper + + pop hlp0 + ret +.size intel_aes_gcmAAD,.-intel_aes_gcmAAD + +################################################################################ +# Encrypt and Authenticate +# void intel_aes_gcmENC(uint8_t* PT, uint8_t* CT, void *Gctx,uint64_t len); +.type intel_aes_gcmENC,@function +.globl intel_aes_gcmENC +.align 16 +intel_aes_gcmENC: + +.set PT,%rdi +.set CT,%rsi +.set Htbl, %rdx +.set len, %rcx +.set KS,%r9 +.set NR,%r10d + +.set Gctx, %rdx + +.set T,%xmm0 +.set TMP0,%xmm1 +.set TMP1,%xmm2 +.set TMP2,%xmm3 +.set TMP3,%xmm4 +.set TMP4,%xmm5 +.set TMP5,%xmm6 +.set CTR0,%xmm7 +.set CTR1,%xmm8 +.set CTR2,%xmm9 +.set CTR3,%xmm10 +.set CTR4,%xmm11 +.set CTR5,%xmm12 +.set CTR6,%xmm13 +.set CTR7,%xmm14 +.set CTR,%xmm15 + +.macro ROUND i + vmovdqu \i*16(KS), TMP3 + vaesenc TMP3, CTR0, CTR0 + vaesenc TMP3, CTR1, CTR1 + vaesenc TMP3, CTR2, CTR2 + vaesenc TMP3, CTR3, CTR3 + vaesenc TMP3, CTR4, CTR4 + vaesenc TMP3, CTR5, CTR5 + vaesenc TMP3, CTR6, CTR6 + vaesenc TMP3, CTR7, CTR7 +.endm + +.macro ROUNDMUL i + + vmovdqu \i*16(%rsp), TMP5 + vmovdqu \i*16(KS), TMP3 + + vaesenc TMP3, CTR0, CTR0 + vaesenc TMP3, CTR1, CTR1 + vaesenc TMP3, CTR2, CTR2 + vaesenc TMP3, CTR3, CTR3 + + vpshufd $78, TMP5, TMP4 + vpxor TMP5, TMP4, TMP4 + + vaesenc TMP3, CTR4, CTR4 + vaesenc TMP3, CTR5, CTR5 + vaesenc TMP3, CTR6, CTR6 + vaesenc TMP3, CTR7, CTR7 + + vpclmulqdq $0x00, 128+\i*16(Htbl), TMP4, TMP3 + vpxor TMP3, TMP0, TMP0 + vmovdqa \i*16(Htbl), TMP4 + vpclmulqdq $0x11, TMP4, TMP5, TMP3 + vpxor TMP3, TMP1, TMP1 + vpclmulqdq $0x00, TMP4, TMP5, TMP3 + vpxor TMP3, TMP2, TMP2 + +.endm + +.macro KARATSUBA i + vmovdqu \i*16(%rsp), TMP5 + + vpclmulqdq $0x11, 16*\i(Htbl), TMP5, TMP3 + vpxor TMP3, TMP1, TMP1 + vpclmulqdq $0x00, 16*\i(Htbl), TMP5, TMP3 + vpxor TMP3, TMP2, TMP2 + vpshufd $78, TMP5, TMP3 + vpxor TMP5, TMP3, TMP5 + vpclmulqdq $0x00, 128+\i*16(Htbl), TMP5, TMP3 + vpxor TMP3, TMP0, TMP0 +.endm + + test len, len + jnz .Lbegin + ret + +.Lbegin: + + vzeroupper + push %rbp + push %rbx + + movq %rsp, %rbp + sub $128, %rsp + andq $-16, %rsp + + vmovdqu 288(Gctx), CTR + vmovdqu 272(Gctx), T + mov 304(Gctx), KS + mov 4(KS), NR + lea 48(KS), KS + + vpshufb .Lbswap_mask(%rip), CTR, CTR + vpshufb .Lbswap_mask(%rip), T, T + + cmp $128, len + jb .LDataSingles + +# Encrypt the first eight blocks + sub $128, len + vmovdqa CTR, CTR0 + vpaddd .Lone(%rip), CTR0, CTR1 + vpaddd .Ltwo(%rip), CTR0, CTR2 + vpaddd .Lone(%rip), CTR2, CTR3 + vpaddd .Ltwo(%rip), CTR2, CTR4 + vpaddd .Lone(%rip), CTR4, CTR5 + vpaddd .Ltwo(%rip), CTR4, CTR6 + vpaddd .Lone(%rip), CTR6, CTR7 + vpaddd .Ltwo(%rip), CTR6, CTR + + vpshufb .Lbswap_mask(%rip), CTR0, CTR0 + vpshufb .Lbswap_mask(%rip), CTR1, CTR1 + vpshufb .Lbswap_mask(%rip), CTR2, CTR2 + vpshufb .Lbswap_mask(%rip), CTR3, CTR3 + vpshufb .Lbswap_mask(%rip), CTR4, CTR4 + vpshufb .Lbswap_mask(%rip), CTR5, CTR5 + vpshufb .Lbswap_mask(%rip), CTR6, CTR6 + vpshufb .Lbswap_mask(%rip), CTR7, CTR7 + + vpxor (KS), CTR0, CTR0 + vpxor (KS), CTR1, CTR1 + vpxor (KS), CTR2, CTR2 + vpxor (KS), CTR3, CTR3 + vpxor (KS), CTR4, CTR4 + vpxor (KS), CTR5, CTR5 + vpxor (KS), CTR6, CTR6 + vpxor (KS), CTR7, CTR7 + + ROUND 1 + ROUND 2 + ROUND 3 + ROUND 4 + ROUND 5 + ROUND 6 + ROUND 7 + ROUND 8 + ROUND 9 + + vmovdqu 160(KS), TMP5 + cmp $12, NR + jb .LLast1 + + ROUND 10 + ROUND 11 + + vmovdqu 192(KS), TMP5 + cmp $14, NR + jb .LLast1 + + ROUND 12 + ROUND 13 + + vmovdqu 224(KS), TMP5 + +.LLast1: + + vpxor (PT), TMP5, TMP3 + vaesenclast TMP3, CTR0, CTR0 + vpxor 16(PT), TMP5, TMP3 + vaesenclast TMP3, CTR1, CTR1 + vpxor 32(PT), TMP5, TMP3 + vaesenclast TMP3, CTR2, CTR2 + vpxor 48(PT), TMP5, TMP3 + vaesenclast TMP3, CTR3, CTR3 + vpxor 64(PT), TMP5, TMP3 + vaesenclast TMP3, CTR4, CTR4 + vpxor 80(PT), TMP5, TMP3 + vaesenclast TMP3, CTR5, CTR5 + vpxor 96(PT), TMP5, TMP3 + vaesenclast TMP3, CTR6, CTR6 + vpxor 112(PT), TMP5, TMP3 + vaesenclast TMP3, CTR7, CTR7 + + vmovdqu .Lbswap_mask(%rip), TMP3 + + vmovdqu CTR0, (CT) + vpshufb TMP3, CTR0, CTR0 + vmovdqu CTR1, 16(CT) + vpshufb TMP3, CTR1, CTR1 + vmovdqu CTR2, 32(CT) + vpshufb TMP3, CTR2, CTR2 + vmovdqu CTR3, 48(CT) + vpshufb TMP3, CTR3, CTR3 + vmovdqu CTR4, 64(CT) + vpshufb TMP3, CTR4, CTR4 + vmovdqu CTR5, 80(CT) + vpshufb TMP3, CTR5, CTR5 + vmovdqu CTR6, 96(CT) + vpshufb TMP3, CTR6, CTR6 + vmovdqu CTR7, 112(CT) + vpshufb TMP3, CTR7, CTR7 + + lea 128(CT), CT + lea 128(PT), PT + jmp .LDataOctets + +# Encrypt 8 blocks each time while hashing previous 8 blocks +.align 64 +.LDataOctets: + cmp $128, len + jb .LEndOctets + sub $128, len + + vmovdqa CTR7, TMP5 + vmovdqa CTR6, 1*16(%rsp) + vmovdqa CTR5, 2*16(%rsp) + vmovdqa CTR4, 3*16(%rsp) + vmovdqa CTR3, 4*16(%rsp) + vmovdqa CTR2, 5*16(%rsp) + vmovdqa CTR1, 6*16(%rsp) + vmovdqa CTR0, 7*16(%rsp) + + vmovdqa CTR, CTR0 + vpaddd .Lone(%rip), CTR0, CTR1 + vpaddd .Ltwo(%rip), CTR0, CTR2 + vpaddd .Lone(%rip), CTR2, CTR3 + vpaddd .Ltwo(%rip), CTR2, CTR4 + vpaddd .Lone(%rip), CTR4, CTR5 + vpaddd .Ltwo(%rip), CTR4, CTR6 + vpaddd .Lone(%rip), CTR6, CTR7 + vpaddd .Ltwo(%rip), CTR6, CTR + + vmovdqu (KS), TMP4 + vpshufb TMP3, CTR0, CTR0 + vpxor TMP4, CTR0, CTR0 + vpshufb TMP3, CTR1, CTR1 + vpxor TMP4, CTR1, CTR1 + vpshufb TMP3, CTR2, CTR2 + vpxor TMP4, CTR2, CTR2 + vpshufb TMP3, CTR3, CTR3 + vpxor TMP4, CTR3, CTR3 + vpshufb TMP3, CTR4, CTR4 + vpxor TMP4, CTR4, CTR4 + vpshufb TMP3, CTR5, CTR5 + vpxor TMP4, CTR5, CTR5 + vpshufb TMP3, CTR6, CTR6 + vpxor TMP4, CTR6, CTR6 + vpshufb TMP3, CTR7, CTR7 + vpxor TMP4, CTR7, CTR7 + + vmovdqu 16*0(Htbl), TMP3 + vpclmulqdq $0x11, TMP3, TMP5, TMP1 + vpclmulqdq $0x00, TMP3, TMP5, TMP2 + vpshufd $78, TMP5, TMP3 + vpxor TMP5, TMP3, TMP5 + vmovdqu 128+0*16(Htbl), TMP3 + vpclmulqdq $0x00, TMP3, TMP5, TMP0 + + ROUNDMUL 1 + + ROUNDMUL 2 + + ROUNDMUL 3 + + ROUNDMUL 4 + + ROUNDMUL 5 + + ROUNDMUL 6 + + vpxor 7*16(%rsp), T, TMP5 + vmovdqu 7*16(KS), TMP3 + + vaesenc TMP3, CTR0, CTR0 + vaesenc TMP3, CTR1, CTR1 + vaesenc TMP3, CTR2, CTR2 + vaesenc TMP3, CTR3, CTR3 + + vpshufd $78, TMP5, TMP4 + vpxor TMP5, TMP4, TMP4 + + vaesenc TMP3, CTR4, CTR4 + vaesenc TMP3, CTR5, CTR5 + vaesenc TMP3, CTR6, CTR6 + vaesenc TMP3, CTR7, CTR7 + + vpclmulqdq $0x11, 7*16(Htbl), TMP5, TMP3 + vpxor TMP3, TMP1, TMP1 + vpclmulqdq $0x00, 7*16(Htbl), TMP5, TMP3 + vpxor TMP3, TMP2, TMP2 + vpclmulqdq $0x00, 128+7*16(Htbl), TMP4, TMP3 + vpxor TMP3, TMP0, TMP0 + + ROUND 8 + vmovdqa .Lpoly(%rip), TMP5 + + vpxor TMP1, TMP0, TMP0 + vpxor TMP2, TMP0, TMP0 + vpsrldq $8, TMP0, TMP3 + vpxor TMP3, TMP1, TMP4 + vpslldq $8, TMP0, TMP3 + vpxor TMP3, TMP2, T + + vpclmulqdq $0x10, TMP5, T, TMP1 + vpalignr $8, T, T, T + vpxor T, TMP1, T + + ROUND 9 + + vpclmulqdq $0x10, TMP5, T, TMP1 + vpalignr $8, T, T, T + vpxor T, TMP1, T + + vmovdqu 160(KS), TMP5 + cmp $10, NR + jbe .LLast2 + + ROUND 10 + ROUND 11 + + vmovdqu 192(KS), TMP5 + cmp $12, NR + jbe .LLast2 + + ROUND 12 + ROUND 13 + + vmovdqu 224(KS), TMP5 + +.LLast2: + + vpxor (PT), TMP5, TMP3 + vaesenclast TMP3, CTR0, CTR0 + vpxor 16(PT), TMP5, TMP3 + vaesenclast TMP3, CTR1, CTR1 + vpxor 32(PT), TMP5, TMP3 + vaesenclast TMP3, CTR2, CTR2 + vpxor 48(PT), TMP5, TMP3 + vaesenclast TMP3, CTR3, CTR3 + vpxor 64(PT), TMP5, TMP3 + vaesenclast TMP3, CTR4, CTR4 + vpxor 80(PT), TMP5, TMP3 + vaesenclast TMP3, CTR5, CTR5 + vpxor 96(PT), TMP5, TMP3 + vaesenclast TMP3, CTR6, CTR6 + vpxor 112(PT), TMP5, TMP3 + vaesenclast TMP3, CTR7, CTR7 + + vmovdqu .Lbswap_mask(%rip), TMP3 + + vmovdqu CTR0, (CT) + vpshufb TMP3, CTR0, CTR0 + vmovdqu CTR1, 16(CT) + vpshufb TMP3, CTR1, CTR1 + vmovdqu CTR2, 32(CT) + vpshufb TMP3, CTR2, CTR2 + vmovdqu CTR3, 48(CT) + vpshufb TMP3, CTR3, CTR3 + vmovdqu CTR4, 64(CT) + vpshufb TMP3, CTR4, CTR4 + vmovdqu CTR5, 80(CT) + vpshufb TMP3, CTR5, CTR5 + vmovdqu CTR6, 96(CT) + vpshufb TMP3, CTR6, CTR6 + vmovdqu CTR7,112(CT) + vpshufb TMP3, CTR7, CTR7 + + vpxor TMP4, T, T + + lea 128(CT), CT + lea 128(PT), PT + jmp .LDataOctets + +.LEndOctets: + + vmovdqa CTR7, TMP5 + vmovdqa CTR6, 1*16(%rsp) + vmovdqa CTR5, 2*16(%rsp) + vmovdqa CTR4, 3*16(%rsp) + vmovdqa CTR3, 4*16(%rsp) + vmovdqa CTR2, 5*16(%rsp) + vmovdqa CTR1, 6*16(%rsp) + vmovdqa CTR0, 7*16(%rsp) + + vmovdqu 16*0(Htbl), TMP3 + vpclmulqdq $0x11, TMP3, TMP5, TMP1 + vpclmulqdq $0x00, TMP3, TMP5, TMP2 + vpshufd $78, TMP5, TMP3 + vpxor TMP5, TMP3, TMP5 + vmovdqu 128+0*16(Htbl), TMP3 + vpclmulqdq $0x00, TMP3, TMP5, TMP0 + + KARATSUBA 1 + KARATSUBA 2 + KARATSUBA 3 + KARATSUBA 4 + KARATSUBA 5 + KARATSUBA 6 + + vmovdqu 7*16(%rsp), TMP5 + vpxor T, TMP5, TMP5 + vmovdqu 16*7(Htbl), TMP4 + vpclmulqdq $0x11, TMP4, TMP5, TMP3 + vpxor TMP3, TMP1, TMP1 + vpclmulqdq $0x00, TMP4, TMP5, TMP3 + vpxor TMP3, TMP2, TMP2 + vpshufd $78, TMP5, TMP3 + vpxor TMP5, TMP3, TMP5 + vmovdqu 128+7*16(Htbl), TMP4 + vpclmulqdq $0x00, TMP4, TMP5, TMP3 + vpxor TMP3, TMP0, TMP0 + + vpxor TMP1, TMP0, TMP0 + vpxor TMP2, TMP0, TMP0 + + vpsrldq $8, TMP0, TMP3 + vpxor TMP3, TMP1, TMP4 + vpslldq $8, TMP0, TMP3 + vpxor TMP3, TMP2, T + + vmovdqa .Lpoly(%rip), TMP2 + + vpalignr $8, T, T, TMP1 + vpclmulqdq $0x10, TMP2, T, T + vpxor T, TMP1, T + + vpalignr $8, T, T, TMP1 + vpclmulqdq $0x10, TMP2, T, T + vpxor T, TMP1, T + + vpxor TMP4, T, T + +#Here we encrypt any remaining whole block +.LDataSingles: + + cmp $16, len + jb .LDataTail + sub $16, len + + vpshufb .Lbswap_mask(%rip), CTR, TMP1 + vpaddd .Lone(%rip), CTR, CTR + + vpxor (KS), TMP1, TMP1 + vaesenc 16*1(KS), TMP1, TMP1 + vaesenc 16*2(KS), TMP1, TMP1 + vaesenc 16*3(KS), TMP1, TMP1 + vaesenc 16*4(KS), TMP1, TMP1 + vaesenc 16*5(KS), TMP1, TMP1 + vaesenc 16*6(KS), TMP1, TMP1 + vaesenc 16*7(KS), TMP1, TMP1 + vaesenc 16*8(KS), TMP1, TMP1 + vaesenc 16*9(KS), TMP1, TMP1 + vmovdqu 16*10(KS), TMP2 + cmp $10, NR + je .LLast3 + vaesenc 16*10(KS), TMP1, TMP1 + vaesenc 16*11(KS), TMP1, TMP1 + vmovdqu 16*12(KS), TMP2 + cmp $12, NR + je .LLast3 + vaesenc 16*12(KS), TMP1, TMP1 + vaesenc 16*13(KS), TMP1, TMP1 + vmovdqu 16*14(KS), TMP2 + +.LLast3: + vaesenclast TMP2, TMP1, TMP1 + + vpxor (PT), TMP1, TMP1 + vmovdqu TMP1, (CT) + addq $16, CT + addq $16, PT + + vpshufb .Lbswap_mask(%rip), TMP1, TMP1 + vpxor TMP1, T, T + vmovdqu (Htbl), TMP0 + call GFMUL + + jmp .LDataSingles + +#Here we encypt the final partial block, if there is one +.LDataTail: + + test len, len + jz DATA_END +# First prepare the counter block + vpshufb .Lbswap_mask(%rip), CTR, TMP1 + vpaddd .Lone(%rip), CTR, CTR + + vpxor (KS), TMP1, TMP1 + vaesenc 16*1(KS), TMP1, TMP1 + vaesenc 16*2(KS), TMP1, TMP1 + vaesenc 16*3(KS), TMP1, TMP1 + vaesenc 16*4(KS), TMP1, TMP1 + vaesenc 16*5(KS), TMP1, TMP1 + vaesenc 16*6(KS), TMP1, TMP1 + vaesenc 16*7(KS), TMP1, TMP1 + vaesenc 16*8(KS), TMP1, TMP1 + vaesenc 16*9(KS), TMP1, TMP1 + vmovdqu 16*10(KS), TMP2 + cmp $10, NR + je .LLast4 + vaesenc 16*10(KS), TMP1, TMP1 + vaesenc 16*11(KS), TMP1, TMP1 + vmovdqu 16*12(KS), TMP2 + cmp $12, NR + je .LLast4 + vaesenc 16*12(KS), TMP1, TMP1 + vaesenc 16*13(KS), TMP1, TMP1 + vmovdqu 16*14(KS), TMP2 + +.LLast4: + vaesenclast TMP2, TMP1, TMP1 +#Zero a temp location + vpxor TMP2, TMP2, TMP2 + vmovdqa TMP2, (%rsp) + +# Copy the required bytes only (could probably use rep movsb) + xor KS, KS +.LEncCpy: + cmp KS, len + je .LEncCpyEnd + movb (PT, KS, 1), %r8b + movb %r8b, (%rsp, KS, 1) + inc KS + jmp .LEncCpy +.LEncCpyEnd: +# Xor with the counter block + vpxor (%rsp), TMP1, TMP0 +# Again, store at temp location + vmovdqa TMP0, (%rsp) +# Copy only the required bytes to CT, and zero the rest for the hash + xor KS, KS +.LEncCpy2: + cmp KS, len + je .LEncCpy3 + movb (%rsp, KS, 1), %r8b + movb %r8b, (CT, KS, 1) + inc KS + jmp .LEncCpy2 +.LEncCpy3: + cmp $16, KS + je .LEndCpy3 + movb $0, (%rsp, KS, 1) + inc KS + jmp .LEncCpy3 +.LEndCpy3: + vmovdqa (%rsp), TMP0 + + vpshufb .Lbswap_mask(%rip), TMP0, TMP0 + vpxor TMP0, T, T + vmovdqu (Htbl), TMP0 + call GFMUL + +DATA_END: + + vpshufb .Lbswap_mask(%rip), T, T + vpshufb .Lbswap_mask(%rip), CTR, CTR + vmovdqu T, 272(Gctx) + vmovdqu CTR, 288(Gctx) + + movq %rbp, %rsp + + popq %rbx + popq %rbp + ret + .size intel_aes_gcmENC, .-intel_aes_gcmENC + +######################### +# Decrypt and Authenticate +# void intel_aes_gcmDEC(uint8_t* PT, uint8_t* CT, void *Gctx,uint64_t len); +.type intel_aes_gcmDEC,@function +.globl intel_aes_gcmDEC +.align 16 +intel_aes_gcmDEC: +# parameter 1: CT # input +# parameter 2: PT # output +# parameter 3: %rdx # Gctx +# parameter 4: %rcx # len + +.macro DEC_KARATSUBA i + vmovdqu (7-\i)*16(CT), TMP5 + vpshufb .Lbswap_mask(%rip), TMP5, TMP5 + + vpclmulqdq $0x11, 16*\i(Htbl), TMP5, TMP3 + vpxor TMP3, TMP1, TMP1 + vpclmulqdq $0x00, 16*\i(Htbl), TMP5, TMP3 + vpxor TMP3, TMP2, TMP2 + vpshufd $78, TMP5, TMP3 + vpxor TMP5, TMP3, TMP5 + vpclmulqdq $0x00, 128+\i*16(Htbl), TMP5, TMP3 + vpxor TMP3, TMP0, TMP0 +.endm + +.set PT,%rsi +.set CT,%rdi +.set Htbl, %rdx +.set len, %rcx +.set KS,%r9 +.set NR,%r10d + +.set Gctx, %rdx + +.set T,%xmm0 +.set TMP0,%xmm1 +.set TMP1,%xmm2 +.set TMP2,%xmm3 +.set TMP3,%xmm4 +.set TMP4,%xmm5 +.set TMP5,%xmm6 +.set CTR0,%xmm7 +.set CTR1,%xmm8 +.set CTR2,%xmm9 +.set CTR3,%xmm10 +.set CTR4,%xmm11 +.set CTR5,%xmm12 +.set CTR6,%xmm13 +.set CTR7,%xmm14 +.set CTR,%xmm15 + + test len, len + jnz .LbeginDec + ret + +.LbeginDec: + + pushq %rbp + pushq %rbx + movq %rsp, %rbp + sub $128, %rsp + andq $-16, %rsp + vmovdqu 288(Gctx), CTR + vmovdqu 272(Gctx), T + mov 304(Gctx), KS + mov 4(KS), NR + lea 48(KS), KS + + vpshufb .Lbswap_mask(%rip), CTR, CTR + vpshufb .Lbswap_mask(%rip), T, T + + vmovdqu .Lbswap_mask(%rip), TMP3 + jmp .LDECOctets + +# Decrypt 8 blocks each time while hashing them at the same time +.align 64 +.LDECOctets: + + cmp $128, len + jb .LDECSingles + sub $128, len + + vmovdqa CTR, CTR0 + vpaddd .Lone(%rip), CTR0, CTR1 + vpaddd .Ltwo(%rip), CTR0, CTR2 + vpaddd .Lone(%rip), CTR2, CTR3 + vpaddd .Ltwo(%rip), CTR2, CTR4 + vpaddd .Lone(%rip), CTR4, CTR5 + vpaddd .Ltwo(%rip), CTR4, CTR6 + vpaddd .Lone(%rip), CTR6, CTR7 + vpaddd .Ltwo(%rip), CTR6, CTR + + vpshufb TMP3, CTR0, CTR0 + vpshufb TMP3, CTR1, CTR1 + vpshufb TMP3, CTR2, CTR2 + vpshufb TMP3, CTR3, CTR3 + vpshufb TMP3, CTR4, CTR4 + vpshufb TMP3, CTR5, CTR5 + vpshufb TMP3, CTR6, CTR6 + vpshufb TMP3, CTR7, CTR7 + + vmovdqu (KS), TMP3 + vpxor TMP3, CTR0, CTR0 + vpxor TMP3, CTR1, CTR1 + vpxor TMP3, CTR2, CTR2 + vpxor TMP3, CTR3, CTR3 + vpxor TMP3, CTR4, CTR4 + vpxor TMP3, CTR5, CTR5 + vpxor TMP3, CTR6, CTR6 + vpxor TMP3, CTR7, CTR7 + + vmovdqu 7*16(CT), TMP5 + vpshufb .Lbswap_mask(%rip), TMP5, TMP5 + vmovdqu 16*0(Htbl), TMP3 + vpclmulqdq $0x11, TMP3, TMP5, TMP1 + vpclmulqdq $0x00, TMP3, TMP5, TMP2 + vpshufd $78, TMP5, TMP3 + vpxor TMP5, TMP3, TMP5 + vmovdqu 128+0*16(Htbl), TMP3 + vpclmulqdq $0x00, TMP3, TMP5, TMP0 + + ROUND 1 + DEC_KARATSUBA 1 + + ROUND 2 + DEC_KARATSUBA 2 + + ROUND 3 + DEC_KARATSUBA 3 + + ROUND 4 + DEC_KARATSUBA 4 + + ROUND 5 + DEC_KARATSUBA 5 + + ROUND 6 + DEC_KARATSUBA 6 + + ROUND 7 + + vmovdqu 0*16(CT), TMP5 + vpshufb .Lbswap_mask(%rip), TMP5, TMP5 + vpxor T, TMP5, TMP5 + vmovdqu 16*7(Htbl), TMP4 + + vpclmulqdq $0x11, TMP4, TMP5, TMP3 + vpxor TMP3, TMP1, TMP1 + vpclmulqdq $0x00, TMP4, TMP5, TMP3 + vpxor TMP3, TMP2, TMP2 + + vpshufd $78, TMP5, TMP3 + vpxor TMP5, TMP3, TMP5 + vmovdqu 128+7*16(Htbl), TMP4 + + vpclmulqdq $0x00, TMP4, TMP5, TMP3 + vpxor TMP3, TMP0, TMP0 + + ROUND 8 + + vpxor TMP1, TMP0, TMP0 + vpxor TMP2, TMP0, TMP0 + + vpsrldq $8, TMP0, TMP3 + vpxor TMP3, TMP1, TMP4 + vpslldq $8, TMP0, TMP3 + vpxor TMP3, TMP2, T + vmovdqa .Lpoly(%rip), TMP2 + + vpalignr $8, T, T, TMP1 + vpclmulqdq $0x10, TMP2, T, T + vpxor T, TMP1, T + + ROUND 9 + + vpalignr $8, T, T, TMP1 + vpclmulqdq $0x10, TMP2, T, T + vpxor T, TMP1, T + + vmovdqu 160(KS), TMP5 + cmp $10, NR + + jbe .LDECLast1 + + ROUND 10 + ROUND 11 + + vmovdqu 192(KS), TMP5 + cmp $12, NR + + jbe .LDECLast1 + + ROUND 12 + ROUND 13 + + vmovdqu 224(KS), TMP5 + +.LDECLast1: + + vpxor (CT), TMP5, TMP3 + vaesenclast TMP3, CTR0, CTR0 + vpxor 16(CT), TMP5, TMP3 + vaesenclast TMP3, CTR1, CTR1 + vpxor 32(CT), TMP5, TMP3 + vaesenclast TMP3, CTR2, CTR2 + vpxor 48(CT), TMP5, TMP3 + vaesenclast TMP3, CTR3, CTR3 + vpxor 64(CT), TMP5, TMP3 + vaesenclast TMP3, CTR4, CTR4 + vpxor 80(CT), TMP5, TMP3 + vaesenclast TMP3, CTR5, CTR5 + vpxor 96(CT), TMP5, TMP3 + vaesenclast TMP3, CTR6, CTR6 + vpxor 112(CT), TMP5, TMP3 + vaesenclast TMP3, CTR7, CTR7 + + vmovdqu .Lbswap_mask(%rip), TMP3 + + vmovdqu CTR0, (PT) + vmovdqu CTR1, 16(PT) + vmovdqu CTR2, 32(PT) + vmovdqu CTR3, 48(PT) + vmovdqu CTR4, 64(PT) + vmovdqu CTR5, 80(PT) + vmovdqu CTR6, 96(PT) + vmovdqu CTR7,112(PT) + + vpxor TMP4, T, T + + lea 128(CT), CT + lea 128(PT), PT + jmp .LDECOctets + +#Here we decrypt and hash any remaining whole block +.LDECSingles: + + cmp $16, len + jb .LDECTail + sub $16, len + + vmovdqu (CT), TMP1 + vpshufb .Lbswap_mask(%rip), TMP1, TMP1 + vpxor TMP1, T, T + vmovdqu (Htbl), TMP0 + call GFMUL + + + vpshufb .Lbswap_mask(%rip), CTR, TMP1 + vpaddd .Lone(%rip), CTR, CTR + + vpxor (KS), TMP1, TMP1 + vaesenc 16*1(KS), TMP1, TMP1 + vaesenc 16*2(KS), TMP1, TMP1 + vaesenc 16*3(KS), TMP1, TMP1 + vaesenc 16*4(KS), TMP1, TMP1 + vaesenc 16*5(KS), TMP1, TMP1 + vaesenc 16*6(KS), TMP1, TMP1 + vaesenc 16*7(KS), TMP1, TMP1 + vaesenc 16*8(KS), TMP1, TMP1 + vaesenc 16*9(KS), TMP1, TMP1 + vmovdqu 16*10(KS), TMP2 + cmp $10, NR + je .LDECLast2 + vaesenc 16*10(KS), TMP1, TMP1 + vaesenc 16*11(KS), TMP1, TMP1 + vmovdqu 16*12(KS), TMP2 + cmp $12, NR + je .LDECLast2 + vaesenc 16*12(KS), TMP1, TMP1 + vaesenc 16*13(KS), TMP1, TMP1 + vmovdqu 16*14(KS), TMP2 +.LDECLast2: + vaesenclast TMP2, TMP1, TMP1 + + vpxor (CT), TMP1, TMP1 + vmovdqu TMP1, (PT) + addq $16, CT + addq $16, PT + jmp .LDECSingles + +#Here we decrypt the final partial block, if there is one +.LDECTail: + test len, len + jz .LDEC_END + + vpshufb .Lbswap_mask(%rip), CTR, TMP1 + vpaddd .Lone(%rip), CTR, CTR + + vpxor (KS), TMP1, TMP1 + vaesenc 16*1(KS), TMP1, TMP1 + vaesenc 16*2(KS), TMP1, TMP1 + vaesenc 16*3(KS), TMP1, TMP1 + vaesenc 16*4(KS), TMP1, TMP1 + vaesenc 16*5(KS), TMP1, TMP1 + vaesenc 16*6(KS), TMP1, TMP1 + vaesenc 16*7(KS), TMP1, TMP1 + vaesenc 16*8(KS), TMP1, TMP1 + vaesenc 16*9(KS), TMP1, TMP1 + vmovdqu 16*10(KS), TMP2 + cmp $10, NR + je .LDECLast3 + vaesenc 16*10(KS), TMP1, TMP1 + vaesenc 16*11(KS), TMP1, TMP1 + vmovdqu 16*12(KS), TMP2 + cmp $12, NR + je .LDECLast3 + vaesenc 16*12(KS), TMP1, TMP1 + vaesenc 16*13(KS), TMP1, TMP1 + vmovdqu 16*14(KS), TMP2 + +.LDECLast3: + vaesenclast TMP2, TMP1, TMP1 + + vpxor TMP2, TMP2, TMP2 + vmovdqa TMP2, (%rsp) +# Copy the required bytes only (could probably use rep movsb) + xor KS, KS +.LDecCpy: + cmp KS, len + je .LDecCpy2 + movb (CT, KS, 1), %r8b + movb %r8b, (%rsp, KS, 1) + inc KS + jmp .LDecCpy +.LDecCpy2: + cmp $16, KS + je .LDecCpyEnd + movb $0, (%rsp, KS, 1) + inc KS + jmp .LDecCpy2 +.LDecCpyEnd: +# Xor with the counter block + vmovdqa (%rsp), TMP0 + vpxor TMP0, TMP1, TMP1 +# Again, store at temp location + vmovdqa TMP1, (%rsp) +# Copy only the required bytes to PT, and zero the rest for the hash + xor KS, KS +.LDecCpy3: + cmp KS, len + je .LDecCpyEnd3 + movb (%rsp, KS, 1), %r8b + movb %r8b, (PT, KS, 1) + inc KS + jmp .LDecCpy3 +.LDecCpyEnd3: + vpshufb .Lbswap_mask(%rip), TMP0, TMP0 + vpxor TMP0, T, T + vmovdqu (Htbl), TMP0 + call GFMUL +.LDEC_END: + + vpshufb .Lbswap_mask(%rip), T, T + vpshufb .Lbswap_mask(%rip), CTR, CTR + vmovdqu T, 272(Gctx) + vmovdqu CTR, 288(Gctx) + + movq %rbp, %rsp + + popq %rbx + popq %rbp + ret + .size intel_aes_gcmDEC, .-intel_aes_gcmDEC +######################### +# a = T +# b = TMP0 - remains unchanged +# res = T +# uses also TMP1,TMP2,TMP3,TMP4 +# __m128i GFMUL(__m128i A, __m128i B); +.type GFMUL,@function +.globl GFMUL +GFMUL: + vpclmulqdq $0x00, TMP0, T, TMP1 + vpclmulqdq $0x11, TMP0, T, TMP4 + + vpshufd $78, T, TMP2 + vpshufd $78, TMP0, TMP3 + vpxor T, TMP2, TMP2 + vpxor TMP0, TMP3, TMP3 + + vpclmulqdq $0x00, TMP3, TMP2, TMP2 + vpxor TMP1, TMP2, TMP2 + vpxor TMP4, TMP2, TMP2 + + vpslldq $8, TMP2, TMP3 + vpsrldq $8, TMP2, TMP2 + + vpxor TMP3, TMP1, TMP1 + vpxor TMP2, TMP4, TMP4 + + vpclmulqdq $0x10, .Lpoly(%rip), TMP1, TMP2 + vpshufd $78, TMP1, TMP3 + vpxor TMP3, TMP2, TMP1 + + vpclmulqdq $0x10, .Lpoly(%rip), TMP1, TMP2 + vpshufd $78, TMP1, TMP3 + vpxor TMP3, TMP2, TMP1 + + vpxor TMP4, TMP1, T + ret +.size GFMUL, .-GFMUL + |