/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

	.text

#define IV_OFFSET 256

/*
 * Warning: the length values used in this module are "unsigned int"
 * in C, which is 32-bit.  When they're passed in registers, use only
 * the low 32 bits, because the top half is unspecified.
 *
 * This is called from C code, so the contents of those bits can
 * depend on the C compiler's optimization decisions.  This means that
 * mistakes might not be obvious in testing if those bits happen to be
 * zero in your build.
 *
 * Exception: 32-bit lea instructions use a 64-bit address because the
 * address size doesn't affect the result, and that form is more
 * compactly encoded and preferred by compilers over a 32-bit address.
 */

/* in %rdi : the key
   in %rsi : buffer for expanded key
*/
	.type intel_aes_encrypt_init_128,@function
	.globl intel_aes_encrypt_init_128
	.align	16
intel_aes_encrypt_init_128:
	movups	(%rdi), %xmm1
	movups	%xmm1, (%rsi)
	leaq	16(%rsi), %rsi
	xorl	%eax, %eax

	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x01	/* aeskeygenassist $0x01, %xmm1, %xmm2 */
	call key_expansion128
	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x02	/* aeskeygenassist $0x02, %xmm1, %xmm2 */
	call key_expansion128
	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x04	/* aeskeygenassist $0x04, %xmm1, %xmm2 */
	call key_expansion128
	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x08	/* aeskeygenassist $0x08, %xmm1, %xmm2 */
	call key_expansion128
	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x10	/* aeskeygenassist $0x10, %xmm1, %xmm2 */
	call key_expansion128
	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x20	/* aeskeygenassist $0x20, %xmm1, %xmm2 */
	call key_expansion128
	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x40	/* aeskeygenassist $0x40, %xmm1, %xmm2 */
	call key_expansion128
	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x80	/* aeskeygenassist $0x80, %xmm1, %xmm2 */
	call key_expansion128
	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x1b	/* aeskeygenassist $0x1b, %xmm1, %xmm2 */
	call key_expansion128
	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x36	/* aeskeygenassist $0x36, %xmm1, %xmm2 */
	call key_expansion128

	ret
	.size intel_aes_encrypt_init_128, .-intel_aes_encrypt_init_128


/* in %rdi : the key
   in %rsi : buffer for expanded key
*/
	.type intel_aes_decrypt_init_128,@function
	.globl intel_aes_decrypt_init_128
	.align	16
intel_aes_decrypt_init_128:
	movups	(%rdi), %xmm1
	movups	%xmm1, (%rsi)
	leaq	16(%rsi), %rsi
	xorl	%eax, %eax

	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x01	/* aeskeygenassist $0x01, %xmm1, %xmm2 */
	call key_expansion128
	.byte 0x66,0x0f,0x38,0xdb,0xd1	/* aesimc	%xmm1, %xmm2 */
	movups	%xmm2, -16(%rsi)
	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x02	/* aeskeygenassist $0x02, %xmm1, %xmm2 */
	call key_expansion128
	.byte 0x66,0x0f,0x38,0xdb,0xd1	/* aesimc	%xmm1, %xmm2 */
	movups	%xmm2, -16(%rsi)
	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x04	/* aeskeygenassist $0x04, %xmm1, %xmm2 */
	call key_expansion128
	.byte 0x66,0x0f,0x38,0xdb,0xd1	/* aesimc	%xmm1, %xmm2 */
	movups	%xmm2, -16(%rsi)
	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x08	/* aeskeygenassist $0x08, %xmm1, %xmm2 */
	call key_expansion128
	.byte 0x66,0x0f,0x38,0xdb,0xd1	/* aesimc	%xmm1, %xmm2 */
	movups	%xmm2, -16(%rsi)
	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x10	/* aeskeygenassist $0x10, %xmm1, %xmm2 */
	call key_expansion128
	.byte 0x66,0x0f,0x38,0xdb,0xd1	/* aesimc	%xmm1, %xmm2 */
	movups	%xmm2, -16(%rsi)
	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x20	/* aeskeygenassist $0x20, %xmm1, %xmm2 */
	call key_expansion128
	.byte 0x66,0x0f,0x38,0xdb,0xd1	/* aesimc	%xmm1, %xmm2 */
	movups	%xmm2, -16(%rsi)
	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x40	/* aeskeygenassist $0x40, %xmm1, %xmm2 */
	call key_expansion128
	.byte 0x66,0x0f,0x38,0xdb,0xd1	/* aesimc	%xmm1, %xmm2 */
	movups	%xmm2, -16(%rsi)
	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x80	/* aeskeygenassist $0x80, %xmm1, %xmm2 */
	call key_expansion128
	.byte 0x66,0x0f,0x38,0xdb,0xd1	/* aesimc	%xmm1, %xmm2 */
	movups	%xmm2, -16(%rsi)
	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x1b	/* aeskeygenassist $0x1b, %xmm1, %xmm2 */
	call key_expansion128
	.byte 0x66,0x0f,0x38,0xdb,0xd1	/* aesimc	%xmm1, %xmm2 */
	movups	%xmm2, -16(%rsi)
	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x36	/* aeskeygenassist $0x36, %xmm1, %xmm2 */
	call key_expansion128

	ret
	.size intel_aes_decrypt_init_128, .-intel_aes_decrypt_init_128


	.type key_expansion128,@function
	.align	16
key_expansion128:
	movd	%eax, %xmm3
	pshufd	$0xff, %xmm2, %xmm2
	shufps	$0x10, %xmm1, %xmm3
	pxor	%xmm3, %xmm1
	shufps	$0x8c, %xmm1, %xmm3
	pxor	%xmm2, %xmm1
	pxor	%xmm3, %xmm1
	movdqu	%xmm1, (%rsi)
	addq	$16, %rsi
	ret
	.size key_expansion128, .-key_expansion128


/* in %rdi : cx - context
   in %rsi : output - pointer to output buffer
   in %rdx : outputLen - pointer to variable for length of output
             (already filled in by caller)
   in %ecx : maxOutputLen - length of output buffer
             (already checked by caller)
   in %r8  : input - pointer to input buffer
   in %r9d : inputLen - length of input buffer
   on stack: blocksize - AES blocksize (always 16, unused)
*/
	.type intel_aes_encrypt_ecb_128,@function
	.globl intel_aes_encrypt_ecb_128
	.align	16
intel_aes_encrypt_ecb_128:
	movdqu	(%rdi), %xmm2
	movdqu	160(%rdi), %xmm12
	xor	%eax, %eax
//	cmpl	$8*16, %r9d
	cmpl	$128, %r9d
	jb	1f
//	leal	-8*16(%r9), %r11d
	leal	-128(%r9), %r11d
2:	movdqu	(%r8, %rax), %xmm3
	movdqu	16(%r8, %rax), %xmm4
	movdqu	32(%r8, %rax), %xmm5
	movdqu	48(%r8, %rax), %xmm6
	movdqu	64(%r8, %rax), %xmm7
	movdqu	80(%r8, %rax), %xmm8
	movdqu	96(%r8, %rax), %xmm9
	movdqu	112(%r8, %rax), %xmm10
	pxor	%xmm2, %xmm3
	pxor	%xmm2, %xmm4
	pxor	%xmm2, %xmm5
	pxor	%xmm2, %xmm6
	pxor	%xmm2, %xmm7
	pxor	%xmm2, %xmm8
	pxor	%xmm2, %xmm9
	pxor	%xmm2, %xmm10

// complete loop unrolling
	movdqu 16(%rdi), %xmm1
	movdqu 32(%rdi), %xmm11
	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
	.byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
	.byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
	.byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */

	movdqu 48(%rdi), %xmm1
	movdqu 64(%rdi), %xmm11
	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
	.byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
	.byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
	.byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */

	movdqu 80(%rdi), %xmm1
	movdqu 96(%rdi), %xmm11
	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
	.byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
	.byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
	.byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */

	movdqu 112(%rdi), %xmm1
	movdqu 128(%rdi), %xmm11
	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
	.byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
	.byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
	.byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */

	movdqu 144(%rdi), %xmm1
	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
	.byte 0x66,0x41,0x0f,0x38,0xdd,0xdc 	/* aesenclast 	%xmm12, %xmm3 */
	.byte 0x66,0x41,0x0f,0x38,0xdd,0xe4 	/* aesenclast 	%xmm12, %xmm4 */
	.byte 0x66,0x41,0x0f,0x38,0xdd,0xec 	/* aesenclast 	%xmm12, %xmm5 */
	.byte 0x66,0x41,0x0f,0x38,0xdd,0xf4 	/* aesenclast 	%xmm12, %xmm6 */
	.byte 0x66,0x41,0x0f,0x38,0xdd,0xfc 	/* aesenclast 	%xmm12, %xmm7 */
	.byte 0x66,0x45,0x0f,0x38,0xdd,0xc4 	/* aesenclast 	%xmm12, %xmm8 */
	.byte 0x66,0x45,0x0f,0x38,0xdd,0xcc 	/* aesenclast 	%xmm12, %xmm9 */
	.byte 0x66,0x45,0x0f,0x38,0xdd,0xd4 	/* aesenclast 	%xmm12, %xmm10 */

	movdqu	%xmm3, (%rsi, %rax)
	movdqu	%xmm4, 16(%rsi, %rax)
	movdqu	%xmm5, 32(%rsi, %rax)
	movdqu	%xmm6, 48(%rsi, %rax)
	movdqu	%xmm7, 64(%rsi, %rax)
	movdqu	%xmm8, 80(%rsi, %rax)
	movdqu	%xmm9, 96(%rsi, %rax)
	movdqu	%xmm10, 112(%rsi, %rax)
//	addl	$8*16, %eax
	addl	$128, %eax
	cmpl	%r11d, %eax
	jbe	2b
1:	cmpl	%eax, %r9d
	je	5f

	movdqu	16(%rdi), %xmm3
	movdqu	32(%rdi), %xmm4
	movdqu	48(%rdi), %xmm5
	movdqu	64(%rdi), %xmm6
	movdqu	80(%rdi), %xmm7
	movdqu	96(%rdi), %xmm8
	movdqu	112(%rdi), %xmm9
	movdqu	128(%rdi), %xmm10
	movdqu	144(%rdi), %xmm11

4:	movdqu	(%r8, %rax), %xmm1
	pxor	%xmm2, %xmm1
	.byte 0x66,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm3, %xmm1 */
	.byte 0x66,0x0f,0x38,0xdc,0xcc	/* aesenc	%xmm4, %xmm1 */
	.byte 0x66,0x0f,0x38,0xdc,0xcd	/* aesenc	%xmm5, %xmm1 */
	.byte 0x66,0x0f,0x38,0xdc,0xce	/* aesenc	%xmm6, %xmm1 */
	.byte 0x66,0x0f,0x38,0xdc,0xcf	/* aesenc	%xmm7, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xc8	/* aesenc	%xmm8, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm9, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xca	/* aesenc	%xmm10, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xdd,0xcc	/* aesenclast %xmm12, %xmm1 */
	movdqu	%xmm1, (%rsi, %rax)
	addl	$16, %eax
	cmpl	%eax, %r9d
	jne	4b

5:	xor	%eax, %eax
	ret
	.size intel_aes_encrypt_ecb_128, .-intel_aes_encrypt_ecb_128


/* in %rdi : cx - context
   in %rsi : output - pointer to output buffer
   in %rdx : outputLen - pointer to variable for length of output
             (already filled in by caller)
   in %ecx : maxOutputLen - length of output buffer
             (already checked by caller)
   in %r8  : input - pointer to input buffer
   in %r9d : inputLen - length of input buffer
   on stack: blocksize - AES blocksize (always 16, unused)
*/
	.type intel_aes_decrypt_ecb_128,@function
	.globl intel_aes_decrypt_ecb_128
	.align	16
intel_aes_decrypt_ecb_128:
	movdqu	(%rdi), %xmm2
	movdqu	160(%rdi), %xmm12
	xorl	%eax, %eax
//	cmpl	$8*16, %r9d
	cmpl	$128, %r9d
	jb	1f
//	leal	-8*16(%r9), %r11d
	leal	-128(%r9), %r11d
2:	movdqu	(%r8, %rax), %xmm3
	movdqu	16(%r8, %rax), %xmm4
	movdqu	32(%r8, %rax), %xmm5
	movdqu	48(%r8, %rax), %xmm6
	movdqu	64(%r8, %rax), %xmm7
	movdqu	80(%r8, %rax), %xmm8
	movdqu	96(%r8, %rax), %xmm9
	movdqu	112(%r8, %rax), %xmm10
	pxor	%xmm12, %xmm3
	pxor	%xmm12, %xmm4
	pxor	%xmm12, %xmm5
	pxor	%xmm12, %xmm6
	pxor	%xmm12, %xmm7
	pxor	%xmm12, %xmm8
	pxor	%xmm12, %xmm9
	pxor	%xmm12, %xmm10

// complete loop unrolling
	movdqu 144(%rdi), %xmm1
	movdqu 128(%rdi), %xmm11
	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */

	movdqu 112(%rdi), %xmm1
	movdqu 96(%rdi), %xmm11
	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */

	movdqu 80(%rdi), %xmm1
	movdqu 64(%rdi), %xmm11
	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */

	movdqu 48(%rdi), %xmm1
	movdqu 32(%rdi), %xmm11
	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */

	movdqu 16(%rdi), %xmm1
	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
	.byte 0x66,0x0f,0x38,0xdf,0xda		/* aesdeclast 	%xmm2, %xmm3 */
	.byte 0x66,0x0f,0x38,0xdf,0xe2		/* aesdeclast 	%xmm2, %xmm4 */
	.byte 0x66,0x0f,0x38,0xdf,0xea		/* aesdeclast 	%xmm2, %xmm5 */
	.byte 0x66,0x0f,0x38,0xdf,0xf2		/* aesdeclast 	%xmm2, %xmm6 */
	.byte 0x66,0x0f,0x38,0xdf,0xfa		/* aesdeclast 	%xmm2, %xmm7 */
	.byte 0x66,0x44,0x0f,0x38,0xdf,0xc2	/* aesdeclast 	%xmm2, %xmm8 */
	.byte 0x66,0x44,0x0f,0x38,0xdf,0xca	/* aesdeclast 	%xmm2, %xmm9 */
	.byte 0x66,0x44,0x0f,0x38,0xdf,0xd2	/* aesdeclast 	%xmm2, %xmm10 */

	movdqu	%xmm3, (%rsi, %rax)
	movdqu	%xmm4, 16(%rsi, %rax)
	movdqu	%xmm5, 32(%rsi, %rax)
	movdqu	%xmm6, 48(%rsi, %rax)
	movdqu	%xmm7, 64(%rsi, %rax)
	movdqu	%xmm8, 80(%rsi, %rax)
	movdqu	%xmm9, 96(%rsi, %rax)
	movdqu	%xmm10, 112(%rsi, %rax)
//	addl	$8*16, %eax
	addl	$128, %eax
	cmpl	%r11d, %eax
	jbe	2b
1:	cmpl	%eax, %r9d
	je	5f

	movdqu	16(%rdi), %xmm3
	movdqu	32(%rdi), %xmm4
	movdqu	48(%rdi), %xmm5
	movdqu	64(%rdi), %xmm6
	movdqu	80(%rdi), %xmm7
	movdqu	96(%rdi), %xmm8
	movdqu	112(%rdi), %xmm9
	movdqu	128(%rdi), %xmm10
	movdqu	144(%rdi), %xmm11

4:	movdqu	(%r8, %rax), %xmm1
	pxor	%xmm12, %xmm1
	.byte 0x66,0x41,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xca	/* aesdec	%xmm10, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm9, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xc8	/* aesdec	%xmm8, %xmm1 */
	.byte 0x66,0x0f,0x38,0xde,0xcf	/* aesdec	%xmm7, %xmm1 */
	.byte 0x66,0x0f,0x38,0xde,0xce	/* aesdec	%xmm7, %xmm1 */
	.byte 0x66,0x0f,0x38,0xde,0xcd	/* aesdec	%xmm7, %xmm1 */
	.byte 0x66,0x0f,0x38,0xde,0xcc	/* aesdec	%xmm7, %xmm1 */
	.byte 0x66,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm7, %xmm1 */
	.byte 0x66,0x0f,0x38,0xdf,0xca	/* aesdeclast %xmm2, %xmm1 */
	movdqu	%xmm1, (%rsi, %rax)
	addl	$16, %eax
	cmpl	%eax, %r9d
	jne	4b

5:	xor	%eax, %eax
	ret
	.size intel_aes_decrypt_ecb_128, .-intel_aes_decrypt_ecb_128


/* in %rdi : cx - context
   in %rsi : output - pointer to output buffer
   in %rdx : outputLen - pointer to variable for length of output
             (already filled in by caller)
   in %ecx : maxOutputLen - length of output buffer
             (already checked by caller)
   in %r8  : input - pointer to input buffer
   in %r9d : inputLen - length of input buffer
   on stack: blocksize - AES blocksize (always 16, unused)
*/
	.type intel_aes_encrypt_cbc_128,@function
	.globl intel_aes_encrypt_cbc_128
	.align	16
intel_aes_encrypt_cbc_128:
	testl	%r9d, %r9d
	je	2f

//	leaq	IV_OFFSET(%rdi), %rdx
	leaq	256(%rdi), %rdx

	movdqu	(%rdx), %xmm0
	movdqu	(%rdi), %xmm2
	movdqu	16(%rdi), %xmm3
	movdqu	32(%rdi), %xmm4
	movdqu	48(%rdi), %xmm5
	movdqu	64(%rdi), %xmm6
	movdqu	80(%rdi), %xmm7
	movdqu	96(%rdi), %xmm8
	movdqu	112(%rdi), %xmm9
	movdqu	128(%rdi), %xmm10
	movdqu	144(%rdi), %xmm11
	movdqu	160(%rdi), %xmm12

	xorl	%eax, %eax
1:	movdqu	(%r8, %rax), %xmm1
	pxor	%xmm0, %xmm1
	pxor	%xmm2, %xmm1
	.byte 0x66,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm3, %xmm1 */
	.byte 0x66,0x0f,0x38,0xdc,0xcc	/* aesenc	%xmm4, %xmm1 */
	.byte 0x66,0x0f,0x38,0xdc,0xcd	/* aesenc	%xmm5, %xmm1 */
	.byte 0x66,0x0f,0x38,0xdc,0xce	/* aesenc	%xmm6, %xmm1 */
	.byte 0x66,0x0f,0x38,0xdc,0xcf	/* aesenc	%xmm7, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xc8	/* aesenc	%xmm8, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm9, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xca	/* aesenc	%xmma, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmmb, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xdd,0xcc	/* aesenclast %xmm12, %xmm1 */
	movdqu	%xmm1, (%rsi, %rax)
	movdqa	%xmm1, %xmm0
	addl	$16, %eax
	cmpl	%eax, %r9d
	jne	1b

	movdqu	%xmm0, (%rdx)

2:	xor	%eax, %eax
	ret
	.size intel_aes_encrypt_cbc_128, .-intel_aes_encrypt_cbc_128


/* in %rdi : cx - context
   in %rsi : output - pointer to output buffer
   in %rdx : outputLen - pointer to variable for length of output
             (already filled in by caller)
   in %ecx : maxOutputLen - length of output buffer
             (already checked by caller)
   in %r8  : input - pointer to input buffer
   in %r9d : inputLen - length of input buffer
   on stack: blocksize - AES blocksize (always 16, unused)
*/
	.type intel_aes_decrypt_cbc_128,@function
	.globl intel_aes_decrypt_cbc_128
	.align	16
intel_aes_decrypt_cbc_128:
//	leaq	IV_OFFSET(%rdi), %rdx
	leaq	256(%rdi), %rdx

	movdqu	(%rdx), %xmm0   /* iv */
	movdqu	(%rdi), %xmm2   /* first key block */
	movdqu	160(%rdi), %xmm12 /* last key block */
	xorl	%eax, %eax
	cmpl	$128, %r9d
	jb	1f
	leal	-128(%r9), %r11d
2:	movdqu	(%r8, %rax), %xmm3 /* 1st data block */
	movdqu	16(%r8, %rax), %xmm4 /* 2d data block */
	movdqu	32(%r8, %rax), %xmm5
	movdqu	48(%r8, %rax), %xmm6
	movdqu	64(%r8, %rax), %xmm7
	movdqu	80(%r8, %rax), %xmm8
	movdqu	96(%r8, %rax), %xmm9
	movdqu	112(%r8, %rax), %xmm10
	pxor	%xmm12, %xmm3
	pxor	%xmm12, %xmm4
	pxor	%xmm12, %xmm5
	pxor	%xmm12, %xmm6
	pxor	%xmm12, %xmm7
	pxor	%xmm12, %xmm8
	pxor	%xmm12, %xmm9
	pxor	%xmm12, %xmm10

// complete loop unrolling
	movdqu 144(%rdi), %xmm1
	movdqu 128(%rdi), %xmm11
	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */

	movdqu 112(%rdi), %xmm1
	movdqu 96(%rdi), %xmm11
	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */

	movdqu 80(%rdi), %xmm1
	movdqu 64(%rdi), %xmm11
	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */

	movdqu 48(%rdi), %xmm1
	movdqu 32(%rdi), %xmm11
	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */

	movdqu 16(%rdi), %xmm1
	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
	.byte 0x66,0x0f,0x38,0xdf,0xda		/* aesdeclast 	%xmm2, %xmm3 */
	.byte 0x66,0x0f,0x38,0xdf,0xe2		/* aesdeclast 	%xmm2, %xmm4 */
	.byte 0x66,0x0f,0x38,0xdf,0xea		/* aesdeclast 	%xmm2, %xmm5 */
	.byte 0x66,0x0f,0x38,0xdf,0xf2		/* aesdeclast 	%xmm2, %xmm6 */
	.byte 0x66,0x0f,0x38,0xdf,0xfa		/* aesdeclast 	%xmm2, %xmm7 */
	.byte 0x66,0x44,0x0f,0x38,0xdf,0xc2	/* aesdeclast 	%xmm2, %xmm8 */
	.byte 0x66,0x44,0x0f,0x38,0xdf,0xca	/* aesdeclast 	%xmm2, %xmm9 */
	.byte 0x66,0x44,0x0f,0x38,0xdf,0xd2	/* aesdeclast 	%xmm2, %xmm10 */

 	pxor	%xmm0, %xmm3
	movdqu	(%r8, %rax), %xmm0
	pxor	%xmm0, %xmm4
	movdqu	16(%r8, %rax), %xmm0
	pxor	%xmm0, %xmm5
	movdqu	32(%r8, %rax), %xmm0
	pxor	%xmm0, %xmm6
	movdqu	48(%r8, %rax), %xmm0
	pxor	%xmm0, %xmm7
	movdqu	64(%r8, %rax), %xmm0
	pxor	%xmm0, %xmm8
	movdqu	80(%r8, %rax), %xmm0
	pxor	%xmm0, %xmm9
	movdqu	96(%r8, %rax), %xmm0
	pxor	%xmm0, %xmm10
	movdqu	112(%r8, %rax), %xmm0
	movdqu	%xmm3, (%rsi, %rax)
	movdqu	%xmm4, 16(%rsi, %rax)
	movdqu	%xmm5, 32(%rsi, %rax)
	movdqu	%xmm6, 48(%rsi, %rax)
	movdqu	%xmm7, 64(%rsi, %rax)
	movdqu	%xmm8, 80(%rsi, %rax)
	movdqu	%xmm9, 96(%rsi, %rax)
	movdqu	%xmm10, 112(%rsi, %rax)
	addl	$128, %eax
	cmpl	%r11d, %eax
	jbe	2b
1:	cmpl	%eax, %r9d
	je	5f

	movdqu	16(%rdi), %xmm3
	movdqu	32(%rdi), %xmm4
	movdqu	48(%rdi), %xmm5
	movdqu	64(%rdi), %xmm6
	movdqu	80(%rdi), %xmm7
	movdqu	96(%rdi), %xmm8
	movdqu	112(%rdi), %xmm9
	movdqu	128(%rdi), %xmm10
	movdqu	144(%rdi), %xmm11

4:	movdqu	(%r8, %rax), %xmm1
	movdqa	%xmm1, %xmm13
	pxor	%xmm12, %xmm1
	.byte 0x66,0x41,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xca	/* aesdec	%xmm10, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm9, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xc8	/* aesdec	%xmm8, %xmm1 */
	.byte 0x66,0x0f,0x38,0xde,0xcf	/* aesdec	%xmm7, %xmm1 */
	.byte 0x66,0x0f,0x38,0xde,0xce	/* aesdec	%xmm6, %xmm1 */
	.byte 0x66,0x0f,0x38,0xde,0xcd	/* aesdec	%xmm5, %xmm1 */
	.byte 0x66,0x0f,0x38,0xde,0xcc	/* aesdec	%xmm4, %xmm1 */
	.byte 0x66,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm3, %xmm1 */
	.byte 0x66,0x0f,0x38,0xdf,0xca	/* aesdeclast %xmm2, %xmm1 */
	pxor	%xmm0, %xmm1
	movdqu	%xmm1, (%rsi, %rax)
	movdqa	%xmm13, %xmm0
	addl	$16, %eax
	cmpl	%eax, %r9d
	jne	4b

5:	movdqu	%xmm0, (%rdx)

	xor	%eax, %eax
	ret
	.size intel_aes_decrypt_cbc_128, .-intel_aes_decrypt_cbc_128
        
/* in %rdi : the key
   in %rsi : buffer for expanded key
*/
	.type intel_aes_encrypt_init_192,@function
	.globl intel_aes_encrypt_init_192
	.align	16
intel_aes_encrypt_init_192:
	movdqu	(%rdi), %xmm1
	movq	16(%rdi), %xmm3
	movdqu	%xmm1, (%rsi)
	movq	%xmm3, 16(%rsi)
	leaq	24(%rsi), %rsi

	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x01	/* aeskeygenassist $0x01, %xmm3, %xmm2 */
	call key_expansion192
	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x02	/* aeskeygenassist $0x02, %xmm3, %xmm2 */
	call key_expansion192
	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x04	/* aeskeygenassist $0x04, %xmm3, %xmm2 */
	call key_expansion192
	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x08	/* aeskeygenassist $0x08, %xmm3, %xmm2 */
	call key_expansion192
	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x10	/* aeskeygenassist $0x10, %xmm3, %xmm2 */
	call key_expansion192
	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x20	/* aeskeygenassist $0x20, %xmm3, %xmm2 */
	call key_expansion192
	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x40	/* aeskeygenassist $0x40, %xmm3, %xmm2 */
	call key_expansion192
	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x80	/* aeskeygenassist $0x80, %xmm3, %xmm2 */
	call key_expansion192

	ret
	.size intel_aes_encrypt_init_192, .-intel_aes_encrypt_init_192


/* in %rdi : the key
   in %rsi : buffer for expanded key
*/
	.type intel_aes_decrypt_init_192,@function
	.globl intel_aes_decrypt_init_192
	.align	16
intel_aes_decrypt_init_192:
	movdqu	(%rdi), %xmm1
	movq	16(%rdi), %xmm3
	movdqu	%xmm1, (%rsi)
	movq	%xmm3, 16(%rsi)
	leaq	24(%rsi), %rsi

	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x01	/* aeskeygenassist $0x01, %xmm3, %xmm2 */
	call key_expansion192
	movups	-32(%rsi), %xmm2
	movups	-16(%rsi), %xmm4
	.byte 0x66,0x0f,0x38,0xdb,0xd2	/* aesimc	%xmm2, %xmm2 */
	.byte 0x66,0x0f,0x38,0xdb,0xe4	/* aesimc	%xmm4, %xmm4 */
	movups	%xmm2, -32(%rsi)
	movups	%xmm4, -16(%rsi)
	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x02	/* aeskeygenassist $0x02, %xmm3, %xmm2 */
	call key_expansion192
	.byte 0x66,0x0f,0x38,0xdb,0xd1	/* aesimc	%xmm1, %xmm2 */
	movups	%xmm2, -24(%rsi)
	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x04	/* aeskeygenassist $0x04, %xmm3, %xmm2 */
	call key_expansion192
	movups	-32(%rsi), %xmm2
	movups	-16(%rsi), %xmm4
	.byte 0x66,0x0f,0x38,0xdb,0xd2	/* aesimc	%xmm2, %xmm2 */
	.byte 0x66,0x0f,0x38,0xdb,0xe4	/* aesimc	%xmm4, %xmm4 */
	movups	%xmm2, -32(%rsi)
	movups	%xmm4, -16(%rsi)
	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x08	/* aeskeygenassist $0x08, %xmm3, %xmm2 */
	call key_expansion192
	.byte 0x66,0x0f,0x38,0xdb,0xd1	/* aesimc	%xmm1, %xmm2 */
	movups	%xmm2, -24(%rsi)
	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x10	/* aeskeygenassist $0x10, %xmm3, %xmm2 */
	call key_expansion192
	movups	-32(%rsi), %xmm2
	movups	-16(%rsi), %xmm4
	.byte 0x66,0x0f,0x38,0xdb,0xd2	/* aesimc	%xmm2, %xmm2 */
	.byte 0x66,0x0f,0x38,0xdb,0xe4	/* aesimc	%xmm4, %xmm4 */
	movups	%xmm2, -32(%rsi)
	movups	%xmm4, -16(%rsi)
	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x20	/* aeskeygenassist $0x20, %xmm3, %xmm2 */
	call key_expansion192
	.byte 0x66,0x0f,0x38,0xdb,0xd1	/* aesimc	%xmm1, %xmm2 */
	movups	%xmm2, -24(%rsi)
	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x40	/* aeskeygenassist $0x40, %xmm3, %xmm2 */
	call key_expansion192
	movups	-32(%rsi), %xmm2
	movups	-16(%rsi), %xmm4
	.byte 0x66,0x0f,0x38,0xdb,0xd2	/* aesimc	%xmm2, %xmm2 */
	.byte 0x66,0x0f,0x38,0xdb,0xe4	/* aesimc	%xmm4, %xmm4 */
	movups	%xmm2, -32(%rsi)
	movups	%xmm4, -16(%rsi)
	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x80	/* aeskeygenassist $0x80, %xmm3, %xmm2 */
	call key_expansion192

	ret
	.size intel_aes_decrypt_init_192, .-intel_aes_decrypt_init_192


	.type key_expansion192,@function
	.align	16
key_expansion192:
	pshufd	$0x55, %xmm2, %xmm2
	xor	%eax, %eax
	movd	%eax, %xmm4
	shufps	$0x10, %xmm1, %xmm4
	pxor	%xmm4, %xmm1
	shufps	$0x8c, %xmm1, %xmm4
	pxor	%xmm2, %xmm1
	pxor	%xmm4, %xmm1
	movdqu	%xmm1, (%rsi)
	addq	$16, %rsi

	pshufd	$0xff, %xmm1, %xmm4
	movd	%eax, %xmm5
	shufps	$0x00, %xmm3, %xmm5
	shufps	$0x08, %xmm3, %xmm5
	pxor	%xmm4, %xmm3
	pxor	%xmm5, %xmm3
	movq	%xmm3, (%rsi)
	addq	$8, %rsi
	ret
	.size key_expansion192, .-key_expansion192


/* in %rdi : cx - context
   in %rsi : output - pointer to output buffer
   in %rdx : outputLen - pointer to variable for length of output
             (already filled in by caller)
   in %ecx : maxOutputLen - length of output buffer
             (already checked by caller)
   in %r8  : input - pointer to input buffer
   in %r9d : inputLen - length of input buffer
   on stack: blocksize - AES blocksize (always 16, unused)
*/
	.type intel_aes_encrypt_ecb_192,@function
	.globl intel_aes_encrypt_ecb_192
	.align	16
intel_aes_encrypt_ecb_192:
	movdqu	(%rdi), %xmm2
	movdqu	192(%rdi), %xmm14
	xorl	%eax, %eax
//	cmpl	$8*16, %r9d
	cmpl	$128, %r9d
	jb	1f
//	leal	-8*16(%r9), %r11d
	leal	-128(%r9), %r11d
2:	movdqu	(%r8, %rax), %xmm3
	movdqu	16(%r8, %rax), %xmm4
	movdqu	32(%r8, %rax), %xmm5
	movdqu	48(%r8, %rax), %xmm6
	movdqu	64(%r8, %rax), %xmm7
	movdqu	80(%r8, %rax), %xmm8
	movdqu	96(%r8, %rax), %xmm9
	movdqu	112(%r8, %rax), %xmm10
	pxor	%xmm2, %xmm3
	pxor	%xmm2, %xmm4
	pxor	%xmm2, %xmm5
	pxor	%xmm2, %xmm6
	pxor	%xmm2, %xmm7
	pxor	%xmm2, %xmm8
	pxor	%xmm2, %xmm9
	pxor	%xmm2, %xmm10

// complete loop unrolling
	movdqu 16(%rdi), %xmm1
	movdqu 32(%rdi), %xmm11
	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
	.byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
	.byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
	.byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */

	movdqu 48(%rdi), %xmm1
	movdqu 64(%rdi), %xmm11
	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
	.byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
	.byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
	.byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */

	movdqu 80(%rdi), %xmm1
	movdqu 96(%rdi), %xmm11
	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
	.byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
	.byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
	.byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */

	movdqu 112(%rdi), %xmm1
	movdqu 128(%rdi), %xmm11
	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
	.byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
	.byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
	.byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */

	movdqu 144(%rdi), %xmm1
	movdqu 160(%rdi), %xmm11
	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
	.byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
	.byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
	.byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */

	movdqu 176(%rdi), %xmm1
	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
	.byte 0x66,0x41,0x0f,0x38,0xdd,0xde	/* aesenclast 	%xmm14, %xmm3 */
	.byte 0x66,0x41,0x0f,0x38,0xdd,0xe6	/* aesenclast 	%xmm14, %xmm4 */
	.byte 0x66,0x41,0x0f,0x38,0xdd,0xee	/* aesenclast 	%xmm14, %xmm5 */
	.byte 0x66,0x41,0x0f,0x38,0xdd,0xf6	/* aesenclast 	%xmm14, %xmm7 */
	.byte 0x66,0x41,0x0f,0x38,0xdd,0xfe	/* aesenclast 	%xmm14, %xmm3 */
	.byte 0x66,0x45,0x0f,0x38,0xdd,0xc6	/* aesenclast 	%xmm14, %xmm8 */
	.byte 0x66,0x45,0x0f,0x38,0xdd,0xce	/* aesenclast 	%xmm14, %xmm9 */
	.byte 0x66,0x45,0x0f,0x38,0xdd,0xd6	/* aesenclast 	%xmm14, %xmm10 */

	movdqu	%xmm3, (%rsi, %rax)
	movdqu	%xmm4, 16(%rsi, %rax)
	movdqu	%xmm5, 32(%rsi, %rax)
	movdqu	%xmm6, 48(%rsi, %rax)
	movdqu	%xmm7, 64(%rsi, %rax)
	movdqu	%xmm8, 80(%rsi, %rax)
	movdqu	%xmm9, 96(%rsi, %rax)
	movdqu	%xmm10, 112(%rsi, %rax)
//	addl	$8*16, %eax
	addl	$128, %eax
	cmpl	%r11d, %eax
	jbe	2b
1:	cmpl	%eax, %r9d
	je	5f

	movdqu	16(%rdi), %xmm3
	movdqu	32(%rdi), %xmm4
	movdqu	48(%rdi), %xmm5
	movdqu	64(%rdi), %xmm6
	movdqu	80(%rdi), %xmm7
	movdqu	96(%rdi), %xmm8
	movdqu	112(%rdi), %xmm9
	movdqu	128(%rdi), %xmm10
	movdqu	144(%rdi), %xmm11
	movdqu	160(%rdi), %xmm12
	movdqu	176(%rdi), %xmm13

4:	movdqu	(%r8, %rax), %xmm1
	pxor	%xmm2, %xmm1
	.byte 0x66,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm3, %xmm1 */
	.byte 0x66,0x0f,0x38,0xdc,0xcc	/* aesenc	%xmm4, %xmm1 */
	.byte 0x66,0x0f,0x38,0xdc,0xcd	/* aesenc	%xmm5, %xmm1 */
	.byte 0x66,0x0f,0x38,0xdc,0xce	/* aesenc	%xmm6, %xmm1 */
	.byte 0x66,0x0f,0x38,0xdc,0xcf	/* aesenc	%xmm7, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xc8	/* aesenc	%xmm8, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm9, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xca	/* aesenc	%xmm10, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcc	/* aesenc	%xmm12, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcd	/* aesenc	%xmm13, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xdd,0xce	/* aesenclast %xmm14, %xmm1 */
	movdqu	%xmm1, (%rsi, %rax)
	addl	$16, %eax
	cmpl	%eax, %r9d
	jne	4b

5:	xor	%eax, %eax
	ret
	.size intel_aes_encrypt_ecb_192, .-intel_aes_encrypt_ecb_192


/* in %rdi : cx - context
   in %rsi : output - pointer to output buffer
   in %rdx : outputLen - pointer to variable for length of output
             (already filled in by caller)
   in %ecx : maxOutputLen - length of output buffer
             (already checked by caller)
   in %r8  : input - pointer to input buffer
   in %r9d : inputLen - length of input buffer
   on stack: blocksize - AES blocksize (always 16, unused)
*/
	.type intel_aes_decrypt_ecb_192,@function
	.globl intel_aes_decrypt_ecb_192
	.align	16
intel_aes_decrypt_ecb_192:
	movdqu	(%rdi), %xmm2
	movdqu	192(%rdi), %xmm14
	xorl	%eax, %eax
//	cmpl	$8*16, %r9d
	cmpl	$128, %r9d
	jb	1f
//	leal	-8*16(%r9), %r11d
	leal	-128(%r9), %r11d
2:	movdqu	(%r8, %rax), %xmm3
	movdqu	16(%r8, %rax), %xmm4
	movdqu	32(%r8, %rax), %xmm5
	movdqu	48(%r8, %rax), %xmm6
	movdqu	64(%r8, %rax), %xmm7
	movdqu	80(%r8, %rax), %xmm8
	movdqu	96(%r8, %rax), %xmm9
	movdqu	112(%r8, %rax), %xmm10
	pxor	%xmm14, %xmm3
	pxor	%xmm14, %xmm4
	pxor	%xmm14, %xmm5
	pxor	%xmm14, %xmm6
	pxor	%xmm14, %xmm7
	pxor	%xmm14, %xmm8
	pxor	%xmm14, %xmm9
	pxor	%xmm14, %xmm10

// complete loop unrolling
	movdqu 176(%rdi), %xmm1
	movdqu 160(%rdi), %xmm11
	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */

	movdqu 144(%rdi), %xmm1
	movdqu 128(%rdi), %xmm11
	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */

	movdqu 112(%rdi), %xmm1
	movdqu 96(%rdi), %xmm11
	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */

	movdqu 80(%rdi), %xmm1
	movdqu 64(%rdi), %xmm11
	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */

	movdqu 48(%rdi), %xmm1
	movdqu 32(%rdi), %xmm11
	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */

	movdqu 16(%rdi), %xmm1
	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
	.byte 0x66,0x0f,0x38,0xdf,0xda		/* aesdeclast 	%xmm2, %xmm3 */
	.byte 0x66,0x0f,0x38,0xdf,0xe2		/* aesdeclast 	%xmm2, %xmm4 */
	.byte 0x66,0x0f,0x38,0xdf,0xea		/* aesdeclast 	%xmm2, %xmm5 */
	.byte 0x66,0x0f,0x38,0xdf,0xf2		/* aesdeclast 	%xmm2, %xmm6 */
	.byte 0x66,0x0f,0x38,0xdf,0xfa		/* aesdeclast 	%xmm2, %xmm7 */
	.byte 0x66,0x44,0x0f,0x38,0xdf,0xc2	/* aesdeclast 	%xmm2, %xmm8 */
	.byte 0x66,0x44,0x0f,0x38,0xdf,0xca	/* aesdeclast 	%xmm2, %xmm9 */
	.byte 0x66,0x44,0x0f,0x38,0xdf,0xd2	/* aesdeclast 	%xmm2, %xmm10 */

	movdqu	%xmm3, (%rsi, %rax)
	movdqu	%xmm4, 16(%rsi, %rax)
	movdqu	%xmm5, 32(%rsi, %rax)
	movdqu	%xmm6, 48(%rsi, %rax)
	movdqu	%xmm7, 64(%rsi, %rax)
	movdqu	%xmm8, 80(%rsi, %rax)
	movdqu	%xmm9, 96(%rsi, %rax)
	movdqu	%xmm10, 112(%rsi, %rax)
//	addl	$8*16, %eax
	addl	$128, %eax
	cmpl	%r11d, %eax
	jbe	2b
1:	cmpl	%eax, %r9d
	je	5f

	movdqu	16(%rdi), %xmm3
	movdqu	32(%rdi), %xmm4
	movdqu	48(%rdi), %xmm5
	movdqu	64(%rdi), %xmm6
	movdqu	80(%rdi), %xmm7
	movdqu	96(%rdi), %xmm8
	movdqu	112(%rdi), %xmm9
	movdqu	128(%rdi), %xmm10
	movdqu	144(%rdi), %xmm11
	movdqu	160(%rdi), %xmm12
	movdqu	176(%rdi), %xmm13

4:	movdqu	(%r8, %rax), %xmm1
	pxor	%xmm14, %xmm1
	.byte 0x66,0x41,0x0f,0x38,0xde,0xcd	/* aesdec	%xmm13, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xcc	/* aesdec	%xmm12, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xca	/* aesdec	%xmm10, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm9, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xc8	/* aesdec	%xmm8, %xmm1 */
	.byte 0x66,0x0f,0x38,0xde,0xcf	/* aesdec	%xmm7, %xmm1 */
	.byte 0x66,0x0f,0x38,0xde,0xce	/* aesdec	%xmm6, %xmm1 */
	.byte 0x66,0x0f,0x38,0xde,0xcd	/* aesdec	%xmm5, %xmm1 */
	.byte 0x66,0x0f,0x38,0xde,0xcc	/* aesdec	%xmm4, %xmm1 */
	.byte 0x66,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm3, %xmm1 */
	.byte 0x66,0x0f,0x38,0xdf,0xca	/* aesdeclast %xmm2, %xmm1 */
	movdqu	%xmm1, (%rsi, %rax)
	addl	$16, %eax
	cmpl	%eax, %r9d
	jne	4b

5:	xor	%eax, %eax
	ret
	.size intel_aes_decrypt_ecb_192, .-intel_aes_decrypt_ecb_192


/* in %rdi : cx - context
   in %rsi : output - pointer to output buffer
   in %rdx : outputLen - pointer to variable for length of output
             (already filled in by caller)
   in %ecx : maxOutputLen - length of output buffer
             (already checked by caller)
   in %r8  : input - pointer to input buffer
   in %r9d : inputLen - length of input buffer
   on stack: blocksize - AES blocksize (always 16, unused)
*/
	.type intel_aes_encrypt_cbc_192,@function
	.globl intel_aes_encrypt_cbc_192
	.align	16
intel_aes_encrypt_cbc_192:
	testl	%r9d, %r9d
	je	2f

//	leaq	IV_OFFSET(%rdi), %rdx
	leaq	256(%rdi), %rdx

	movdqu	(%rdx), %xmm0
	movdqu	(%rdi), %xmm2
	movdqu	16(%rdi), %xmm3
	movdqu	32(%rdi), %xmm4
	movdqu	48(%rdi), %xmm5
	movdqu	64(%rdi), %xmm6
	movdqu	80(%rdi), %xmm7
	movdqu	96(%rdi), %xmm8
	movdqu	112(%rdi), %xmm9
	movdqu	128(%rdi), %xmm10
	movdqu	144(%rdi), %xmm11
	movdqu	160(%rdi), %xmm12
	movdqu	176(%rdi), %xmm13
	movdqu	192(%rdi), %xmm14

	xorl	%eax, %eax
1:	movdqu	(%r8, %rax), %xmm1
	pxor	%xmm0, %xmm1
	pxor	%xmm2, %xmm1
	.byte 0x66,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm3, %xmm1 */
	.byte 0x66,0x0f,0x38,0xdc,0xcc	/* aesenc	%xmm4, %xmm1 */
	.byte 0x66,0x0f,0x38,0xdc,0xcd	/* aesenc	%xmm5, %xmm1 */
	.byte 0x66,0x0f,0x38,0xdc,0xce	/* aesenc	%xmm6, %xmm1 */
	.byte 0x66,0x0f,0x38,0xdc,0xcf	/* aesenc	%xmm7, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xc8	/* aesenc	%xmm8, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm9, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xca	/* aesenc	%xmm10, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcc	/* aesenc	%xmm12, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcd	/* aesenc	%xmm13, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xdd,0xce	/* aesenclast %xmm14, %xmm1 */
	movdqu	%xmm1, (%rsi, %rax)
	movdqa	%xmm1, %xmm0
	addl	$16, %eax
	cmpl	%eax, %r9d
	jne	1b

	movdqu	%xmm0, (%rdx)

2:	xor	%eax, %eax
	ret
	.size intel_aes_encrypt_cbc_192, .-intel_aes_encrypt_cbc_192


/* in %rdi : cx - context
   in %rsi : output - pointer to output buffer
   in %rdx : outputLen - pointer to variable for length of output
             (already filled in by caller)
   in %exx : maxOutputLen - length of output buffer
             (already checked by caller)
   in %r8  : input - pointer to input buffer
   in %r9d : inputLen - length of input buffer
   on stack: blocksize - AES blocksize (always 16, unused)
*/
	.type intel_aes_decrypt_cbc_192,@function
	.globl intel_aes_decrypt_cbc_192
	.align	16
intel_aes_decrypt_cbc_192:
//	leaq	IV_OFFSET(%rdi), %rdx
	leaq	256(%rdi), %rdx

	movdqu	(%rdx), %xmm0
	movdqu	(%rdi), %xmm2
	movdqu	192(%rdi), %xmm14
	xorl	%eax, %eax
	cmpl	$128, %r9d
	jb	1f
	leal	-128(%r9), %r11d
2:	movdqu	(%r8, %rax), %xmm3
	movdqu	16(%r8, %rax), %xmm4
	movdqu	32(%r8, %rax), %xmm5
	movdqu	48(%r8, %rax), %xmm6
	movdqu	64(%r8, %rax), %xmm7
	movdqu	80(%r8, %rax), %xmm8
	movdqu	96(%r8, %rax), %xmm9
	movdqu	112(%r8, %rax), %xmm10
	pxor	%xmm14, %xmm3
	pxor	%xmm14, %xmm4
	pxor	%xmm14, %xmm5
	pxor	%xmm14, %xmm6
	pxor	%xmm14, %xmm7
	pxor	%xmm14, %xmm8
	pxor	%xmm14, %xmm9
	pxor	%xmm14, %xmm10

// complete loop unrolling
	movdqu 176(%rdi), %xmm1
	movdqu 160(%rdi), %xmm11
	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */

	movdqu 144(%rdi), %xmm1
	movdqu 128(%rdi), %xmm11
	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */

	movdqu 112(%rdi), %xmm1
	movdqu 96(%rdi), %xmm11
	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */

	movdqu 80(%rdi), %xmm1
	movdqu 64(%rdi), %xmm11
	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */

	movdqu 48(%rdi), %xmm1
	movdqu 32(%rdi), %xmm11
	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */

	movdqu 16(%rdi), %xmm1
	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
	.byte 0x66,0x0f,0x38,0xdf,0xda		/* aesdeclast 	%xmm2, %xmm3 */
	.byte 0x66,0x0f,0x38,0xdf,0xe2		/* aesdeclast 	%xmm2, %xmm4 */
	.byte 0x66,0x0f,0x38,0xdf,0xea		/* aesdeclast 	%xmm2, %xmm5 */
	.byte 0x66,0x0f,0x38,0xdf,0xf2		/* aesdeclast 	%xmm2, %xmm6 */
	.byte 0x66,0x0f,0x38,0xdf,0xfa		/* aesdeclast 	%xmm2, %xmm7 */
	.byte 0x66,0x44,0x0f,0x38,0xdf,0xc2	/* aesdeclast 	%xmm2, %xmm8 */
	.byte 0x66,0x44,0x0f,0x38,0xdf,0xca	/* aesdeclast 	%xmm2, %xmm9 */
	.byte 0x66,0x44,0x0f,0x38,0xdf,0xd2	/* aesdeclast 	%xmm2, %xmm10 */

 	pxor	%xmm0, %xmm3
	movdqu	(%r8, %rax), %xmm0
	pxor	%xmm0, %xmm4
	movdqu	16(%r8, %rax), %xmm0
	pxor	%xmm0, %xmm5
	movdqu	32(%r8, %rax), %xmm0
	pxor	%xmm0, %xmm6
	movdqu	48(%r8, %rax), %xmm0
	pxor	%xmm0, %xmm7
	movdqu	64(%r8, %rax), %xmm0
	pxor	%xmm0, %xmm8
	movdqu	80(%r8, %rax), %xmm0
	pxor	%xmm0, %xmm9
	movdqu	96(%r8, %rax), %xmm0
	pxor	%xmm0, %xmm10
	movdqu	112(%r8, %rax), %xmm0
	movdqu	%xmm3, (%rsi, %rax)
	movdqu	%xmm4, 16(%rsi, %rax)
	movdqu	%xmm5, 32(%rsi, %rax)
	movdqu	%xmm6, 48(%rsi, %rax)
	movdqu	%xmm7, 64(%rsi, %rax)
	movdqu	%xmm8, 80(%rsi, %rax)
	movdqu	%xmm9, 96(%rsi, %rax)
	movdqu	%xmm10, 112(%rsi, %rax)
	addl	$128, %eax
	cmpl	%r11d, %eax
	jbe	2b
1:	cmpl	%eax, %r9d
	je	5f

	movdqu	16(%rdi), %xmm3
	movdqu	32(%rdi), %xmm4
	movdqu	48(%rdi), %xmm5
	movdqu	64(%rdi), %xmm6
	movdqu	80(%rdi), %xmm7
	movdqu	96(%rdi), %xmm8
	movdqu	112(%rdi), %xmm9
	movdqu	128(%rdi), %xmm10
	movdqu	144(%rdi), %xmm11
	movdqu	160(%rdi), %xmm12
	movdqu	176(%rdi), %xmm13

4:	movdqu	(%r8, %rax), %xmm1
	movdqa	%xmm1, %xmm15
	pxor	%xmm14, %xmm1
	.byte 0x66,0x41,0x0f,0x38,0xde,0xcd	/* aesdec	%xmm13, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xcc	/* aesdec	%xmm12, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xca	/* aesdec	%xmm10, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm9, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xc8	/* aesdec	%xmm8, %xmm1 */
	.byte 0x66,0x0f,0x38,0xde,0xcf	/* aesdec	%xmm7, %xmm1 */
	.byte 0x66,0x0f,0x38,0xde,0xce	/* aesdec	%xmm6, %xmm1 */
	.byte 0x66,0x0f,0x38,0xde,0xcd	/* aesdec	%xmm5, %xmm1 */
	.byte 0x66,0x0f,0x38,0xde,0xcc	/* aesdec	%xmm4, %xmm1 */
	.byte 0x66,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm3, %xmm1 */
	.byte 0x66,0x0f,0x38,0xdf,0xca	/* aesdeclast %xmm2, %xmm1 */
	pxor	%xmm0, %xmm1
	movdqu	%xmm1, (%rsi, %rax)
	movdqa	%xmm15, %xmm0
	addl	$16, %eax
	cmpl	%eax, %r9d
	jne	4b

5:	movdqu	%xmm0, (%rdx)

	xor	%eax, %eax
	ret
	.size intel_aes_decrypt_cbc_192, .-intel_aes_decrypt_cbc_192

/* in %rdi : the key
   in %rsi : buffer for expanded key
*/
	.type intel_aes_encrypt_init_256,@function
	.globl intel_aes_encrypt_init_256
	.align	16
intel_aes_encrypt_init_256:
	movdqu	(%rdi), %xmm1
	movdqu	16(%rdi), %xmm3
	movdqu	%xmm1, (%rsi)
	movdqu	%xmm3, 16(%rsi)
	leaq	32(%rsi), %rsi
	xor	%eax, %eax

	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x01	/* aeskeygenassist $0x01, %xmm3, %xmm2 */
	call key_expansion256
	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x02	/* aeskeygenassist $0x02, %xmm3, %xmm2 */
	call key_expansion256
	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x04	/* aeskeygenassist $0x04, %xmm3, %xmm2 */
	call key_expansion256
	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x08	/* aeskeygenassist $0x08, %xmm3, %xmm2 */
	call key_expansion256
	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x10	/* aeskeygenassist $0x10, %xmm3, %xmm2 */
	call key_expansion256
	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x20	/* aeskeygenassist $0x20, %xmm3, %xmm2 */
	call key_expansion256
	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x40	/* aeskeygenassist $0x40, %xmm3, %xmm2 */
	pxor	%xmm6, %xmm6
	pshufd	$0xff, %xmm2, %xmm2
	shufps	$0x10, %xmm1, %xmm6
	pxor	%xmm6, %xmm1
	shufps	$0x8c, %xmm1, %xmm6
	pxor	%xmm2, %xmm1
	pxor	%xmm6, %xmm1
	movdqu	%xmm1, (%rsi)

	ret
	.size intel_aes_encrypt_init_256, .-intel_aes_encrypt_init_256


/* in %rdi : the key
   in %rsi : buffer for expanded key
*/
	.type intel_aes_decrypt_init_256,@function
	.globl intel_aes_decrypt_init_256
	.align	16
intel_aes_decrypt_init_256:
	movdqu	(%rdi), %xmm1
	movdqu	16(%rdi), %xmm3
	movdqu	%xmm1, (%rsi)
	.byte 0x66,0x0f,0x38,0xdb,0xe3	/* aesimc	%xmm3, %xmm4 */
	movdqu	%xmm4, 16(%rsi)
	leaq	32(%rsi), %rsi
	xor	%eax, %eax

	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x01	/* aeskeygenassist $0x01, %xmm3, %xmm2 */
	call key_expansion256
	.byte 0x66,0x0f,0x38,0xdb,0xe1	/* aesimc	%xmm1, %xmm4 */
	.byte 0x66,0x0f,0x38,0xdb,0xeb	/* aesimc	%xmm3, %xmm5 */
	movdqu	%xmm4, -32(%rsi)
	movdqu	%xmm5, -16(%rsi)
	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x02	/* aeskeygenassist $0x02, %xmm3, %xmm2 */
	call key_expansion256
	.byte 0x66,0x0f,0x38,0xdb,0xe1	/* aesimc	%xmm1, %xmm4 */
	.byte 0x66,0x0f,0x38,0xdb,0xeb	/* aesimc	%xmm3, %xmm5 */
	movdqu	%xmm4, -32(%rsi)
	movdqu	%xmm5, -16(%rsi)
	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x04	/* aeskeygenassist $0x04, %xmm3, %xmm2 */
	call key_expansion256
	.byte 0x66,0x0f,0x38,0xdb,0xe1	/* aesimc	%xmm1, %xmm4 */
	.byte 0x66,0x0f,0x38,0xdb,0xeb	/* aesimc	%xmm3, %xmm5 */
	movdqu	%xmm4, -32(%rsi)
	movdqu	%xmm5, -16(%rsi)
	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x08	/* aeskeygenassist $0x08, %xmm3, %xmm2 */
	call key_expansion256
	.byte 0x66,0x0f,0x38,0xdb,0xe1	/* aesimc	%xmm1, %xmm4 */
	.byte 0x66,0x0f,0x38,0xdb,0xeb	/* aesimc	%xmm3, %xmm5 */
	movdqu	%xmm4, -32(%rsi)
	movdqu	%xmm5, -16(%rsi)
	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x10	/* aeskeygenassist $0x10, %xmm3, %xmm2 */
	call key_expansion256
	.byte 0x66,0x0f,0x38,0xdb,0xe1	/* aesimc	%xmm1, %xmm4 */
	.byte 0x66,0x0f,0x38,0xdb,0xeb	/* aesimc	%xmm3, %xmm5 */
	movdqu	%xmm4, -32(%rsi)
	movdqu	%xmm5, -16(%rsi)
	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x20	/* aeskeygenassist $0x20, %xmm3, %xmm2 */
	call key_expansion256
	.byte 0x66,0x0f,0x38,0xdb,0xe1	/* aesimc	%xmm1, %xmm4 */
	.byte 0x66,0x0f,0x38,0xdb,0xeb	/* aesimc	%xmm3, %xmm5 */
	movdqu	%xmm4, -32(%rsi)
	movdqu	%xmm5, -16(%rsi)
	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x40	/* aeskeygenassist $0x40, %xmm3, %xmm2 */
	pxor	%xmm6, %xmm6
	pshufd	$0xff, %xmm2, %xmm2
	shufps	$0x10, %xmm1, %xmm6
	pxor	%xmm6, %xmm1
	shufps	$0x8c, %xmm1, %xmm6
	pxor	%xmm2, %xmm1
	pxor	%xmm6, %xmm1
	movdqu	%xmm1, (%rsi)

	ret
	.size intel_aes_decrypt_init_256, .-intel_aes_decrypt_init_256


	.type key_expansion256,@function
	.align	16
key_expansion256:
	movd	%eax, %xmm6
	pshufd	$0xff, %xmm2, %xmm2
	shufps	$0x10, %xmm1, %xmm6
	pxor	%xmm6, %xmm1
	shufps	$0x8c, %xmm1, %xmm6
	pxor	%xmm2, %xmm1
	pxor	%xmm6, %xmm1
	movdqu	%xmm1, (%rsi)

	addq	$16, %rsi
	.byte 0x66,0x0f,0x3a,0xdf,0xe1,0x00	/* aeskeygenassist $0, %xmm1, %xmm4 */
	pshufd	$0xaa, %xmm4, %xmm4
	shufps	$0x10, %xmm3, %xmm6
	pxor	%xmm6, %xmm3
	shufps	$0x8c, %xmm3, %xmm6
	pxor	%xmm4, %xmm3
	pxor	%xmm6, %xmm3
	movdqu	%xmm3, (%rsi)
	addq	$16, %rsi
	ret
	.size key_expansion256, .-key_expansion256


/* in %rdi : cx - context
   in %rsi : output - pointer to output buffer
   in %rdx : outputLen - pointer to variable for length of output
             (already filled in by caller)
   in %ecx : maxOutputLen - length of output buffer
             (already checked by caller)
   in %r8  : input - pointer to input buffer
   in %r9d : inputLen - length of input buffer
   on stack: blocksize - AES blocksize (always 16, unused)
*/
	.type intel_aes_encrypt_ecb_256,@function
	.globl intel_aes_encrypt_ecb_256
	.align	16
intel_aes_encrypt_ecb_256:
	movdqu	(%rdi), %xmm2
	movdqu	224(%rdi), %xmm15
	xorl	%eax, %eax
//	cmpl	$8*16, %r9d
	cmpl	$128, %r9d
	jb	1f
//	leal	-8*16(%r9), %r11d
	leal	-128(%r9), %r11d
2:	movdqu	(%r8, %rax), %xmm3
	movdqu	16(%r8, %rax), %xmm4
	movdqu	32(%r8, %rax), %xmm5
	movdqu	48(%r8, %rax), %xmm6
	movdqu	64(%r8, %rax), %xmm7
	movdqu	80(%r8, %rax), %xmm8
	movdqu	96(%r8, %rax), %xmm9
	movdqu	112(%r8, %rax), %xmm10
	pxor	%xmm2, %xmm3
	pxor	%xmm2, %xmm4
	pxor	%xmm2, %xmm5
	pxor	%xmm2, %xmm6
	pxor	%xmm2, %xmm7
	pxor	%xmm2, %xmm8
	pxor	%xmm2, %xmm9
	pxor	%xmm2, %xmm10

// complete loop unrolling
	movdqu 16(%rdi), %xmm1
	movdqu 32(%rdi), %xmm11
	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
	.byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
	.byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
	.byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */

	movdqu 48(%rdi), %xmm1
	movdqu 64(%rdi), %xmm11
	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
	.byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
	.byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
	.byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */

	movdqu 80(%rdi), %xmm1
	movdqu 96(%rdi), %xmm11
	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
	.byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
	.byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
	.byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */

	movdqu 112(%rdi), %xmm1
	movdqu 128(%rdi), %xmm11
	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
	.byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
	.byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
	.byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */

	movdqu 144(%rdi), %xmm1
	movdqu 160(%rdi), %xmm11
	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
	.byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
	.byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
	.byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */

	movdqu 176(%rdi), %xmm1
	movdqu 192(%rdi), %xmm11
	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
	.byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
	.byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
	.byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */

	movdqu 208(%rdi), %xmm1
	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
	.byte 0x66,0x41,0x0f,0x38,0xdd,0xdf	/* aesenclast 	%xmm15, %xmm3 */
	.byte 0x66,0x41,0x0f,0x38,0xdd,0xe7	/* aesenclast 	%xmm15, %xmm4 */
	.byte 0x66,0x41,0x0f,0x38,0xdd,0xef	/* aesenclast 	%xmm15, %xmm5 */
	.byte 0x66,0x41,0x0f,0x38,0xdd,0xf7	/* aesenclast 	%xmm15, %xmm6 */
	.byte 0x66,0x41,0x0f,0x38,0xdd,0xff	/* aesenclast 	%xmm15, %xmm7 */
	.byte 0x66,0x45,0x0f,0x38,0xdd,0xc7	/* aesenclast 	%xmm15, %xmm8 */
	.byte 0x66,0x45,0x0f,0x38,0xdd,0xcf	/* aesenclast 	%xmm15, %xmm9 */
	.byte 0x66,0x45,0x0f,0x38,0xdd,0xd7	/* aesenclast 	%xmm15, %xmm10 */

	movdqu	%xmm3, (%rsi, %rax)
	movdqu	%xmm4, 16(%rsi, %rax)
	movdqu	%xmm5, 32(%rsi, %rax)
	movdqu	%xmm6, 48(%rsi, %rax)
	movdqu	%xmm7, 64(%rsi, %rax)
	movdqu	%xmm8, 80(%rsi, %rax)
	movdqu	%xmm9, 96(%rsi, %rax)
	movdqu	%xmm10, 112(%rsi, %rax)
//	addl	$8*16, %eax
	addl	$128, %eax
	cmpl	%r11d, %eax
	jbe	2b
1:	cmpl	%eax, %r9d
	je	5f

	movdqu	(%rdi), %xmm8
	movdqu	16(%rdi), %xmm2
	movdqu	32(%rdi), %xmm3
	movdqu	48(%rdi), %xmm4
	movdqu	64(%rdi), %xmm5
	movdqu	80(%rdi), %xmm6
	movdqu	96(%rdi), %xmm7
	movdqu	128(%rdi), %xmm9
	movdqu	144(%rdi), %xmm10
	movdqu	160(%rdi), %xmm11
	movdqu	176(%rdi), %xmm12
	movdqu	192(%rdi), %xmm13
	movdqu	208(%rdi), %xmm14

4:	movdqu	(%r8, %rax), %xmm1
	pxor	%xmm8, %xmm1
	movdqu	112(%rdi), %xmm8
	.byte 0x66,0x0f,0x38,0xdc,0xca	/* aesenc	%xmm2, %xmm1 */
	.byte 0x66,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm3, %xmm1 */
	.byte 0x66,0x0f,0x38,0xdc,0xcc	/* aesenc	%xmm4, %xmm1 */
	.byte 0x66,0x0f,0x38,0xdc,0xcd	/* aesenc	%xmm5, %xmm1 */
	.byte 0x66,0x0f,0x38,0xdc,0xce	/* aesenc	%xmm6, %xmm1 */
	.byte 0x66,0x0f,0x38,0xdc,0xcf	/* aesenc	%xmm7, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xc8	/* aesenc	%xmm8, %xmm1 */
	movdqu	(%rdi), %xmm8
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm9, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xca	/* aesenc	%xmm10, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcc	/* aesenc	%xmm12, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcd	/* aesenc	%xmm13, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xce	/* aesenc	%xmm14, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xdd,0xcf	/* aesenclast %xmm15, %xmm1 */
	movdqu	%xmm1, (%rsi, %rax)
	addl	$16, %eax
	cmpl	%eax, %r9d
	jne	4b

5:	xor	%eax, %eax
	ret
	.size intel_aes_encrypt_ecb_256, .-intel_aes_encrypt_ecb_256


/* in %rdi : cx - context
   in %rsi : output - pointer to output buffer
   in %rdx : outputLen - pointer to variable for length of output
             (already filled in by caller)
   in %ecx : maxOutputLen - length of output buffer
             (already checked by caller)
   in %r8  : input - pointer to input buffer
   in %r9d : inputLen - length of input buffer
   on stack: blocksize - AES blocksize (always 16, unused)
*/
	.type intel_aes_decrypt_ecb_256,@function
	.globl intel_aes_decrypt_ecb_256
	.align	16
intel_aes_decrypt_ecb_256:
	movdqu	(%rdi), %xmm2
	movdqu	224(%rdi), %xmm15
	xorl	%eax, %eax
//	cmpl	$8*16, %r9d
	cmpl	$128, %r9d
	jb	1f
//	leal	-8*16(%r9), %r11d
	leal	-128(%r9), %r11d
2:	movdqu	(%r8, %rax), %xmm3
	movdqu	16(%r8, %rax), %xmm4
	movdqu	32(%r8, %rax), %xmm5
	movdqu	48(%r8, %rax), %xmm6
	movdqu	64(%r8, %rax), %xmm7
	movdqu	80(%r8, %rax), %xmm8
	movdqu	96(%r8, %rax), %xmm9
	movdqu	112(%r8, %rax), %xmm10
	pxor	%xmm15, %xmm3
	pxor	%xmm15, %xmm4
	pxor	%xmm15, %xmm5
	pxor	%xmm15, %xmm6
	pxor	%xmm15, %xmm7
	pxor	%xmm15, %xmm8
	pxor	%xmm15, %xmm9
	pxor	%xmm15, %xmm10

// complete loop unrolling
	movdqu 208(%rdi), %xmm1
	movdqu 192(%rdi), %xmm11
	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */

	movdqu 176(%rdi), %xmm1
	movdqu 160(%rdi), %xmm11
	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */

	movdqu 144(%rdi), %xmm1
	movdqu 128(%rdi), %xmm11
	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */

	movdqu 112(%rdi), %xmm1
	movdqu 96(%rdi), %xmm11
	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */

	movdqu 80(%rdi), %xmm1
	movdqu 64(%rdi), %xmm11
	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */

	movdqu 48(%rdi), %xmm1
	movdqu 32(%rdi), %xmm11
	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */

	movdqu 16(%rdi), %xmm1
	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
	.byte 0x66,0x0f,0x38,0xdf,0xda		/* aesdeclast 	%xmm2, %xmm3 */
	.byte 0x66,0x0f,0x38,0xdf,0xe2		/* aesdeclast 	%xmm2, %xmm4 */
	.byte 0x66,0x0f,0x38,0xdf,0xea		/* aesdeclast 	%xmm2, %xmm5 */
	.byte 0x66,0x0f,0x38,0xdf,0xf2		/* aesdeclast 	%xmm2, %xmm6 */
	.byte 0x66,0x0f,0x38,0xdf,0xfa		/* aesdeclast 	%xmm2, %xmm7 */
	.byte 0x66,0x44,0x0f,0x38,0xdf,0xc2	/* aesdeclast 	%xmm2, %xmm8 */
	.byte 0x66,0x44,0x0f,0x38,0xdf,0xca	/* aesdeclast 	%xmm2, %xmm9 */
	.byte 0x66,0x44,0x0f,0x38,0xdf,0xd2	/* aesdeclast 	%xmm2, %xmm10 */

	movdqu	%xmm3, (%rsi, %rax)
	movdqu	%xmm4, 16(%rsi, %rax)
	movdqu	%xmm5, 32(%rsi, %rax)
	movdqu	%xmm6, 48(%rsi, %rax)
	movdqu	%xmm7, 64(%rsi, %rax)
	movdqu	%xmm8, 80(%rsi, %rax)
	movdqu	%xmm9, 96(%rsi, %rax)
	movdqu	%xmm10, 112(%rsi, %rax)
//	addl	$8*16, %eax
	addl	$128, %eax
	cmpl	%r11d, %eax
	jbe	2b
1:	cmpl	%eax, %r9d
	je	5f

	movdqu	16(%rdi), %xmm2
	movdqu	32(%rdi), %xmm3
	movdqu	48(%rdi), %xmm4
	movdqu	64(%rdi), %xmm5
	movdqu	80(%rdi), %xmm6
	movdqu	96(%rdi), %xmm7
	movdqu	112(%rdi), %xmm8
	movdqu	128(%rdi), %xmm9
	movdqu	144(%rdi), %xmm10
	movdqu	160(%rdi), %xmm11
	movdqu	176(%rdi), %xmm12
	movdqu	192(%rdi), %xmm13
	movdqu	208(%rdi), %xmm14

4:	movdqu	(%r8, %rax), %xmm1
	pxor	%xmm15, %xmm1
	.byte 0x66,0x41,0x0f,0x38,0xde,0xce	/* aesdec	%xmm14, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xcd	/* aesdec	%xmm13, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xcc	/* aesdec	%xmm12, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xca	/* aesdec	%xmm10, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm9, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xc8	/* aesdec	%xmm8, %xmm1 */
	movdqu	(%rdi), %xmm8
	.byte 0x66,0x0f,0x38,0xde,0xcf	/* aesdec	%xmm7, %xmm1 */
	.byte 0x66,0x0f,0x38,0xde,0xce	/* aesdec	%xmm6, %xmm1 */
	.byte 0x66,0x0f,0x38,0xde,0xcd	/* aesdec	%xmm5, %xmm1 */
	.byte 0x66,0x0f,0x38,0xde,0xcc	/* aesdec	%xmm4, %xmm1 */
	.byte 0x66,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm3, %xmm1 */
	.byte 0x66,0x0f,0x38,0xde,0xca	/* aesdec	%xmm2, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xdf,0xc8	/* aesdeclast %xmm8, %xmm1 */
	movdqu	112(%rdi), %xmm8
	movdqu	%xmm1, (%rsi, %rax)
	addl	$16, %eax
	cmpl	%eax, %r9d
	jne	4b

5:	xor	%eax, %eax
	ret
	.size intel_aes_decrypt_ecb_256, .-intel_aes_decrypt_ecb_256


/* in %rdi : cx - context
   in %rsi : output - pointer to output buffer
   in %rdx : outputLen - pointer to variable for length of output
             (already filled in by caller)
   in %ecx : maxOutputLen - length of output buffer
             (already checked by caller)
   in %r8  : input - pointer to input buffer
   in %r9d : inputLen - length of input buffer
   on stack: blocksize - AES blocksize (always 16, unused)
*/
	.type intel_aes_encrypt_cbc_256,@function
	.globl intel_aes_encrypt_cbc_256
	.align	16
intel_aes_encrypt_cbc_256:
	testl	%r9d, %r9d
	je	2f

//	leaq	IV_OFFSET(%rdi), %rdx
	leaq	256(%rdi), %rdx

	movdqu	(%rdx), %xmm0
	movdqu	(%rdi), %xmm8
	movdqu	16(%rdi), %xmm2
	movdqu	32(%rdi), %xmm3
	movdqu	48(%rdi), %xmm4
	movdqu	64(%rdi), %xmm5
	movdqu	80(%rdi), %xmm6
	movdqu	96(%rdi), %xmm7
	movdqu	128(%rdi), %xmm9
	movdqu	144(%rdi), %xmm10
	movdqu	160(%rdi), %xmm11
	movdqu	176(%rdi), %xmm12
	movdqu	192(%rdi), %xmm13
	movdqu	208(%rdi), %xmm14
	movdqu	224(%rdi), %xmm15

	xorl	%eax, %eax
1:	movdqu	(%r8, %rax), %xmm1
	pxor	%xmm0, %xmm1
	pxor	%xmm8, %xmm1
	movdqu	112(%rdi), %xmm8
	.byte 0x66,0x0f,0x38,0xdc,0xca	/* aesenc	%xmm2, %xmm1 */
	.byte 0x66,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm3, %xmm1 */
	.byte 0x66,0x0f,0x38,0xdc,0xcc	/* aesenc	%xmm4, %xmm1 */
	.byte 0x66,0x0f,0x38,0xdc,0xcd	/* aesenc	%xmm5, %xmm1 */
	.byte 0x66,0x0f,0x38,0xdc,0xce	/* aesenc	%xmm6, %xmm1 */
	.byte 0x66,0x0f,0x38,0xdc,0xcf	/* aesenc	%xmm7, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xc8	/* aesenc	%xmm8, %xmm1 */
	movdqu	(%rdi), %xmm8
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm9, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xca	/* aesenc	%xmm10, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcc	/* aesenc	%xmm12, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcd	/* aesenc	%xmm13, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xdc,0xce	/* aesenc	%xmm14, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xdd,0xcf	/* aesenclast %xmm15, %xmm1 */
	movdqu	%xmm1, (%rsi, %rax)
	movdqa	%xmm1, %xmm0
	addl	$16, %eax
	cmpl	%eax, %r9d
	jne	1b

	movdqu	%xmm0, (%rdx)

2:	xor	%eax, %eax
	ret
	.size intel_aes_encrypt_cbc_256, .-intel_aes_encrypt_cbc_256


/* in %rdi : cx - context
   in %rsi : output - pointer to output buffer
   in %rdx : outputLen - pointer to variable for length of output
             (already filled in by caller)
   in %ecx : maxOutputLen - length of output buffer
             (already checked by caller)
   in %r8  : input - pointer to input buffer
   in %r9d : inputLen - length of input buffer
   on stack: blocksize - AES blocksize (always 16, unused)
*/
	.type intel_aes_decrypt_cbc_256,@function
	.globl intel_aes_decrypt_cbc_256
	.align	16
intel_aes_decrypt_cbc_256:
//	leaq	IV_OFFSET(%rdi), %rdx
	leaq	256(%rdi), %rdx

	movdqu	(%rdx), %xmm0
	movdqu	(%rdi), %xmm2
	movdqu	224(%rdi), %xmm15
	xorl	%eax, %eax
//	cmpl	$8*16, %r9d
	cmpl	$128, %r9d
	jb	1f
//	leal	-8*16(%r9), %r11d
	leal	-128(%r9), %r11d
2:	movdqu  (%r8, %rax), %xmm3
	movdqu	16(%r8, %rax), %xmm4
	movdqu	32(%r8, %rax), %xmm5
	movdqu	48(%r8, %rax), %xmm6
	movdqu	64(%r8, %rax), %xmm7
	movdqu	80(%r8, %rax), %xmm8
	movdqu	96(%r8, %rax), %xmm9
	movdqu	112(%r8, %rax), %xmm10
	pxor	%xmm15, %xmm3
	pxor	%xmm15, %xmm4
	pxor	%xmm15, %xmm5
	pxor	%xmm15, %xmm6
	pxor	%xmm15, %xmm7
	pxor	%xmm15, %xmm8
	pxor	%xmm15, %xmm9
	pxor	%xmm15, %xmm10

// complete loop unrolling
	movdqu 208(%rdi), %xmm1
	movdqu 192(%rdi), %xmm11
	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */

	movdqu 176(%rdi), %xmm1
	movdqu 160(%rdi), %xmm11
	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */

	movdqu 144(%rdi), %xmm1
	movdqu 128(%rdi), %xmm11
	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */

	movdqu 112(%rdi), %xmm1
	movdqu 96(%rdi), %xmm11
	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */

	movdqu 80(%rdi), %xmm1
	movdqu 64(%rdi), %xmm11
	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */

	movdqu 48(%rdi), %xmm1
	movdqu 32(%rdi), %xmm11
	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */

	movdqu 16(%rdi), %xmm1
	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
	.byte 0x66,0x0f,0x38,0xdf,0xda		/* aesdeclast 	%xmm2, %xmm3 */
	.byte 0x66,0x0f,0x38,0xdf,0xe2		/* aesdeclast 	%xmm2, %xmm4 */
	.byte 0x66,0x0f,0x38,0xdf,0xea		/* aesdeclast 	%xmm2, %xmm5 */
	.byte 0x66,0x0f,0x38,0xdf,0xf2		/* aesdeclast 	%xmm2, %xmm6 */
	.byte 0x66,0x0f,0x38,0xdf,0xfa		/* aesdeclast 	%xmm2, %xmm7 */
	.byte 0x66,0x44,0x0f,0x38,0xdf,0xc2	/* aesdeclast 	%xmm2, %xmm8 */
	.byte 0x66,0x44,0x0f,0x38,0xdf,0xca	/* aesdeclast 	%xmm2, %xmm9 */
	.byte 0x66,0x44,0x0f,0x38,0xdf,0xd2	/* aesdeclast 	%xmm2, %xmm10 */

 	pxor	%xmm0, %xmm3
	movdqu	(%r8, %rax), %xmm0
	pxor	%xmm0, %xmm4
	movdqu	16(%r8, %rax), %xmm0
	pxor	%xmm0, %xmm5
	movdqu	32(%r8, %rax), %xmm0
	pxor	%xmm0, %xmm6
	movdqu	48(%r8, %rax), %xmm0
	pxor	%xmm0, %xmm7
	movdqu	64(%r8, %rax), %xmm0
	pxor	%xmm0, %xmm8
	movdqu	80(%r8, %rax), %xmm0
	pxor	%xmm0, %xmm9
	movdqu	96(%r8, %rax), %xmm0
	pxor	%xmm0, %xmm10
	movdqu	112(%r8, %rax), %xmm0
	movdqu	%xmm3, (%rsi, %rax)
	movdqu	%xmm4, 16(%rsi, %rax)
	movdqu	%xmm5, 32(%rsi, %rax)
	movdqu	%xmm6, 48(%rsi, %rax)
	movdqu	%xmm7, 64(%rsi, %rax)
	movdqu	%xmm8, 80(%rsi, %rax)
	movdqu	%xmm9, 96(%rsi, %rax)
	movdqu	%xmm10, 112(%rsi, %rax)
//	addl	$8*16, %eax
	addl	$128, %eax
	cmpl	%r11d, %eax
	jbe	2b
1:	cmpl	%eax, %r9d
	je	5f

	movdqu	16(%rdi), %xmm2
	movdqu	32(%rdi), %xmm3
	movdqu	48(%rdi), %xmm4
	movdqu	64(%rdi), %xmm5
	movdqu	80(%rdi), %xmm6
	movdqu	96(%rdi), %xmm7
	movdqu	112(%rdi), %xmm8
	movdqu	128(%rdi), %xmm9
	movdqu	144(%rdi), %xmm10
	movdqu	160(%rdi), %xmm11
	movdqu	176(%rdi), %xmm12
	movdqu	192(%rdi), %xmm13
	movdqu	208(%rdi), %xmm14

4:	movdqu	(%r8, %rax), %xmm1
	pxor	%xmm15, %xmm1
	.byte 0x66,0x41,0x0f,0x38,0xde,0xce	/* aesdec	%xmm14, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xcd	/* aesdec	%xmm13, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xcc	/* aesdec	%xmm12, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xca	/* aesdec	%xmm10, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm9, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xde,0xc8	/* aesdec	%xmm8, %xmm1 */
	movdqu	(%rdi), %xmm8
	.byte 0x66,0x0f,0x38,0xde,0xcf	/* aesdec	%xmm7, %xmm1 */
	.byte 0x66,0x0f,0x38,0xde,0xce	/* aesdec	%xmm6, %xmm1 */
	.byte 0x66,0x0f,0x38,0xde,0xcd	/* aesdec	%xmm5, %xmm1 */
	.byte 0x66,0x0f,0x38,0xde,0xcc	/* aesdec	%xmm4, %xmm1 */
	.byte 0x66,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm3, %xmm1 */
	.byte 0x66,0x0f,0x38,0xde,0xca	/* aesdec	%xmm2, %xmm1 */
	.byte 0x66,0x41,0x0f,0x38,0xdf,0xc8	/* aesdeclast %xmm8, %xmm1 */
	movdqu	112(%rdi), %xmm8
	pxor	%xmm0, %xmm1
	movdqu	(%r8, %rax), %xmm0  /* fetch the IV before we store the block */
	movdqu	%xmm1, (%rsi, %rax) /* in case input buf = output buf */
	addl	$16, %eax
	cmpl	%eax, %r9d
	jne	4b

5:	movdqu	%xmm0, (%rdx)

	xor	%eax, %eax
	ret
	.size intel_aes_decrypt_cbc_256, .-intel_aes_decrypt_cbc_256