diff options
Diffstat (limited to 'security/nss/lib/freebl/arcfour-amd64-sun.s')
-rw-r--r-- | security/nss/lib/freebl/arcfour-amd64-sun.s | 84 |
1 files changed, 84 insertions, 0 deletions
diff --git a/security/nss/lib/freebl/arcfour-amd64-sun.s b/security/nss/lib/freebl/arcfour-amd64-sun.s new file mode 100644 index 000000000..8b649f901 --- /dev/null +++ b/security/nss/lib/freebl/arcfour-amd64-sun.s @@ -0,0 +1,84 @@ +/ This Source Code Form is subject to the terms of the Mozilla Public +/ License, v. 2.0. If a copy of the MPL was not distributed with this +/ file, You can obtain one at http://mozilla.org/MPL/2.0/. + +/ ** ARCFOUR implementation optimized for AMD64. +/ ** +/ ** The throughput achieved by this code is about 320 MBytes/sec, on +/ ** a 1.8 GHz AMD Opteron (rev C0) processor. + +.text +.align 16 +.globl ARCFOUR +.type ARCFOUR,@function +ARCFOUR: + pushq %rbp + pushq %rbx + movq %rdi, %rbp / key = ARG(key) + movq %rsi, %rbx / rbx = ARG(len) + movq %rdx, %rsi / in = ARG(in) + movq %rcx, %rdi / out = ARG(out) + movq (%rbp), %rcx / x = key->x + movq 8(%rbp), %rdx / y = key->y + addq $16, %rbp / d = key->data + incq %rcx / x++ + andq $255, %rcx / x &= 0xff + leaq -8(%rbx,%rsi), %rbx / rbx = in+len-8 + movq %rbx, %r9 / tmp = in+len-8 + movq 0(%rbp,%rcx,8), %rax / tx = d[x] + cmpq %rsi, %rbx / cmp in with in+len-8 + jl .Lend / jump if (in+len-8 < in) + +.Lstart: + addq $8, %rsi / increment in + addq $8, %rdi / increment out + + / generate the next 8 bytes of the rc4 stream into %r8 + movq $8, %r11 / byte counter +1: addb %al, %dl / y += tx + movl 0(%rbp,%rdx,8), %ebx / ty = d[y] + movl %ebx, 0(%rbp,%rcx,8) / d[x] = ty + addb %al, %bl / val = ty + tx + movl %eax, 0(%rbp,%rdx,8) / d[y] = tx + incb %cl / x++ (NEXT ROUND) + movl 0(%rbp,%rcx,8), %eax / tx = d[x] (NEXT ROUND) + movb 0(%rbp,%rbx,8), %r8b / val = d[val] + decb %r11b + rorq $8, %r8 / (ror does not change ZF) + jnz 1b + + / xor 8 bytes + xorq -8(%rsi), %r8 + cmpq %r9, %rsi / cmp in+len-8 with in + movq %r8, -8(%rdi) + jle .Lstart / jump if (in <= in+len-8) + +.Lend: + addq $8, %r9 / tmp = in+len + + / handle the last bytes, one by one +1: cmpq %rsi, %r9 / cmp in with in+len + jle .Lfinished / jump if (in+len <= in) + addb %al, %dl / y += tx + movl 0(%rbp,%rdx,8), %ebx / ty = d[y] + movl %ebx, 0(%rbp,%rcx,8) / d[x] = ty + addb %al, %bl / val = ty + tx + movl %eax, 0(%rbp,%rdx,8) / d[y] = tx + incb %cl / x++ (NEXT ROUND) + movl 0(%rbp,%rcx,8), %eax / tx = d[x] (NEXT ROUND) + movb 0(%rbp,%rbx,8), %r8b / val = d[val] + xorb (%rsi), %r8b / xor 1 byte + movb %r8b, (%rdi) + incq %rsi / in++ + incq %rdi / out++ + jmp 1b + +.Lfinished: + decq %rcx / x-- + movb %dl, -8(%rbp) / key->y = y + movb %cl, -16(%rbp) / key->x = x + popq %rbx + popq %rbp + ret +.L_ARCFOUR_end: +.size ARCFOUR,.L_ARCFOUR_end-ARCFOUR |