summaryrefslogtreecommitdiffstats
path: root/security/nss/lib/freebl/mpi/mpi_sse2.s
diff options
context:
space:
mode:
Diffstat (limited to 'security/nss/lib/freebl/mpi/mpi_sse2.s')
-rw-r--r--security/nss/lib/freebl/mpi/mpi_sse2.s294
1 files changed, 294 insertions, 0 deletions
diff --git a/security/nss/lib/freebl/mpi/mpi_sse2.s b/security/nss/lib/freebl/mpi/mpi_sse2.s
new file mode 100644
index 000000000..16a47019c
--- /dev/null
+++ b/security/nss/lib/freebl/mpi/mpi_sse2.s
@@ -0,0 +1,294 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifdef DARWIN
+#define s_mpv_mul_d _s_mpv_mul_d
+#define s_mpv_mul_d_add _s_mpv_mul_d_add
+#define s_mpv_mul_d_add_prop _s_mpv_mul_d_add_prop
+#define s_mpv_sqr_add_prop _s_mpv_sqr_add_prop
+#define s_mpv_div_2dx1d _s_mpv_div_2dx1d
+#define TYPE_FUNCTION(x)
+#else
+#define TYPE_FUNCTION(x) .type x, @function
+#endif
+
+.text
+
+ # ebp - 8: caller's esi
+ # ebp - 4: caller's edi
+ # ebp + 0: caller's ebp
+ # ebp + 4: return address
+ # ebp + 8: a argument
+ # ebp + 12: a_len argument
+ # ebp + 16: b argument
+ # ebp + 20: c argument
+ # registers:
+ # ebx:
+ # ecx: a_len
+ # esi: a ptr
+ # edi: c ptr
+.globl s_mpv_mul_d
+.private_extern s_mpv_mul_d
+TYPE_FUNCTION(s_mpv_mul_d)
+s_mpv_mul_d:
+ push %ebp
+ mov %esp, %ebp
+ push %edi
+ push %esi
+ psubq %mm2, %mm2 # carry = 0
+ mov 12(%ebp), %ecx # ecx = a_len
+ movd 16(%ebp), %mm1 # mm1 = b
+ mov 20(%ebp), %edi
+ cmp $0, %ecx
+ je 2f # jmp if a_len == 0
+ mov 8(%ebp), %esi # esi = a
+ cld
+1:
+ movd 0(%esi), %mm0 # mm0 = *a++
+ add $4, %esi
+ pmuludq %mm1, %mm0 # mm0 = b * *a++
+ paddq %mm0, %mm2 # add the carry
+ movd %mm2, 0(%edi) # store the 32bit result
+ add $4, %edi
+ psrlq $32, %mm2 # save the carry
+ dec %ecx # --a_len
+ jnz 1b # jmp if a_len != 0
+2:
+ movd %mm2, 0(%edi) # *c = carry
+ emms
+ pop %esi
+ pop %edi
+ leave
+ ret
+ nop
+
+ # ebp - 8: caller's esi
+ # ebp - 4: caller's edi
+ # ebp + 0: caller's ebp
+ # ebp + 4: return address
+ # ebp + 8: a argument
+ # ebp + 12: a_len argument
+ # ebp + 16: b argument
+ # ebp + 20: c argument
+ # registers:
+ # ebx:
+ # ecx: a_len
+ # esi: a ptr
+ # edi: c ptr
+.globl s_mpv_mul_d_add
+.private_extern s_mpv_mul_d_add
+TYPE_FUNCTION(s_mpv_mul_d_add)
+s_mpv_mul_d_add:
+ push %ebp
+ mov %esp, %ebp
+ push %edi
+ push %esi
+ psubq %mm2, %mm2 # carry = 0
+ mov 12(%ebp), %ecx # ecx = a_len
+ movd 16(%ebp), %mm1 # mm1 = b
+ mov 20(%ebp), %edi
+ cmp $0, %ecx
+ je 2f # jmp if a_len == 0
+ mov 8(%ebp), %esi # esi = a
+ cld
+1:
+ movd 0(%esi), %mm0 # mm0 = *a++
+ add $4, %esi
+ pmuludq %mm1, %mm0 # mm0 = b * *a++
+ paddq %mm0, %mm2 # add the carry
+ movd 0(%edi), %mm0
+ paddq %mm0, %mm2 # add the carry
+ movd %mm2, 0(%edi) # store the 32bit result
+ add $4, %edi
+ psrlq $32, %mm2 # save the carry
+ dec %ecx # --a_len
+ jnz 1b # jmp if a_len != 0
+2:
+ movd %mm2, 0(%edi) # *c = carry
+ emms
+ pop %esi
+ pop %edi
+ leave
+ ret
+ nop
+
+ # ebp - 12: caller's ebx
+ # ebp - 8: caller's esi
+ # ebp - 4: caller's edi
+ # ebp + 0: caller's ebp
+ # ebp + 4: return address
+ # ebp + 8: a argument
+ # ebp + 12: a_len argument
+ # ebp + 16: b argument
+ # ebp + 20: c argument
+ # registers:
+ # eax:
+ # ebx: carry
+ # ecx: a_len
+ # esi: a ptr
+ # edi: c ptr
+.globl s_mpv_mul_d_add_prop
+.private_extern s_mpv_mul_d_add_prop
+TYPE_FUNCTION(s_mpv_mul_d_add_prop)
+s_mpv_mul_d_add_prop:
+ push %ebp
+ mov %esp, %ebp
+ push %edi
+ push %esi
+ push %ebx
+ psubq %mm2, %mm2 # carry = 0
+ mov 12(%ebp), %ecx # ecx = a_len
+ movd 16(%ebp), %mm1 # mm1 = b
+ mov 20(%ebp), %edi
+ cmp $0, %ecx
+ je 2f # jmp if a_len == 0
+ mov 8(%ebp), %esi # esi = a
+ cld
+1:
+ movd 0(%esi), %mm0 # mm0 = *a++
+ movd 0(%edi), %mm3 # fetch the sum
+ add $4, %esi
+ pmuludq %mm1, %mm0 # mm0 = b * *a++
+ paddq %mm0, %mm2 # add the carry
+ paddq %mm3, %mm2 # add *c++
+ movd %mm2, 0(%edi) # store the 32bit result
+ add $4, %edi
+ psrlq $32, %mm2 # save the carry
+ dec %ecx # --a_len
+ jnz 1b # jmp if a_len != 0
+2:
+ movd %mm2, %ebx
+ cmp $0, %ebx # is carry zero?
+ jz 4f
+ mov 0(%edi), %eax
+ add %ebx, %eax
+ stosl
+ jnc 4f
+3:
+ mov 0(%edi), %eax # add in current word from *c
+ adc $0, %eax
+ stosl # [es:edi] = ax; edi += 4;
+ jc 3b
+4:
+ emms
+ pop %ebx
+ pop %esi
+ pop %edi
+ leave
+ ret
+ nop
+
+ # ebp - 12: caller's ebx
+ # ebp - 8: caller's esi
+ # ebp - 4: caller's edi
+ # ebp + 0: caller's ebp
+ # ebp + 4: return address
+ # ebp + 8: pa argument
+ # ebp + 12: a_len argument
+ # ebp + 16: ps argument
+ # registers:
+ # eax:
+ # ebx: carry
+ # ecx: a_len
+ # esi: a ptr
+ # edi: c ptr
+.globl s_mpv_sqr_add_prop
+.private_extern s_mpv_sqr_add_prop
+TYPE_FUNCTION(s_mpv_sqr_add_prop)
+s_mpv_sqr_add_prop:
+ push %ebp
+ mov %esp, %ebp
+ push %edi
+ push %esi
+ push %ebx
+ psubq %mm2, %mm2 # carry = 0
+ mov 12(%ebp), %ecx # ecx = a_len
+ mov 16(%ebp), %edi
+ cmp $0, %ecx
+ je 2f # jmp if a_len == 0
+ mov 8(%ebp), %esi # esi = a
+ cld
+1:
+ movd 0(%esi), %mm0 # mm0 = *a
+ movd 0(%edi), %mm3 # fetch the sum
+ add $4, %esi
+ pmuludq %mm0, %mm0 # mm0 = sqr(a)
+ paddq %mm0, %mm2 # add the carry
+ paddq %mm3, %mm2 # add the low word
+ movd 4(%edi), %mm3
+ movd %mm2, 0(%edi) # store the 32bit result
+ psrlq $32, %mm2
+ paddq %mm3, %mm2 # add the high word
+ movd %mm2, 4(%edi) # store the 32bit result
+ psrlq $32, %mm2 # save the carry.
+ add $8, %edi
+ dec %ecx # --a_len
+ jnz 1b # jmp if a_len != 0
+2:
+ movd %mm2, %ebx
+ cmp $0, %ebx # is carry zero?
+ jz 4f
+ mov 0(%edi), %eax
+ add %ebx, %eax
+ stosl
+ jnc 4f
+3:
+ mov 0(%edi), %eax # add in current word from *c
+ adc $0, %eax
+ stosl # [es:edi] = ax; edi += 4;
+ jc 3b
+4:
+ emms
+ pop %ebx
+ pop %esi
+ pop %edi
+ leave
+ ret
+ nop
+
+ #
+ # Divide 64-bit (Nhi,Nlo) by 32-bit divisor, which must be normalized
+ # so its high bit is 1. This code is from NSPR.
+ #
+ # mp_err s_mpv_div_2dx1d(mp_digit Nhi, mp_digit Nlo, mp_digit divisor,
+ # mp_digit *qp, mp_digit *rp)
+
+ # esp + 0: Caller's ebx
+ # esp + 4: return address
+ # esp + 8: Nhi argument
+ # esp + 12: Nlo argument
+ # esp + 16: divisor argument
+ # esp + 20: qp argument
+ # esp + 24: rp argument
+ # registers:
+ # eax:
+ # ebx: carry
+ # ecx: a_len
+ # edx:
+ # esi: a ptr
+ # edi: c ptr
+ #
+.globl s_mpv_div_2dx1d
+.private_extern s_mpv_div_2dx1d
+TYPE_FUNCTION(s_mpv_div_2dx1d)
+s_mpv_div_2dx1d:
+ push %ebx
+ mov 8(%esp), %edx
+ mov 12(%esp), %eax
+ mov 16(%esp), %ebx
+ div %ebx
+ mov 20(%esp), %ebx
+ mov %eax, 0(%ebx)
+ mov 24(%esp), %ebx
+ mov %edx, 0(%ebx)
+ xor %eax, %eax # return zero
+ pop %ebx
+ ret
+ nop
+
+#ifndef DARWIN
+ # Magic indicating no need for an executable stack
+.section .note.GNU-stack, "", @progbits
+.previous
+#endif