diff options
Diffstat (limited to 'security/nss/lib/freebl/mpi/mpi_x86.s')
-rw-r--r-- | security/nss/lib/freebl/mpi/mpi_x86.s | 541 |
1 files changed, 541 insertions, 0 deletions
diff --git a/security/nss/lib/freebl/mpi/mpi_x86.s b/security/nss/lib/freebl/mpi/mpi_x86.s new file mode 100644 index 000000000..8f7e2130c --- /dev/null +++ b/security/nss/lib/freebl/mpi/mpi_x86.s @@ -0,0 +1,541 @@ +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +.data +.align 4 + # + # -1 means to call s_mpi_is_sse to determine if we support sse + # instructions. + # 0 means to use x86 instructions + # 1 means to use sse2 instructions +.type is_sse,@object +.size is_sse,4 +is_sse: .long -1 + +# +# sigh, handle the difference between -fPIC and not PIC +# default to pic, since this file seems to be exclusively +# linux right now (solaris uses mpi_i86pc.s and windows uses +# mpi_x86_asm.c) +# +.ifndef NO_PIC +.macro GET var,reg + movl \var@GOTOFF(%ebx),\reg +.endm +.macro PUT reg,var + movl \reg,\var@GOTOFF(%ebx) +.endm +.else +.macro GET var,reg + movl \var,\reg +.endm +.macro PUT reg,var + movl \reg,\var +.endm +.endif + +.text + + + # ebp - 36: caller's esi + # ebp - 32: caller's edi + # ebp - 28: + # ebp - 24: + # ebp - 20: + # ebp - 16: + # ebp - 12: + # ebp - 8: + # ebp - 4: + # ebp + 0: caller's ebp + # ebp + 4: return address + # ebp + 8: a argument + # ebp + 12: a_len argument + # ebp + 16: b argument + # ebp + 20: c argument + # registers: + # eax: + # ebx: carry + # ecx: a_len + # edx: + # esi: a ptr + # edi: c ptr +.globl s_mpv_mul_d +.type s_mpv_mul_d,@function +s_mpv_mul_d: + GET is_sse,%eax + cmp $0,%eax + je s_mpv_mul_d_x86 + jg s_mpv_mul_d_sse2 + call s_mpi_is_sse2 + PUT %eax,is_sse + cmp $0,%eax + jg s_mpv_mul_d_sse2 +s_mpv_mul_d_x86: + push %ebp + mov %esp,%ebp + sub $28,%esp + push %edi + push %esi + push %ebx + movl $0,%ebx # carry = 0 + mov 12(%ebp),%ecx # ecx = a_len + mov 20(%ebp),%edi + cmp $0,%ecx + je 2f # jmp if a_len == 0 + mov 8(%ebp),%esi # esi = a + cld +1: + lodsl # eax = [ds:esi]; esi += 4 + mov 16(%ebp),%edx # edx = b + mull %edx # edx:eax = Phi:Plo = a_i * b + + add %ebx,%eax # add carry (%ebx) to edx:eax + adc $0,%edx + mov %edx,%ebx # high half of product becomes next carry + + stosl # [es:edi] = ax; edi += 4; + dec %ecx # --a_len + jnz 1b # jmp if a_len != 0 +2: + mov %ebx,0(%edi) # *c = carry + pop %ebx + pop %esi + pop %edi + leave + ret + nop +s_mpv_mul_d_sse2: + push %ebp + mov %esp,%ebp + push %edi + push %esi + psubq %mm2,%mm2 # carry = 0 + mov 12(%ebp),%ecx # ecx = a_len + movd 16(%ebp),%mm1 # mm1 = b + mov 20(%ebp),%edi + cmp $0,%ecx + je 6f # jmp if a_len == 0 + mov 8(%ebp),%esi # esi = a + cld +5: + movd 0(%esi),%mm0 # mm0 = *a++ + add $4,%esi + pmuludq %mm1,%mm0 # mm0 = b * *a++ + paddq %mm0,%mm2 # add the carry + movd %mm2,0(%edi) # store the 32bit result + add $4,%edi + psrlq $32, %mm2 # save the carry + dec %ecx # --a_len + jnz 5b # jmp if a_len != 0 +6: + movd %mm2,0(%edi) # *c = carry + emms + pop %esi + pop %edi + leave + ret + nop + + # ebp - 36: caller's esi + # ebp - 32: caller's edi + # ebp - 28: + # ebp - 24: + # ebp - 20: + # ebp - 16: + # ebp - 12: + # ebp - 8: + # ebp - 4: + # ebp + 0: caller's ebp + # ebp + 4: return address + # ebp + 8: a argument + # ebp + 12: a_len argument + # ebp + 16: b argument + # ebp + 20: c argument + # registers: + # eax: + # ebx: carry + # ecx: a_len + # edx: + # esi: a ptr + # edi: c ptr +.globl s_mpv_mul_d_add +.type s_mpv_mul_d_add,@function +s_mpv_mul_d_add: + GET is_sse,%eax + cmp $0,%eax + je s_mpv_mul_d_add_x86 + jg s_mpv_mul_d_add_sse2 + call s_mpi_is_sse2 + PUT %eax,is_sse + cmp $0,%eax + jg s_mpv_mul_d_add_sse2 +s_mpv_mul_d_add_x86: + push %ebp + mov %esp,%ebp + sub $28,%esp + push %edi + push %esi + push %ebx + movl $0,%ebx # carry = 0 + mov 12(%ebp),%ecx # ecx = a_len + mov 20(%ebp),%edi + cmp $0,%ecx + je 11f # jmp if a_len == 0 + mov 8(%ebp),%esi # esi = a + cld +10: + lodsl # eax = [ds:esi]; esi += 4 + mov 16(%ebp),%edx # edx = b + mull %edx # edx:eax = Phi:Plo = a_i * b + + add %ebx,%eax # add carry (%ebx) to edx:eax + adc $0,%edx + mov 0(%edi),%ebx # add in current word from *c + add %ebx,%eax + adc $0,%edx + mov %edx,%ebx # high half of product becomes next carry + + stosl # [es:edi] = ax; edi += 4; + dec %ecx # --a_len + jnz 10b # jmp if a_len != 0 +11: + mov %ebx,0(%edi) # *c = carry + pop %ebx + pop %esi + pop %edi + leave + ret + nop +s_mpv_mul_d_add_sse2: + push %ebp + mov %esp,%ebp + push %edi + push %esi + psubq %mm2,%mm2 # carry = 0 + mov 12(%ebp),%ecx # ecx = a_len + movd 16(%ebp),%mm1 # mm1 = b + mov 20(%ebp),%edi + cmp $0,%ecx + je 16f # jmp if a_len == 0 + mov 8(%ebp),%esi # esi = a + cld +15: + movd 0(%esi),%mm0 # mm0 = *a++ + add $4,%esi + pmuludq %mm1,%mm0 # mm0 = b * *a++ + paddq %mm0,%mm2 # add the carry + movd 0(%edi),%mm0 + paddq %mm0,%mm2 # add the carry + movd %mm2,0(%edi) # store the 32bit result + add $4,%edi + psrlq $32, %mm2 # save the carry + dec %ecx # --a_len + jnz 15b # jmp if a_len != 0 +16: + movd %mm2,0(%edi) # *c = carry + emms + pop %esi + pop %edi + leave + ret + nop + + # ebp - 8: caller's esi + # ebp - 4: caller's edi + # ebp + 0: caller's ebp + # ebp + 4: return address + # ebp + 8: a argument + # ebp + 12: a_len argument + # ebp + 16: b argument + # ebp + 20: c argument + # registers: + # eax: + # ebx: carry + # ecx: a_len + # edx: + # esi: a ptr + # edi: c ptr +.globl s_mpv_mul_d_add_prop +.type s_mpv_mul_d_add_prop,@function +s_mpv_mul_d_add_prop: + GET is_sse,%eax + cmp $0,%eax + je s_mpv_mul_d_add_prop_x86 + jg s_mpv_mul_d_add_prop_sse2 + call s_mpi_is_sse2 + PUT %eax,is_sse + cmp $0,%eax + jg s_mpv_mul_d_add_prop_sse2 +s_mpv_mul_d_add_prop_x86: + push %ebp + mov %esp,%ebp + sub $28,%esp + push %edi + push %esi + push %ebx + movl $0,%ebx # carry = 0 + mov 12(%ebp),%ecx # ecx = a_len + mov 20(%ebp),%edi + cmp $0,%ecx + je 21f # jmp if a_len == 0 + cld + mov 8(%ebp),%esi # esi = a +20: + lodsl # eax = [ds:esi]; esi += 4 + mov 16(%ebp),%edx # edx = b + mull %edx # edx:eax = Phi:Plo = a_i * b + + add %ebx,%eax # add carry (%ebx) to edx:eax + adc $0,%edx + mov 0(%edi),%ebx # add in current word from *c + add %ebx,%eax + adc $0,%edx + mov %edx,%ebx # high half of product becomes next carry + + stosl # [es:edi] = ax; edi += 4; + dec %ecx # --a_len + jnz 20b # jmp if a_len != 0 +21: + cmp $0,%ebx # is carry zero? + jz 23f + mov 0(%edi),%eax # add in current word from *c + add %ebx,%eax + stosl # [es:edi] = ax; edi += 4; + jnc 23f +22: + mov 0(%edi),%eax # add in current word from *c + adc $0,%eax + stosl # [es:edi] = ax; edi += 4; + jc 22b +23: + pop %ebx + pop %esi + pop %edi + leave + ret + nop +s_mpv_mul_d_add_prop_sse2: + push %ebp + mov %esp,%ebp + push %edi + push %esi + push %ebx + psubq %mm2,%mm2 # carry = 0 + mov 12(%ebp),%ecx # ecx = a_len + movd 16(%ebp),%mm1 # mm1 = b + mov 20(%ebp),%edi + cmp $0,%ecx + je 26f # jmp if a_len == 0 + mov 8(%ebp),%esi # esi = a + cld +25: + movd 0(%esi),%mm0 # mm0 = *a++ + movd 0(%edi),%mm3 # fetch the sum + add $4,%esi + pmuludq %mm1,%mm0 # mm0 = b * *a++ + paddq %mm0,%mm2 # add the carry + paddq %mm3,%mm2 # add *c++ + movd %mm2,0(%edi) # store the 32bit result + add $4,%edi + psrlq $32, %mm2 # save the carry + dec %ecx # --a_len + jnz 25b # jmp if a_len != 0 +26: + movd %mm2,%ebx + cmp $0,%ebx # is carry zero? + jz 28f + mov 0(%edi),%eax + add %ebx, %eax + stosl + jnc 28f +27: + mov 0(%edi),%eax # add in current word from *c + adc $0,%eax + stosl # [es:edi] = ax; edi += 4; + jc 27b +28: + emms + pop %ebx + pop %esi + pop %edi + leave + ret + nop + + + # ebp - 20: caller's esi + # ebp - 16: caller's edi + # ebp - 12: + # ebp - 8: carry + # ebp - 4: a_len local + # ebp + 0: caller's ebp + # ebp + 4: return address + # ebp + 8: pa argument + # ebp + 12: a_len argument + # ebp + 16: ps argument + # ebp + 20: + # registers: + # eax: + # ebx: carry + # ecx: a_len + # edx: + # esi: a ptr + # edi: c ptr + +.globl s_mpv_sqr_add_prop +.type s_mpv_sqr_add_prop,@function +s_mpv_sqr_add_prop: + GET is_sse,%eax + cmp $0,%eax + je s_mpv_sqr_add_prop_x86 + jg s_mpv_sqr_add_prop_sse2 + call s_mpi_is_sse2 + PUT %eax,is_sse + cmp $0,%eax + jg s_mpv_sqr_add_prop_sse2 +s_mpv_sqr_add_prop_x86: + push %ebp + mov %esp,%ebp + sub $12,%esp + push %edi + push %esi + push %ebx + movl $0,%ebx # carry = 0 + mov 12(%ebp),%ecx # a_len + mov 16(%ebp),%edi # edi = ps + cmp $0,%ecx + je 31f # jump if a_len == 0 + cld + mov 8(%ebp),%esi # esi = pa +30: + lodsl # %eax = [ds:si]; si += 4; + mull %eax + + add %ebx,%eax # add "carry" + adc $0,%edx + mov 0(%edi),%ebx + add %ebx,%eax # add low word from result + mov 4(%edi),%ebx + stosl # [es:di] = %eax; di += 4; + adc %ebx,%edx # add high word from result + movl $0,%ebx + mov %edx,%eax + adc $0,%ebx + stosl # [es:di] = %eax; di += 4; + dec %ecx # --a_len + jnz 30b # jmp if a_len != 0 +31: + cmp $0,%ebx # is carry zero? + jz 34f + mov 0(%edi),%eax # add in current word from *c + add %ebx,%eax + stosl # [es:edi] = ax; edi += 4; + jnc 34f +32: + mov 0(%edi),%eax # add in current word from *c + adc $0,%eax + stosl # [es:edi] = ax; edi += 4; + jc 32b +34: + pop %ebx + pop %esi + pop %edi + leave + ret + nop +s_mpv_sqr_add_prop_sse2: + push %ebp + mov %esp,%ebp + push %edi + push %esi + push %ebx + psubq %mm2,%mm2 # carry = 0 + mov 12(%ebp),%ecx # ecx = a_len + mov 16(%ebp),%edi + cmp $0,%ecx + je 36f # jmp if a_len == 0 + mov 8(%ebp),%esi # esi = a + cld +35: + movd 0(%esi),%mm0 # mm0 = *a + movd 0(%edi),%mm3 # fetch the sum + add $4,%esi + pmuludq %mm0,%mm0 # mm0 = sqr(a) + paddq %mm0,%mm2 # add the carry + paddq %mm3,%mm2 # add the low word + movd 4(%edi),%mm3 + movd %mm2,0(%edi) # store the 32bit result + psrlq $32, %mm2 + paddq %mm3,%mm2 # add the high word + movd %mm2,4(%edi) # store the 32bit result + psrlq $32, %mm2 # save the carry. + add $8,%edi + dec %ecx # --a_len + jnz 35b # jmp if a_len != 0 +36: + movd %mm2,%ebx + cmp $0,%ebx # is carry zero? + jz 38f + mov 0(%edi),%eax + add %ebx, %eax + stosl + jnc 38f +37: + mov 0(%edi),%eax # add in current word from *c + adc $0,%eax + stosl # [es:edi] = ax; edi += 4; + jc 37b +38: + emms + pop %ebx + pop %esi + pop %edi + leave + ret + nop + + # + # Divide 64-bit (Nhi,Nlo) by 32-bit divisor, which must be normalized + # so its high bit is 1. This code is from NSPR. + # + # mp_err s_mpv_div_2dx1d(mp_digit Nhi, mp_digit Nlo, mp_digit divisor, + # mp_digit *qp, mp_digit *rp) + + # esp + 0: Caller's ebx + # esp + 4: return address + # esp + 8: Nhi argument + # esp + 12: Nlo argument + # esp + 16: divisor argument + # esp + 20: qp argument + # esp + 24: rp argument + # registers: + # eax: + # ebx: carry + # ecx: a_len + # edx: + # esi: a ptr + # edi: c ptr + # + +.globl s_mpv_div_2dx1d +.type s_mpv_div_2dx1d,@function +s_mpv_div_2dx1d: + push %ebx + mov 8(%esp),%edx + mov 12(%esp),%eax + mov 16(%esp),%ebx + div %ebx + mov 20(%esp),%ebx + mov %eax,0(%ebx) + mov 24(%esp),%ebx + mov %edx,0(%ebx) + xor %eax,%eax # return zero + pop %ebx + ret + nop + + # Magic indicating no need for an executable stack +.section .note.GNU-stack, "", @progbits +.previous |