summaryrefslogtreecommitdiffstats
path: root/security/nss/lib/freebl/mpi/hppa20.s
diff options
context:
space:
mode:
Diffstat (limited to 'security/nss/lib/freebl/mpi/hppa20.s')
-rw-r--r--security/nss/lib/freebl/mpi/hppa20.s904
1 files changed, 904 insertions, 0 deletions
diff --git a/security/nss/lib/freebl/mpi/hppa20.s b/security/nss/lib/freebl/mpi/hppa20.s
new file mode 100644
index 000000000..c72de8a12
--- /dev/null
+++ b/security/nss/lib/freebl/mpi/hppa20.s
@@ -0,0 +1,904 @@
+; This Source Code Form is subject to the terms of the Mozilla Public
+; License, v. 2.0. If a copy of the MPL was not distributed with this
+; file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifdef __LP64__
+ .LEVEL 2.0W
+#else
+; .LEVEL 1.1
+; .ALLOW 2.0N
+ .LEVEL 2.0
+#endif
+ .SPACE $TEXT$,SORT=8
+ .SUBSPA $CODE$,QUAD=0,ALIGN=4,ACCESS=0x2c,CODE_ONLY,SORT=24
+
+; ***************************************************************
+;
+; maxpy_[little/big]
+;
+; ***************************************************************
+
+; There is no default -- you must specify one or the other.
+#define LITTLE_WORDIAN 1
+
+#ifdef LITTLE_WORDIAN
+#define EIGHT 8
+#define SIXTEEN 16
+#define THIRTY_TWO 32
+#define UN_EIGHT -8
+#define UN_SIXTEEN -16
+#define UN_TWENTY_FOUR -24
+#endif
+
+#ifdef BIG_WORDIAN
+#define EIGHT -8
+#define SIXTEEN -16
+#define THIRTY_TWO -32
+#define UN_EIGHT 8
+#define UN_SIXTEEN 16
+#define UN_TWENTY_FOUR 24
+#endif
+
+; This performs a multiple-precision integer version of "daxpy",
+; Using the selected addressing direction. "Little-wordian" means that
+; the least significant word of a number is stored at the lowest address.
+; "Big-wordian" means that the most significant word is at the lowest
+; address. Either way, the incoming address of the vector is that
+; of the least significant word. That means that, for little-wordian
+; addressing, we move the address upward as we propagate carries
+; from the least significant word to the most significant. For
+; big-wordian we move the address downward.
+
+; We use the following registers:
+;
+; r2 return PC, of course
+; r26 = arg1 = length
+; r25 = arg2 = address of scalar
+; r24 = arg3 = multiplicand vector
+; r23 = arg4 = result vector
+;
+; fr9 = scalar loaded once only from r25
+
+; The cycle counts shown in the bodies below are simply the result of a
+; scheduling by hand. The actual PCX-U hardware does it differently.
+; The intention is that the overall speed is the same.
+
+; The pipeline startup and shutdown code is constructed in the usual way,
+; by taking the loop bodies and removing unnecessary instructions.
+; We have left the comments describing cycle numbers in the code.
+; These are intended for reference when comparing with the main loop,
+; and have no particular relationship to actual cycle numbers.
+
+#ifdef LITTLE_WORDIAN
+maxpy_little
+#else
+maxpy_big
+#endif
+ .PROC
+ .CALLINFO FRAME=120,ENTRY_GR=4
+ .ENTRY
+ STW,MA %r3,128(%sp)
+ STW %r4,-124(%sp)
+
+ ADDIB,< -1,%r26,$L0 ; If N = 0, exit immediately.
+ FLDD 0(%r25),%fr9 ; fr9 = scalar
+
+; First startup
+
+ FLDD 0(%r24),%fr24 ; Cycle 1
+ XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3
+ XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4
+ XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5
+ CMPIB,> 3,%r26,$N_IS_SMALL ; Pick out cases N = 1, 2, or 3
+ XMPYU %fr9L,%fr24R,%fr24 ; Cycle 6
+ FLDD EIGHT(%r24),%fr28 ; Cycle 8
+ XMPYU %fr9L,%fr28R,%fr31 ; Cycle 10
+ FSTD %fr24,-96(%sp)
+ XMPYU %fr9R,%fr28L,%fr30 ; Cycle 11
+ FSTD %fr25,-80(%sp)
+ LDO SIXTEEN(%r24),%r24 ; Cycle 12
+ FSTD %fr31,-64(%sp)
+ XMPYU %fr9R,%fr28R,%fr29 ; Cycle 13
+ FSTD %fr27,-48(%sp)
+
+; Second startup
+
+ XMPYU %fr9L,%fr28L,%fr28 ; Cycle 1
+ FSTD %fr30,-56(%sp)
+ FLDD 0(%r24),%fr24
+
+ FSTD %fr26,-88(%sp) ; Cycle 2
+
+ XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3
+ FSTD %fr28,-104(%sp)
+
+ XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4
+ LDD -96(%sp),%r3
+ FSTD %fr29,-72(%sp)
+
+ XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5
+ LDD -64(%sp),%r19
+ LDD -80(%sp),%r21
+
+ XMPYU %fr9L,%fr24R,%fr24 ; Cycle 6
+ LDD -56(%sp),%r20
+ ADD %r21,%r3,%r3
+
+ ADD,DC %r20,%r19,%r19 ; Cycle 7
+ LDD -88(%sp),%r4
+ SHRPD %r3,%r0,32,%r21
+ LDD -48(%sp),%r1
+
+ FLDD EIGHT(%r24),%fr28 ; Cycle 8
+ LDD -104(%sp),%r31
+ ADD,DC %r0,%r0,%r20
+ SHRPD %r19,%r3,32,%r3
+
+ LDD -72(%sp),%r29 ; Cycle 9
+ SHRPD %r20,%r19,32,%r20
+ ADD %r21,%r1,%r1
+
+ XMPYU %fr9L,%fr28R,%fr31 ; Cycle 10
+ ADD,DC %r3,%r4,%r4
+ FSTD %fr24,-96(%sp)
+
+ XMPYU %fr9R,%fr28L,%fr30 ; Cycle 11
+ ADD,DC %r0,%r20,%r20
+ LDD 0(%r23),%r3
+ FSTD %fr25,-80(%sp)
+
+ LDO SIXTEEN(%r24),%r24 ; Cycle 12
+ FSTD %fr31,-64(%sp)
+
+ XMPYU %fr9R,%fr28R,%fr29 ; Cycle 13
+ ADD %r0,%r0,%r0 ; clear the carry bit
+ ADDIB,<= -4,%r26,$ENDLOOP ; actually happens in cycle 12
+ FSTD %fr27,-48(%sp)
+; MFCTL %cr16,%r21 ; for timing
+; STD %r21,-112(%sp)
+
+; Here is the loop.
+
+$LOOP XMPYU %fr9L,%fr28L,%fr28 ; Cycle 1
+ ADD,DC %r29,%r4,%r4
+ FSTD %fr30,-56(%sp)
+ FLDD 0(%r24),%fr24
+
+ LDO SIXTEEN(%r23),%r23 ; Cycle 2
+ ADD,DC %r0,%r20,%r20
+ FSTD %fr26,-88(%sp)
+
+ XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3
+ ADD %r3,%r1,%r1
+ FSTD %fr28,-104(%sp)
+ LDD UN_EIGHT(%r23),%r21
+
+ XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4
+ ADD,DC %r21,%r4,%r28
+ FSTD %fr29,-72(%sp)
+ LDD -96(%sp),%r3
+
+ XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5
+ ADD,DC %r20,%r31,%r22
+ LDD -64(%sp),%r19
+ LDD -80(%sp),%r21
+
+ XMPYU %fr9L,%fr24R,%fr24 ; Cycle 6
+ ADD %r21,%r3,%r3
+ LDD -56(%sp),%r20
+ STD %r1,UN_SIXTEEN(%r23)
+
+ ADD,DC %r20,%r19,%r19 ; Cycle 7
+ SHRPD %r3,%r0,32,%r21
+ LDD -88(%sp),%r4
+ LDD -48(%sp),%r1
+
+ ADD,DC %r0,%r0,%r20 ; Cycle 8
+ SHRPD %r19,%r3,32,%r3
+ FLDD EIGHT(%r24),%fr28
+ LDD -104(%sp),%r31
+
+ SHRPD %r20,%r19,32,%r20 ; Cycle 9
+ ADD %r21,%r1,%r1
+ STD %r28,UN_EIGHT(%r23)
+ LDD -72(%sp),%r29
+
+ XMPYU %fr9L,%fr28R,%fr31 ; Cycle 10
+ ADD,DC %r3,%r4,%r4
+ FSTD %fr24,-96(%sp)
+
+ XMPYU %fr9R,%fr28L,%fr30 ; Cycle 11
+ ADD,DC %r0,%r20,%r20
+ FSTD %fr25,-80(%sp)
+ LDD 0(%r23),%r3
+
+ LDO SIXTEEN(%r24),%r24 ; Cycle 12
+ FSTD %fr31,-64(%sp)
+
+ XMPYU %fr9R,%fr28R,%fr29 ; Cycle 13
+ ADD %r22,%r1,%r1
+ ADDIB,> -2,%r26,$LOOP ; actually happens in cycle 12
+ FSTD %fr27,-48(%sp)
+
+$ENDLOOP
+
+; Shutdown code, first stage.
+
+; MFCTL %cr16,%r21 ; for timing
+; STD %r21,UN_SIXTEEN(%r23)
+; LDD -112(%sp),%r21
+; STD %r21,UN_EIGHT(%r23)
+
+ XMPYU %fr9L,%fr28L,%fr28 ; Cycle 1
+ ADD,DC %r29,%r4,%r4
+ CMPIB,= 0,%r26,$ONEMORE
+ FSTD %fr30,-56(%sp)
+
+ LDO SIXTEEN(%r23),%r23 ; Cycle 2
+ ADD,DC %r0,%r20,%r20
+ FSTD %fr26,-88(%sp)
+
+ ADD %r3,%r1,%r1 ; Cycle 3
+ FSTD %fr28,-104(%sp)
+ LDD UN_EIGHT(%r23),%r21
+
+ ADD,DC %r21,%r4,%r28 ; Cycle 4
+ FSTD %fr29,-72(%sp)
+ STD %r28,UN_EIGHT(%r23) ; moved up from cycle 9
+ LDD -96(%sp),%r3
+
+ ADD,DC %r20,%r31,%r22 ; Cycle 5
+ STD %r1,UN_SIXTEEN(%r23)
+$JOIN4
+ LDD -64(%sp),%r19
+ LDD -80(%sp),%r21
+
+ ADD %r21,%r3,%r3 ; Cycle 6
+ LDD -56(%sp),%r20
+
+ ADD,DC %r20,%r19,%r19 ; Cycle 7
+ SHRPD %r3,%r0,32,%r21
+ LDD -88(%sp),%r4
+ LDD -48(%sp),%r1
+
+ ADD,DC %r0,%r0,%r20 ; Cycle 8
+ SHRPD %r19,%r3,32,%r3
+ LDD -104(%sp),%r31
+
+ SHRPD %r20,%r19,32,%r20 ; Cycle 9
+ ADD %r21,%r1,%r1
+ LDD -72(%sp),%r29
+
+ ADD,DC %r3,%r4,%r4 ; Cycle 10
+
+ ADD,DC %r0,%r20,%r20 ; Cycle 11
+ LDD 0(%r23),%r3
+
+ ADD %r22,%r1,%r1 ; Cycle 13
+
+; Shutdown code, second stage.
+
+ ADD,DC %r29,%r4,%r4 ; Cycle 1
+
+ LDO SIXTEEN(%r23),%r23 ; Cycle 2
+ ADD,DC %r0,%r20,%r20
+
+ LDD UN_EIGHT(%r23),%r21 ; Cycle 3
+ ADD %r3,%r1,%r1
+
+ ADD,DC %r21,%r4,%r28 ; Cycle 4
+
+ ADD,DC %r20,%r31,%r22 ; Cycle 5
+
+ STD %r1,UN_SIXTEEN(%r23); Cycle 6
+
+ STD %r28,UN_EIGHT(%r23) ; Cycle 9
+
+ LDD 0(%r23),%r3 ; Cycle 11
+
+; Shutdown code, third stage.
+
+ LDO SIXTEEN(%r23),%r23
+ ADD %r3,%r22,%r1
+$JOIN1 ADD,DC %r0,%r0,%r21
+ CMPIB,*= 0,%r21,$L0 ; if no overflow, exit
+ STD %r1,UN_SIXTEEN(%r23)
+
+; Final carry propagation
+
+$FINAL1 LDO EIGHT(%r23),%r23
+ LDD UN_SIXTEEN(%r23),%r21
+ ADDI 1,%r21,%r21
+ CMPIB,*= 0,%r21,$FINAL1 ; Keep looping if there is a carry.
+ STD %r21,UN_SIXTEEN(%r23)
+ B $L0
+ NOP
+
+; Here is the code that handles the difficult cases N=1, N=2, and N=3.
+; We do the usual trick -- branch out of the startup code at appropriate
+; points, and branch into the shutdown code.
+
+$N_IS_SMALL
+ CMPIB,= 0,%r26,$N_IS_ONE
+ FSTD %fr24,-96(%sp) ; Cycle 10
+ FLDD EIGHT(%r24),%fr28 ; Cycle 8
+ XMPYU %fr9L,%fr28R,%fr31 ; Cycle 10
+ XMPYU %fr9R,%fr28L,%fr30 ; Cycle 11
+ FSTD %fr25,-80(%sp)
+ FSTD %fr31,-64(%sp) ; Cycle 12
+ XMPYU %fr9R,%fr28R,%fr29 ; Cycle 13
+ FSTD %fr27,-48(%sp)
+ XMPYU %fr9L,%fr28L,%fr28 ; Cycle 1
+ CMPIB,= 2,%r26,$N_IS_THREE
+ FSTD %fr30,-56(%sp)
+
+; N = 2
+ FSTD %fr26,-88(%sp) ; Cycle 2
+ FSTD %fr28,-104(%sp) ; Cycle 3
+ LDD -96(%sp),%r3 ; Cycle 4
+ FSTD %fr29,-72(%sp)
+ B $JOIN4
+ ADD %r0,%r0,%r22
+
+$N_IS_THREE
+ FLDD SIXTEEN(%r24),%fr24
+ FSTD %fr26,-88(%sp) ; Cycle 2
+ XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3
+ FSTD %fr28,-104(%sp)
+ XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4
+ LDD -96(%sp),%r3
+ FSTD %fr29,-72(%sp)
+ XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5
+ LDD -64(%sp),%r19
+ LDD -80(%sp),%r21
+ B $JOIN3
+ ADD %r0,%r0,%r22
+
+$N_IS_ONE
+ FSTD %fr25,-80(%sp)
+ FSTD %fr27,-48(%sp)
+ FSTD %fr26,-88(%sp) ; Cycle 2
+ B $JOIN5
+ ADD %r0,%r0,%r22
+
+; We came out of the unrolled loop with wrong parity. Do one more
+; single cycle. This is quite tricky, because of the way the
+; carry chains and SHRPD chains have been chopped up.
+
+$ONEMORE
+
+ FLDD 0(%r24),%fr24
+
+ LDO SIXTEEN(%r23),%r23 ; Cycle 2
+ ADD,DC %r0,%r20,%r20
+ FSTD %fr26,-88(%sp)
+
+ XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3
+ FSTD %fr28,-104(%sp)
+ LDD UN_EIGHT(%r23),%r21
+ ADD %r3,%r1,%r1
+
+ XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4
+ ADD,DC %r21,%r4,%r28
+ STD %r28,UN_EIGHT(%r23) ; moved from cycle 9
+ LDD -96(%sp),%r3
+ FSTD %fr29,-72(%sp)
+
+ XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5
+ ADD,DC %r20,%r31,%r22
+ LDD -64(%sp),%r19
+ LDD -80(%sp),%r21
+
+ STD %r1,UN_SIXTEEN(%r23); Cycle 6
+$JOIN3
+ XMPYU %fr9L,%fr24R,%fr24
+ LDD -56(%sp),%r20
+ ADD %r21,%r3,%r3
+
+ ADD,DC %r20,%r19,%r19 ; Cycle 7
+ LDD -88(%sp),%r4
+ SHRPD %r3,%r0,32,%r21
+ LDD -48(%sp),%r1
+
+ LDD -104(%sp),%r31 ; Cycle 8
+ ADD,DC %r0,%r0,%r20
+ SHRPD %r19,%r3,32,%r3
+
+ LDD -72(%sp),%r29 ; Cycle 9
+ SHRPD %r20,%r19,32,%r20
+ ADD %r21,%r1,%r1
+
+ ADD,DC %r3,%r4,%r4 ; Cycle 10
+ FSTD %fr24,-96(%sp)
+
+ ADD,DC %r0,%r20,%r20 ; Cycle 11
+ LDD 0(%r23),%r3
+ FSTD %fr25,-80(%sp)
+
+ ADD %r22,%r1,%r1 ; Cycle 13
+ FSTD %fr27,-48(%sp)
+
+; Shutdown code, stage 1-1/2.
+
+ ADD,DC %r29,%r4,%r4 ; Cycle 1
+
+ LDO SIXTEEN(%r23),%r23 ; Cycle 2
+ ADD,DC %r0,%r20,%r20
+ FSTD %fr26,-88(%sp)
+
+ LDD UN_EIGHT(%r23),%r21 ; Cycle 3
+ ADD %r3,%r1,%r1
+
+ ADD,DC %r21,%r4,%r28 ; Cycle 4
+ STD %r28,UN_EIGHT(%r23) ; moved from cycle 9
+
+ ADD,DC %r20,%r31,%r22 ; Cycle 5
+ STD %r1,UN_SIXTEEN(%r23)
+$JOIN5
+ LDD -96(%sp),%r3 ; moved from cycle 4
+ LDD -80(%sp),%r21
+ ADD %r21,%r3,%r3 ; Cycle 6
+ ADD,DC %r0,%r0,%r19 ; Cycle 7
+ LDD -88(%sp),%r4
+ SHRPD %r3,%r0,32,%r21
+ LDD -48(%sp),%r1
+ SHRPD %r19,%r3,32,%r3 ; Cycle 8
+ ADD %r21,%r1,%r1 ; Cycle 9
+ ADD,DC %r3,%r4,%r4 ; Cycle 10
+ LDD 0(%r23),%r3 ; Cycle 11
+ ADD %r22,%r1,%r1 ; Cycle 13
+
+; Shutdown code, stage 2-1/2.
+
+ ADD,DC %r0,%r4,%r4 ; Cycle 1
+ LDO SIXTEEN(%r23),%r23 ; Cycle 2
+ LDD UN_EIGHT(%r23),%r21 ; Cycle 3
+ ADD %r3,%r1,%r1
+ STD %r1,UN_SIXTEEN(%r23)
+ ADD,DC %r21,%r4,%r1
+ B $JOIN1
+ LDO EIGHT(%r23),%r23
+
+; exit
+
+$L0
+ LDW -124(%sp),%r4
+ BVE (%r2)
+ .EXIT
+ LDW,MB -128(%sp),%r3
+
+ .PROCEND
+
+; ***************************************************************
+;
+; add_diag_[little/big]
+;
+; ***************************************************************
+
+; The arguments are as follows:
+; r2 return PC, of course
+; r26 = arg1 = length
+; r25 = arg2 = vector to square
+; r24 = arg3 = result vector
+
+#ifdef LITTLE_WORDIAN
+add_diag_little
+#else
+add_diag_big
+#endif
+ .PROC
+ .CALLINFO FRAME=120,ENTRY_GR=4
+ .ENTRY
+ STW,MA %r3,128(%sp)
+ STW %r4,-124(%sp)
+
+ ADDIB,< -1,%r26,$Z0 ; If N=0, exit immediately.
+ NOP
+
+; Startup code
+
+ FLDD 0(%r25),%fr7 ; Cycle 2 (alternate body)
+ XMPYU %fr7R,%fr7R,%fr29 ; Cycle 4
+ XMPYU %fr7L,%fr7R,%fr27 ; Cycle 5
+ XMPYU %fr7L,%fr7L,%fr30
+ LDO SIXTEEN(%r25),%r25 ; Cycle 6
+ FSTD %fr29,-88(%sp)
+ FSTD %fr27,-72(%sp) ; Cycle 7
+ CMPIB,= 0,%r26,$DIAG_N_IS_ONE ; Cycle 1 (main body)
+ FSTD %fr30,-96(%sp)
+ FLDD UN_EIGHT(%r25),%fr7 ; Cycle 2
+ LDD -88(%sp),%r22 ; Cycle 3
+ LDD -72(%sp),%r31 ; Cycle 4
+ XMPYU %fr7R,%fr7R,%fr28
+ XMPYU %fr7L,%fr7R,%fr24 ; Cycle 5
+ XMPYU %fr7L,%fr7L,%fr31
+ LDD -96(%sp),%r20 ; Cycle 6
+ FSTD %fr28,-80(%sp)
+ ADD %r0,%r0,%r0 ; clear the carry bit
+ ADDIB,<= -2,%r26,$ENDDIAGLOOP ; Cycle 7
+ FSTD %fr24,-64(%sp)
+
+; Here is the loop. It is unrolled twice, modelled after the "alternate body" and then the "main body".
+
+$DIAGLOOP
+ SHRPD %r31,%r0,31,%r3 ; Cycle 1 (alternate body)
+ LDO SIXTEEN(%r25),%r25
+ LDD 0(%r24),%r1
+ FSTD %fr31,-104(%sp)
+ SHRPD %r0,%r31,31,%r4 ; Cycle 2
+ ADD,DC %r22,%r3,%r3
+ FLDD UN_SIXTEEN(%r25),%fr7
+ ADD,DC %r0,%r20,%r20 ; Cycle 3
+ ADD %r1,%r3,%r3
+ XMPYU %fr7R,%fr7R,%fr29 ; Cycle 4
+ LDD -80(%sp),%r21
+ STD %r3,0(%r24)
+ XMPYU %fr7L,%fr7R,%fr27 ; Cycle 5
+ XMPYU %fr7L,%fr7L,%fr30
+ LDD -64(%sp),%r29
+ LDD EIGHT(%r24),%r1
+ ADD,DC %r4,%r20,%r20 ; Cycle 6
+ LDD -104(%sp),%r19
+ FSTD %fr29,-88(%sp)
+ ADD %r20,%r1,%r1 ; Cycle 7
+ FSTD %fr27,-72(%sp)
+ SHRPD %r29,%r0,31,%r4 ; Cycle 1 (main body)
+ LDO THIRTY_TWO(%r24),%r24
+ LDD UN_SIXTEEN(%r24),%r28
+ FSTD %fr30,-96(%sp)
+ SHRPD %r0,%r29,31,%r3 ; Cycle 2
+ ADD,DC %r21,%r4,%r4
+ FLDD UN_EIGHT(%r25),%fr7
+ STD %r1,UN_TWENTY_FOUR(%r24)
+ ADD,DC %r0,%r19,%r19 ; Cycle 3
+ ADD %r28,%r4,%r4
+ XMPYU %fr7R,%fr7R,%fr28 ; Cycle 4
+ LDD -88(%sp),%r22
+ STD %r4,UN_SIXTEEN(%r24)
+ XMPYU %fr7L,%fr7R,%fr24 ; Cycle 5
+ XMPYU %fr7L,%fr7L,%fr31
+ LDD -72(%sp),%r31
+ LDD UN_EIGHT(%r24),%r28
+ ADD,DC %r3,%r19,%r19 ; Cycle 6
+ LDD -96(%sp),%r20
+ FSTD %fr28,-80(%sp)
+ ADD %r19,%r28,%r28 ; Cycle 7
+ FSTD %fr24,-64(%sp)
+ ADDIB,> -2,%r26,$DIAGLOOP ; Cycle 8
+ STD %r28,UN_EIGHT(%r24)
+
+$ENDDIAGLOOP
+
+ ADD,DC %r0,%r22,%r22
+ CMPIB,= 0,%r26,$ONEMOREDIAG
+ SHRPD %r31,%r0,31,%r3
+
+; Shutdown code, first stage.
+
+ FSTD %fr31,-104(%sp) ; Cycle 1 (alternate body)
+ LDD 0(%r24),%r28
+ SHRPD %r0,%r31,31,%r4 ; Cycle 2
+ ADD %r3,%r22,%r3
+ ADD,DC %r0,%r20,%r20 ; Cycle 3
+ LDD -80(%sp),%r21
+ ADD %r3,%r28,%r3
+ LDD -64(%sp),%r29 ; Cycle 4
+ STD %r3,0(%r24)
+ LDD EIGHT(%r24),%r1 ; Cycle 5
+ LDO SIXTEEN(%r25),%r25 ; Cycle 6
+ LDD -104(%sp),%r19
+ ADD,DC %r4,%r20,%r20
+ ADD %r20,%r1,%r1 ; Cycle 7
+ ADD,DC %r0,%r21,%r21 ; Cycle 8
+ STD %r1,EIGHT(%r24)
+
+; Shutdown code, second stage.
+
+ SHRPD %r29,%r0,31,%r4 ; Cycle 1 (main body)
+ LDO THIRTY_TWO(%r24),%r24
+ LDD UN_SIXTEEN(%r24),%r1
+ SHRPD %r0,%r29,31,%r3 ; Cycle 2
+ ADD %r4,%r21,%r4
+ ADD,DC %r0,%r19,%r19 ; Cycle 3
+ ADD %r4,%r1,%r4
+ STD %r4,UN_SIXTEEN(%r24); Cycle 4
+ LDD UN_EIGHT(%r24),%r28 ; Cycle 5
+ ADD,DC %r3,%r19,%r19 ; Cycle 6
+ ADD %r19,%r28,%r28 ; Cycle 7
+ ADD,DC %r0,%r0,%r22 ; Cycle 8
+ CMPIB,*= 0,%r22,$Z0 ; if no overflow, exit
+ STD %r28,UN_EIGHT(%r24)
+
+; Final carry propagation
+
+$FDIAG2
+ LDO EIGHT(%r24),%r24
+ LDD UN_EIGHT(%r24),%r26
+ ADDI 1,%r26,%r26
+ CMPIB,*= 0,%r26,$FDIAG2 ; Keep looping if there is a carry.
+ STD %r26,UN_EIGHT(%r24)
+
+ B $Z0
+ NOP
+
+; Here is the code that handles the difficult case N=1.
+; We do the usual trick -- branch out of the startup code at appropriate
+; points, and branch into the shutdown code.
+
+$DIAG_N_IS_ONE
+
+ LDD -88(%sp),%r22
+ LDD -72(%sp),%r31
+ B $JOINDIAG
+ LDD -96(%sp),%r20
+
+; We came out of the unrolled loop with wrong parity. Do one more
+; single cycle. This is the "alternate body". It will, of course,
+; give us opposite registers from the other case, so we need
+; completely different shutdown code.
+
+$ONEMOREDIAG
+ FSTD %fr31,-104(%sp) ; Cycle 1 (alternate body)
+ LDD 0(%r24),%r28
+ FLDD 0(%r25),%fr7 ; Cycle 2
+ SHRPD %r0,%r31,31,%r4
+ ADD %r3,%r22,%r3
+ ADD,DC %r0,%r20,%r20 ; Cycle 3
+ LDD -80(%sp),%r21
+ ADD %r3,%r28,%r3
+ LDD -64(%sp),%r29 ; Cycle 4
+ STD %r3,0(%r24)
+ XMPYU %fr7R,%fr7R,%fr29
+ LDD EIGHT(%r24),%r1 ; Cycle 5
+ XMPYU %fr7L,%fr7R,%fr27
+ XMPYU %fr7L,%fr7L,%fr30
+ LDD -104(%sp),%r19 ; Cycle 6
+ FSTD %fr29,-88(%sp)
+ ADD,DC %r4,%r20,%r20
+ FSTD %fr27,-72(%sp) ; Cycle 7
+ ADD %r20,%r1,%r1
+ ADD,DC %r0,%r21,%r21 ; Cycle 8
+ STD %r1,EIGHT(%r24)
+
+; Shutdown code, first stage.
+
+ SHRPD %r29,%r0,31,%r4 ; Cycle 1 (main body)
+ LDO THIRTY_TWO(%r24),%r24
+ FSTD %fr30,-96(%sp)
+ LDD UN_SIXTEEN(%r24),%r1
+ SHRPD %r0,%r29,31,%r3 ; Cycle 2
+ ADD %r4,%r21,%r4
+ ADD,DC %r0,%r19,%r19 ; Cycle 3
+ LDD -88(%sp),%r22
+ ADD %r4,%r1,%r4
+ LDD -72(%sp),%r31 ; Cycle 4
+ STD %r4,UN_SIXTEEN(%r24)
+ LDD UN_EIGHT(%r24),%r28 ; Cycle 5
+ LDD -96(%sp),%r20 ; Cycle 6
+ ADD,DC %r3,%r19,%r19
+ ADD %r19,%r28,%r28 ; Cycle 7
+ ADD,DC %r0,%r22,%r22 ; Cycle 8
+ STD %r28,UN_EIGHT(%r24)
+
+; Shutdown code, second stage.
+
+$JOINDIAG
+ SHRPD %r31,%r0,31,%r3 ; Cycle 1 (alternate body)
+ LDD 0(%r24),%r28
+ SHRPD %r0,%r31,31,%r4 ; Cycle 2
+ ADD %r3,%r22,%r3
+ ADD,DC %r0,%r20,%r20 ; Cycle 3
+ ADD %r3,%r28,%r3
+ STD %r3,0(%r24) ; Cycle 4
+ LDD EIGHT(%r24),%r1 ; Cycle 5
+ ADD,DC %r4,%r20,%r20
+ ADD %r20,%r1,%r1 ; Cycle 7
+ ADD,DC %r0,%r0,%r21 ; Cycle 8
+ CMPIB,*= 0,%r21,$Z0 ; if no overflow, exit
+ STD %r1,EIGHT(%r24)
+
+; Final carry propagation
+
+$FDIAG1
+ LDO EIGHT(%r24),%r24
+ LDD EIGHT(%r24),%r26
+ ADDI 1,%r26,%r26
+ CMPIB,*= 0,%r26,$FDIAG1 ; Keep looping if there is a carry.
+ STD %r26,EIGHT(%r24)
+
+$Z0
+ LDW -124(%sp),%r4
+ BVE (%r2)
+ .EXIT
+ LDW,MB -128(%sp),%r3
+ .PROCEND
+; .ALLOW
+
+ .SPACE $TEXT$
+ .SUBSPA $CODE$
+#ifdef LITTLE_WORDIAN
+#ifdef __GNUC__
+; GNU-as (as of 2.19) does not support LONG_RETURN
+ .EXPORT maxpy_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
+ .EXPORT add_diag_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR
+#else
+ .EXPORT maxpy_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,LONG_RETURN
+ .EXPORT add_diag_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,LONG_RETURN
+#endif
+#else
+ .EXPORT maxpy_big,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,LONG_RETURN
+ .EXPORT add_diag_big,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,LONG_RETURN
+#endif
+ .END
+
+
+; How to use "maxpy_PA20_little" and "maxpy_PA20_big"
+;
+; The routine "maxpy_PA20_little" or "maxpy_PA20_big"
+; performs a 64-bit x any-size multiply, and adds the
+; result to an area of memory. That is, it performs
+; something like
+;
+; A B C D
+; * Z
+; __________
+; P Q R S T
+;
+; and then adds the "PQRST" vector into an area of memory,
+; handling all carries.
+;
+; Digression on nomenclature and endian-ness:
+;
+; Each of the capital letters in the above represents a 64-bit
+; quantity. That is, you could think of the discussion as
+; being in terms of radix-16-quintillion arithmetic. The data
+; type being manipulated is "unsigned long long int". This
+; requires the 64-bit extension of the HP-UX C compiler,
+; available at release 10. You need these compiler flags to
+; enable these extensions:
+;
+; -Aa +e +DA2.0 +DS2.0
+;
+; (The first specifies ANSI C, the second enables the
+; extensions, which are beyond ANSI C, and the third and
+; fourth tell the compiler to use whatever features of the
+; PA2.0 architecture it wishes, in order to made the code more
+; efficient. Since the presence of the assembly code will
+; make the program unable to run on anything less than PA2.0,
+; you might as well gain the performance enhancements in the C
+; code as well.)
+;
+; Questions of "endian-ness" often come up, usually in the
+; context of byte ordering in a word. These routines have a
+; similar issue, that could be called "wordian-ness".
+; Independent of byte ordering (PA is always big-endian), one
+; can make two choices when representing extremely large
+; numbers as arrays of 64-bit doublewords in memory.
+;
+; "Little-wordian" layout means that the least significant
+; word of a number is stored at the lowest address.
+;
+; MSW LSW
+; | |
+; V V
+;
+; A B C D E
+;
+; ^ ^ ^
+; | | |____ address 0
+; | |
+; | |_______address 8
+; |
+; address 32
+;
+; "Big-wordian" means that the most significant word is at the
+; lowest address.
+;
+; MSW LSW
+; | |
+; V V
+;
+; A B C D E
+;
+; ^ ^ ^
+; | | |____ address 32
+; | |
+; | |_______address 24
+; |
+; address 0
+;
+; When you compile the file, you must specify one or the other, with
+; a switch "-DLITTLE_WORDIAN" or "-DBIG_WORDIAN".
+;
+; Incidentally, you assemble this file as part of your
+; project with the same C compiler as the rest of the program.
+; My "makefile" for a superprecision arithmetic package has
+; the following stuff:
+;
+; # definitions:
+; CC = cc -Aa +e -z +DA2.0 +DS2.0 +w1
+; CFLAGS = +O3
+; LDFLAGS = -L /usr/lib -Wl,-aarchive
+;
+; # general build rule for ".s" files:
+; .s.o:
+; $(CC) $(CFLAGS) -c $< -DBIG_WORDIAN
+;
+; # Now any bind step that calls for pa20.o will assemble pa20.s
+;
+; End of digression, back to arithmetic:
+;
+; The way we multiply two huge numbers is, of course, to multiply
+; the "ABCD" vector by each of the "WXYZ" doublewords, adding
+; the result vectors with increasing offsets, the way we learned
+; in school, back before we all used calculators:
+;
+; A B C D
+; * W X Y Z
+; __________
+; P Q R S T
+; E F G H I
+; M N O P Q
+; + R S T U V
+; _______________
+; F I N A L S U M
+;
+; So we call maxpy_PA20_big (in my case; my package is
+; big-wordian) repeatedly, giving the W, X, Y, and Z arguments
+; in turn as the "scalar", and giving the "ABCD" vector each
+; time. We direct it to add its result into an area of memory
+; that we have cleared at the start. We skew the exact
+; location into that area with each call.
+;
+; The prototype for the function is
+;
+; extern void maxpy_PA20_big(
+; int length, /* Number of doublewords in the multiplicand vector. */
+; const long long int *scalaraddr, /* Address to fetch the scalar. */
+; const long long int *multiplicand, /* The multiplicand vector. */
+; long long int *result); /* Where to accumulate the result. */
+;
+; (You should place a copy of this prototype in an include file
+; or in your C file.)
+;
+; Now, IN ALL CASES, the given address for the multiplicand or
+; the result is that of the LEAST SIGNIFICANT DOUBLEWORD.
+; That word is, of course, the word at which the routine
+; starts processing. "maxpy_PA20_little" then increases the
+; addresses as it computes. "maxpy_PA20_big" decreases them.
+;
+; In our example above, "length" would be 4 in each case.
+; "multiplicand" would be the "ABCD" vector. Specifically,
+; the address of the element "D". "scalaraddr" would be the
+; address of "W", "X", "Y", or "Z" on the four calls that we
+; would make. (The order doesn't matter, of course.)
+; "result" would be the appropriate address in the result
+; area. When multiplying by "Z", that would be the least
+; significant word. When multiplying by "Y", it would be the
+; next higher word (8 bytes higher if little-wordian; 8 bytes
+; lower if big-wordian), and so on. The size of the result
+; area must be the the sum of the sizes of the multiplicand
+; and multiplier vectors, and must be initialized to zero
+; before we start.
+;
+; Whenever the routine adds its partial product into the result
+; vector, it follows carry chains as far as they need to go.
+;
+; Here is the super-precision multiply routine that I use for
+; my package. The package is big-wordian. I have taken out
+; handling of exponents (it's a floating point package):
+;
+; static void mul_PA20(
+; int size,
+; const long long int *arg1,
+; const long long int *arg2,
+; long long int *result)
+; {
+; int i;
+;
+; for (i=0 ; i<2*size ; i++) result[i] = 0ULL;
+;
+; for (i=0 ; i<size ; i++) {
+; maxpy_PA20_big(size, &arg2[i], &arg1[size-1], &result[size+i]);
+; }
+; }