submul_1.asm
上传用户:qaz666999
上传日期:2022-08-06
资源大小:2570k
文件大小:8k
- dnl Alpha ev6 nails mpn_submul_1.
- dnl Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
- dnl
- dnl This file is part of the GNU MP Library.
- dnl
- dnl The GNU MP Library is free software; you can redistribute it and/or
- dnl modify it under the terms of the GNU Lesser General Public License as
- dnl published by the Free Software Foundation; either version 3 of the
- dnl License, or (at your option) any later version.
- dnl
- dnl The GNU MP Library is distributed in the hope that it will be useful,
- dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
- dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- dnl Lesser General Public License for more details.
- dnl
- dnl You should have received a copy of the GNU Lesser General Public License
- dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
- include(`../config.m4')
- C cycles/limb
- C EV4: 42
- C EV5: 18
- C EV6: 4
- C TODO
- C * Reroll loop for 3.75 c/l with current 4-way unrolling.
- C * The loop is overscheduled wrt loads and wrt multiplies, in particular
- C umulh.
- C * Use FP loop count and multiple exit points, that would simplify feed-in lp0
- C and would work since the loop structure is really regular.
- C INPUT PARAMETERS
- define(`rp',`r16')
- define(`up',`r17')
- define(`n', `r18')
- define(`vl0',`r19')
- define(`numb_mask',`r6')
- define(`m0a',`r0')
- define(`m0b',`r1')
- define(`m1a',`r2')
- define(`m1b',`r3')
- define(`m2a',`r20')
- define(`m2b',`r21')
- define(`m3a',`r22')
- define(`m3b',`r23')
- define(`acc0',`r25')
- define(`acc1',`r27')
- define(`ul0',`r4')
- define(`ul1',`r5')
- define(`ul2',`r4')
- define(`ul3',`r5')
- define(`rl0',`r24')
- define(`rl1',`r24')
- define(`rl2',`r24')
- define(`rl3',`r24')
- define(`t0',`r7')
- define(`t1',`r8')
- define(`NAIL_BITS',`GMP_NAIL_BITS')
- define(`NUMB_BITS',`GMP_NUMB_BITS')
- dnl This declaration is munged by configure
- NAILS_SUPPORT(2-63)
- ASM_START()
- PROLOGUE(mpn_submul_1)
- sll vl0, NAIL_BITS, vl0
- lda numb_mask, -1(r31)
- srl numb_mask, NAIL_BITS, numb_mask
- and n, 3, r25
- cmpeq r25, 1, r21
- bne r21, L(1m4)
- cmpeq r25, 2, r21
- bne r21, L(2m4)
- beq r25, L(0m4)
- L(3m4): ldq ul3, 0(up)
- lda n, -4(n)
- ldq ul0, 8(up)
- mulq vl0, ul3, m3a
- umulh vl0, ul3, m3b
- ldq ul1, 16(up)
- lda up, 24(up)
- lda rp, -8(rp)
- mulq vl0, ul0, m0a
- umulh vl0, ul0, m0b
- bge n, L(ge3)
- mulq vl0, ul1, m1a
- umulh vl0, ul1, m1b
- ldq rl3, 8(rp)
- srl m3a,NAIL_BITS, t0
- addq t0, r31, acc1
- subq rl3, acc1, acc1
- ldq rl0, 16(rp)
- srl m0a,NAIL_BITS, t0
- addq t0, m3b, acc0
- sra acc1,NUMB_BITS, t1
- br r31, L(ta3)
- L(ge3): ldq ul2, 0(up)
- mulq vl0, ul1, m1a
- umulh vl0, ul1, m1b
- ldq rl3, 8(rp)
- srl m3a,NAIL_BITS, t0
- ldq ul3, 8(up)
- lda n, -4(n)
- mulq vl0, ul2, m2a
- addq t0, r31, acc1
- umulh vl0, ul2, m2b
- subq rl3, acc1, acc1
- ldq rl0, 16(rp)
- srl m0a,NAIL_BITS, t0
- ldq ul0, 16(up)
- mulq vl0, ul3, m3a
- addq t0, m3b, acc0
- sra acc1,NUMB_BITS, t1
- br r31, L(el3)
- L(0m4): lda n, -8(n)
- ldq ul2, 0(up)
- ldq ul3, 8(up)
- mulq vl0, ul2, m2a
- umulh vl0, ul2, m2b
- ldq ul0, 16(up)
- mulq vl0, ul3, m3a
- umulh vl0, ul3, m3b
- ldq ul1, 24(up)
- lda up, 32(up)
- mulq vl0, ul0, m0a
- umulh vl0, ul0, m0b
- bge n, L(ge4)
- ldq rl2, 0(rp)
- srl m2a,NAIL_BITS, t0
- mulq vl0, ul1, m1a
- addq t0, r31, acc0
- umulh vl0, ul1, m1b
- subq rl2, acc0, acc0
- ldq rl3, 8(rp)
- srl m3a,NAIL_BITS, t0
- addq t0, m2b, acc1
- sra acc0,NUMB_BITS, t1
- br r31, L(ta4)
- L(ge4): ldq rl2, 0(rp)
- srl m2a,NAIL_BITS, t0
- ldq ul2, 0(up)
- mulq vl0, ul1, m1a
- addq t0, r31, acc0
- umulh vl0, ul1, m1b
- subq rl2, acc0, acc0
- ldq rl3, 8(rp)
- srl m3a,NAIL_BITS, t0
- ldq ul3, 8(up)
- lda n, -4(n)
- mulq vl0, ul2, m2a
- addq t0, m2b, acc1
- sra acc0,NUMB_BITS, t1
- br r31, L(el0)
- L(2m4): lda n, -4(n)
- ldq ul0, 0(up)
- ldq ul1, 8(up)
- lda up, 16(up)
- lda rp, -16(rp)
- mulq vl0, ul0, m0a
- umulh vl0, ul0, m0b
- bge n, L(ge2)
- mulq vl0, ul1, m1a
- umulh vl0, ul1, m1b
- ldq rl0, 16(rp)
- srl m0a,NAIL_BITS, t0
- addq t0, r31, acc0
- subq rl0, acc0, acc0
- ldq rl1, 24(rp)
- srl m1a,NAIL_BITS, t0
- addq t0, m0b, acc1
- sra acc0,NUMB_BITS, t1
- br r31, L(ta2)
- L(ge2): ldq ul2, 0(up)
- mulq vl0, ul1, m1a
- umulh vl0, ul1, m1b
- ldq ul3, 8(up)
- lda n, -4(n)
- mulq vl0, ul2, m2a
- umulh vl0, ul2, m2b
- ldq rl0, 16(rp)
- srl m0a,NAIL_BITS, t0
- ldq ul0, 16(up)
- mulq vl0, ul3, m3a
- addq t0, r31, acc0
- umulh vl0, ul3, m3b
- subq rl0, acc0, acc0
- ldq rl1, 24(rp)
- srl m1a,NAIL_BITS, t0
- ldq ul1, 24(up)
- lda up, 32(up)
- lda rp, 32(rp)
- mulq vl0, ul0, m0a
- addq t0, m0b, acc1
- sra acc0,NUMB_BITS, t1
- bge n, L(el2)
- br r31, L(ta6)
- L(1m4): lda n, -4(n)
- ldq ul1, 0(up)
- lda up, 8(up)
- lda rp, -24(rp)
- bge n, L(ge1)
- mulq vl0, ul1, m1a
- umulh vl0, ul1, m1b
- ldq rl1, 24(rp)
- srl m1a,NAIL_BITS, t0
- subq rl1, t0, acc1
- and acc1,numb_mask, r28
- sra acc1,NUMB_BITS, t1
- stq r28, 24(rp)
- subq m1b, t1, r0
- ret r31, (r26), 1
- L(ge1): ldq ul2, 0(up)
- mulq vl0, ul1, m1a
- umulh vl0, ul1, m1b
- ldq ul3, 8(up)
- lda n, -4(n)
- mulq vl0, ul2, m2a
- umulh vl0, ul2, m2b
- ldq ul0, 16(up)
- mulq vl0, ul3, m3a
- umulh vl0, ul3, m3b
- ldq rl1, 24(rp)
- srl m1a,NAIL_BITS, t0
- ldq ul1, 24(up)
- lda up, 32(up)
- lda rp, 32(rp)
- mulq vl0, ul0, m0a
- addq t0, r31, acc1
- umulh vl0, ul0, m0b
- subq rl1, acc1, acc1
- ldq rl2, 0(rp)
- srl m2a,NAIL_BITS, t0
- mulq vl0, ul1, m1a
- addq t0, m1b, acc0
- sra acc1,NUMB_BITS, t1
- blt n, L(ta5)
- L(ge5): ldq ul2, 0(up)
- br r31, L(el1)
- ALIGN(16)
- L(top): mulq vl0, ul0, m0a C U1
- addq t0, m0b, acc1 C L0
- sra acc0,NUMB_BITS, t1 C U0
- stq r28, -24(rp) C L1
- C
- L(el2): umulh vl0, ul0, m0b C U1
- and acc0,numb_mask, r28 C L0
- subq rl1, acc1, acc1 C U0
- ldq rl2, 0(rp) C L1
- C
- unop C U1
- addq t1, acc1, acc1 C L0
- srl m2a,NAIL_BITS, t0 C U0
- ldq ul2, 0(up) C L1
- C
- mulq vl0, ul1, m1a C U1
- addq t0, m1b, acc0 C L0
- sra acc1,NUMB_BITS, t1 C U0
- stq r28, -16(rp) C L1
- C
- L(el1): umulh vl0, ul1, m1b C U1
- and acc1,numb_mask, r28 C L0
- subq rl2, acc0, acc0 C U0
- ldq rl3, 8(rp) C L1
- C
- lda n, -4(n) C L1
- addq t1, acc0, acc0 C L0
- srl m3a,NAIL_BITS, t0 C U0
- ldq ul3, 8(up) C L1
- C
- mulq vl0, ul2, m2a C U1
- addq t0, m2b, acc1 C L0
- sra acc0,NUMB_BITS, t1 C U0
- stq r28, -8(rp) C L1
- C
- L(el0): umulh vl0, ul2, m2b C U1
- and acc0,numb_mask, r28 C L0
- subq rl3, acc1, acc1 C U0
- ldq rl0, 16(rp) C L1
- C
- unop C U1
- addq t1, acc1, acc1 C L0
- srl m0a,NAIL_BITS, t0 C U0
- ldq ul0, 16(up) C L1
- C
- mulq vl0, ul3, m3a C U1
- addq t0, m3b, acc0 C L0
- sra acc1,NUMB_BITS, t1 C U0
- stq r28, 0(rp) C L1
- C
- L(el3): umulh vl0, ul3, m3b C U1
- and acc1,numb_mask, r28 C L0
- subq rl0, acc0, acc0 C U0
- ldq rl1, 24(rp) C L1
- C
- unop C U1
- addq t1, acc0, acc0 C L0
- srl m1a,NAIL_BITS, t0 C U0
- ldq ul1, 24(up) C L1
- C
- lda up, 32(up) C L0
- unop C U1
- lda rp, 32(rp) C L1
- bge n, L(top) C U0
- L(end): mulq vl0, ul0, m0a
- addq t0, m0b, acc1
- sra acc0,NUMB_BITS, t1
- stq r28, -24(rp)
- L(ta6): umulh vl0, ul0, m0b
- and acc0,numb_mask, r28
- subq rl1, acc1, acc1
- ldq rl2, 0(rp)
- addq t1, acc1, acc1
- srl m2a,NAIL_BITS, t0
- mulq vl0, ul1, m1a
- addq t0, m1b, acc0
- sra acc1,NUMB_BITS, t1
- stq r28, -16(rp)
- L(ta5): umulh vl0, ul1, m1b
- and acc1,numb_mask, r28
- subq rl2, acc0, acc0
- ldq rl3, 8(rp)
- addq t1, acc0, acc0
- srl m3a,NAIL_BITS, t0
- addq t0, m2b, acc1
- sra acc0,NUMB_BITS, t1
- stq r28, -8(rp)
- unop
- ALIGN(16)
- L(ta4): and acc0,numb_mask, r28
- subq rl3, acc1, acc1
- ldq rl0, 16(rp)
- addq t1, acc1, acc1
- srl m0a,NAIL_BITS, t0
- addq t0, m3b, acc0
- sra acc1,NUMB_BITS, t1
- stq r28, 0(rp)
- unop
- ALIGN(16)
- L(ta3): and acc1,numb_mask, r28
- subq rl0, acc0, acc0
- ldq rl1, 24(rp)
- addq t1, acc0, acc0
- srl m1a,NAIL_BITS, t0
- addq t0, m0b, acc1
- sra acc0,NUMB_BITS, t1
- stq r28, 8(rp)
- unop
- ALIGN(16)
- L(ta2): and acc0,numb_mask, r28
- subq rl1, acc1, acc1
- addq t1, acc1, acc1
- sra acc1,NUMB_BITS, t1
- stq r28, 16(rp)
- and acc1,numb_mask, r28
- subq m1b, t1, r0
- stq r28, 24(rp)
- ret r31, (r26), 1
- EPILOGUE()
- ASM_END()