divrem_1.asm
上传用户:qaz666999
上传日期:2022-08-06
资源大小:2570k
文件大小:6k
- dnl PowerPC-64 mpn_divrem_1 -- Divide an mpn number by an unnormalized limb.
- dnl Copyright 2003, 2004, 2005, 2007, 2008 Free Software Foundation, Inc.
- dnl This file is part of the GNU MP Library.
- dnl The GNU MP Library is free software; you can redistribute it and/or modify
- dnl it under the terms of the GNU Lesser General Public License as published
- dnl by the Free Software Foundation; either version 3 of the License, or (at
- dnl your option) any later version.
- dnl The GNU MP Library is distributed in the hope that it will be useful, but
- dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
- dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
- dnl License for more details.
- dnl You should have received a copy of the GNU Lesser General Public License
- dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
- include(`../config.m4')
- C cycles/limb
- C norm unorm frac
- C POWER3/PPC630 16-34 16-34 ~11
- C POWER4/PPC970 29 19
- C POWER5 29 29 ~20
- C INPUT PARAMETERS
- C qp = r3
- C fn = r4
- C up = r5
- C un = r6
- C d = r7
- C We use a not very predictable branch in the frac code, therefore the cycle
- C count wobbles somewhat. With the alternative branch-free code, things run
- C considerably slower on POWER4/PPC970 and POWER5.
- C Add preinv entry point.
- ASM_START()
- EXTERN_FUNC(mpn_invert_limb)
- PROLOGUE(mpn_divrem_1)
- mfcr r12
- add. r10, r6, r4
- std r25, -56(r1)
- mr r25, r4
- mflr r0
- std r26, -48(r1)
- mr r26, r5
- std r28, -32(r1)
- mr r28, r6
- std r29, -24(r1)
- mr r29, r3
- li r3, 0
- std r30, -16(r1)
- mr r30, r7
- std r31, -8(r1)
- li r31, 0
- std r27, -40(r1)
- std r0, 16(r1)
- stw r12, 8(r1)
- stdu r1, -176(r1)
- beq- cr0, L(1)
- cmpdi cr7, r7, 0
- sldi r0, r10, 3
- add r11, r0, r29
- addi r29, r11, -8
- blt- cr7, L(162)
- cmpdi cr4, r6, 0
- beq+ cr4, L(71)
- L(163):
- sldi r9, r6, 3
- add r9, r9, r5
- ld r7, -8(r9)
- cmpld cr7, r7, r30
- bge- cr7, L(71)
- cmpdi cr7, r10, 1
- li r0, 0
- mr r31, r7
- std r0, -8(r11)
- addi r29, r29, -8
- mr r3, r7
- beq- cr7, L(1)
- addi r28, r6, -1
- cmpdi cr4, r28, 0
- L(71):
- cntlzd r27, r30
- sld r30, r30, r27
- sld r31, r31, r27
- mr r3, r30
- CALL( mpn_invert_limb)
- nop
- beq- cr4, L(110)
- sldi r9, r28, 3
- addic. r6, r28, -2
- add r9, r9, r26
- subfic r5, r27, 64
- ld r8, -8(r9)
- srd r0, r8, r5
- or r31, r31, r0
- sld r7, r8, r27
- blt- cr0, L(154)
- addi r28, r28, -1
- mtctr r28
- sldi r6, r6, 3
- ALIGN(16)
- L(uloop):
- addi r11, r31, 1
- ldx r8, r26, r6
- mulld r0, r31, r3
- mulhdu r10, r31, r3
- addi r6, r6, -8
- srd r9, r8, r5
- or r9, r7, r9
- addc r0, r0, r9
- adde r10, r10, r11
- mulld r31, r10, r30
- subf r31, r31, r9
- subfc r0, r0, r31 C r >= ql
- subfe r0, r0, r0 C r0 = -(r >= ql)
- not r7, r0
- add r10, r7, r10 C qh -= (r >= ql)
- andc r0, r30, r0
- add r31, r31, r0
- cmpld cr7, r31, r30
- bge- cr7, L(164)
- L(123):
- std r10, 0(r29)
- addi r29, r29, -8
- sld r7, r8, r27
- bdnz L(uloop)
- L(154):
- addi r11, r31, 1
- nop
- mulld r0, r31, r3
- mulhdu r8, r31, r3
- addc r0, r0, r7
- adde r8, r8, r11
- mulld r31, r8, r30
- subf r31, r31, r7
- subfc r0, r0, r31 C r >= ql
- subfe r0, r0, r0 C r0 = -(r >= ql)
- not r7, r0
- add r8, r7, r8 C qh -= (r >= ql)
- andc r0, r30, r0
- add r31, r31, r0
- cmpld cr7, r31, r30
- bge- cr7, L(165)
- L(134):
- std r8, 0(r29)
- addi r29, r29, -8
- L(110):
- addic. r0, r25, -1
- blt- cr0, L(156)
- mtctr r25
- neg r9, r30
- ALIGN(16)
- L(ufloop):
- addi r11, r31, 1
- nop
- mulld r7, r3, r31
- mulhdu r10, r3, r31
- add r10, r10, r11
- mulld r31, r9, r10
- ifelse(0,1,`
- subfc r0, r7, r31
- subfe r0, r0, r0 C r0 = -(r >= ql)
- not r7, r0
- add r10, r7, r10 C qh -= (r >= ql)
- andc r0, r30, r0
- add r31, r31, r0
- ',`
- cmpld cr7, r31, r7
- blt cr7, L(29)
- add r31, r30, r31
- addi r10, r10, -1
- L(29):
- ')
- std r10, 0(r29)
- addi r29, r29, -8
- bdnz L(ufloop)
- L(156):
- srd r3, r31, r27
- L(1):
- addi r1, r1, 176
- ld r0, 16(r1)
- lwz r12, 8(r1)
- mtlr r0
- ld r25, -56(r1)
- ld r26, -48(r1)
- mtcrf 8, r12
- ld r27, -40(r1)
- ld r28, -32(r1)
- ld r29, -24(r1)
- ld r30, -16(r1)
- ld r31, -8(r1)
- blr
- L(162):
- cmpdi cr7, r6, 0
- beq- cr7, L(8)
- sldi r9, r6, 3
- addi r29, r29, -8
- add r9, r9, r5
- addi r28, r6, -1
- ld r31, -8(r9)
- subfc r9, r7, r31
- li r9, 0
- adde r9, r9, r9
- neg r0, r9
- std r9, -8(r11)
- and r0, r0, r7
- subf r31, r0, r31
- L(8):
- L(10):
- mr r3, r30
- CALL( mpn_invert_limb)
- nop
- addic. r6, r28, -1
- blt- cr0, L(150)
- mtctr r28
- sldi r6, r6, 3
- ALIGN(16)
- L(nloop):
- addi r11, r31, 1
- ldx r8, r26, r6
- mulld r0, r31, r3
- addi r6, r6, -8
- mulhdu r10, r31, r3
- addc r7, r0, r8
- adde r10, r10, r11
- mulld r31, r10, r30
- subf r31, r31, r8 C r = nl - qh * d
- subfc r0, r7, r31 C r >= ql
- subfe r0, r0, r0 C r0 = -(r >= ql)
- not r7, r0
- add r10, r7, r10 C qh -= (r >= ql)
- andc r0, r30, r0
- add r31, r31, r0
- cmpld cr7, r31, r30
- bge- cr7, L(167)
- L(51):
- std r10, 0(r29)
- addi r29, r29, -8
- bdnz L(nloop)
- L(150):
- addic. r9, r25, -1
- blt- cr0, L(152)
- mtctr r25
- neg r9, r30
- ALIGN(16)
- L(nfloop):
- addi r11, r31, 1
- nop
- mulld r7, r3, r31
- mulhdu r10, r3, r31
- add r10, r10, r11
- mulld r31, r9, r10
- ifelse(0,1,`
- subfc r0, r7, r31
- subfe r0, r0, r0 C r0 = -(r >= ql)
- not r7, r0
- add r10, r7, r10 C qh -= (r >= ql)
- andc r0, r30, r0
- add r31, r31, r0
- ',`
- cmpld cr7, r31, r7
- blt cr7, L(28)
- add r31, r30, r31
- addi r10, r10, -1
- L(28):
- ')
- std r10, 0(r29)
- addi r29, r29, -8
- bdnz L(nfloop)
- L(152):
- addi r1, r1, 176
- mr r3, r31
- ld r0, 16(r1)
- lwz r12, 8(r1)
- mtlr r0
- ld r25, -56(r1)
- ld r26, -48(r1)
- mtcrf 8, r12
- ld r27, -40(r1)
- ld r28, -32(r1)
- ld r29, -24(r1)
- ld r30, -16(r1)
- ld r31, -8(r1)
- blr
- L(164):
- subf r31, r30, r31
- addi r10, r10, 1
- b L(123)
- L(167):
- subf r31, r30, r31
- addi r10, r10, 1
- b L(51)
- L(165):
- subf r31, r30, r31
- addi r8, r8, 1
- b L(134)
- EPILOGUE()