aors_n.asm
上传用户:qaz666999
上传日期:2022-08-06
资源大小:2570k
文件大小:5k
- dnl PowerPC-64 mpn_add_n/mpn_sub_n -- mpn addition and subtraction.
- dnl Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2007 Free Software
- dnl Foundation, Inc.
- dnl This file is part of the GNU MP Library.
- dnl The GNU MP Library is free software; you can redistribute it and/or modify
- dnl it under the terms of the GNU Lesser General Public License as published
- dnl by the Free Software Foundation; either version 3 of the License, or (at
- dnl your option) any later version.
- dnl The GNU MP Library is distributed in the hope that it will be useful, but
- dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
- dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
- dnl License for more details.
- dnl You should have received a copy of the GNU Lesser General Public License
- dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
- include(`../config.m4')
- C cycles/limb
- C POWER3/PPC630: 1.5
- C POWER4/PPC970: 2
- C n POWER3/PPC630 POWER4/PPC970
- C 1 17.00 19.00
- C 2 9.00 10.49
- C 3 5.33 7.66
- C 4 4.50 5.14
- C 5 4.20 4.80
- C 6 3.83 4.33
- C 7 3.00 3.99
- C 8 2.87 3.55
- C 9 2.89 3.40
- C 10 2.60 3.42
- C 11 2.45 3.15
- C 12 2.41 2.99
- C 13 2.46 3.01
- C 14 2.42 2.97
- C 15 2.20 2.85
- C 50 1.78 2.44
- C 100 1.83 2.20
- C 200 1.55 2.12
- C 400 1.53 2.05
- C 1000 1.98 2.02#
- C 2000 1.50# 2.04
- C 4000 2.55 2.50
- C 8000 2.70 2.45
- C 16000 2.65 5.94
- C 32000 2.62 16.41
- C 64000 2.73 18.94
- C This code is a little bit slower for POWER3/PPC630 than the simple code used
- C previously, but it is much faster for POWER4/PPC970. The reason for the
- C POWER3/PPC630 slowdown can be attributed to the saving and restoring of 4
- C registers.
- C INPUT PARAMETERS
- C rp r3
- C up r4
- C vp r5
- C n r6
- ifdef(`OPERATION_add_n',`
- define(ADDSUBC, adde)
- define(ADDSUB, addc)
- define(func, mpn_add_n)
- define(func_nc, mpn_add_nc)
- define(GENRVAL, `addi r3, r3, 1')
- define(SETCBR, `addic r0, $1, -1')
- define(CLRCB, `addic r0, r0, 0')
- ')
- ifdef(`OPERATION_sub_n',`
- define(ADDSUBC, subfe)
- define(ADDSUB, subfc)
- define(func, mpn_sub_n)
- define(func_nc, mpn_sub_nc)
- define(GENRVAL, `neg r3, r3')
- define(SETCBR, `subfic r0, $1, 0')
- define(CLRCB, `addic r0, r1, -1')
- ')
- MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
- ASM_START()
- PROLOGUE(func_nc)
- SETCBR(r7)
- b L(ent)
- EPILOGUE()
- PROLOGUE(func)
- CLRCB
- L(ent): std r31, -8(r1)
- std r30, -16(r1)
- std r29, -24(r1)
- std r28, -32(r1)
- rldicl. r0, r6, 0,62 C r0 = n & 3, set cr0
- cmpdi cr6, r0, 2
- addi r6, r6, 3 C compute count...
- srdi r6, r6, 2 C ...for ctr
- mtctr r6 C copy count into ctr
- beq cr0, L(b00)
- blt cr6, L(b01)
- beq cr6, L(b10)
- L(b11): ld r8, 0(r4) C load s1 limb
- ld r9, 0(r5) C load s2 limb
- ld r10, 8(r4) C load s1 limb
- ld r11, 8(r5) C load s2 limb
- ld r12, 16(r4) C load s1 limb
- addi r4, r4, 24
- ld r0, 16(r5) C load s2 limb
- addi r5, r5, 24
- ADDSUBC r29, r9, r8
- ADDSUBC r30, r11, r10
- ADDSUBC r31, r0, r12
- std r29, 0(r3)
- std r30, 8(r3)
- std r31, 16(r3)
- addi r3, r3, 24
- bdnz L(go)
- b L(ret)
- L(b01): ld r12, 0(r4) C load s1 limb
- addi r4, r4, 8
- ld r0, 0(r5) C load s2 limb
- addi r5, r5, 8
- ADDSUBC r31, r0, r12 C add
- std r31, 0(r3)
- addi r3, r3, 8
- bdnz L(go)
- b L(ret)
- L(b10): ld r10, 0(r4) C load s1 limb
- ld r11, 0(r5) C load s2 limb
- ld r12, 8(r4) C load s1 limb
- addi r4, r4, 16
- ld r0, 8(r5) C load s2 limb
- addi r5, r5, 16
- ADDSUBC r30, r11, r10 C add
- ADDSUBC r31, r0, r12 C add
- std r30, 0(r3)
- std r31, 8(r3)
- addi r3, r3, 16
- bdnz L(go)
- b L(ret)
- L(b00): C INITCY C clear/set cy
- L(go): ld r6, 0(r4) C load s1 limb
- ld r7, 0(r5) C load s2 limb
- ld r8, 8(r4) C load s1 limb
- ld r9, 8(r5) C load s2 limb
- ld r10, 16(r4) C load s1 limb
- ld r11, 16(r5) C load s2 limb
- ld r12, 24(r4) C load s1 limb
- ld r0, 24(r5) C load s2 limb
- bdz L(end)
- addi r4, r4, 32
- addi r5, r5, 32
- L(oop): ADDSUBC r28, r7, r6
- ld r6, 0(r4) C load s1 limb
- ld r7, 0(r5) C load s2 limb
- ADDSUBC r29, r9, r8
- ld r8, 8(r4) C load s1 limb
- ld r9, 8(r5) C load s2 limb
- ADDSUBC r30, r11, r10
- ld r10, 16(r4) C load s1 limb
- ld r11, 16(r5) C load s2 limb
- ADDSUBC r31, r0, r12
- ld r12, 24(r4) C load s1 limb
- ld r0, 24(r5) C load s2 limb
- std r28, 0(r3)
- addi r4, r4, 32
- std r29, 8(r3)
- addi r5, r5, 32
- std r30, 16(r3)
- std r31, 24(r3)
- addi r3, r3, 32
- bdnz L(oop) C decrement ctr and loop back
- L(end): ADDSUBC r28, r7, r6
- ADDSUBC r29, r9, r8
- ADDSUBC r30, r11, r10
- ADDSUBC r31, r0, r12
- std r28, 0(r3)
- std r29, 8(r3)
- std r30, 16(r3)
- std r31, 24(r3)
- L(ret): ld r31, -8(r1)
- ld r30, -16(r1)
- ld r29, -24(r1)
- ld r28, -32(r1)
- subfe r3, r0, r0 C -cy
- GENRVAL
- blr
- EPILOGUE()