sqr_diagonal.asm
上传用户:qaz666999
上传日期:2022-08-06
资源大小:2570k
文件大小:4k
- dnl HP-PA 2.0 64-bit mpn_sqr_diagonal.
- dnl Copyright 2001, 2002, 2003 Free Software Foundation, Inc.
- dnl This file is part of the GNU MP Library.
- dnl The GNU MP Library is free software; you can redistribute it and/or modify
- dnl it under the terms of the GNU Lesser General Public License as published
- dnl by the Free Software Foundation; either version 3 of the License, or (at
- dnl your option) any later version.
- dnl The GNU MP Library is distributed in the hope that it will be useful, but
- dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
- dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
- dnl License for more details.
- dnl You should have received a copy of the GNU Lesser General Public License
- dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
- dnl This code runs at 7.25 cycles/limb on PA8000 and 7.75 cycles/limb on
- dnl PA8500. The cache would saturate at 5 cycles/limb, so there is some room
- dnl for optimization.
- include(`../config.m4')
- C INPUT PARAMETERS
- define(`rp',`%r26')
- define(`up',`%r25')
- define(`n',`%r24')
- define(`p00',`%r28')
- define(`p32',`%r29')
- define(`p64',`%r31')
- define(`t0',`%r19')
- define(`t1',`%r20')
- ifdef(`HAVE_ABI_2_0w',
- ` .level 2.0w
- ',` .level 2.0
- ')
- PROLOGUE(mpn_sqr_diagonal)
- ldo 128(%r30),%r30
- fldds,ma 8(up),%fr8
- addib,= -1,n,L(end1)
- nop
- fldds,ma 8(up),%fr4
- xmpyu %fr8l,%fr8r,%fr10
- fstd %fr10,-120(%r30)
- xmpyu %fr8r,%fr8r,%fr9
- fstd %fr9,0(rp)
- xmpyu %fr8l,%fr8l,%fr11
- fstd %fr11,8(rp)
- addib,= -1,n,L(end2)
- ldo 16(rp),rp
- LDEF(loop)
- fldds,ma 8(up),%fr8 C load next up limb
- xmpyu %fr4l,%fr4r,%fr6
- fstd %fr6,-128(%r30)
- xmpyu %fr4r,%fr4r,%fr5 C multiply in fp regs
- fstd %fr5,0(rp)
- xmpyu %fr4l,%fr4l,%fr7
- fstd %fr7,8(rp)
- ldd -120(%r30),p32
- ldd -16(rp),p00 C accumulate in int regs
- ldd -8(rp),p64
- depd,z p32,30,31,t0
- add t0,p00,p00
- std p00,-16(rp)
- extrd,u p32,32,33,t1
- add,dc t1,p64,p64
- std p64,-8(rp)
- addib,= -1,n,L(exit)
- ldo 16(rp),rp
- fldds,ma 8(up),%fr4
- xmpyu %fr8l,%fr8r,%fr10
- fstd %fr10,-120(%r30)
- xmpyu %fr8r,%fr8r,%fr9
- fstd %fr9,0(rp)
- xmpyu %fr8l,%fr8l,%fr11
- fstd %fr11,8(rp)
- ldd -128(%r30),p32
- ldd -16(rp),p00
- ldd -8(rp),p64
- depd,z p32,30,31,t0
- add t0,p00,p00
- std p00,-16(rp)
- extrd,u p32,32,33,t1
- add,dc t1,p64,p64
- std p64,-8(rp)
- addib,<> -1,n,L(loop)
- ldo 16(rp),rp
- LDEF(end2)
- xmpyu %fr4l,%fr4r,%fr6
- fstd %fr6,-128(%r30)
- xmpyu %fr4r,%fr4r,%fr5
- fstd %fr5,0(rp)
- xmpyu %fr4l,%fr4l,%fr7
- fstd %fr7,8(rp)
- ldd -120(%r30),p32
- ldd -16(rp),p00
- ldd -8(rp),p64
- depd,z p32,30,31,t0
- add t0,p00,p00
- std p00,-16(rp)
- extrd,u p32,32,33,t1
- add,dc t1,p64,p64
- std p64,-8(rp)
- ldo 16(rp),rp
- ldd -128(%r30),p32
- ldd -16(rp),p00
- ldd -8(rp),p64
- depd,z p32,30,31,t0
- add t0,p00,p00
- std p00,-16(rp)
- extrd,u p32,32,33,t1
- add,dc t1,p64,p64
- std p64,-8(rp)
- bve (%r2)
- ldo -128(%r30),%r30
- LDEF(exit)
- xmpyu %fr8l,%fr8r,%fr10
- fstd %fr10,-120(%r30)
- xmpyu %fr8r,%fr8r,%fr9
- fstd %fr9,0(rp)
- xmpyu %fr8l,%fr8l,%fr11
- fstd %fr11,8(rp)
- ldd -128(%r30),p32
- ldd -16(rp),p00
- ldd -8(rp),p64
- depd,z p32,31,32,t0
- add t0,p00,p00
- extrd,u p32,31,32,t1
- add,dc t1,p64,p64
- add t0,p00,p00
- add,dc t1,p64,p64
- std p00,-16(rp)
- std p64,-8(rp)
- ldo 16(rp),rp
- ldd -120(%r30),p32
- ldd -16(rp),p00
- ldd -8(rp),p64
- depd,z p32,31,32,t0
- add t0,p00,p00
- extrd,u p32,31,32,t1
- add,dc t1,p64,p64
- add t0,p00,p00
- add,dc t1,p64,p64
- std p00,-16(rp)
- std p64,-8(rp)
- bve (%r2)
- ldo -128(%r30),%r30
- LDEF(end1)
- xmpyu %fr8l,%fr8r,%fr10
- fstd %fr10,-128(%r30)
- xmpyu %fr8r,%fr8r,%fr9
- fstd %fr9,0(rp)
- xmpyu %fr8l,%fr8l,%fr11
- fstd %fr11,8(rp)
- ldo 16(rp),rp
- ldd -128(%r30),p32
- ldd -16(rp),p00
- ldd -8(rp),p64
- depd,z p32,31,32,t0
- add t0,p00,p00
- extrd,u p32,31,32,t1
- add,dc t1,p64,p64
- add t0,p00,p00
- add,dc t1,p64,p64
- std p00,-16(rp)
- std p64,-8(rp)
- bve (%r2)
- ldo -128(%r30),%r30
- EPILOGUE(mpn_sqr_diagonal)