sqr_diagonal.asm
上传用户:qaz666999
上传日期:2022-08-06
资源大小:2570k
文件大小:9k
- dnl SPARC v9 32-bit mpn_sqr_diagonal.
- dnl Copyright 2001, 2003 Free Software Foundation, Inc.
- dnl This file is part of the GNU MP Library.
- dnl The GNU MP Library is free software; you can redistribute it and/or modify
- dnl it under the terms of the GNU Lesser General Public License as published
- dnl by the Free Software Foundation; either version 3 of the License, or (at
- dnl your option) any later version.
- dnl The GNU MP Library is distributed in the hope that it will be useful, but
- dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
- dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
- dnl License for more details.
- dnl You should have received a copy of the GNU Lesser General Public License
- dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
- include(`../config.m4')
- C INPUT PARAMETERS
- C rp i0
- C up i1
- C n i2
- C This code uses a very deep software pipeline, due to the need for moving data
- C forth and back between the integer registers and floating-point registers.
- C
- C A VIS variant of this code would make the pipeline less deep, since the
- C masking now done in the integer unit could take place in the floating-point
- C unit using the FAND instruction. It would be possible to save several cycles
- C too.
- C
- C On UltraSPARC 1 and 2, this code runs at 11 cycles/limb from the Dcache and
- C not much slower from the Ecache. It would perhaps be possible to shave off
- C one cycle, but not easily. We cannot do better than 10 cycles/limb with the
- C used instructions, since we have 10 memory operations per limb. But a VIS
- C variant could run three cycles faster than the corresponding non-VIS code.
- C This is non-pipelined code showing the algorithm:
- C
- C .Loop:
- C lduw [up+0],%g4 C 00000000hhhhllll
- C sllx %g4,16,%g3 C 0000hhhhllll0000
- C or %g3,%g4,%g2 C 0000hhhhXXXXllll
- C andn %g2,%g5,%g2 C 0000hhhh0000llll
- C stx %g2,[%fp+80]
- C ldd [%fp+80],%f0
- C fitod %f0,%f4 C hi16
- C fitod %f1,%f6 C lo16
- C ld [up+0],%f9
- C fxtod %f8,%f2
- C fmuld %f2,%f4,%f4
- C fmuld %f2,%f6,%f6
- C fdtox %f4,%f4
- C fdtox %f6,%f6
- C std %f4,[%fp-24]
- C std %f6,[%fp-16]
- C ldx [%fp-24],%g2
- C ldx [%fp-16],%g1
- C sllx %g2,16,%g2
- C add %g2,%g1,%g1
- C stw %g1,[rp+0]
- C srlx %g1,32,%l0
- C stw %l0,[rp+4]
- C add up,4,up
- C subcc n,1,n
- C bne,pt %icc,.Loop
- C add rp,8,rp
- define(`fanop',`fitod %f12,%f10') dnl A quasi nop running in the FA pipe
- ASM_START()
- TEXT
- ALIGN(4)
- .Lnoll:
- .word 0
- PROLOGUE(mpn_sqr_diagonal)
- save %sp,-256,%sp
- ifdef(`PIC',
- `.Lpc: rd %pc,%o7
- ld [%o7+.Lnoll-.Lpc],%f8',
- ` sethi %hi(.Lnoll),%g1
- ld [%g1+%lo(.Lnoll)],%f8')
- sethi %hi(0xffff0000),%g5
- add %i1,-8,%i1
- lduw [%i1+8],%g4
- add %i1,4,%i1 C s1_ptr++
- sllx %g4,16,%g3 C 0000hhhhllll0000
- or %g3,%g4,%g2 C 0000hhhhXXXXllll
- subcc %i2,1,%i2
- bne,pt %icc,.L_grt_1
- andn %g2,%g5,%g2 C 0000hhhh0000llll
- add %i1,4,%i1 C s1_ptr++
- stx %g2,[%fp+80]
- ld [%i1],%f9
- ldd [%fp+80],%f0
- fxtod %f8,%f2
- fitod %f0,%f4
- fitod %f1,%f6
- fmuld %f2,%f4,%f4
- fmuld %f2,%f6,%f6
- fdtox %f4,%f4
- fdtox %f6,%f6
- std %f4,[%fp-24]
- std %f6,[%fp-16]
- add %fp, 80, %l3
- add %fp, -24, %l4
- add %fp, 72, %l5
- b .L1
- add %fp, -40, %l6
- .L_grt_1:
- stx %g2,[%fp+80]
- lduw [%i1+8],%g4
- add %i1,4,%i1 C s1_ptr++
- sllx %g4,16,%g3 C 0000hhhhllll0000
- or %g3,%g4,%g2 C 0000hhhhXXXXllll
- subcc %i2,1,%i2
- bne,pt %icc,.L_grt_2
- andn %g2,%g5,%g2 C 0000hhhh0000llll
- stx %g2,[%fp+72]
- ld [%i1],%f9
- add %i1,4,%i1 C s1_ptr++
- ldd [%fp+80],%f0
- fxtod %f8,%f2
- fitod %f0,%f4
- fitod %f1,%f6
- fmuld %f2,%f4,%f4
- ld [%i1],%f9
- fmuld %f2,%f6,%f6
- ldd [%fp+72],%f0
- fdtox %f4,%f4
- fdtox %f6,%f6
- std %f4,[%fp-24]
- fxtod %f8,%f2
- std %f6,[%fp-16]
- fitod %f0,%f4
- fitod %f1,%f6
- fmuld %f2,%f4,%f4
- fmuld %f2,%f6,%f6
- fdtox %f4,%f4
- add %fp, 72, %l3
- add %fp, -40, %l4
- add %fp, 80, %l5
- b .L2
- add %fp, -24, %l6
- .L_grt_2:
- stx %g2,[%fp+72]
- lduw [%i1+8],%g4
- ld [%i1],%f9
- add %i1,4,%i1 C s1_ptr++
- ldd [%fp+80],%f0
- sllx %g4,16,%g3 C 0000hhhhllll0000
- or %g3,%g4,%g2 C 0000hhhhXXXXllll
- subcc %i2,1,%i2
- fxtod %f8,%f2
- bne,pt %icc,.L_grt_3
- andn %g2,%g5,%g2 C 0000hhhh0000llll
- stx %g2,[%fp+80]
- fitod %f0,%f4
- fitod %f1,%f6
- fmuld %f2,%f4,%f4
- ld [%i1],%f9
- fmuld %f2,%f6,%f6
- add %i1,4,%i1 C s1_ptr++
- ldd [%fp+72],%f0
- fdtox %f4,%f4
- fdtox %f6,%f6
- std %f4,[%fp-24]
- fxtod %f8,%f2
- std %f6,[%fp-16]
- fitod %f0,%f4
- fitod %f1,%f6
- fmuld %f2,%f4,%f4
- ld [%i1],%f9
- add %fp, 80, %l3
- fmuld %f2,%f6,%f6
- add %fp, -24, %l4
- ldd [%fp+80],%f0
- add %fp, 72, %l5
- fdtox %f4,%f4
- b .L3
- add %fp, -40, %l6
- .L_grt_3:
- stx %g2,[%fp+80]
- fitod %f0,%f4
- lduw [%i1+8],%g4
- fitod %f1,%f6
- fmuld %f2,%f4,%f4
- ld [%i1],%f9
- fmuld %f2,%f6,%f6
- add %i1,4,%i1 C s1_ptr++
- ldd [%fp+72],%f0
- fdtox %f4,%f4
- sllx %g4,16,%g3 C 0000hhhhllll0000
- fdtox %f6,%f6
- or %g3,%g4,%g2 C 0000hhhhXXXXllll
- subcc %i2,1,%i2
- std %f4,[%fp-24]
- fxtod %f8,%f2
- std %f6,[%fp-16]
- bne,pt %icc,.L_grt_4
- andn %g2,%g5,%g2 C 0000hhhh0000llll
- stx %g2,[%fp+72]
- fitod %f0,%f4
- fitod %f1,%f6
- add %fp, 72, %l3
- fmuld %f2,%f4,%f4
- add %fp, -40, %l4
- ld [%i1],%f9
- fmuld %f2,%f6,%f6
- add %i1,4,%i1 C s1_ptr++
- ldd [%fp+80],%f0
- add %fp, 80, %l5
- fdtox %f4,%f4
- b .L4
- add %fp, -24, %l6
- .L_grt_4:
- stx %g2,[%fp+72]
- fitod %f0,%f4
- lduw [%i1+8],%g4
- fitod %f1,%f6
- fmuld %f2,%f4,%f4
- ld [%i1],%f9
- fmuld %f2,%f6,%f6
- add %i1,4,%i1 C s1_ptr++
- ldd [%fp+80],%f0
- fdtox %f4,%f4
- sllx %g4,16,%g3 C 0000hhhhllll0000
- fdtox %f6,%f6
- or %g3,%g4,%g2 C 0000hhhhXXXXllll
- subcc %i2,1,%i2
- std %f4,[%fp-40]
- fxtod %f8,%f2
- std %f6,[%fp-32]
- be,pn %icc,.L5
- andn %g2,%g5,%g2 C 0000hhhh0000llll
- b,a .Loop
- .align 16
- C --- LOOP BEGIN
- .Loop: nop
- nop
- stx %g2,[%fp+80]
- fitod %f0,%f4
- C ---
- nop
- nop
- lduw [%i1+8],%g4
- fitod %f1,%f6
- C ---
- nop
- nop
- ldx [%fp-24],%g2 C p16
- fanop
- C ---
- nop
- nop
- ldx [%fp-16],%g1 C p0
- fmuld %f2,%f4,%f4
- C ---
- sllx %g2,16,%g2 C align p16
- add %i0,8,%i0 C res_ptr++
- ld [%i1],%f9
- fmuld %f2,%f6,%f6
- C ---
- add %g2,%g1,%g1 C add p16 to p0 (ADD1)
- add %i1,4,%i1 C s1_ptr++
- ldd [%fp+72],%f0
- fanop
- C ---
- srlx %g1,32,%l0
- nop
- stw %g1,[%i0-8]
- fdtox %f4,%f4
- C ---
- sllx %g4,16,%g3 C 0000hhhhllll0000
- nop
- stw %l0,[%i0-4]
- fdtox %f6,%f6
- C ---
- or %g3,%g4,%g2 C 0000hhhhXXXXllll
- subcc %i2,1,%i2
- std %f4,[%fp-24]
- fxtod %f8,%f2
- C ---
- std %f6,[%fp-16]
- andn %g2,%g5,%g2 C 0000hhhh0000llll
- be,pn %icc,.Lend
- fanop
- C --- LOOP MIDDLE
- nop
- nop
- stx %g2,[%fp+72]
- fitod %f0,%f4
- C ---
- nop
- nop
- lduw [%i1+8],%g4
- fitod %f1,%f6
- C ---
- nop
- nop
- ldx [%fp-40],%g2 C p16
- fanop
- C ---
- nop
- nop
- ldx [%fp-32],%g1 C p0
- fmuld %f2,%f4,%f4
- C ---
- sllx %g2,16,%g2 C align p16
- add %i0,8,%i0 C res_ptr++
- ld [%i1],%f9
- fmuld %f2,%f6,%f6
- C ---
- add %g2,%g1,%g1 C add p16 to p0 (ADD1)
- add %i1,4,%i1 C s1_ptr++
- ldd [%fp+80],%f0
- fanop
- C ---
- srlx %g1,32,%l0
- nop
- stw %g1,[%i0-8]
- fdtox %f4,%f4
- C ---
- sllx %g4,16,%g3 C 0000hhhhllll0000
- nop
- stw %l0,[%i0-4]
- fdtox %f6,%f6
- C ---
- or %g3,%g4,%g2 C 0000hhhhXXXXllll
- subcc %i2,1,%i2
- std %f4,[%fp-40]
- fxtod %f8,%f2
- C ---
- std %f6,[%fp-32]
- andn %g2,%g5,%g2 C 0000hhhh0000llll
- bne,pt %icc,.Loop
- fanop
- C --- LOOP END
- .L5: add %fp, 80, %l3
- add %fp, -24, %l4
- add %fp, 72, %l5
- b .Ltail
- add %fp, -40, %l6
- .Lend: add %fp, 72, %l3
- add %fp, -40, %l4
- add %fp, 80, %l5
- add %fp, -24, %l6
- .Ltail: stx %g2,[%l3]
- fitod %f0,%f4
- fitod %f1,%f6
- ldx [%l4],%g2 C p16
- ldx [%l4+8],%g1 C p0
- fmuld %f2,%f4,%f4
- sllx %g2,16,%g2 C align p16
- add %i0,8,%i0 C res_ptr++
- ld [%i1],%f9
- fmuld %f2,%f6,%f6
- add %g2,%g1,%g1 C add p16 to p0 (ADD1)
- add %i1,4,%i1 C s1_ptr++
- ldd [%l5],%f0
- srlx %g1,32,%l0
- stw %g1,[%i0-8]
- fdtox %f4,%f4
- stw %l0,[%i0-4]
- .L4: fdtox %f6,%f6
- std %f4,[%l4]
- fxtod %f8,%f2
- std %f6,[%l4+8]
- fitod %f0,%f4
- fitod %f1,%f6
- ldx [%l6],%g2 C p16
- ldx [%l6+8],%g1 C p0
- fmuld %f2,%f4,%f4
- sllx %g2,16,%g2 C align p16
- add %i0,8,%i0 C res_ptr++
- ld [%i1],%f9
- fmuld %f2,%f6,%f6
- add %g2,%g1,%g1 C add p16 to p0 (ADD1)
- ldd [%l3],%f0
- srlx %g1,32,%l0
- stw %g1,[%i0-8]
- fdtox %f4,%f4
- stw %l0,[%i0-4]
- .L3: fdtox %f6,%f6
- std %f4,[%l6]
- fxtod %f8,%f2
- std %f6,[%l6+8]
- fitod %f0,%f4
- fitod %f1,%f6
- ldx [%l4],%g2 C p16
- ldx [%l4+8],%g1 C p0
- fmuld %f2,%f4,%f4
- sllx %g2,16,%g2 C align p16
- add %i0,8,%i0 C res_ptr++
- fmuld %f2,%f6,%f6
- add %g2,%g1,%g1 C add p16 to p0 (ADD1)
- srlx %g1,32,%l0
- stw %g1,[%i0-8]
- fdtox %f4,%f4
- stw %l0,[%i0-4]
- .L2: fdtox %f6,%f6
- std %f4,[%l4]
- std %f6,[%l4+8]
- ldx [%l6],%g2 C p16
- ldx [%l6+8],%g1 C p0
- sllx %g2,16,%g2 C align p16
- add %i0,8,%i0 C res_ptr++
- add %g2,%g1,%g1 C add p16 to p0 (ADD1)
- srlx %g1,32,%l0
- stw %g1,[%i0-8]
- stw %l0,[%i0-4]
- .L1: ldx [%l4],%g2 C p16
- ldx [%l4+8],%g1 C p0
- sllx %g2,16,%g2 C align p16
- add %i0,8,%i0 C res_ptr++
- add %g2,%g1,%g1 C add p16 to p0 (ADD1)
- srlx %g1,32,%l0
- stw %g1,[%i0-8]
- stw %l0,[%i0-4]
- ret
- restore %g0,%g0,%o0
- EPILOGUE(mpn_sqr_diagonal)