mul_1.asm
上传用户:qaz666999
上传日期:2022-08-06
资源大小:2570k
文件大小:16k
- dnl SPARC v9 64-bit mpn_mul_1 -- Multiply a limb vector with a limb and store
- dnl the result in a second limb vector.
- dnl Copyright 1998, 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
- dnl This file is part of the GNU MP Library.
- dnl The GNU MP Library is free software; you can redistribute it and/or modify
- dnl it under the terms of the GNU Lesser General Public License as published
- dnl by the Free Software Foundation; either version 3 of the License, or (at
- dnl your option) any later version.
- dnl The GNU MP Library is distributed in the hope that it will be useful, but
- dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
- dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
- dnl License for more details.
- dnl You should have received a copy of the GNU Lesser General Public License
- dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
- include(`../config.m4')
- C cycles/limb
- C UltraSPARC 1&2: 14
- C UltraSPARC 3: 18.5
- C Algorithm: We use eight floating-point multiplies per limb product, with the
- C invariant v operand split into four 16-bit pieces, and the s1 operand split
- C into 32-bit pieces. We sum pairs of 48-bit partial products using
- C floating-point add, then convert the four 49-bit product-sums and transfer
- C them to the integer unit.
- C Possible optimizations:
- C 1. Align the stack area where we transfer the four 49-bit product-sums
- C to a 32-byte boundary. That would minimize the cache collision.
- C (UltraSPARC-1/2 use a direct-mapped cache.) (Perhaps even better would
- C be to align the area to map to the area immediately before s1?)
- C 2. Sum the 4 49-bit quantities using 32-bit operations, as in the
- C develop mpn_addmul_2. This would save many integer instructions.
- C 3. Unrolling. Questionable if it is worth the code expansion, given that
- C it could only save 1 cycle/limb.
- C 4. Specialize for particular v values. If its upper 32 bits are zero, we
- C could save many operations, in the FPU (fmuld), but more so in the IEU
- C since we'll be summing 48-bit quantities, which might be simpler.
- C 5. Ideally, we should schedule the f2/f3 and f4/f5 RAW further apart, and
- C the i00,i16,i32,i48 RAW less apart. The latter apart-scheduling should
- C not be greater than needed for L2 cache latency, and also not so great
- C that i16 needs to be copied.
- C 6. Avoid performing mem+fa+fm in the same cycle, at least not when we want
- C to get high IEU bandwidth. (12 of the 14 cycles will be free for 2 IEU
- C ops.)
- C Instruction classification (as per UltraSPARC-1/2 functional units):
- C 8 FM
- C 10 FA
- C 11 MEM
- C 9 ISHIFT + 10? IADDLOG
- C 1 BRANCH
- C 49 insns totally (plus three mov insns that should be optimized out)
- C The loop executes 53 instructions in 14 cycles on UltraSPARC-1/2, i.e we
- C sustain 3.79 instructions/cycle.
- C INPUT PARAMETERS
- C rp i0
- C up i1
- C n i2
- C v i3
- ASM_START()
- REGISTER(%g2,#scratch)
- REGISTER(%g3,#scratch)
- define(`p00', `%f8') define(`p16',`%f10') define(`p32',`%f12') define(`p48',`%f14')
- define(`r32',`%f16') define(`r48',`%f18') define(`r64',`%f20') define(`r80',`%f22')
- define(`v00',`%f24') define(`v16',`%f26') define(`v32',`%f28') define(`v48',`%f30')
- define(`u00',`%f32') define(`u32', `%f34')
- define(`a00',`%f36') define(`a16',`%f38') define(`a32',`%f40') define(`a48',`%f42')
- define(`cy',`%g1')
- define(`rlimb',`%g3')
- define(`i00',`%l0') define(`i16',`%l1') define(`i32',`%l2') define(`i48',`%l3')
- define(`xffffffff',`%l7')
- define(`xffff',`%o0')
- PROLOGUE(mpn_mul_1)
- C Initialization. (1) Split v operand into four 16-bit chunks and store them
- C as IEEE double in fp registers. (2) Clear upper 32 bits of fp register pairs
- C f2 and f4. (3) Store masks in registers aliased to `xffff' and `xffffffff'.
- save %sp, -256, %sp
- mov -1, %g4
- srlx %g4, 48, xffff C store mask in register `xffff'
- and %i3, xffff, %g2
- stx %g2, [%sp+2223+0]
- srlx %i3, 16, %g3
- and %g3, xffff, %g3
- stx %g3, [%sp+2223+8]
- srlx %i3, 32, %g2
- and %g2, xffff, %g2
- stx %g2, [%sp+2223+16]
- srlx %i3, 48, %g3
- stx %g3, [%sp+2223+24]
- srlx %g4, 32, xffffffff C store mask in register `xffffffff'
- sllx %i2, 3, %i2
- mov 0, cy C clear cy
- add %i0, %i2, %i0
- add %i1, %i2, %i1
- neg %i2
- add %i1, 4, %i5
- add %i0, -32, %i4
- add %i0, -16, %i0
- ldd [%sp+2223+0], v00
- ldd [%sp+2223+8], v16
- ldd [%sp+2223+16], v32
- ldd [%sp+2223+24], v48
- ld [%sp+2223+0],%f2 C zero f2
- ld [%sp+2223+0],%f4 C zero f4
- ld [%i5+%i2], %f3 C read low 32 bits of up[i]
- ld [%i1+%i2], %f5 C read high 32 bits of up[i]
- fxtod v00, v00
- fxtod v16, v16
- fxtod v32, v32
- fxtod v48, v48
- C Start real work. (We sneakingly read f3 and f5 above...)
- C The software pipeline is very deep, requiring 4 feed-in stages.
- fxtod %f2, u00
- fxtod %f4, u32
- fmuld u00, v00, a00
- fmuld u00, v16, a16
- fmuld u00, v32, p32
- fmuld u32, v00, r32
- fmuld u00, v48, p48
- addcc %i2, 8, %i2
- bnz,pt %icc, .L_two_or_more
- fmuld u32, v16, r48
- .L_one:
- fmuld u32, v32, r64 C FIXME not urgent
- faddd p32, r32, a32
- fdtox a00, a00
- faddd p48, r48, a48
- fmuld u32, v48, r80 C FIXME not urgent
- fdtox a16, a16
- fdtox a32, a32
- fdtox a48, a48
- std a00, [%sp+2223+0]
- std a16, [%sp+2223+8]
- std a32, [%sp+2223+16]
- std a48, [%sp+2223+24]
- add %i2, 8, %i2
- fdtox r64, a00
- fdtox r80, a16
- ldx [%sp+2223+0], i00
- ldx [%sp+2223+8], i16
- ldx [%sp+2223+16], i32
- ldx [%sp+2223+24], i48
- std a00, [%sp+2223+0]
- std a16, [%sp+2223+8]
- add %i2, 8, %i2
- mov i00, %g5 C i00+ now in g5
- ldx [%sp+2223+0], i00
- srlx i16, 48, %l4 C (i16 >> 48)
- mov i16, %g2
- ldx [%sp+2223+8], i16
- srlx i48, 16, %l5 C (i48 >> 16)
- mov i32, %g4 C i32+ now in g4
- sllx i48, 32, %l6 C (i48 << 32)
- srlx %g4, 32, %o3 C (i32 >> 32)
- add %l5, %l4, %o1 C hi64- in %o1
- std a00, [%sp+2223+0]
- sllx %g4, 16, %o2 C (i32 << 16)
- add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
- std a16, [%sp+2223+8]
- sllx %o1, 48, %o3 C (hi64 << 48)
- add %g2, %o2, %o2 C mi64- in %o2
- add %l6, %o2, %o2 C mi64- in %o2
- sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
- add cy, %g5, %o4 C x = prev(i00) + cy
- b .L_out_1
- add %i2, 8, %i2
- .L_two_or_more:
- ld [%i5+%i2], %f3 C read low 32 bits of up[i]
- fmuld u32, v32, r64 C FIXME not urgent
- faddd p32, r32, a32
- ld [%i1+%i2], %f5 C read high 32 bits of up[i]
- fdtox a00, a00
- faddd p48, r48, a48
- fmuld u32, v48, r80 C FIXME not urgent
- fdtox a16, a16
- fdtox a32, a32
- fxtod %f2, u00
- fxtod %f4, u32
- fdtox a48, a48
- std a00, [%sp+2223+0]
- fmuld u00, v00, p00
- std a16, [%sp+2223+8]
- fmuld u00, v16, p16
- std a32, [%sp+2223+16]
- fmuld u00, v32, p32
- std a48, [%sp+2223+24]
- faddd p00, r64, a00
- fmuld u32, v00, r32
- faddd p16, r80, a16
- fmuld u00, v48, p48
- addcc %i2, 8, %i2
- bnz,pt %icc, .L_three_or_more
- fmuld u32, v16, r48
- .L_two:
- fmuld u32, v32, r64 C FIXME not urgent
- faddd p32, r32, a32
- fdtox a00, a00
- faddd p48, r48, a48
- fmuld u32, v48, r80 C FIXME not urgent
- fdtox a16, a16
- ldx [%sp+2223+0], i00
- fdtox a32, a32
- ldx [%sp+2223+8], i16
- ldx [%sp+2223+16], i32
- ldx [%sp+2223+24], i48
- fdtox a48, a48
- std a00, [%sp+2223+0]
- std a16, [%sp+2223+8]
- std a32, [%sp+2223+16]
- std a48, [%sp+2223+24]
- add %i2, 8, %i2
- fdtox r64, a00
- mov i00, %g5 C i00+ now in g5
- fdtox r80, a16
- ldx [%sp+2223+0], i00
- srlx i16, 48, %l4 C (i16 >> 48)
- mov i16, %g2
- ldx [%sp+2223+8], i16
- srlx i48, 16, %l5 C (i48 >> 16)
- mov i32, %g4 C i32+ now in g4
- ldx [%sp+2223+16], i32
- sllx i48, 32, %l6 C (i48 << 32)
- ldx [%sp+2223+24], i48
- srlx %g4, 32, %o3 C (i32 >> 32)
- add %l5, %l4, %o1 C hi64- in %o1
- std a00, [%sp+2223+0]
- sllx %g4, 16, %o2 C (i32 << 16)
- add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
- std a16, [%sp+2223+8]
- sllx %o1, 48, %o3 C (hi64 << 48)
- add %g2, %o2, %o2 C mi64- in %o2
- add %l6, %o2, %o2 C mi64- in %o2
- sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
- add cy, %g5, %o4 C x = prev(i00) + cy
- b .L_out_2
- add %i2, 8, %i2
- .L_three_or_more:
- ld [%i5+%i2], %f3 C read low 32 bits of up[i]
- fmuld u32, v32, r64 C FIXME not urgent
- faddd p32, r32, a32
- ld [%i1+%i2], %f5 C read high 32 bits of up[i]
- fdtox a00, a00
- faddd p48, r48, a48
- fmuld u32, v48, r80 C FIXME not urgent
- fdtox a16, a16
- ldx [%sp+2223+0], i00
- fdtox a32, a32
- ldx [%sp+2223+8], i16
- fxtod %f2, u00
- ldx [%sp+2223+16], i32
- fxtod %f4, u32
- ldx [%sp+2223+24], i48
- fdtox a48, a48
- std a00, [%sp+2223+0]
- fmuld u00, v00, p00
- std a16, [%sp+2223+8]
- fmuld u00, v16, p16
- std a32, [%sp+2223+16]
- fmuld u00, v32, p32
- std a48, [%sp+2223+24]
- faddd p00, r64, a00
- fmuld u32, v00, r32
- faddd p16, r80, a16
- fmuld u00, v48, p48
- addcc %i2, 8, %i2
- bnz,pt %icc, .L_four_or_more
- fmuld u32, v16, r48
- .L_three:
- fmuld u32, v32, r64 C FIXME not urgent
- faddd p32, r32, a32
- fdtox a00, a00
- faddd p48, r48, a48
- mov i00, %g5 C i00+ now in g5
- fmuld u32, v48, r80 C FIXME not urgent
- fdtox a16, a16
- ldx [%sp+2223+0], i00
- fdtox a32, a32
- srlx i16, 48, %l4 C (i16 >> 48)
- mov i16, %g2
- ldx [%sp+2223+8], i16
- srlx i48, 16, %l5 C (i48 >> 16)
- mov i32, %g4 C i32+ now in g4
- ldx [%sp+2223+16], i32
- sllx i48, 32, %l6 C (i48 << 32)
- ldx [%sp+2223+24], i48
- fdtox a48, a48
- srlx %g4, 32, %o3 C (i32 >> 32)
- add %l5, %l4, %o1 C hi64- in %o1
- std a00, [%sp+2223+0]
- sllx %g4, 16, %o2 C (i32 << 16)
- add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
- std a16, [%sp+2223+8]
- sllx %o1, 48, %o3 C (hi64 << 48)
- add %g2, %o2, %o2 C mi64- in %o2
- std a32, [%sp+2223+16]
- add %l6, %o2, %o2 C mi64- in %o2
- std a48, [%sp+2223+24]
- sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
- add cy, %g5, %o4 C x = prev(i00) + cy
- b .L_out_3
- add %i2, 8, %i2
- .L_four_or_more:
- ld [%i5+%i2], %f3 C read low 32 bits of up[i]
- fmuld u32, v32, r64 C FIXME not urgent
- faddd p32, r32, a32
- ld [%i1+%i2], %f5 C read high 32 bits of up[i]
- fdtox a00, a00
- faddd p48, r48, a48
- mov i00, %g5 C i00+ now in g5
- fmuld u32, v48, r80 C FIXME not urgent
- fdtox a16, a16
- ldx [%sp+2223+0], i00
- fdtox a32, a32
- srlx i16, 48, %l4 C (i16 >> 48)
- mov i16, %g2
- ldx [%sp+2223+8], i16
- fxtod %f2, u00
- srlx i48, 16, %l5 C (i48 >> 16)
- mov i32, %g4 C i32+ now in g4
- ldx [%sp+2223+16], i32
- fxtod %f4, u32
- sllx i48, 32, %l6 C (i48 << 32)
- ldx [%sp+2223+24], i48
- fdtox a48, a48
- srlx %g4, 32, %o3 C (i32 >> 32)
- add %l5, %l4, %o1 C hi64- in %o1
- std a00, [%sp+2223+0]
- fmuld u00, v00, p00
- sllx %g4, 16, %o2 C (i32 << 16)
- add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
- std a16, [%sp+2223+8]
- fmuld u00, v16, p16
- sllx %o1, 48, %o3 C (hi64 << 48)
- add %g2, %o2, %o2 C mi64- in %o2
- std a32, [%sp+2223+16]
- fmuld u00, v32, p32
- add %l6, %o2, %o2 C mi64- in %o2
- std a48, [%sp+2223+24]
- faddd p00, r64, a00
- fmuld u32, v00, r32
- sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
- faddd p16, r80, a16
- fmuld u00, v48, p48
- add cy, %g5, %o4 C x = prev(i00) + cy
- addcc %i2, 8, %i2
- bnz,pt %icc, .Loop
- fmuld u32, v16, r48
- .L_four:
- b,a .L_out_4
- C BEGIN MAIN LOOP
- .align 16
- .Loop:
- C 00
- srlx %o4, 16, %o5 C (x >> 16)
- ld [%i5+%i2], %f3 C read low 32 bits of up[i]
- fmuld u32, v32, r64 C FIXME not urgent
- faddd p32, r32, a32
- C 01
- add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT
- and %o4, xffff, %o5 C (x & 0xffff)
- ld [%i1+%i2], %f5 C read high 32 bits of up[i]
- fdtox a00, a00
- C 02
- faddd p48, r48, a48
- C 03
- srlx %o2, 48, %o7 C (mi64 >> 48)
- mov i00, %g5 C i00+ now in g5
- fmuld u32, v48, r80 C FIXME not urgent
- fdtox a16, a16
- C 04
- sllx %o2, 16, %i3 C (mi64 << 16)
- add %o7, %o1, cy C new cy
- ldx [%sp+2223+0], i00
- fdtox a32, a32
- C 05
- srlx i16, 48, %l4 C (i16 >> 48)
- mov i16, %g2
- ldx [%sp+2223+8], i16
- fxtod %f2, u00
- C 06
- srlx i48, 16, %l5 C (i48 >> 16)
- mov i32, %g4 C i32+ now in g4
- ldx [%sp+2223+16], i32
- fxtod %f4, u32
- C 07
- sllx i48, 32, %l6 C (i48 << 32)
- or %i3, %o5, %o5
- ldx [%sp+2223+24], i48
- fdtox a48, a48
- C 08
- srlx %g4, 32, %o3 C (i32 >> 32)
- add %l5, %l4, %o1 C hi64- in %o1
- std a00, [%sp+2223+0]
- fmuld u00, v00, p00
- C 09
- sllx %g4, 16, %o2 C (i32 << 16)
- add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
- std a16, [%sp+2223+8]
- fmuld u00, v16, p16
- C 10
- sllx %o1, 48, %o3 C (hi64 << 48)
- add %g2, %o2, %o2 C mi64- in %o2
- std a32, [%sp+2223+16]
- fmuld u00, v32, p32
- C 11
- add %l6, %o2, %o2 C mi64- in %o2
- std a48, [%sp+2223+24]
- faddd p00, r64, a00
- fmuld u32, v00, r32
- C 12
- sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
- stx %o5, [%i4+%i2]
- faddd p16, r80, a16
- fmuld u00, v48, p48
- C 13
- add cy, %g5, %o4 C x = prev(i00) + cy
- addcc %i2, 8, %i2
- bnz,pt %icc, .Loop
- fmuld u32, v16, r48
- C END MAIN LOOP
- .L_out_4:
- srlx %o4, 16, %o5 C (x >> 16)
- fmuld u32, v32, r64 C FIXME not urgent
- faddd p32, r32, a32
- add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT
- and %o4, xffff, %o5 C (x & 0xffff)
- fdtox a00, a00
- faddd p48, r48, a48
- srlx %o2, 48, %o7 C (mi64 >> 48)
- mov i00, %g5 C i00+ now in g5
- fmuld u32, v48, r80 C FIXME not urgent
- fdtox a16, a16
- sllx %o2, 16, %i3 C (mi64 << 16)
- add %o7, %o1, cy C new cy
- ldx [%sp+2223+0], i00
- fdtox a32, a32
- srlx i16, 48, %l4 C (i16 >> 48)
- mov i16, %g2
- ldx [%sp+2223+8], i16
- srlx i48, 16, %l5 C (i48 >> 16)
- mov i32, %g4 C i32+ now in g4
- ldx [%sp+2223+16], i32
- sllx i48, 32, %l6 C (i48 << 32)
- or %i3, %o5, %o5
- ldx [%sp+2223+24], i48
- fdtox a48, a48
- srlx %g4, 32, %o3 C (i32 >> 32)
- add %l5, %l4, %o1 C hi64- in %o1
- std a00, [%sp+2223+0]
- sllx %g4, 16, %o2 C (i32 << 16)
- add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
- std a16, [%sp+2223+8]
- sllx %o1, 48, %o3 C (hi64 << 48)
- add %g2, %o2, %o2 C mi64- in %o2
- std a32, [%sp+2223+16]
- add %l6, %o2, %o2 C mi64- in %o2
- std a48, [%sp+2223+24]
- sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
- stx %o5, [%i4+%i2]
- add cy, %g5, %o4 C x = prev(i00) + cy
- add %i2, 8, %i2
- .L_out_3:
- srlx %o4, 16, %o5 C (x >> 16)
- add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT
- and %o4, xffff, %o5 C (x & 0xffff)
- fdtox r64, a00
- srlx %o2, 48, %o7 C (mi64 >> 48)
- mov i00, %g5 C i00+ now in g5
- fdtox r80, a16
- sllx %o2, 16, %i3 C (mi64 << 16)
- add %o7, %o1, cy C new cy
- ldx [%sp+2223+0], i00
- srlx i16, 48, %l4 C (i16 >> 48)
- mov i16, %g2
- ldx [%sp+2223+8], i16
- srlx i48, 16, %l5 C (i48 >> 16)
- mov i32, %g4 C i32+ now in g4
- ldx [%sp+2223+16], i32
- sllx i48, 32, %l6 C (i48 << 32)
- or %i3, %o5, %o5
- ldx [%sp+2223+24], i48
- srlx %g4, 32, %o3 C (i32 >> 32)
- add %l5, %l4, %o1 C hi64- in %o1
- std a00, [%sp+2223+0]
- sllx %g4, 16, %o2 C (i32 << 16)
- add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
- std a16, [%sp+2223+8]
- sllx %o1, 48, %o3 C (hi64 << 48)
- add %g2, %o2, %o2 C mi64- in %o2
- add %l6, %o2, %o2 C mi64- in %o2
- sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
- stx %o5, [%i4+%i2]
- add cy, %g5, %o4 C x = prev(i00) + cy
- add %i2, 8, %i2
- .L_out_2:
- srlx %o4, 16, %o5 C (x >> 16)
- add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT
- and %o4, xffff, %o5 C (x & 0xffff)
- srlx %o2, 48, %o7 C (mi64 >> 48)
- mov i00, %g5 C i00+ now in g5
- sllx %o2, 16, %i3 C (mi64 << 16)
- add %o7, %o1, cy C new cy
- ldx [%sp+2223+0], i00
- srlx i16, 48, %l4 C (i16 >> 48)
- mov i16, %g2
- ldx [%sp+2223+8], i16
- srlx i48, 16, %l5 C (i48 >> 16)
- mov i32, %g4 C i32+ now in g4
- sllx i48, 32, %l6 C (i48 << 32)
- or %i3, %o5, %o5
- srlx %g4, 32, %o3 C (i32 >> 32)
- add %l5, %l4, %o1 C hi64- in %o1
- sllx %g4, 16, %o2 C (i32 << 16)
- add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
- sllx %o1, 48, %o3 C (hi64 << 48)
- add %g2, %o2, %o2 C mi64- in %o2
- add %l6, %o2, %o2 C mi64- in %o2
- sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
- stx %o5, [%i4+%i2]
- add cy, %g5, %o4 C x = prev(i00) + cy
- add %i2, 8, %i2
- .L_out_1:
- srlx %o4, 16, %o5 C (x >> 16)
- add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT
- and %o4, xffff, %o5 C (x & 0xffff)
- srlx %o2, 48, %o7 C (mi64 >> 48)
- sllx %o2, 16, %i3 C (mi64 << 16)
- add %o7, %o1, cy C new cy
- or %i3, %o5, %o5
- stx %o5, [%i4+%i2]
- sllx i00, 0, %g2
- add %g2, cy, cy
- sllx i16, 16, %g3
- add %g3, cy, cy
- return %i7+8
- mov cy, %o0
- EPILOGUE(mpn_mul_1)