mul_basecase.asm
上传用户:qaz666999
上传日期:2022-08-06
资源大小:2570k
文件大小:8k
- dnl AMD64 mpn_mul_basecase.
- dnl Contributed to the GNU project by Torbjorn Granlund and David Harvey.
- dnl Copyright 2008 Free Software Foundation, Inc.
- dnl This file is part of the GNU MP Library.
- dnl The GNU MP Library is free software; you can redistribute it and/or modify
- dnl it under the terms of the GNU Lesser General Public License as published
- dnl by the Free Software Foundation; either version 3 of the License, or (at
- dnl your option) any later version.
- dnl The GNU MP Library is distributed in the hope that it will be useful, but
- dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
- dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
- dnl License for more details.
- dnl You should have received a copy of the GNU Lesser General Public License
- dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
- include(`../config.m4')
- C cycles/limb
- C K8,K9: 2.375
- C K10: 2.375
- C P4: ?
- C P6-15: 4.45
- C The inner loops of this code are the result of running a code generation and
- C optimization tool suite written by David Harvey and Torbjorn Granlund.
- C TODO
- C * Use fewer registers. (how??? I can't see it -- david)
- C * Avoid some "mov $0,r" and instead use "xor r,r".
- C * Can the top of each L(addmul_outer_n) prologue be folded into the
- C mul_1/mul_2 prologues, saving a LEA (%rip)? It would slow down the
- C case where vn = 1 or 2; is it worth it?
- C INPUT PARAMETERS
- define(`rp', `%rdi')
- define(`up', `%rsi')
- define(`un_param',`%rdx')
- define(`vp', `%rcx')
- define(`vn', `%r8')
- define(`v0', `%r12')
- define(`v1', `%r9')
- define(`w0', `%rbx')
- define(`w1', `%r15')
- define(`w2', `%rbp')
- define(`w3', `%r10')
- define(`n', `%r11')
- define(`outer_addr', `%r14')
- define(`un', `%r13')
- ASM_START()
- TEXT
- ALIGN(16)
- PROLOGUE(mpn_mul_basecase)
- push %rbx
- push %rbp
- push %r12
- push %r13
- push %r14
- push %r15
- xor R32(un), R32(un)
- mov (up), %rax
- mov (vp), v0
- sub un_param, un C rdx used by mul
- mov un, n
- mov R32(un_param), R32(w0)
- lea (rp,un_param,8), rp
- lea (up,un_param,8), up
- mul v0
- test $1, R8(vn)
- jz L(mul_2)
- C ===========================================================
- C mul_1 for vp[0] if vn is odd
- L(mul_1):
- and $3, R32(w0)
- jz L(mul_1_prologue_0)
- cmp $2, R32(w0)
- jc L(mul_1_prologue_1)
- jz L(mul_1_prologue_2)
- jmp L(mul_1_prologue_3)
- L(mul_1_prologue_0):
- mov %rax, w2
- mov %rdx, w3 C note: already w0 == 0
- lea L(addmul_outer_0)(%rip), outer_addr
- jmp L(mul_1_entry_0)
- L(mul_1_prologue_1):
- cmp $-1, un
- jne 2f
- mov %rax, -8(rp)
- mov %rdx, (rp)
- jmp L(ret)
- 2: add $1, n
- lea L(addmul_outer_1)(%rip), outer_addr
- mov %rax, w1
- mov %rdx, w2
- xor R32(w3), R32(w3)
- mov (up,n,8), %rax
- jmp L(mul_1_entry_1)
- L(mul_1_prologue_2):
- add $-2, n
- lea L(addmul_outer_2)(%rip), outer_addr
- mov %rax, w0
- mov %rdx, w1
- mov 24(up,n,8), %rax
- xor R32(w2), R32(w2)
- xor R32(w3), R32(w3)
- jmp L(mul_1_entry_2)
- L(mul_1_prologue_3):
- add $-1, n
- lea L(addmul_outer_3)(%rip), outer_addr
- mov %rax, w3
- mov %rdx, w0
- jmp L(mul_1_entry_3)
- C this loop is 10 c/loop = 2.5 c/l on K8, for all up/rp alignments
- ALIGN(16)
- L(mul_1_top):
- mov w0, -16(rp,n,8)
- add %rax, w1
- mov (up,n,8), %rax
- adc %rdx, w2
- L(mul_1_entry_1):
- xor R32(w0), R32(w0)
- mul v0
- mov w1, -8(rp,n,8)
- add %rax, w2
- adc %rdx, w3
- L(mul_1_entry_0):
- mov 8(up,n,8), %rax
- mul v0
- mov w2, (rp,n,8)
- add %rax, w3
- adc %rdx, w0
- L(mul_1_entry_3):
- mov 16(up,n,8), %rax
- mul v0
- mov w3, 8(rp,n,8)
- xor R32(w2), R32(w2) C zero
- mov w2, w3 C zero
- add %rax, w0
- mov 24(up,n,8), %rax
- mov w2, w1 C zero
- adc %rdx, w1
- L(mul_1_entry_2):
- mul v0
- add $4, n
- js L(mul_1_top)
- mov w0, -16(rp)
- add %rax, w1
- mov w1, -8(rp)
- adc %rdx, w2
- mov w2, (rp)
- add $-1, vn C vn -= 1
- jz L(ret)
- mov 8(vp), v0
- mov 16(vp), v1
- lea 8(vp), vp C vp += 1
- lea 8(rp), rp C rp += 1
- jmp *outer_addr
- C ===========================================================
- C mul_2 for vp[0], vp[1] if vn is even
- ALIGN(16)
- L(mul_2):
- mov 8(vp), v1
- and $3, R32(w0)
- jz L(mul_2_prologue_0)
- cmp $2, R32(w0)
- jz L(mul_2_prologue_2)
- jc L(mul_2_prologue_1)
- L(mul_2_prologue_3):
- lea L(addmul_outer_3)(%rip), outer_addr
- add $2, n
- mov %rax, -16(rp,n,8)
- mov %rdx, w2
- xor R32(w3), R32(w3)
- xor R32(w0), R32(w0)
- mov -16(up,n,8), %rax
- jmp L(mul_2_entry_3)
- ALIGN(16)
- L(mul_2_prologue_0):
- add $3, n
- mov %rax, w0
- mov %rdx, w1
- xor R32(w2), R32(w2)
- mov -24(up,n,8), %rax
- lea L(addmul_outer_0)(%rip), outer_addr
- jmp L(mul_2_entry_0)
- ALIGN(16)
- L(mul_2_prologue_1):
- mov %rax, w3
- mov %rdx, w0
- xor R32(w1), R32(w1)
- lea L(addmul_outer_1)(%rip), outer_addr
- jmp L(mul_2_entry_1)
- ALIGN(16)
- L(mul_2_prologue_2):
- add $1, n
- lea L(addmul_outer_2)(%rip), outer_addr
- mov $0, R32(w0)
- mov $0, R32(w1)
- mov %rax, w2
- mov -8(up,n,8), %rax
- mov %rdx, w3
- jmp L(mul_2_entry_2)
- C this loop is 18 c/loop = 2.25 c/l on K8, for all up/rp alignments
- ALIGN(16)
- L(mul_2_top):
- mov -32(up,n,8), %rax
- mul v1
- add %rax, w0
- adc %rdx, w1
- mov -24(up,n,8), %rax
- xor R32(w2), R32(w2)
- mul v0
- add %rax, w0
- mov -24(up,n,8), %rax
- adc %rdx, w1
- adc $0, R32(w2)
- L(mul_2_entry_0):
- mul v1
- add %rax, w1
- mov w0, -24(rp,n,8)
- adc %rdx, w2
- mov -16(up,n,8), %rax
- mul v0
- mov $0, R32(w3)
- add %rax, w1
- adc %rdx, w2
- mov -16(up,n,8), %rax
- adc $0, R32(w3)
- mov $0, R32(w0)
- mov w1, -16(rp,n,8)
- L(mul_2_entry_3):
- mul v1
- add %rax, w2
- mov -8(up,n,8), %rax
- adc %rdx, w3
- mov $0, R32(w1)
- mul v0
- add %rax, w2
- mov -8(up,n,8), %rax
- adc %rdx, w3
- adc R32(w1), R32(w0) C adc $0, w0
- L(mul_2_entry_2):
- mul v1
- add %rax, w3
- mov w2, -8(rp,n,8)
- adc %rdx, w0
- mov (up,n,8), %rax
- mul v0
- add %rax, w3
- adc %rdx, w0
- adc $0, R32(w1)
- L(mul_2_entry_1):
- add $4, n
- mov w3, -32(rp,n,8)
- js L(mul_2_top)
- mov -32(up,n,8), %rax
- mul v1
- add %rax, w0
- mov w0, (rp)
- adc %rdx, w1
- mov w1, 8(rp)
- add $-2, vn C vn -= 2
- jz L(ret)
- mov 16(vp), v0
- mov 24(vp), v1
- lea 16(vp), vp C vp += 2
- lea 16(rp), rp C rp += 2
- jmp *outer_addr
- C ===========================================================
- C addmul_2 for remaining vp's
- C in the following prologues, we reuse un to store the
- C adjusted value of n that is reloaded on each iteration
- L(addmul_outer_0):
- add $3, un
- lea 0(%rip), outer_addr
- mov un, n
- mov -24(up,un,8), %rax
- mul v0
- mov %rax, w0
- mov -24(up,un,8), %rax
- mov %rdx, w1
- xor R32(w2), R32(w2)
- jmp L(addmul_entry_0)
- L(addmul_outer_1):
- mov un, n
- mov (up,un,8), %rax
- mul v0
- mov %rax, w3
- mov (up,un,8), %rax
- mov %rdx, w0
- xor R32(w1), R32(w1)
- jmp L(addmul_entry_1)
- L(addmul_outer_2):
- add $1, un
- lea 0(%rip), outer_addr
- mov un, n
- mov -8(up,un,8), %rax
- mul v0
- xor R32(w0), R32(w0)
- mov %rax, w2
- xor R32(w1), R32(w1)
- mov %rdx, w3
- mov -8(up,un,8), %rax
- jmp L(addmul_entry_2)
- L(addmul_outer_3):
- add $2, un
- lea 0(%rip), outer_addr
- mov un, n
- mov -16(up,un,8), %rax
- xor R32(w3), R32(w3)
- mul v0
- mov %rax, w1
- mov -16(up,un,8), %rax
- mov %rdx, w2
- jmp L(addmul_entry_3)
- C this loop is 19 c/loop = 2.375 c/l on K8, for all up/rp alignments
- ALIGN(16)
- L(addmul_top):
- add w3, -32(rp,n,8)
- adc %rax, w0
- mov -24(up,n,8), %rax
- adc %rdx, w1
- xor R32(w2), R32(w2)
- mul v0
- add %rax, w0
- mov -24(up,n,8), %rax
- adc %rdx, w1
- adc R32(w2), R32(w2) C adc $0, w2
- L(addmul_entry_0):
- mul v1
- xor R32(w3), R32(w3)
- add w0, -24(rp,n,8)
- adc %rax, w1
- mov -16(up,n,8), %rax
- adc %rdx, w2
- mul v0
- add %rax, w1
- mov -16(up,n,8), %rax
- adc %rdx, w2
- adc $0, R32(w3)
- L(addmul_entry_3):
- mul v1
- add w1, -16(rp,n,8)
- adc %rax, w2
- mov -8(up,n,8), %rax
- adc %rdx, w3
- mul v0
- xor R32(w0), R32(w0)
- add %rax, w2
- adc %rdx, w3
- mov $0, R32(w1)
- mov -8(up,n,8), %rax
- adc R32(w1), R32(w0) C adc $0, w0
- L(addmul_entry_2):
- mul v1
- add w2, -8(rp,n,8)
- adc %rax, w3
- adc %rdx, w0
- mov (up,n,8), %rax
- mul v0
- add %rax, w3
- mov (up,n,8), %rax
- adc %rdx, w0
- adc $0, R32(w1)
- L(addmul_entry_1):
- mul v1
- add $4, n
- js L(addmul_top)
- add w3, -8(rp)
- adc %rax, w0
- mov w0, (rp)
- adc %rdx, w1
- mov w1, 8(rp)
- add $-2, vn C vn -= 2
- jz L(ret)
- lea 16(rp), rp C rp += 2
- lea 16(vp), vp C vp += 2
- mov (vp), v0
- mov 8(vp), v1
- jmp *outer_addr
- ALIGN(16)
- L(ret): pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
- ret
- EPILOGUE()