sqr_basecase.asm
上传用户:qaz666999
上传日期:2022-08-06
资源大小:2570k
文件大小:9k
- dnl Intel P5 mpn_sqr_basecase -- square an mpn number.
- dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
- dnl
- dnl This file is part of the GNU MP Library.
- dnl
- dnl The GNU MP Library is free software; you can redistribute it and/or
- dnl modify it under the terms of the GNU Lesser General Public License as
- dnl published by the Free Software Foundation; either version 3 of the
- dnl License, or (at your option) any later version.
- dnl
- dnl The GNU MP Library is distributed in the hope that it will be useful,
- dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
- dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- dnl Lesser General Public License for more details.
- dnl
- dnl You should have received a copy of the GNU Lesser General Public License
- dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
- include(`../config.m4')
- C P5: approx 8 cycles per crossproduct, or 15.5 cycles per triangular
- C product at around 20x20 limbs.
- C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
- C
- C Calculate src,size squared, storing the result in dst,2*size.
- C
- C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
- C lot of function call overheads are avoided, especially when the size is
- C small.
- defframe(PARAM_SIZE,12)
- defframe(PARAM_SRC, 8)
- defframe(PARAM_DST, 4)
- TEXT
- ALIGN(8)
- PROLOGUE(mpn_sqr_basecase)
- deflit(`FRAME',0)
- movl PARAM_SIZE, %edx
- movl PARAM_SRC, %eax
- cmpl $2, %edx
- movl PARAM_DST, %ecx
- je L(two_limbs)
- movl (%eax), %eax
- ja L(three_or_more)
- C -----------------------------------------------------------------------------
- C one limb only
- C eax src
- C ebx
- C ecx dst
- C edx
- mull %eax
- movl %eax, (%ecx)
- movl %edx, 4(%ecx)
- ret
- C -----------------------------------------------------------------------------
- ALIGN(8)
- L(two_limbs):
- C eax src
- C ebx
- C ecx dst
- C edx size
- pushl %ebp
- pushl %edi
- pushl %esi
- pushl %ebx
- movl %eax, %ebx
- movl (%eax), %eax
- mull %eax C src[0]^2
- movl %eax, (%ecx) C dst[0]
- movl %edx, %esi C dst[1]
- movl 4(%ebx), %eax
- mull %eax C src[1]^2
- movl %eax, %edi C dst[2]
- movl %edx, %ebp C dst[3]
- movl (%ebx), %eax
- mull 4(%ebx) C src[0]*src[1]
- addl %eax, %esi
- popl %ebx
- adcl %edx, %edi
- adcl $0, %ebp
- addl %esi, %eax
- adcl %edi, %edx
- movl %eax, 4(%ecx)
- adcl $0, %ebp
- popl %esi
- movl %edx, 8(%ecx)
- movl %ebp, 12(%ecx)
- popl %edi
- popl %ebp
- ret
- C -----------------------------------------------------------------------------
- ALIGN(8)
- L(three_or_more):
- C eax src low limb
- C ebx
- C ecx dst
- C edx size
- cmpl $4, %edx
- pushl %ebx
- deflit(`FRAME',4)
- movl PARAM_SRC, %ebx
- jae L(four_or_more)
- C -----------------------------------------------------------------------------
- C three limbs
- C eax src low limb
- C ebx src
- C ecx dst
- C edx size
- pushl %ebp
- pushl %edi
- mull %eax C src[0] ^ 2
- movl %eax, (%ecx)
- movl %edx, 4(%ecx)
- movl 4(%ebx), %eax
- xorl %ebp, %ebp
- mull %eax C src[1] ^ 2
- movl %eax, 8(%ecx)
- movl %edx, 12(%ecx)
- movl 8(%ebx), %eax
- pushl %esi C risk of cache bank clash
- mull %eax C src[2] ^ 2
- movl %eax, 16(%ecx)
- movl %edx, 20(%ecx)
- movl (%ebx), %eax
- mull 4(%ebx) C src[0] * src[1]
- movl %eax, %esi
- movl %edx, %edi
- movl (%ebx), %eax
- mull 8(%ebx) C src[0] * src[2]
- addl %eax, %edi
- movl %edx, %ebp
- adcl $0, %ebp
- movl 4(%ebx), %eax
- mull 8(%ebx) C src[1] * src[2]
- xorl %ebx, %ebx
- addl %eax, %ebp
- C eax
- C ebx zero, will be dst[5]
- C ecx dst
- C edx dst[4]
- C esi dst[1]
- C edi dst[2]
- C ebp dst[3]
- adcl $0, %edx
- addl %esi, %esi
- adcl %edi, %edi
- adcl %ebp, %ebp
- adcl %edx, %edx
- movl 4(%ecx), %eax
- adcl $0, %ebx
- addl %esi, %eax
- movl %eax, 4(%ecx)
- movl 8(%ecx), %eax
- adcl %edi, %eax
- movl 12(%ecx), %esi
- adcl %ebp, %esi
- movl 16(%ecx), %edi
- movl %eax, 8(%ecx)
- movl %esi, 12(%ecx)
- adcl %edx, %edi
- popl %esi
- movl 20(%ecx), %eax
- movl %edi, 16(%ecx)
- popl %edi
- popl %ebp
- adcl %ebx, %eax C no carry out of this
- popl %ebx
- movl %eax, 20(%ecx)
- ret
- C -----------------------------------------------------------------------------
- ALIGN(8)
- L(four_or_more):
- C eax src low limb
- C ebx src
- C ecx dst
- C edx size
- C esi
- C edi
- C ebp
- C
- C First multiply src[0]*src[1..size-1] and store at dst[1..size].
- deflit(`FRAME',4)
- pushl %edi
- FRAME_pushl()
- pushl %esi
- FRAME_pushl()
- pushl %ebp
- FRAME_pushl()
- leal (%ecx,%edx,4), %edi C dst end of this mul1
- leal (%ebx,%edx,4), %esi C src end
- movl %ebx, %ebp C src
- negl %edx C -size
- xorl %ebx, %ebx C clear carry limb and carry flag
- leal 1(%edx), %ecx C -(size-1)
- L(mul1):
- C eax scratch
- C ebx carry
- C ecx counter, negative
- C edx scratch
- C esi &src[size]
- C edi &dst[size]
- C ebp src
- adcl $0, %ebx
- movl (%esi,%ecx,4), %eax
- mull (%ebp)
- addl %eax, %ebx
- movl %ebx, (%edi,%ecx,4)
- incl %ecx
- movl %edx, %ebx
- jnz L(mul1)
- C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for
- C n=1..size-2.
- C
- C The last two products, which are the end corner of the product
- C triangle, are handled separately to save looping overhead. These
- C are src[size-3]*src[size-2,size-1] and src[size-2]*src[size-1].
- C If size is 4 then it's only these that need to be done.
- C
- C In the outer loop %esi is a constant, and %edi just advances by 1
- C limb each time. The size of the operation decreases by 1 limb
- C each time.
- C eax
- C ebx carry (needing carry flag added)
- C ecx
- C edx
- C esi &src[size]
- C edi &dst[size]
- C ebp
- adcl $0, %ebx
- movl PARAM_SIZE, %edx
- movl %ebx, (%edi)
- subl $4, %edx
- negl %edx
- jz L(corner)
- L(outer):
- C ebx previous carry limb to store
- C edx outer loop counter (negative)
- C esi &src[size]
- C edi dst, pointing at stored carry limb of previous loop
- pushl %edx C new outer loop counter
- leal -2(%edx), %ecx
- movl %ebx, (%edi)
- addl $4, %edi
- addl $4, %ebp
- xorl %ebx, %ebx C initial carry limb, clear carry flag
- L(inner):
- C eax scratch
- C ebx carry (needing carry flag added)
- C ecx counter, negative
- C edx scratch
- C esi &src[size]
- C edi dst end of this addmul
- C ebp &src[j]
- adcl $0, %ebx
- movl (%esi,%ecx,4), %eax
- mull (%ebp)
- addl %ebx, %eax
- movl (%edi,%ecx,4), %ebx
- adcl $0, %edx
- addl %eax, %ebx
- movl %ebx, (%edi,%ecx,4)
- incl %ecx
- movl %edx, %ebx
- jnz L(inner)
- adcl $0, %ebx
- popl %edx C outer loop counter
- incl %edx
- jnz L(outer)
- movl %ebx, (%edi)
- L(corner):
- C esi &src[size]
- C edi &dst[2*size-4]
- movl -8(%esi), %eax
- movl -4(%edi), %ebx C risk of data cache bank clash here
- mull -12(%esi) C src[size-2]*src[size-3]
- addl %eax, %ebx
- movl %edx, %ecx
- adcl $0, %ecx
- movl -4(%esi), %eax
- mull -12(%esi) C src[size-1]*src[size-3]
- addl %ecx, %eax
- movl (%edi), %ecx
- adcl $0, %edx
- movl %ebx, -4(%edi)
- addl %eax, %ecx
- movl %edx, %ebx
- adcl $0, %ebx
- movl -4(%esi), %eax
- mull -8(%esi) C src[size-1]*src[size-2]
- movl %ecx, (%edi)
- addl %eax, %ebx
- adcl $0, %edx
- movl PARAM_SIZE, %eax
- negl %eax
- movl %ebx, 4(%edi)
- addl $1, %eax C -(size-1) and clear carry
- movl %edx, 8(%edi)
- C -----------------------------------------------------------------------------
- C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1].
- L(lshift):
- C eax counter, negative
- C ebx next limb
- C ecx
- C edx
- C esi
- C edi &dst[2*size-4]
- C ebp
- movl 12(%edi,%eax,8), %ebx
- rcll %ebx
- movl 16(%edi,%eax,8), %ecx
- rcll %ecx
- movl %ebx, 12(%edi,%eax,8)
- movl %ecx, 16(%edi,%eax,8)
- incl %eax
- jnz L(lshift)
- adcl %eax, %eax C high bit out
- movl PARAM_SRC, %esi
- movl PARAM_SIZE, %ecx C risk of cache bank clash
- movl %eax, 12(%edi) C dst most significant limb
- C -----------------------------------------------------------------------------
- C Now add in the squares on the diagonal, namely src[0]^2, src[1]^2, ...,
- C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the
- C low limb of src[0]^2.
- movl (%esi), %eax C src[0]
- leal (%esi,%ecx,4), %esi C src end
- negl %ecx
- mull %eax
- movl %eax, 16(%edi,%ecx,8) C dst[0]
- movl %edx, %ebx
- addl $1, %ecx C size-1 and clear carry
- L(diag):
- C eax scratch (low product)
- C ebx carry limb
- C ecx counter, negative
- C edx scratch (high product)
- C esi &src[size]
- C edi &dst[2*size-4]
- C ebp scratch (fetched dst limbs)
- movl (%esi,%ecx,4), %eax
- adcl $0, %ebx
- mull %eax
- movl 16-4(%edi,%ecx,8), %ebp
- addl %ebp, %ebx
- movl 16(%edi,%ecx,8), %ebp
- adcl %eax, %ebp
- movl %ebx, 16-4(%edi,%ecx,8)
- movl %ebp, 16(%edi,%ecx,8)
- incl %ecx
- movl %edx, %ebx
- jnz L(diag)
- adcl $0, %edx
- movl 16-4(%edi), %eax C dst most significant limb
- addl %eax, %edx
- popl %ebp
- movl %edx, 16-4(%edi)
- popl %esi C risk of cache bank clash
- popl %edi
- popl %ebx
- ret
- EPILOGUE()