copyd.asm
上传用户:qaz666999
上传日期:2022-08-06
资源大小:2570k
文件大小:4k
- dnl Intel P6 mpn_copyd -- copy limb vector backwards.
- dnl Copyright 2001, 2002 Free Software Foundation, Inc.
- dnl
- dnl This file is part of the GNU MP Library.
- dnl
- dnl The GNU MP Library is free software; you can redistribute it and/or
- dnl modify it under the terms of the GNU Lesser General Public License as
- dnl published by the Free Software Foundation; either version 3 of the
- dnl License, or (at your option) any later version.
- dnl
- dnl The GNU MP Library is distributed in the hope that it will be useful,
- dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
- dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- dnl Lesser General Public License for more details.
- dnl
- dnl You should have received a copy of the GNU Lesser General Public License
- dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
- include(`../config.m4')
- C P6: 1.75 cycles/limb, or 0.75 if no overlap
- C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
- C
- C An explicit loop is used because a decrementing rep movsl is a bit slow at
- C 2.4 c/l. That rep movsl also has about a 40 cycle startup time, and the
- C code here stands a chance of being faster if the branches predict well.
- C
- C The slightly strange loop form seems necessary for the claimed speed.
- C Maybe load/store ordering affects it.
- C
- C The source and destination are checked to see if they're actually
- C overlapping, since it might be possible to use an incrementing rep movsl
- C at 0.75 c/l. (It doesn't suffer the bad startup time of the decrementing
- C version.)
- C
- C Enhancements:
- C
- C Top speed for an all-integer copy is probably 1.0 c/l, being one load and
- C one store each cycle. Unrolling the loop below would approach 1.0, but
- C it'd be good to know why something like store/load/subl + store/load/jnz
- C doesn't already run at 1.0 c/l. It looks like it should decode in 2
- C cycles, but doesn't run that way.
- defframe(PARAM_SIZE,12)
- defframe(PARAM_SRC, 8)
- defframe(PARAM_DST, 4)
- dnl re-using parameter space
- define(SAVE_ESI,`PARAM_SIZE')
- define(SAVE_EDI,`PARAM_SRC')
- TEXT
- ALIGN(16)
- PROLOGUE(mpn_copyd)
- deflit(`FRAME',0)
- movl PARAM_SIZE, %ecx
- movl %esi, SAVE_ESI
- movl PARAM_SRC, %esi
- movl %edi, SAVE_EDI
- movl PARAM_DST, %edi
- subl $1, %ecx
- jb L(zero)
- movl (%esi,%ecx,4), %eax C src[size-1]
- jz L(one)
- movl -4(%esi,%ecx,4), %edx C src[size-2]
- subl $2, %ecx
- jbe L(done_loop) C 2 or 3 limbs only
- C The usual overlap is
- C
- C high low
- C +------------------+
- C | dst|
- C +------------------+
- C +------------------+
- C | src|
- C +------------------+
- C
- C We can use an incrementing copy in the following circumstances.
- C
- C src+4*size<=dst, since then the regions are disjoint
- C
- C src==dst, clearly (though this shouldn't occur normally)
- C
- C src>dst, since in that case it's a requirement of the
- C parameters that src>=dst+size*4, and hence the
- C regions are disjoint
- C
- leal (%edi,%ecx,4), %edx
- cmpl %edi, %esi
- jae L(use_movsl) C src >= dst
- cmpl %edi, %edx
- movl 4(%esi,%ecx,4), %edx C src[size-2] again
- jbe L(use_movsl) C src+4*size <= dst
- L(top):
- C eax prev high limb
- C ebx
- C ecx counter, size-3 down to 0 or -1, inclusive, by 2s
- C edx prev low limb
- C esi src
- C edi dst
- C ebp
- movl %eax, 8(%edi,%ecx,4)
- movl (%esi,%ecx,4), %eax
- movl %edx, 4(%edi,%ecx,4)
- movl -4(%esi,%ecx,4), %edx
- subl $2, %ecx
- jnbe L(top)
- L(done_loop):
- movl %eax, 8(%edi,%ecx,4)
- movl %edx, 4(%edi,%ecx,4)
- C copy low limb (needed if size was odd, but will already have been
- C done in the loop if size was even)
- movl (%esi), %eax
- L(one):
- movl %eax, (%edi)
- movl SAVE_EDI, %edi
- movl SAVE_ESI, %esi
- ret
- L(use_movsl):
- C eax
- C ebx
- C ecx size-3
- C edx
- C esi src
- C edi dst
- C ebp
- addl $3, %ecx
- cld C better safe than sorry, see mpn/x86/README
- rep
- movsl
- L(zero):
- movl SAVE_ESI, %esi
- movl SAVE_EDI, %edi
- ret
- EPILOGUE()