mode1o.asm
上传用户:qaz666999
上传日期:2022-08-06
资源大小:2570k
文件大小:5k
- dnl Intel Pentium mpn_modexact_1_odd -- exact division style remainder.
- dnl Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
- dnl
- dnl This file is part of the GNU MP Library.
- dnl
- dnl The GNU MP Library is free software; you can redistribute it and/or
- dnl modify it under the terms of the GNU Lesser General Public License as
- dnl published by the Free Software Foundation; either version 3 of the
- dnl License, or (at your option) any later version.
- dnl
- dnl The GNU MP Library is distributed in the hope that it will be useful,
- dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
- dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- dnl Lesser General Public License for more details.
- dnl
- dnl You should have received a copy of the GNU Lesser General Public License
- dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
- include(`../config.m4')
- C P5: 23.0 cycles/limb
- C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
- C mp_limb_t divisor);
- C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
- C mp_limb_t divisor, mp_limb_t carry);
- C
- C There seems no way to pair up the two lone instructions in the main loop.
- C
- C The special case for size==1 saves about 20 cycles (non-PIC), making it
- C the same as mpn_mod_1, and in fact making modexact faster than mod_1 at
- C all sizes.
- C
- C Alternatives:
- C
- C Using mmx for the multiplies might be possible, with pmullw and pmulhw
- C having just 3 cycle latencies, but carry bit handling would probably be
- C complicated.
- defframe(PARAM_CARRY, 16)
- defframe(PARAM_DIVISOR,12)
- defframe(PARAM_SIZE, 8)
- defframe(PARAM_SRC, 4)
- dnl re-using parameter space
- define(VAR_INVERSE,`PARAM_SIZE')
- TEXT
- ALIGN(16)
- PROLOGUE(mpn_modexact_1c_odd)
- deflit(`FRAME',0)
- movl PARAM_DIVISOR, %eax
- movl PARAM_CARRY, %edx
- jmp L(start_1c)
- EPILOGUE()
- ALIGN(16)
- PROLOGUE(mpn_modexact_1_odd)
- deflit(`FRAME',0)
- movl PARAM_DIVISOR, %eax
- xorl %edx, %edx C carry
- L(start_1c):
- ifdef(`PIC',`
- call L(here) FRAME_pushl()
- L(here):
- shrl %eax C d/2
- movl (%esp), %ecx C eip
- addl $_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ecx
- movl %ebx, (%esp) C push ebx
- andl $127, %eax
- movl PARAM_SIZE, %ebx
- movl binvert_limb_table@GOT(%ecx), %ecx
- subl $2, %ebx
- movb (%eax,%ecx), %cl C inv 8 bits
- jc L(one_limb)
- ',`
- dnl non-PIC
- shrl %eax C d/2
- pushl %ebx FRAME_pushl()
- movl PARAM_SIZE, %ebx
- andl $127, %eax
- subl $2, %ebx
- jc L(one_limb)
- movb binvert_limb_table(%eax), %cl C inv 8 bits
- ')
- movl %ecx, %eax
- addl %ecx, %ecx C 2*inv
- imull %eax, %eax C inv*inv
- imull PARAM_DIVISOR, %eax C inv*inv*d
- subl %eax, %ecx C inv = 2*inv - inv*inv*d
- movl %ecx, %eax
- addl %ecx, %ecx C 2*inv
- imull %eax, %eax C inv*inv
- imull PARAM_DIVISOR, %eax C inv*inv*d
- subl %eax, %ecx C inv = 2*inv - inv*inv*d
- pushl %esi FRAME_pushl()
- ASSERT(e,` C d*inv == 1 mod 2^GMP_LIMB_BITS
- movl %ecx, %eax
- imull PARAM_DIVISOR, %eax
- cmpl $1, %eax')
- movl PARAM_SRC, %esi
- movl %ecx, VAR_INVERSE
- movl (%esi), %eax C src[0]
- leal 4(%esi,%ebx,4), %esi C &src[size-1]
- xorl $-1, %ebx C -(size-1)
- ASSERT(nz)
- jmp L(entry)
- C The use of VAR_INVERSE means only a store is needed for that value, rather
- C than a push and pop of say %edi.
- ALIGN(16)
- L(top):
- C eax scratch, low product
- C ebx counter, limbs, negative
- C ecx carry bit
- C edx scratch, high product
- C esi &src[size-1]
- C edi
- C ebp
- mull PARAM_DIVISOR C h:dummy = q*d
- movl (%esi,%ebx,4), %eax C src[i]
- subl %ecx, %edx C h -= -c
- L(entry):
- subl %edx, %eax C s = src[i] - h
- sbbl %ecx, %ecx C new -c (0 or -1)
- imull VAR_INVERSE, %eax C q = s*i
- incl %ebx
- jnz L(top)
- mull PARAM_DIVISOR
- movl (%esi), %eax C src high
- subl %ecx, %edx C h -= -c
- cmpl PARAM_DIVISOR, %eax
- jbe L(skip_last)
- deflit(FRAME_LAST,FRAME)
- subl %edx, %eax C s = src[i] - h
- popl %esi FRAME_popl()
- sbbl %ecx, %ecx C c (0 or -1)
- popl %ebx FRAME_popl()
- imull VAR_INVERSE, %eax C q = s*i
- mull PARAM_DIVISOR C h:dummy = q*d
- movl %edx, %eax
- subl %ecx, %eax
- ret
- C When high<divisor can skip last step.
- L(skip_last):
- deflit(`FRAME',FRAME_LAST)
- C eax src high
- C ebx
- C ecx
- C edx r
- C esi
- subl %eax, %edx C r-s
- popl %esi FRAME_popl()
- sbbl %eax, %eax C -1 if underflow
- movl PARAM_DIVISOR, %ebx
- andl %ebx, %eax C divisor if underflow
- popl %ebx FRAME_popl()
- addl %edx, %eax C addback if underflow
- ret
- C Special case for size==1 using a division for r = c-a mod d.
- C Could look for a-c<d and save a division sometimes, but that doesn't seem
- C worth bothering about.
- L(one_limb):
- deflit(`FRAME',4)
- C eax
- C ebx size-2 (==-1)
- C ecx
- C edx carry
- C esi src end
- C edi
- C ebp
- movl %edx, %eax
- movl PARAM_SRC, %edx
- movl PARAM_DIVISOR, %ecx
- popl %ebx FRAME_popl()
- subl (%edx), %eax C c-a
- sbbl %edx, %edx
- decl %ecx C d-1
- andl %ecx, %edx C b*d+c-a if c<a, or c-a if c>=a
- divl PARAM_DIVISOR
- movl %edx, %eax
- ret
- EPILOGUE()