数学计算

开发平台：
Unix_Linux

gcd_1.asm：源码内容
							dnl  AMD K6 mpn_gcd_1 -- mpn by 1 gcd.
dnl  Copyright 2000, 2001, 2002, 2004 Free Software Foundation, Inc.
dnl
dnl  This file is part of the GNU MP Library.
dnl
dnl  The GNU MP Library is free software; you can redistribute it and/or
dnl  modify it under the terms of the GNU Lesser General Public License as
dnl  published by the Free Software Foundation; either version 3 of the
dnl  License, or (at your option) any later version.
dnl
dnl  The GNU MP Library is distributed in the hope that it will be useful,
dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
dnl  Lesser General Public License for more details.
dnl
dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C K6: 9.5 cycles/bit (approx)   1x1 gcd
C     11.0 cycles/limb          Nx1 reduction (modexact_1_odd)
C mp_limb_t mpn_gcd_1 (mp_srcptr src, mp_size_t size, mp_limb_t y);
C
C This code is nothing very special, but offers a speedup over what gcc 2.95
C can do with mpn/generic/gcd_1.c.
C
C Future:
C
C Using a lookup table to count trailing zeros seems a touch quicker, but
C after a slightly longer startup.  Might be worthwhile if an mpn_gcd_2 used
C it too.
dnl  If size==1 and x (the larger operand) is more than DIV_THRESHOLD bits
dnl  bigger than y, then a division x%y is done to reduce it.
dnl
dnl  A divl is 20 cycles and the loop runs at about 9.5 cycles/bitpair so
dnl  there should be an advantage in the divl at about 4 or 5 bits, which is
dnl  what's found.
deflit(DIV_THRESHOLD, 5)
defframe(PARAM_LIMB, 12)
defframe(PARAM_SIZE,  8)
defframe(PARAM_SRC,   4)
	TEXT
	ALIGN(16)
PROLOGUE(mpn_gcd_1)
deflit(`FRAME',0)
	ASSERT(ne, `cmpl $0, PARAM_LIMB')
	ASSERT(ae, `cmpl $1, PARAM_SIZE')
	movl	PARAM_SRC, %eax
	pushl	%ebx			FRAME_pushl()
	movl	PARAM_LIMB, %edx
	movl	$-1, %ecx
	movl	(%eax), %ebx		C src low limb
	movl	%ebx, %eax		C src low limb
	orl	%edx, %ebx
L(common_twos):
	shrl	%ebx
	incl	%ecx
	jnc	L(common_twos)		C 1/4 chance on random data
	shrl	%cl, %edx		C y
	cmpl	$1, PARAM_SIZE
	ja	L(size_two_or_more)
	ASSERT(nz, `orl	%eax, %eax')	C should have src limb != 0
	shrl	%cl, %eax		C x
	C Swap if necessary to make x>=y.  Measures a touch quicker as a
	C jump than a branch free calculation.
	C
	C eax	x
	C ebx
	C ecx	common twos
	C edx	y
	movl	%eax, %ebx
	cmpl	%eax, %edx
	jb	L(noswap)
	movl	%edx, %eax
	movl	%ebx, %edx
	movl	%eax, %ebx
L(noswap):
	C See if it's worth reducing x with a divl.
	C
	C eax	x
	C ebx	x
	C ecx	common twos
	C edx	y
	shrl	$DIV_THRESHOLD, %ebx
	cmpl	%ebx, %edx
	ja	L(nodiv)
	C Reduce x to x%y.
	C
	C eax	x
	C ebx
	C ecx	common twos
	C edx	y
	movl	%edx, %ebx
	xorl	%edx, %edx
	divl	%ebx
	orl	%edx, %edx	C y
	nop	C code alignment
	movl	%ebx, %eax	C x
	jz	L(done_shll)
L(nodiv):
	C eax	x
	C ebx
	C ecx	common twos
	C edx	y
	C esi
	C edi
	C ebp
L(strip_y):
	shrl	%edx
	jnc	L(strip_y)
	leal	1(%edx,%edx), %edx
	movl	%ecx, %ebx	C common twos
	leal	1(%eax), %ecx
	jmp	L(strip_x_and)
C Calculating a %cl shift based on the low bit 0 or 1 avoids doing a branch
C on a 50/50 chance of 0 or 1.  The chance of the next bit also being 0 is
C only 1/4.
C
C A second computed %cl shift was tried, but that measured a touch slower
C than branching back.
C
C A branch-free abs(x-y) and min(x,y) calculation was tried, but that
C measured about 1 cycle/bit slower.
	C eax	x
	C ebx	common twos
	C ecx	scratch
	C edx	y
	ALIGN(4)
L(swap):
	addl	%eax, %edx	C x-y+y = x
	negl	%eax		C -(x-y) = y-x
L(strip_x):
	shrl	%eax		C odd-odd = even, so always one to strip
	ASSERT(nz)
L(strip_x_leal):
	leal	1(%eax), %ecx
L(strip_x_and):
	andl	$1, %ecx	C (x^1)&1
	shrl	%cl, %eax	C shift if x even
	testb	$1, %al
	jz	L(strip_x)
	ASSERT(nz,`testl $1, %eax')	C x, y odd
	ASSERT(nz,`testl $1, %edx')
	subl	%edx, %eax
	jb	L(swap)
	ja	L(strip_x)
	movl	%edx, %eax
	movl	%ebx, %ecx
L(done_shll):
	shll	%cl, %eax
	popl	%ebx
	ret
C -----------------------------------------------------------------------------
C Two or more limbs.
C
C x={src,size} is reduced modulo y using either a plain mod_1 style
C remainder, or a modexact_1 style exact division.
deflit(MODEXACT_THRESHOLD, ifdef(`PIC', 4, 4))
	ALIGN(8)
L(size_two_or_more):
	C eax
	C ebx
	C ecx	common twos
	C edx	y, without common twos
	C esi
	C edi
	C ebp
deflit(FRAME_TWO_OR_MORE, FRAME)
	pushl	%edi		defframe_pushl(SAVE_EDI)
	movl	PARAM_SRC, %ebx
L(y_twos):
	shrl	%edx
	jnc	L(y_twos)
	movl	%ecx, %edi		C common twos
	movl	PARAM_SIZE, %ecx
	pushl	%esi		defframe_pushl(SAVE_ESI)
	leal	1(%edx,%edx), %esi	C y (odd)
	movl	-4(%ebx,%ecx,4), %eax	C src high limb
	cmpl	%edx, %eax		C carry if high<divisor
	sbbl	%edx, %edx		C -1 if high<divisor
	addl	%edx, %ecx		C skip one limb if high<divisor
	andl	%eax, %edx
	cmpl	$MODEXACT_THRESHOLD, %ecx
	jae	L(modexact)
L(divide_top):
	C eax	scratch (quotient)
	C ebx	src
	C ecx	counter, size-1 to 1
	C edx	carry (remainder)
	C esi	divisor (odd)
	C edi
	C ebp
	movl	-4(%ebx,%ecx,4), %eax
	divl	%esi
	loop	L(divide_top)
	movl	%edx, %eax	C x
	movl	%esi, %edx	C y (odd)
	movl	%edi, %ebx	C common twos
	popl	%esi
	popl	%edi
	leal	1(%eax), %ecx
	orl	%eax, %eax
	jnz	L(strip_x_and)
	movl	%ebx, %ecx
	movl	%edx, %eax
	shll	%cl, %eax
	popl	%ebx
	ret
	ALIGN(8)
L(modexact):
	C eax
	C ebx	src ptr
	C ecx	size or size-1
	C edx
	C esi	y odd
	C edi	common twos
	C ebp
	movl	PARAM_SIZE, %eax
	pushl	%esi		FRAME_pushl()
	pushl	%eax		FRAME_pushl()
	pushl	%ebx		FRAME_pushl()
ifdef(`PIC',`
	nop	C code alignment
	call	L(movl_eip_ebx)
L(here):
	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
	call	GSYM_PREFIX`'mpn_modexact_1_odd@PLT
',`
	call	GSYM_PREFIX`'mpn_modexact_1_odd
')
	movl	%esi, %edx		C y odd
	movl	SAVE_ESI, %esi
	movl	%edi, %ebx		C common twos
	movl	SAVE_EDI, %edi
	addl	$eval(FRAME - FRAME_TWO_OR_MORE), %esp
	orl	%eax, %eax
	leal	1(%eax), %ecx
	jnz	L(strip_x_and)
	movl	%ebx, %ecx
	movl	%edx, %eax
	shll	%cl, %eax
	popl	%ebx
	ret
ifdef(`PIC',`
L(movl_eip_ebx):
	movl	(%esp), %ebx
	ret_internal
')
EPILOGUE()