数学计算

开发平台：
Unix_Linux

divrem_1.asm：源码内容
							dnl  Intel Pentium-4 mpn_divrem_1 -- mpn by limb division.
dnl  Copyright 1999, 2000, 2001, 2002, 2003, 2004 Free Software Foundation,
dnl  Inc.
dnl
dnl  This file is part of the GNU MP Library.
dnl
dnl  The GNU MP Library is free software; you can redistribute it and/or
dnl  modify it under the terms of the GNU Lesser General Public License as
dnl  published by the Free Software Foundation; either version 3 of the
dnl  License, or (at your option) any later version.
dnl
dnl  The GNU MP Library is distributed in the hope that it will be useful,
dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
dnl  Lesser General Public License for more details.
dnl
dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C P4: 32 cycles/limb integer part, 30 cycles/limb fraction part.
C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
C                         mp_srcptr src, mp_size_t size,
C                         mp_limb_t divisor);
C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
C                          mp_srcptr src, mp_size_t size,
C                          mp_limb_t divisor, mp_limb_t carry);
C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize,
C                                mp_srcptr src, mp_size_t size,
C                                mp_limb_t divisor, mp_limb_t inverse,
C                                unsigned shift);
C
C Algorithm:
C
C The method and nomenclature follow part 8 of "Division by Invariant
C Integers using Multiplication" by Granlund and Montgomery, reference in
C gmp.texi.
C
C "m" is written for what is m' in the paper, and "d" for d_norm, which
C won't cause any confusion since it's only the normalized divisor that's of
C any use in the code.  "b" is written for 2^N, the size of a limb, N being
C 32 here.
C
C The step "sdword dr = n - 2^N*d + (2^N-1-q1) * d" is instead done as
C "n-d - q1*d".  This rearrangement gives the same two-limb answer but lets
C us have just a psubq on the dependent chain.
C
C For reference, the way the k7 code uses "n-(q1+1)*d" would not suit here,
C detecting an overflow of q1+1 when q1=0xFFFFFFFF would cost too much.
C
C Notes:
C
C mpn_divrem_1 and mpn_preinv_divrem_1 avoid one division if the src high
C limb is less than the divisor.  mpn_divrem_1c doesn't check for a zero
C carry, since in normal circumstances that will be a very rare event.
C
C The test for skipping a division is branch free (once size>=1 is tested).
C The store to the destination high limb is 0 when a divide is skipped, or
C if it's not skipped then a copy of the src high limb is stored.  The
C latter is in case src==dst.
C
C There's a small bias towards expecting xsize==0, by having code for
C xsize==0 in a straight line and xsize!=0 under forward jumps.
C
C Enhancements:
C
C The loop measures 32 cycles, but the dependent chain would suggest it
C could be done with 30.  Not sure where to start looking for the extras.
C
C Alternatives:
C
C If the divisor is normalized (high bit set) then a division step can
C always be skipped, since the high destination limb is always 0 or 1 in
C that case.  It doesn't seem worth checking for this though, since it
C probably occurs infrequently.
dnl  MUL_THRESHOLD is the value of xsize+size at which the multiply by
dnl  inverse method is used, rather than plain "divl"s.  Minimum value 1.
dnl
dnl  The inverse takes about 80-90 cycles to calculate, but after that the
dnl  multiply is 32 c/l versus division at about 58 c/l.
dnl
dnl  At 4 limbs the div is a touch faster than the mul (and of course
dnl  simpler), so start the mul from 5 limbs.
deflit(MUL_THRESHOLD, 5)
defframe(PARAM_PREINV_SHIFT,   28)  dnl mpn_preinv_divrem_1
defframe(PARAM_PREINV_INVERSE, 24)  dnl mpn_preinv_divrem_1
defframe(PARAM_CARRY,  24)          dnl mpn_divrem_1c
defframe(PARAM_DIVISOR,20)
defframe(PARAM_SIZE,   16)
defframe(PARAM_SRC,    12)
defframe(PARAM_XSIZE,  8)
defframe(PARAM_DST,    4)
dnl  re-use parameter space
define(SAVE_ESI,`PARAM_SIZE')
define(SAVE_EBP,`PARAM_SRC')
define(SAVE_EDI,`PARAM_DIVISOR')
define(SAVE_EBX,`PARAM_DST')
	TEXT
	ALIGN(16)
PROLOGUE(mpn_preinv_divrem_1)
deflit(`FRAME',0)
	movl	PARAM_SIZE, %ecx
	xorl	%edx, %edx		C carry if can't skip a div
	movl	%esi, SAVE_ESI
	movl	PARAM_SRC, %esi
	movl	%ebp, SAVE_EBP
	movl	PARAM_DIVISOR, %ebp
	movl	%edi, SAVE_EDI
	movl	PARAM_DST, %edi
	movl	-4(%esi,%ecx,4), %eax	C src high limb
	movl	%ebx, SAVE_EBX
	movl	PARAM_XSIZE, %ebx
	movd	PARAM_PREINV_INVERSE, %mm4
	movd	PARAM_PREINV_SHIFT, %mm7  C l
	cmpl	%ebp, %eax		C high cmp divisor
	cmovc(	%eax, %edx)		C high is carry if high<divisor
	movd	%edx, %mm0		C carry
	movd	%edx, %mm1		C carry
	movl	$0, %edx
	movd	%ebp, %mm5		C d
	cmovnc(	%eax, %edx)		C 0 if skip div, src high if not
					C (the latter in case src==dst)
	leal	-4(%edi,%ebx,4), %edi	C &dst[xsize-1]
	movl	%edx, (%edi,%ecx,4)	C dst high limb
	sbbl	$0, %ecx		C skip one division if high<divisor
	movl	$32, %eax
	subl	PARAM_PREINV_SHIFT, %eax
	psllq	%mm7, %mm5		C d normalized
	leal	(%edi,%ecx,4), %edi	C &dst[xsize+size-1]
	leal	-4(%esi,%ecx,4), %esi	C &src[size-1]
	movd	%eax, %mm6		C 32-l
	jmp	L(start_preinv)
EPILOGUE()
	ALIGN(16)
PROLOGUE(mpn_divrem_1c)
deflit(`FRAME',0)
	movl	PARAM_CARRY, %edx
	movl	PARAM_SIZE, %ecx
	movl	%esi, SAVE_ESI
	movl	PARAM_SRC, %esi
	movl	%ebp, SAVE_EBP
	movl	PARAM_DIVISOR, %ebp
	movl	%edi, SAVE_EDI
	movl	PARAM_DST, %edi
	movl	%ebx, SAVE_EBX
	movl	PARAM_XSIZE, %ebx
	leal	-4(%edi,%ebx,4), %edi	C &dst[xsize-1]
	jmp	L(start_1c)
EPILOGUE()
	ALIGN(16)
PROLOGUE(mpn_divrem_1)
deflit(`FRAME',0)
	movl	PARAM_SIZE, %ecx
	xorl	%edx, %edx		C initial carry (if can't skip a div)
	movl	%esi, SAVE_ESI
	movl	PARAM_SRC, %esi
	movl	%ebp, SAVE_EBP
	movl	PARAM_DIVISOR, %ebp
	movl	%edi, SAVE_EDI
	movl	PARAM_DST, %edi
	movl	%ebx, SAVE_EBX
	movl	PARAM_XSIZE, %ebx
	leal	-4(%edi,%ebx,4), %edi	C &dst[xsize-1]
	orl	%ecx, %ecx		C size
	jz	L(no_skip_div)		C if size==0
	movl	-4(%esi,%ecx,4), %eax	C src high limb
	cmpl	%ebp, %eax		C high cmp divisor
	cmovnc(	%eax, %edx)		C 0 if skip div, src high if not
	movl	%edx, (%edi,%ecx,4)	C dst high limb
	movl	$0, %edx
	cmovc(	%eax, %edx)		C high is carry if high<divisor
	sbbl	$0, %ecx		C size-1 if high<divisor
L(no_skip_div):
L(start_1c):
	C eax
	C ebx	xsize
	C ecx	size
	C edx	carry
	C esi	src
	C edi	&dst[xsize-1]
	C ebp	divisor
	leal	(%ebx,%ecx), %eax	C size+xsize
	leal	-4(%esi,%ecx,4), %esi	C &src[size-1]
	leal	(%edi,%ecx,4), %edi	C &dst[size+xsize-1]
	cmpl	$MUL_THRESHOLD, %eax
	jae	L(mul_by_inverse)
	orl	%ecx, %ecx
	jz	L(divide_no_integer)	C if size==0
L(divide_integer):
	C eax	scratch (quotient)
	C ebx	xsize
	C ecx	counter
	C edx	carry
	C esi	src, decrementing
	C edi	dst, decrementing
	C ebp	divisor
	movl	(%esi), %eax
	subl	$4, %esi
	divl	%ebp
	movl	%eax, (%edi)
	subl	$4, %edi
	subl	$1, %ecx
	jnz	L(divide_integer)
L(divide_no_integer):
	orl	%ebx, %ebx
	jnz	L(divide_fraction)	C if xsize!=0
L(divide_done):
	movl	SAVE_ESI, %esi
	movl	SAVE_EDI, %edi
	movl	SAVE_EBX, %ebx
	movl	SAVE_EBP, %ebp
	movl	%edx, %eax
	ret
L(divide_fraction):
	C eax	scratch (quotient)
	C ebx	counter
	C ecx
	C edx	carry
	C esi
	C edi	dst, decrementing
	C ebp	divisor
	movl	$0, %eax
	divl	%ebp
	movl	%eax, (%edi)
	subl	$4, %edi
	subl	$1, %ebx
	jnz	L(divide_fraction)
	jmp	L(divide_done)
C -----------------------------------------------------------------------------
L(mul_by_inverse):
	C eax
	C ebx	xsize
	C ecx	size
	C edx	carry
	C esi	&src[size-1]
	C edi	&dst[size+xsize-1]
	C ebp	divisor
	bsrl	%ebp, %eax		C 31-l
	movd	%edx, %mm0		C carry
	movd	%edx, %mm1		C carry
	movl	%ecx, %edx		C size
	movl	$31, %ecx
	C
	xorl	%eax, %ecx		C l = leading zeros on d
	addl	$1, %eax
	shll	%cl, %ebp		C d normalized
	movd	%ecx, %mm7		C l
	movl	%edx, %ecx		C size
	movd	%eax, %mm6		C 32-l
	movl	$-1, %edx
	movl	$-1, %eax
	C
	subl	%ebp, %edx		C (b-d)-1 so  edx:eax = b*(b-d)-1
	divl	%ebp			C floor (b*(b-d)-1 / d)
	movd	%ebp, %mm5		C d
	C
	movd	%eax, %mm4		C m
L(start_preinv):
	C eax	inverse
	C ebx	xsize
	C ecx	size
	C edx
	C esi	&src[size-1]
	C edi	&dst[size+xsize-1]
	C ebp
	C
	C mm0	carry
	C mm1	carry
	C mm2
	C mm4	m
	C mm5	d
	C mm6	31-l
	C mm7	l
	psllq	%mm7, %mm0		C n2 = carry << l, for size==0
	subl	$1, %ecx
	jb	L(integer_none)
	movd	(%esi), %mm0		C src high limb
	punpckldq %mm1, %mm0
	psrlq	%mm6, %mm0		C n2 = high (carry:srchigh << l)
	jz	L(integer_last)
C The dependent chain here consists of
C
C	2   paddd    n1+n2
C	8   pmuludq  m*(n1+n2)
C	2   paddq    n2:nadj + m*(n1+n2)
C	2   psrlq    q1
C	8   pmuludq  d*q1
C	2   psubq    (n-d)-q1*d
C	2   psrlq    high n-(q1+1)*d mask
C	2   pand     d masked
C	2   paddd    n2+d addback
C	--
C	30
C
C But it seems to run at 32 cycles, so presumably there's something else
C going on.
	ALIGN(16)
L(integer_top):
	C eax
	C ebx
	C ecx	counter, size-1 to 0
	C edx
	C esi	src, decrementing
	C edi	dst, decrementing
	C
	C mm0	n2
	C mm4	m
	C mm5	d
	C mm6	32-l
	C mm7	l
	ASSERT(b,`C n2<d
	 movd	%mm0, %eax
	 movd	%mm5, %edx
	 cmpl	%edx, %eax')
	movd	-4(%esi), %mm1		C next src limbs
	movd	(%esi), %mm2
	leal	-4(%esi), %esi
	punpckldq %mm2, %mm1
	psrlq	%mm6, %mm1		C n10
	movq	%mm1, %mm2		C n10
	movq	%mm1, %mm3		C n10
	psrad	$31, %mm1		C -n1
	pand	%mm5, %mm1		C -n1 & d
	paddd	%mm2, %mm1		C nadj = n10+(-n1&d), ignore overflow
	psrld	$31, %mm2		C n1
	paddd	%mm0, %mm2		C n2+n1
	punpckldq %mm0, %mm1		C n2:nadj
	pmuludq	%mm4, %mm2		C m*(n2+n1)
	C
	paddq	%mm2, %mm1		C n2:nadj + m*(n2+n1)
	pxor	%mm2, %mm2		C break dependency, saves 4 cycles
	pcmpeqd	%mm2, %mm2		C FF...FF
	psrlq	$63, %mm2		C 1
	psrlq	$32, %mm1		C q1 = high(n2:nadj + m*(n2+n1))
	paddd	%mm1, %mm2		C q1+1
	pmuludq	%mm5, %mm1		C q1*d
	punpckldq %mm0, %mm3		C n = n2:n10
	pxor	%mm0, %mm0
	psubq	%mm5, %mm3		C n - d
	C
	psubq	%mm1, %mm3		C n - (q1+1)*d
	por	%mm3, %mm0		C copy remainder -> new n2
	psrlq	$32, %mm3		C high n - (q1+1)*d, 0 or -1
	ASSERT(be,`C 0 or -1
	 movd	%mm3, %eax
	 addl	$1, %eax
	 cmpl	$1, %eax')
	paddd	%mm3, %mm2		C q
	pand	%mm5, %mm3		C mask & d
	paddd	%mm3, %mm0		C addback if necessary
	movd	%mm2, (%edi)
	leal	-4(%edi), %edi
	subl	$1, %ecx
	ja	L(integer_top)
L(integer_last):
	C eax
	C ebx	xsize
	C ecx
	C edx
	C esi	&src[0]
	C edi	&dst[xsize]
	C
	C mm0	n2
	C mm4	m
	C mm5	d
	C mm6
	C mm7	l
	ASSERT(b,`C n2<d
	 movd	%mm0, %eax
	 movd	%mm5, %edx
	 cmpl	%edx, %eax')
	movd	(%esi), %mm1		C src[0]
	psllq	%mm7, %mm1		C n10
	movq	%mm1, %mm2		C n10
	movq	%mm1, %mm3		C n10
	psrad	$31, %mm1		C -n1
	pand	%mm5, %mm1		C -n1 & d
	paddd	%mm2, %mm1		C nadj = n10+(-n1&d), ignore overflow
	psrld	$31, %mm2		C n1
	paddd	%mm0, %mm2		C n2+n1
	punpckldq %mm0, %mm1		C n2:nadj
	pmuludq	%mm4, %mm2		C m*(n2+n1)
	C
	paddq	%mm2, %mm1		C n2:nadj + m*(n2+n1)
	pcmpeqd	%mm2, %mm2		C FF...FF
	psrlq	$63, %mm2		C 1
	psrlq	$32, %mm1		C q1 = high(n2:nadj + m*(n2+n1))
	paddd	%mm1, %mm2		C q1
	pmuludq	%mm5, %mm1		C q1*d
	punpckldq %mm0, %mm3		C n
	psubq	%mm5, %mm3		C n - d
	pxor	%mm0, %mm0
	C
	psubq	%mm1, %mm3		C n - (q1+1)*d
	por	%mm3, %mm0		C remainder -> n2
	psrlq	$32, %mm3		C high n - (q1+1)*d, 0 or -1
	ASSERT(be,`C 0 or -1
	 movd	%mm3, %eax
	 addl	$1, %eax
	 cmpl	$1, %eax')
	paddd	%mm3, %mm2		C q
	pand	%mm5, %mm3		C mask & d
	paddd	%mm3, %mm0		C addback if necessary
	movd	%mm2, (%edi)
	leal	-4(%edi), %edi
L(integer_none):
	C eax
	C ebx	xsize
	orl	%ebx, %ebx
	jnz	L(fraction_some)	C if xsize!=0
L(fraction_done):
	movl	SAVE_EBP, %ebp
	psrld	%mm7, %mm0		C remainder
	movl	SAVE_EDI, %edi
	movd	%mm0, %eax
	movl	SAVE_ESI, %esi
	movl	SAVE_EBX, %ebx
	emms
	ret
C -----------------------------------------------------------------------------
C
L(fraction_some):
	C eax
	C ebx	xsize
	C ecx
	C edx
	C esi
	C edi	&dst[xsize-1]
	C ebp
L(fraction_top):
	C eax
	C ebx	counter, xsize iterations
	C ecx
	C edx
	C esi	src, decrementing
	C edi	dst, decrementing
	C
	C mm0	n2
	C mm4	m
	C mm5	d
	C mm6	32-l
	C mm7	l
	ASSERT(b,`C n2<d
	 movd	%mm0, %eax
	 movd	%mm5, %edx
	 cmpl	%edx, %eax')
	movq	%mm0, %mm1		C n2
	pmuludq	%mm4, %mm0		C m*n2
	pcmpeqd	%mm2, %mm2
	psrlq	$63, %mm2
	C
	psrlq	$32, %mm0		C high(m*n2)
	paddd	%mm1, %mm0		C q1 = high(n2:0 + m*n2)
	paddd	%mm0, %mm2		C q1+1
	pmuludq	%mm5, %mm0		C q1*d
	psllq	$32, %mm1		C n = n2:0
	psubq	%mm5, %mm1		C n - d
	C
	psubq	%mm0, %mm1		C r = n - (q1+1)*d
	pxor	%mm0, %mm0
	por	%mm1, %mm0		C r -> n2
	psrlq	$32, %mm1		C high n - (q1+1)*d, 0 or -1
	ASSERT(be,`C 0 or -1
	 movd	%mm1, %eax
	 addl	$1, %eax
	 cmpl	$1, %eax')
	paddd	%mm1, %mm2		C q
	pand	%mm5, %mm1		C mask & d
	paddd	%mm1, %mm0		C addback if necessary
	movd	%mm2, (%edi)
	leal	-4(%edi), %edi
	subl	$1, %ebx
	jne	L(fraction_top)
	jmp	L(fraction_done)
EPILOGUE()