数学计算

开发平台：
Unix_Linux

aorsmul_1.asm：源码内容
							dnl  AMD64 mpn_addmul_1 and mpn_submul_1.
dnl  Copyright 2003, 2004, 2005, 2007, 2008 Free Software Foundation, Inc.
dnl  This file is part of the GNU MP Library.
dnl  The GNU MP Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 3 of the License, or (at
dnl  your option) any later version.
dnl  The GNU MP Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.
dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C	     cycles/limb
C K8,K9:	 2.5
C K10:		 2.5
C P4:		14.9
C P6 core2:	 5.09
C P6 corei7:
C P6 atom:	21.3
C The inner loop of this code is the result of running a code generation and
C optimization tool suite written by David Harvey and Torbjorn Granlund.
C TODO:
C  * The inner loop is great, but the prologue and epilogue code was
C    quickly written.  Tune it!
C INPUT PARAMETERS
define(`rp',	 `%rdi')
define(`up',	 `%rsi')
define(`n_param',`%rdx')
define(`vl',	 `%rcx')
define(`n',	`%r11')
ifdef(`OPERATION_addmul_1',`
      define(`ADDSUB',        `add')
      define(`func',  `mpn_addmul_1')
')
ifdef(`OPERATION_submul_1',`
      define(`ADDSUB',        `sub')
      define(`func',  `mpn_submul_1')
')
MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
ASM_START()
	TEXT
	ALIGN(16)
PROLOGUE(func)
	mov	(up), %rax		C read first u limb early
	push	%rbx
	mov	n_param, %rbx		C move away n from rdx, mul uses it
	mul	vl
	mov	%rbx, %r11
	and	$3, R32(%rbx)
	jz	L(b0)
	cmp	$2, R32(%rbx)
	jz	L(b2)
	jg	L(b3)
L(b1):	dec	n
	jne	L(gt1)
	ADDSUB	%rax, (rp)
	jmp	L(ret)
L(gt1):	lea	8(up,n,8), up
	lea	-8(rp,n,8), rp
	neg	n
	xor	%r10, %r10
	xor	R32(%rbx), R32(%rbx)
	mov	%rax, %r9
	mov	(up,n,8), %rax
	mov	%rdx, %r8
	jmp	L(L1)
L(b0):	lea	(up,n,8), up
	lea	-16(rp,n,8), rp
	neg	n
	xor	%r10, %r10
	mov	%rax, %r8
	mov	%rdx, %rbx
	jmp	 L(L0)
L(b3):	lea	-8(up,n,8), up
	lea	-24(rp,n,8), rp
	neg	n
	mov	%rax, %rbx
	mov	%rdx, %r10
	jmp	L(L3)
L(b2):	lea	-16(up,n,8), up
	lea	-32(rp,n,8), rp
	neg	n
	xor	%r8, %r8
	xor	R32(%rbx), R32(%rbx)
	mov	%rax, %r10
	mov	24(up,n,8), %rax
	mov	%rdx, %r9
	jmp	L(L2)
	ALIGN(16)
L(top):	ADDSUB	%r10, (rp,n,8)
	adc	%rax, %r9
	mov	(up,n,8), %rax
	adc	%rdx, %r8
	mov	$0, %r10d
L(L1):	mul	vl
	ADDSUB	%r9, 8(rp,n,8)
	adc	%rax, %r8
	adc	%rdx, %rbx
L(L0):	mov	8(up,n,8), %rax
	mul	vl
	ADDSUB	%r8, 16(rp,n,8)
	adc	%rax, %rbx
	adc	%rdx, %r10
L(L3):	mov	16(up,n,8), %rax
	mul	vl
	ADDSUB	%rbx, 24(rp,n,8)
	mov	$0, %r8d		# zero
	mov	%r8, %rbx		# zero
	adc	%rax, %r10
	mov	24(up,n,8), %rax
	mov	%r8, %r9		# zero
	adc	%rdx, %r9
L(L2):	mul	vl
	add	$4, n
	js	 L(top)
	ADDSUB	%r10, (rp,n,8)
	adc	%rax, %r9
	adc	%r8, %rdx
	ADDSUB	%r9, 8(rp,n,8)
L(ret):	adc	$0, %rdx
	mov	%rdx, %rax
	pop	%rbx
	ret
EPILOGUE()