mul_basecase.asm
上传用户:qaz666999
上传日期:2022-08-06
资源大小:2570k
文件大小:18k
- dnl mpn_mul_basecase for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F).
- dnl Copyright 2001, 2002, 2005, 2007 Free Software Foundation, Inc.
- dnl
- dnl This file is part of the GNU MP Library.
- dnl
- dnl The GNU MP Library is free software; you can redistribute it and/or modify
- dnl it under the terms of the GNU Lesser General Public License as published
- dnl by the Free Software Foundation; either version 3 of the License, or (at
- dnl your option) any later version.
- dnl
- dnl The GNU MP Library is distributed in the hope that it will be useful, but
- dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
- dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
- dnl License for more details.
- dnl
- dnl You should have received a copy of the GNU Lesser General Public License
- dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
- include(`../config.m4')
- C TODO:
- C * Improve ad-hoc outer loop code and register handling. Some feed-in
- C scheduling could improve things by several cycles per outer iteration.
- C * In code for un <= 3, try keeping accumulation operands in registers,
- C without storing intermediates to rp.
- C * We might want to keep 32 in a free mm register, since the register form is
- C 3 bytes and the immediate form is 4 bytes. About 70 bytes to save.
- C * Look into different loop alignment, we now expand the code about 50 bytes
- C with possibly needless alignment.
- C * Perhaps rewrap loops 00,01,02 (6 loops) to allow fall-through entry.
- C * Use OSP, should solve feed-in latency problems.
- C * Save a few tens of bytes by doing cross-jumping for Loel0, etc.
- C * Save around 120 bytes by remapping "m 0", "m 1", "m 2" and "m 3" registers
- C so that they can share feed-in code, and changing the branch targets from
- C L<n> to Lm<nn>.
- C cycles/limb
- C P6 model 9 (Banias) ?
- C P6 model 13 (Dothan) 5.24
- C P6 model 14 (Yonah) ?
- C P4 model 0-1 (Willamette): 5
- C P4 model 2 (Northwood): 4.60 at 32 limbs
- C P4 model 3-4 (Prescott): 4.94 at 32 limbs
- C INPUT PARAMETERS
- C rp sp + 4
- C up sp + 8
- C un sp + 12
- C vp sp + 16
- C vn sp + 20
- TEXT
- ALIGN(16)
- PROLOGUE(mpn_mul_basecase)
- push %esi
- push %ebx
- mov 12(%esp), %edx C rp
- mov 16(%esp), %eax C up
- mov 20(%esp), %ecx C un
- mov 24(%esp), %esi C vp
- mov 28(%esp), %ebx C vn
- movd (%esi), %mm7 C
- L(ent): cmp $3, %ecx
- ja L(big)
- movd (%eax), %mm6
- pmuludq %mm7, %mm6
- jz L(un3)
- cmp $2, %ecx
- jz L(un2)
- L(un1): movd %mm6, (%edx) C un=1
- psrlq $32, %mm6 C un=1
- movd %mm6, 4(%edx) C un=1
- jmp L(rtr) C un=1
- L(un2): movd 4(%eax), %mm1 C un=2
- pmuludq %mm7, %mm1 C un=2
- movd %mm6, (%edx) C un=2
- psrlq $32, %mm6 C un=2
- paddq %mm1, %mm6 C un=2
- movd %mm6, 4(%edx) C un=2
- psrlq $32, %mm6 C un=2
- movd %mm6, 8(%edx) C un=2
- dec %ebx C un=2
- jz L(rtr) C un=2
- movd 4(%esi), %mm7 C un=2
- movd (%eax), %mm6 C un=2
- pmuludq %mm7, %mm6 C un=2
- movd 4(%eax), %mm1 C un=2
- movd 4(%edx), %mm4 C un=2
- pmuludq %mm7, %mm1 C un=2
- movd 8(%edx), %mm5 C un=2
- paddq %mm4, %mm6 C un=2
- paddq %mm1, %mm5 C un=2
- movd %mm6, 4(%edx) C un=2
- psrlq $32, %mm6 C un=2
- paddq %mm5, %mm6 C un=2
- movd %mm6, 8(%edx) C un=2
- psrlq $32, %mm6 C un=2
- movd %mm6, 12(%edx) C un=2
- L(rtr): emms
- pop %ebx
- pop %esi
- ret
- L(un3): movd 4(%eax), %mm1 C un=3
- pmuludq %mm7, %mm1 C un=3
- movd 8(%eax), %mm2 C un=3
- pmuludq %mm7, %mm2 C un=3
- movd %mm6, (%edx) C un=3
- psrlq $32, %mm6 C un=3
- paddq %mm1, %mm6 C un=3
- movd %mm6, 4(%edx) C un=3
- psrlq $32, %mm6 C un=3
- paddq %mm2, %mm6 C un=3
- movd %mm6, 8(%edx) C un=3
- psrlq $32, %mm6 C un=3
- movd %mm6, 12(%edx) C un=3
- dec %ebx C un=3
- jz L(rtr) C un=3
- movd 4(%esi), %mm7 C un=3
- movd (%eax), %mm6 C un=3
- pmuludq %mm7, %mm6 C un=3
- movd 4(%eax), %mm1 C un=3
- movd 4(%edx), %mm4 C un=3
- pmuludq %mm7, %mm1 C un=3
- movd 8(%eax), %mm2 C un=3
- movd 8(%edx), %mm5 C un=3
- pmuludq %mm7, %mm2 C un=3
- paddq %mm4, %mm6 C un=3
- paddq %mm1, %mm5 C un=3
- movd 12(%edx), %mm4 C un=3
- movd %mm6, 4(%edx) C un=3
- psrlq $32, %mm6 C un=3
- paddq %mm5, %mm6 C un=3
- paddq %mm2, %mm4 C un=3
- movd %mm6, 8(%edx) C un=3
- psrlq $32, %mm6 C un=3
- paddq %mm4, %mm6 C un=3
- movd %mm6, 12(%edx) C un=3
- psrlq $32, %mm6 C un=3
- movd %mm6, 16(%edx) C un=3
- dec %ebx C un=3
- jz L(rtr) C un=3
- movd 8(%esi), %mm7 C un=3
- movd (%eax), %mm6 C un=3
- pmuludq %mm7, %mm6 C un=3
- movd 4(%eax), %mm1 C un=3
- movd 8(%edx), %mm4 C un=3
- pmuludq %mm7, %mm1 C un=3
- movd 8(%eax), %mm2 C un=3
- movd 12(%edx), %mm5 C un=3
- pmuludq %mm7, %mm2 C un=3
- paddq %mm4, %mm6 C un=3
- paddq %mm1, %mm5 C un=3
- movd 16(%edx), %mm4 C un=3
- movd %mm6, 8(%edx) C un=3
- psrlq $32, %mm6 C un=3
- paddq %mm5, %mm6 C un=3
- paddq %mm2, %mm4 C un=3
- movd %mm6, 12(%edx) C un=3
- psrlq $32, %mm6 C un=3
- paddq %mm4, %mm6 C un=3
- movd %mm6, 16(%edx) C un=3
- psrlq $32, %mm6 C un=3
- movd %mm6, 20(%edx) C un=3
- jmp L(rtr)
- L(big): push %edi
- pxor %mm6, %mm6
- lea 4(%esi), %esi
- and $3, %ecx
- jz L(0)
- cmp $2, %ecx
- jc L(1)
- jz L(2)
- jmp L(3) C FIXME: one case should fall through
- L(0): movd (%eax), %mm3 C m 0
- sub 24(%esp), %ecx C inner loop count m 0
- mov %ecx, 24(%esp) C update loop count for later m 0
- pmuludq %mm7, %mm3 C m 0
- movd 4(%eax), %mm0 C m 0
- pmuludq %mm7, %mm0 C m 0
- movd 8(%eax), %mm1 C m 0
- jmp L(m00) C m 0
- ALIGN(16) C m 0
- L(lpm0):
- pmuludq %mm7, %mm4 C m 0
- paddq %mm0, %mm6 C m 0
- movd (%eax), %mm3 C m 0
- movd %mm6, -12(%edx) C m 0
- psrlq $32, %mm6 C m 0
- pmuludq %mm7, %mm3 C m 0
- paddq %mm1, %mm6 C m 0
- movd 4(%eax), %mm0 C m 0
- movd %mm6, -8(%edx) C m 0
- psrlq $32, %mm6 C m 0
- pmuludq %mm7, %mm0 C m 0
- paddq %mm4, %mm6 C m 0
- movd 8(%eax), %mm1 C m 0
- movd %mm6, -4(%edx) C m 0
- psrlq $32, %mm6 C m 0
- L(m00): pmuludq %mm7, %mm1 C m 0
- paddq %mm3, %mm6 C m 0
- movd 12(%eax), %mm4 C m 0
- movd %mm6, (%edx) C m 0
- psrlq $32, %mm6 C m 0
- lea 16(%eax), %eax C m 0
- lea 16(%edx), %edx C m 0
- add $4, %ecx C m 0
- ja L(lpm0) C m 0
- pmuludq %mm7, %mm4 C m 0
- paddq %mm0, %mm6 C m 0
- movd %mm6, -12(%edx) C m 0
- psrlq $32, %mm6 C m 0
- paddq %mm1, %mm6 C m 0
- mov 16(%esp), %edi C rp 0
- jmp L(x0)
- L(olp0):
- lea 4(%edi), %edi C am 0
- movd (%esi), %mm7 C am 0
- lea 4(%esi), %esi C am 0
- mov %edi, %edx C rp am 0
- mov 20(%esp), %eax C up am 0
- movd (%eax), %mm3 C am 0
- mov 24(%esp), %ecx C inner loop count am 0
- pxor %mm6, %mm6 C am 0
- pmuludq %mm7, %mm3 C am 0
- movd 4(%eax), %mm0 C am 0
- movd (%edx), %mm5 C am 0
- pmuludq %mm7, %mm0 C am 0
- movd 8(%eax), %mm1 C am 0
- paddq %mm3, %mm5 C am 0
- movd 4(%edx), %mm4 C am 0
- jmp L(am00) C am 0
- ALIGN(16) C mm 0
- L(lam0):
- pmuludq %mm7, %mm2 C am 0
- paddq %mm4, %mm6 C am 0
- movd (%eax), %mm3 C am 0
- paddq %mm1, %mm5 C am 0
- movd -4(%edx), %mm4 C am 0
- movd %mm6, -12(%edx) C am 0
- psrlq $32, %mm6 C am 0
- pmuludq %mm7, %mm3 C am 0
- paddq %mm5, %mm6 C am 0
- movd 4(%eax), %mm0 C am 0
- paddq %mm2, %mm4 C am 0
- movd (%edx), %mm5 C am 0
- movd %mm6, -8(%edx) C am 0
- psrlq $32, %mm6 C am 0
- pmuludq %mm7, %mm0 C am 0
- paddq %mm4, %mm6 C am 0
- movd 8(%eax), %mm1 C am 0
- paddq %mm3, %mm5 C am 0
- movd 4(%edx), %mm4 C am 0
- movd %mm6, -4(%edx) C am 0
- psrlq $32, %mm6 C am 0
- L(am00):
- pmuludq %mm7, %mm1 C am 0
- paddq %mm5, %mm6 C am 0
- movd 12(%eax), %mm2 C am 0
- paddq %mm0, %mm4 C am 0
- movd 8(%edx), %mm5 C am 0
- movd %mm6, (%edx) C am 0
- psrlq $32, %mm6 C am 0
- lea 16(%eax), %eax C am 0
- lea 16(%edx), %edx C am 0
- add $4, %ecx C am 0
- jnz L(lam0) C am 0
- pmuludq %mm7, %mm2 C am 0
- paddq %mm4, %mm6 C am 0
- paddq %mm1, %mm5 C am 0
- movd -4(%edx), %mm4 C am 0
- movd %mm6, -12(%edx) C am 0
- psrlq $32, %mm6 C am 0
- paddq %mm5, %mm6 C am 0
- paddq %mm2, %mm4 C am 0
- L(x0): movd %mm6, -8(%edx) C am 0
- psrlq $32, %mm6 C am 0
- paddq %mm4, %mm6 C am 0
- movd %mm6, -4(%edx) C am 0
- psrlq $32, %mm6 C am 0
- movd %mm6, (%edx) C am 0
- dec %ebx C am 0
- jnz L(olp0) C am 0
- L(oel0):
- emms C 0
- pop %edi C 0
- pop %ebx C 0
- pop %esi C 0
- ret C 0
- L(1): movd (%eax), %mm4 C m 1
- sub 24(%esp), %ecx C m 1
- mov %ecx, 24(%esp) C update loop count for later m 1
- pmuludq %mm7, %mm4 C m 1
- movd 4(%eax), %mm3 C m 1
- pmuludq %mm7, %mm3 C m 1
- movd 8(%eax), %mm0 C m 1
- jmp L(m01) C m 1
- ALIGN(16) C m 1
- L(lpm1):
- pmuludq %mm7, %mm4 C m 1
- paddq %mm0, %mm6 C m 1
- movd 4(%eax), %mm3 C m 1
- movd %mm6, -8(%edx) C m 1
- psrlq $32, %mm6 C m 1
- pmuludq %mm7, %mm3 C m 1
- paddq %mm1, %mm6 C m 1
- movd 8(%eax), %mm0 C m 1
- movd %mm6, -4(%edx) C m 1
- psrlq $32, %mm6 C m 1
- L(m01): pmuludq %mm7, %mm0 C m 1
- paddq %mm4, %mm6 C m 1
- movd 12(%eax), %mm1 C m 1
- movd %mm6, (%edx) C m 1
- psrlq $32, %mm6 C m 1
- pmuludq %mm7, %mm1 C m 1
- paddq %mm3, %mm6 C m 1
- movd 16(%eax), %mm4 C m 1
- movd %mm6, 4(%edx) C m 1
- psrlq $32, %mm6 C m 1
- lea 16(%eax), %eax C m 1
- lea 16(%edx), %edx C m 1
- add $4, %ecx C m 1
- ja L(lpm1) C m 1
- pmuludq %mm7, %mm4 C m 1
- paddq %mm0, %mm6 C m 1
- movd %mm6, -8(%edx) C m 1
- psrlq $32, %mm6 C m 1
- paddq %mm1, %mm6 C m 1
- mov 16(%esp), %edi C rp 1
- jmp L(x1)
- L(olp1):
- lea 4(%edi), %edi C am 1
- movd (%esi), %mm7 C am 1
- lea 4(%esi), %esi C am 1
- mov %edi, %edx C rp am 1
- mov 20(%esp), %eax C up am 1
- movd (%eax), %mm2 C am 1
- mov 24(%esp), %ecx C inner loop count am 1
- pxor %mm6, %mm6 C am 1
- pmuludq %mm7, %mm2 C am 1
- movd 4(%eax), %mm3 C am 1
- movd (%edx), %mm4 C am 1
- pmuludq %mm7, %mm3 C am 1
- movd 8(%eax), %mm0 C am 1
- paddq %mm2, %mm4 C am 1
- movd 4(%edx), %mm5 C am 1
- jmp L(am01) C am 1
- ALIGN(16) C am 1
- L(lam1):
- pmuludq %mm7, %mm2 C am 1
- paddq %mm4, %mm6 C am 1
- movd 4(%eax), %mm3 C am 1
- paddq %mm1, %mm5 C am 1
- movd (%edx), %mm4 C am 1
- movd %mm6, -8(%edx) C am 1
- psrlq $32, %mm6 C am 1
- pmuludq %mm7, %mm3 C am 1
- paddq %mm5, %mm6 C am 1
- movd 8(%eax), %mm0 C am 1
- paddq %mm2, %mm4 C am 1
- movd 4(%edx), %mm5 C am 1
- movd %mm6, -4(%edx) C am 1
- psrlq $32, %mm6 C am 1
- L(am01):
- pmuludq %mm7, %mm0 C am 1
- paddq %mm4, %mm6 C am 1
- movd 12(%eax), %mm1 C am 1
- paddq %mm3, %mm5 C am 1
- movd 8(%edx), %mm4 C am 1
- movd %mm6, (%edx) C am 1
- psrlq $32, %mm6 C am 1
- pmuludq %mm7, %mm1 C am 1
- paddq %mm5, %mm6 C am 1
- movd 16(%eax), %mm2 C am 1
- paddq %mm0, %mm4 C am 1
- movd 12(%edx), %mm5 C am 1
- movd %mm6, 4(%edx) C am 1
- psrlq $32, %mm6 C am 1
- lea 16(%eax), %eax C am 1
- lea 16(%edx), %edx C am 1
- add $4, %ecx C am 1
- jnz L(lam1) C am 1
- pmuludq %mm7, %mm2 C am 1
- paddq %mm4, %mm6 C am 1
- paddq %mm1, %mm5 C am 1
- movd (%edx), %mm4 C am 1
- movd %mm6, -8(%edx) C am 1
- psrlq $32, %mm6 C am 1
- paddq %mm5, %mm6 C am 1
- paddq %mm2, %mm4 C am 1
- L(x1): movd %mm6, -4(%edx) C am 1
- psrlq $32, %mm6 C am 1
- paddq %mm4, %mm6 C am 1
- movd %mm6, (%edx) C am 1
- psrlq $32, %mm6 C am 1
- movd %mm6, 4(%edx) C am 1
- dec %ebx C am 1
- jnz L(olp1) C am 1
- L(oel1):
- emms C 1
- pop %edi C 1
- pop %ebx C 1
- pop %esi C 1
- ret C 1
- L(2): movd (%eax), %mm1 C m 2
- sub 24(%esp), %ecx C m 2
- mov %ecx, 24(%esp) C update loop count for later m 2
- pmuludq %mm7, %mm1 C m 2
- movd 4(%eax), %mm4 C m 2
- pmuludq %mm7, %mm4 C m 2
- movd 8(%eax), %mm3 C m 2
- jmp L(m10) C m 2
- ALIGN(16) C m 2
- L(lpm2):
- pmuludq %mm7, %mm4 C m 2
- paddq %mm0, %mm6 C m 2
- movd 8(%eax), %mm3 C m 2
- movd %mm6, -4(%edx) C m 2
- psrlq $32, %mm6 C m 2
- L(m10): pmuludq %mm7, %mm3 C m 2
- paddq %mm1, %mm6 C m 2
- movd 12(%eax), %mm0 C m 2
- movd %mm6, (%edx) C m 2
- psrlq $32, %mm6 C m 2
- pmuludq %mm7, %mm0 C m 2
- paddq %mm4, %mm6 C m 2
- movd 16(%eax), %mm1 C m 2
- movd %mm6, 4(%edx) C m 2
- psrlq $32, %mm6 C m 2
- pmuludq %mm7, %mm1 C m 2
- paddq %mm3, %mm6 C m 2
- movd 20(%eax), %mm4 C m 2
- movd %mm6, 8(%edx) C m 2
- psrlq $32, %mm6 C m 2
- lea 16(%eax), %eax C m 2
- lea 16(%edx), %edx C m 2
- add $4, %ecx C m 2
- ja L(lpm2) C m 2
- pmuludq %mm7, %mm4 C m 2
- paddq %mm0, %mm6 C m 2
- movd %mm6, -4(%edx) C m 2
- psrlq $32, %mm6 C m 2
- paddq %mm1, %mm6 C m 2
- mov 16(%esp), %edi C rp 2
- jmp L(x2)
- L(olp2):
- lea 4(%edi), %edi C am 2
- movd (%esi), %mm7 C am 2
- lea 4(%esi), %esi C am 2
- mov %edi, %edx C rp am 2
- mov 20(%esp), %eax C up am 2
- movd (%eax), %mm1 C am 2
- mov 24(%esp), %ecx C inner loop count am 2
- pxor %mm6, %mm6 C am 2
- pmuludq %mm7, %mm1 C am 2
- movd 4(%eax), %mm2 C am 2
- movd (%edx), %mm5 C am 2
- pmuludq %mm7, %mm2 C am 2
- movd 8(%eax), %mm3 C am 2
- paddq %mm1, %mm5 C am 2
- movd 4(%edx), %mm4 C am 2
- jmp L(am10) C am 2
- ALIGN(16) C am 2
- L(lam2):
- pmuludq %mm7, %mm2 C am 2
- paddq %mm4, %mm6 C am 2
- movd 8(%eax), %mm3 C am 2
- paddq %mm1, %mm5 C am 2
- movd 4(%edx), %mm4 C am 2
- movd %mm6, -4(%edx) C am 2
- psrlq $32, %mm6 C am 2
- L(am10):
- pmuludq %mm7, %mm3 C am 2
- paddq %mm5, %mm6 C am 2
- movd 12(%eax), %mm0 C am 2
- paddq %mm2, %mm4 C am 2
- movd 8(%edx), %mm5 C am 2
- movd %mm6, (%edx) C am 2
- psrlq $32, %mm6 C am 2
- pmuludq %mm7, %mm0 C am 2
- paddq %mm4, %mm6 C am 2
- movd 16(%eax), %mm1 C am 2
- paddq %mm3, %mm5 C am 2
- movd 12(%edx), %mm4 C am 2
- movd %mm6, 4(%edx) C am 2
- psrlq $32, %mm6 C am 2
- pmuludq %mm7, %mm1 C am 2
- paddq %mm5, %mm6 C am 2
- movd 20(%eax), %mm2 C am 2
- paddq %mm0, %mm4 C am 2
- movd 16(%edx), %mm5 C am 2
- movd %mm6, 8(%edx) C am 2
- psrlq $32, %mm6 C am 2
- lea 16(%eax), %eax C am 2
- lea 16(%edx), %edx C am 2
- add $4, %ecx C am 2
- jnz L(lam2) C am 2
- pmuludq %mm7, %mm2 C am 2
- paddq %mm4, %mm6 C am 2
- paddq %mm1, %mm5 C am 2
- movd 4(%edx), %mm4 C am 2
- movd %mm6, -4(%edx) C am 2
- psrlq $32, %mm6 C am 2
- paddq %mm5, %mm6 C am 2
- paddq %mm2, %mm4 C am 2
- L(x2): movd %mm6, (%edx) C am 2
- psrlq $32, %mm6 C am 2
- paddq %mm4, %mm6 C am 2
- movd %mm6, 4(%edx) C am 2
- psrlq $32, %mm6 C am 2
- movd %mm6, 8(%edx) C am 2
- dec %ebx C am 2
- jnz L(olp2) C am 2
- L(oel2):
- emms C 2
- pop %edi C 2
- pop %ebx C 2
- pop %esi C 2
- ret C 2
- L(3): movd (%eax), %mm0 C m 3
- sub 24(%esp), %ecx C m 3
- mov %ecx, 24(%esp) C update loop count for later m 3
- pmuludq %mm7, %mm0 C m 3
- movd 4(%eax), %mm1 C m 3
- pmuludq %mm7, %mm1 C m 3
- movd 8(%eax), %mm4 C m 3
- jmp L(lpm3) C m 3
- ALIGN(16) C m 3
- L(lpm3):
- pmuludq %mm7, %mm4 C m 3
- paddq %mm0, %mm6 C m 3
- movd 12(%eax), %mm3 C m 3
- movd %mm6, (%edx) C m 3
- psrlq $32, %mm6 C m 3
- pmuludq %mm7, %mm3 C m 3
- paddq %mm1, %mm6 C m 3
- movd 16(%eax), %mm0 C m 3
- movd %mm6, 4(%edx) C m 3
- psrlq $32, %mm6 C m 3
- pmuludq %mm7, %mm0 C m 3
- paddq %mm4, %mm6 C m 3
- movd 20(%eax), %mm1 C m 3
- movd %mm6, 8(%edx) C m 3
- psrlq $32, %mm6 C m 3
- pmuludq %mm7, %mm1 C m 3
- paddq %mm3, %mm6 C m 3
- movd 24(%eax), %mm4 C m 3
- movd %mm6, 12(%edx) C m 3
- psrlq $32, %mm6 C m 3
- lea 16(%eax), %eax C m 3
- lea 16(%edx), %edx C m 3
- add $4, %ecx C m 3
- ja L(lpm3) C m 3
- pmuludq %mm7, %mm4 C m 3
- paddq %mm0, %mm6 C m 3
- movd %mm6, (%edx) C m 3
- psrlq $32, %mm6 C m 3
- paddq %mm1, %mm6 C m 3
- mov 16(%esp), %edi C rp 3
- jmp L(x3)
- L(olp3):
- lea 4(%edi), %edi C am 3
- movd (%esi), %mm7 C am 3
- lea 4(%esi), %esi C am 3
- mov %edi, %edx C rp am 3
- mov 20(%esp), %eax C up am 3
- movd (%eax), %mm0 C am 3
- mov 24(%esp), %ecx C inner loop count am 3
- pxor %mm6, %mm6 C am 3
- pmuludq %mm7, %mm0 C am 3
- movd 4(%eax), %mm1 C am 3
- movd (%edx), %mm4 C am 3
- pmuludq %mm7, %mm1 C am 3
- movd 8(%eax), %mm2 C am 3
- paddq %mm0, %mm4 C am 3
- movd 4(%edx), %mm5 C am 3
- jmp L(lam3) C am 3
- ALIGN(16) C am 3
- L(lam3):
- pmuludq %mm7, %mm2 C am 3
- paddq %mm4, %mm6 C am 3
- movd 12(%eax), %mm3 C am 3
- paddq %mm1, %mm5 C am 3
- movd 8(%edx), %mm4 C am 3
- movd %mm6, (%edx) C am 3
- psrlq $32, %mm6 C am 3
- pmuludq %mm7, %mm3 C am 3
- paddq %mm5, %mm6 C am 3
- movd 16(%eax), %mm0 C am 3
- paddq %mm2, %mm4 C am 3
- movd 12(%edx), %mm5 C am 3
- movd %mm6, 4(%edx) C am 3
- psrlq $32, %mm6 C am 3
- pmuludq %mm7, %mm0 C am 3
- paddq %mm4, %mm6 C am 3
- movd 20(%eax), %mm1 C am 3
- paddq %mm3, %mm5 C am 3
- movd 16(%edx), %mm4 C am 3
- movd %mm6, 8(%edx) C am 3
- psrlq $32, %mm6 C am 3
- pmuludq %mm7, %mm1 C am 3
- paddq %mm5, %mm6 C am 3
- movd 24(%eax), %mm2 C am 3
- paddq %mm0, %mm4 C am 3
- movd 20(%edx), %mm5 C am 3
- movd %mm6, 12(%edx) C am 3
- psrlq $32, %mm6 C am 3
- lea 16(%eax), %eax C am 3
- lea 16(%edx), %edx C am 3
- add $4, %ecx C am 3
- jnz L(lam3) C am 3
- pmuludq %mm7, %mm2 C am 3
- paddq %mm4, %mm6 C am 3
- paddq %mm1, %mm5 C am 3
- movd 8(%edx), %mm4 C am 3
- movd %mm6, (%edx) C am 3
- psrlq $32, %mm6 C am 3
- paddq %mm5, %mm6 C am 3
- paddq %mm2, %mm4 C am 3
- L(x3): movd %mm6, 4(%edx) C am 3
- psrlq $32, %mm6 C am 3
- paddq %mm4, %mm6 C am 3
- movd %mm6, 8(%edx) C am 3
- psrlq $32, %mm6 C am 3
- movd %mm6, 12(%edx) C am 3
- dec %ebx C am 3
- jnz L(olp3) C am 3
- L(oel3):
- emms C 3
- pop %edi C 3
- pop %ebx C 3
- pop %esi C 3
- ret C 3
- EPILOGUE()