popcount.asm
上传用户:qaz666999
上传日期:2022-08-06
资源大小:2570k
文件大小:7k
- dnl X86-32 and X86-64 mpn_popcount using SSE2.
- dnl Copyright 2006, 2007 Free Software Foundation, Inc.
- dnl
- dnl This file is part of the GNU MP Library.
- dnl
- dnl The GNU MP Library is free software; you can redistribute it and/or modify
- dnl it under the terms of the GNU Lesser General Public License as published
- dnl by the Free Software Foundation; either version 3 of the License, or (at
- dnl your option) any later version.
- dnl
- dnl The GNU MP Library is distributed in the hope that it will be useful, but
- dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
- dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
- dnl License for more details.
- dnl
- dnl You should have received a copy of the GNU Lesser General Public License
- dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
- include(`../config.m4')
- C 32-bit popcount hamdist
- C cycles/limb cycles/limb
- C P5: -
- C P6 model 0-8,10-12) -
- C P6 model 9 (Banias) ?
- C P6 model 13 (Dothan) 4
- C P4 model 0 (Willamette) ?
- C P4 model 1 (?) ?
- C P4 model 2 (Northwood) 3.9
- C P4 model 3 (Prescott) ?
- C P4 model 4 (Nocona) ?
- C K6: -
- C K7: -
- C K8: ?
- C 64-bit popcount hamdist
- C cycles/limb cycles/limb
- C P4 model 4 (Nocona): 8
- C K8: 7.5
- C K10: 3.5
- C P6 core2: 3.68
- C P6 corei7: 3.15
- C TODO
- C * Make a mpn_hamdist based on this. Alignment could either be handled by
- C using movdqu for one operand and movdqa for the other, or by painfully
- C shifting as we go. Unfortunately, there seem to be no useable shift
- C instruction, except for one that takes an immediate count.
- C * It would probably be possible to cut a few cycles/limb using software
- C pipelining.
- C * There are 35 decode slots unused by the SSE2 instructions. Loop control
- C needs just 2 or 3 slots, leaving around 32 slots. This allows a parallel
- C integer based popcount. Such a combined loop would handle 6 limbs in
- C about 30 cycles on K8.
- C * We could save a byte or two by using 32-bit operations on areg.
- C * Check if using movdqa to a temp of and then register-based pand is faster.
- ifelse(GMP_LIMB_BITS,`32',
- ` define(`up', `%edx')
- define(`n', `%ecx')
- define(`areg',`%eax')
- define(`breg',`%ebx')
- define(`zero',`%xmm4')
- define(`LIMB32',` $1')
- define(`LIMB64',`dnl')
- ',`
- define(`up', `%rdi')
- define(`n', `%rsi')
- define(`areg',`%rax')
- define(`breg',`%rdx')
- define(`zero',`%xmm8')
- define(`LIMB32',`dnl')
- define(`LIMB64',` $1')
- ')
- define(`mm01010101',`%xmm6')
- define(`mm00110011',`%xmm7')
- define(`mm00001111',`%xmm2')
- define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
- define(`LIMBS_PER_XMM', eval(16/GMP_LIMB_BYTES))
- define(`LIMBS_PER_2XMM', eval(32/GMP_LIMB_BYTES))
- undefine(`psadbw') C override inherited m4 version
- ASM_START()
- C Make cnsts global to work around Apple relocation bug.
- ifdef(`DARWIN',`
- define(`cnsts', MPN(popccnsts))
- GLOBL cnsts')
- TEXT
- ALIGN(32)
- PROLOGUE(mpn_popcount)
- LIMB32(`mov 4(%esp), up ')
- LIMB32(`mov 8(%esp), n ')
- LIMB32(`push %ebx ')
- pxor %xmm3, %xmm3 C zero grand total count
- LIMB64(`pxor zero, zero ')
- ifdef(`PIC',`
- LEA( cnsts, breg)
- ',`
- LIMB32(`mov $cnsts, breg ')
- LIMB64(`movabs $cnsts, breg ')
- ')
- movdqa -48(breg), mm01010101
- movdqa -32(breg), mm00110011
- movdqa -16(breg), mm00001111
- mov up, areg
- and $-16, up C round `up' down to 128-bit boundary
- and $12, areg C 32:areg = 0, 4, 8, 12
- C 64:areg = 0, 8
- movdqa (up), %xmm0
- pand 64(breg,areg,4), %xmm0
- shr $m4_log2(GMP_LIMB_BYTES), %eax
- add areg, n C compensate n for rounded down `up'
- pxor %xmm4, %xmm4
- sub $LIMBS_PER_XMM, n
- jbe L(sum)
- sub $LIMBS_PER_XMM, n
- ja L(ent)
- jmp L(lsum)
- ALIGN(16)
- L(top): movdqa (up), %xmm0
- L(ent): movdqa 16(up), %xmm4
- movdqa %xmm0, %xmm1
- movdqa %xmm4, %xmm5
- psrld $1, %xmm0
- psrld $1, %xmm4
- pand mm01010101, %xmm0
- pand mm01010101, %xmm4
- psubd %xmm0, %xmm1
- psubd %xmm4, %xmm5
- movdqa %xmm1, %xmm0
- movdqa %xmm5, %xmm4
- psrlq $2, %xmm1
- psrlq $2, %xmm5
- pand mm00110011, %xmm0
- pand mm00110011, %xmm4
- pand mm00110011, %xmm1
- pand mm00110011, %xmm5
- paddq %xmm0, %xmm1
- paddq %xmm4, %xmm5
- LIMB32(`pxor zero, zero ')
- add $32, up
- sub $LIMBS_PER_2XMM, n
- paddq %xmm5, %xmm1
- movdqa %xmm1, %xmm0
- psrlq $4, %xmm1
- pand mm00001111, %xmm0
- pand mm00001111, %xmm1
- paddq %xmm0, %xmm1
- psadbw zero, %xmm1
- paddq %xmm1, %xmm3 C add to grand total
- jnc L(top)
- L(end):
- add $LIMBS_PER_2XMM, n
- jz L(rt)
- movdqa (up), %xmm0
- pxor %xmm4, %xmm4
- sub $LIMBS_PER_XMM, n
- jbe L(sum)
- L(lsum):
- movdqa %xmm0, %xmm4
- movdqa 16(up), %xmm0
- L(sum):
- shl $m4_log2(GMP_LIMB_BYTES), n
- and $12, n
- pand (breg,n,4), %xmm0
- movdqa %xmm0, %xmm1
- movdqa %xmm4, %xmm5
- psrld $1, %xmm0
- psrld $1, %xmm4
- pand mm01010101, %xmm0
- pand mm01010101, %xmm4
- psubd %xmm0, %xmm1
- psubd %xmm4, %xmm5
- movdqa %xmm1, %xmm0
- movdqa %xmm5, %xmm4
- psrlq $2, %xmm1
- psrlq $2, %xmm5
- pand mm00110011, %xmm0
- pand mm00110011, %xmm4
- pand mm00110011, %xmm1
- pand mm00110011, %xmm5
- paddq %xmm0, %xmm1
- paddq %xmm4, %xmm5
- LIMB32(`pxor zero, zero ')
- paddq %xmm5, %xmm1
- movdqa %xmm1, %xmm0
- psrlq $4, %xmm1
- pand mm00001111, %xmm0
- pand mm00001111, %xmm1
- paddq %xmm0, %xmm1
- psadbw zero, %xmm1
- paddq %xmm1, %xmm3 C add to grand total
- C Add the two 64-bit halves of the grand total counter
- L(rt): movdqa %xmm3, %xmm0
- psrldq $8, %xmm3
- paddq %xmm3, %xmm0
- movd %xmm0, areg C movq avoided due to gas bug
- LIMB32(`pop %ebx ')
- ret
- EPILOGUE()
- DEF_OBJECT(dummy,16)
- C Three magic constants used for masking out bits
- .byte 0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55
- .byte 0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55
- .byte 0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33
- .byte 0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33
- .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
- .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
- cnsts:
- C Masks for high end of number
- .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
- .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
- .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
- .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
- .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
- .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
- .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
- .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
- C Masks for low end of number
- .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
- .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
- .byte 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
- .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
- .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
- .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
- .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
- .byte 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
- END_OBJECT(dummy)