sad-a.asm
资源名称:chapter15.rar [点击查看]
上传用户:hjq518
上传日期:2021-12-09
资源大小:5084k
文件大小:25k
源码类别:
Audio
开发平台:
Visual C++
- ;*****************************************************************************
- ;* sad-a.asm: h264 encoder library
- ;*****************************************************************************
- ;* Copyright (C) 2003-2008 x264 project
- ;*
- ;* Authors: Loren Merritt <lorenm@u.washington.edu>
- ;* Laurent Aimar <fenrir@via.ecp.fr>
- ;* Alex Izvorski <aizvorksi@gmail.com>
- ;*
- ;* This program is free software; you can redistribute it and/or modify
- ;* it under the terms of the GNU General Public License as published by
- ;* the Free Software Foundation; either version 2 of the License, or
- ;* (at your option) any later version.
- ;*
- ;* This program is distributed in the hope that it will be useful,
- ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
- ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- ;* GNU General Public License for more details.
- ;*
- ;* You should have received a copy of the GNU General Public License
- ;* along with this program; if not, write to the Free Software
- ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
- ;*****************************************************************************
- %include "x86inc.asm"
- %include "x86util.asm"
- SECTION_RODATA
- pb_3: times 16 db 3
- sw_64: dd 64
- SECTION .text
- ;=============================================================================
- ; SAD MMX
- ;=============================================================================
- %macro SAD_INC_2x16P 0
- movq mm1, [r0]
- movq mm2, [r0+8]
- movq mm3, [r0+r1]
- movq mm4, [r0+r1+8]
- psadbw mm1, [r2]
- psadbw mm2, [r2+8]
- psadbw mm3, [r2+r3]
- psadbw mm4, [r2+r3+8]
- lea r0, [r0+2*r1]
- paddw mm1, mm2
- paddw mm3, mm4
- lea r2, [r2+2*r3]
- paddw mm0, mm1
- paddw mm0, mm3
- %endmacro
- %macro SAD_INC_2x8P 0
- movq mm1, [r0]
- movq mm2, [r0+r1]
- psadbw mm1, [r2]
- psadbw mm2, [r2+r3]
- lea r0, [r0+2*r1]
- paddw mm0, mm1
- paddw mm0, mm2
- lea r2, [r2+2*r3]
- %endmacro
- %macro SAD_INC_2x4P 0
- movd mm1, [r0]
- movd mm2, [r2]
- punpckldq mm1, [r0+r1]
- punpckldq mm2, [r2+r3]
- psadbw mm1, mm2
- paddw mm0, mm1
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- %endmacro
- ;-----------------------------------------------------------------------------
- ; int x264_pixel_sad_16x16_mmxext (uint8_t *, int, uint8_t *, int )
- ;-----------------------------------------------------------------------------
- %macro SAD 2
- cglobal x264_pixel_sad_%1x%2_mmxext, 4,4
- pxor mm0, mm0
- %rep %2/2
- SAD_INC_2x%1P
- %endrep
- movd eax, mm0
- RET
- %endmacro
- SAD 16, 16
- SAD 16, 8
- SAD 8, 16
- SAD 8, 8
- SAD 8, 4
- SAD 4, 8
- SAD 4, 4
- ;=============================================================================
- ; SAD XMM
- ;=============================================================================
- %macro SAD_END_SSE2 0
- movhlps xmm1, xmm0
- paddw xmm0, xmm1
- movd eax, xmm0
- RET
- %endmacro
- %macro SAD_W16 1
- ;-----------------------------------------------------------------------------
- ; int x264_pixel_sad_16x16_sse2 (uint8_t *, int, uint8_t *, int )
- ;-----------------------------------------------------------------------------
- cglobal x264_pixel_sad_16x16_%1, 4,4
- movdqu xmm0, [r2]
- movdqu xmm1, [r2+r3]
- lea r2, [r2+2*r3]
- movdqu xmm2, [r2]
- movdqu xmm3, [r2+r3]
- lea r2, [r2+2*r3]
- psadbw xmm0, [r0]
- psadbw xmm1, [r0+r1]
- lea r0, [r0+2*r1]
- movdqu xmm4, [r2]
- paddw xmm0, xmm1
- psadbw xmm2, [r0]
- psadbw xmm3, [r0+r1]
- lea r0, [r0+2*r1]
- movdqu xmm5, [r2+r3]
- lea r2, [r2+2*r3]
- paddw xmm2, xmm3
- movdqu xmm6, [r2]
- movdqu xmm7, [r2+r3]
- lea r2, [r2+2*r3]
- paddw xmm0, xmm2
- psadbw xmm4, [r0]
- psadbw xmm5, [r0+r1]
- lea r0, [r0+2*r1]
- movdqu xmm1, [r2]
- paddw xmm4, xmm5
- psadbw xmm6, [r0]
- psadbw xmm7, [r0+r1]
- lea r0, [r0+2*r1]
- movdqu xmm2, [r2+r3]
- lea r2, [r2+2*r3]
- paddw xmm6, xmm7
- movdqu xmm3, [r2]
- paddw xmm0, xmm4
- movdqu xmm4, [r2+r3]
- lea r2, [r2+2*r3]
- paddw xmm0, xmm6
- psadbw xmm1, [r0]
- psadbw xmm2, [r0+r1]
- lea r0, [r0+2*r1]
- movdqu xmm5, [r2]
- paddw xmm1, xmm2
- psadbw xmm3, [r0]
- psadbw xmm4, [r0+r1]
- lea r0, [r0+2*r1]
- movdqu xmm6, [r2+r3]
- lea r2, [r2+2*r3]
- paddw xmm3, xmm4
- movdqu xmm7, [r2]
- paddw xmm0, xmm1
- movdqu xmm1, [r2+r3]
- paddw xmm0, xmm3
- psadbw xmm5, [r0]
- psadbw xmm6, [r0+r1]
- lea r0, [r0+2*r1]
- paddw xmm5, xmm6
- psadbw xmm7, [r0]
- psadbw xmm1, [r0+r1]
- paddw xmm7, xmm1
- paddw xmm0, xmm5
- paddw xmm0, xmm7
- SAD_END_SSE2
- ;-----------------------------------------------------------------------------
- ; int x264_pixel_sad_16x8_sse2 (uint8_t *, int, uint8_t *, int )
- ;-----------------------------------------------------------------------------
- cglobal x264_pixel_sad_16x8_%1, 4,4
- movdqu xmm0, [r2]
- movdqu xmm2, [r2+r3]
- lea r2, [r2+2*r3]
- movdqu xmm3, [r2]
- movdqu xmm4, [r2+r3]
- psadbw xmm0, [r0]
- psadbw xmm2, [r0+r1]
- lea r0, [r0+2*r1]
- psadbw xmm3, [r0]
- psadbw xmm4, [r0+r1]
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- paddw xmm0, xmm2
- paddw xmm3, xmm4
- paddw xmm0, xmm3
- movdqu xmm1, [r2]
- movdqu xmm2, [r2+r3]
- lea r2, [r2+2*r3]
- movdqu xmm3, [r2]
- movdqu xmm4, [r2+r3]
- psadbw xmm1, [r0]
- psadbw xmm2, [r0+r1]
- lea r0, [r0+2*r1]
- psadbw xmm3, [r0]
- psadbw xmm4, [r0+r1]
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- paddw xmm1, xmm2
- paddw xmm3, xmm4
- paddw xmm0, xmm1
- paddw xmm0, xmm3
- SAD_END_SSE2
- %endmacro
- SAD_W16 sse2
- %define movdqu lddqu
- SAD_W16 sse3
- %define movdqu movdqa
- SAD_W16 sse2_aligned
- %define movdqu movups
- ;-----------------------------------------------------------------------------
- ; void intra_sad_x3_16x16 ( uint8_t *fenc, uint8_t *fdec, int res[3] );
- ;-----------------------------------------------------------------------------
- ;xmm7: DC prediction xmm6: H prediction xmm5: V prediction
- ;xmm4: DC pred score xmm3: H pred score xmm2: V pred score
- %macro INTRA_SAD16 1
- cglobal x264_intra_sad_x3_16x16_%1,3,5
- pxor mm0, mm0
- pxor mm1, mm1
- psadbw mm0, [r1-FDEC_STRIDE+0]
- psadbw mm1, [r1-FDEC_STRIDE+8]
- paddw mm0, mm1
- movd r3d, mm0
- %ifidn %1, ssse3
- mova m1, [pb_3 GLOBAL]
- %endif
- %assign n 0
- %rep 16
- movzx r4d, byte [r1-1+FDEC_STRIDE*n]
- add r3d, r4d
- %assign n n+1
- %endrep
- add r3d, 16
- shr r3d, 5
- imul r3d, 0x01010101
- movd m7, r3d
- mova m5, [r1-FDEC_STRIDE]
- %if mmsize==16
- pshufd m7, m7, 0
- %else
- mova m1, [r1-FDEC_STRIDE+8]
- punpckldq m7, m7
- %endif
- pxor m4, m4
- pxor m3, m3
- pxor m2, m2
- mov r3d, 15*FENC_STRIDE
- .vloop:
- SPLATB m6, r1+r3*2-1, m1
- mova m0, [r0+r3]
- psadbw m0, m7
- paddw m4, m0
- mova m0, [r0+r3]
- psadbw m0, m5
- paddw m2, m0
- %if mmsize==8
- mova m0, [r0+r3]
- psadbw m0, m6
- paddw m3, m0
- mova m0, [r0+r3+8]
- psadbw m0, m7
- paddw m4, m0
- mova m0, [r0+r3+8]
- psadbw m0, m1
- paddw m2, m0
- psadbw m6, [r0+r3+8]
- paddw m3, m6
- %else
- psadbw m6, [r0+r3]
- paddw m3, m6
- %endif
- add r3d, -FENC_STRIDE
- jge .vloop
- %if mmsize==16
- pslldq m3, 4
- por m3, m2
- movhlps m1, m3
- paddw m3, m1
- movq [r2+0], m3
- movhlps m1, m4
- paddw m4, m1
- %else
- movd [r2+0], m2
- movd [r2+4], m3
- %endif
- movd [r2+8], m4
- RET
- %endmacro
- INIT_MMX
- %define SPLATB SPLATB_MMX
- INTRA_SAD16 mmxext
- INIT_XMM
- INTRA_SAD16 sse2
- %define SPLATB SPLATB_SSSE3
- INTRA_SAD16 ssse3
- ;=============================================================================
- ; SAD x3/x4 MMX
- ;=============================================================================
- %macro SAD_X3_START_1x8P 0
- movq mm3, [r0]
- movq mm0, [r1]
- movq mm1, [r2]
- movq mm2, [r3]
- psadbw mm0, mm3
- psadbw mm1, mm3
- psadbw mm2, mm3
- %endmacro
- %macro SAD_X3_1x8P 2
- movq mm3, [r0+%1]
- movq mm4, [r1+%2]
- movq mm5, [r2+%2]
- movq mm6, [r3+%2]
- psadbw mm4, mm3
- psadbw mm5, mm3
- psadbw mm6, mm3
- paddw mm0, mm4
- paddw mm1, mm5
- paddw mm2, mm6
- %endmacro
- %macro SAD_X3_START_2x4P 3
- movd mm3, [r0]
- movd %1, [r1]
- movd %2, [r2]
- movd %3, [r3]
- punpckldq mm3, [r0+FENC_STRIDE]
- punpckldq %1, [r1+r4]
- punpckldq %2, [r2+r4]
- punpckldq %3, [r3+r4]
- psadbw %1, mm3
- psadbw %2, mm3
- psadbw %3, mm3
- %endmacro
- %macro SAD_X3_2x16P 1
- %if %1
- SAD_X3_START_1x8P
- %else
- SAD_X3_1x8P 0, 0
- %endif
- SAD_X3_1x8P 8, 8
- SAD_X3_1x8P FENC_STRIDE, r4
- SAD_X3_1x8P FENC_STRIDE+8, r4+8
- add r0, 2*FENC_STRIDE
- lea r1, [r1+2*r4]
- lea r2, [r2+2*r4]
- lea r3, [r3+2*r4]
- %endmacro
- %macro SAD_X3_2x8P 1
- %if %1
- SAD_X3_START_1x8P
- %else
- SAD_X3_1x8P 0, 0
- %endif
- SAD_X3_1x8P FENC_STRIDE, r4
- add r0, 2*FENC_STRIDE
- lea r1, [r1+2*r4]
- lea r2, [r2+2*r4]
- lea r3, [r3+2*r4]
- %endmacro
- %macro SAD_X3_2x4P 1
- %if %1
- SAD_X3_START_2x4P mm0, mm1, mm2
- %else
- SAD_X3_START_2x4P mm4, mm5, mm6
- paddw mm0, mm4
- paddw mm1, mm5
- paddw mm2, mm6
- %endif
- add r0, 2*FENC_STRIDE
- lea r1, [r1+2*r4]
- lea r2, [r2+2*r4]
- lea r3, [r3+2*r4]
- %endmacro
- %macro SAD_X4_START_1x8P 0
- movq mm7, [r0]
- movq mm0, [r1]
- movq mm1, [r2]
- movq mm2, [r3]
- movq mm3, [r4]
- psadbw mm0, mm7
- psadbw mm1, mm7
- psadbw mm2, mm7
- psadbw mm3, mm7
- %endmacro
- %macro SAD_X4_1x8P 2
- movq mm7, [r0+%1]
- movq mm4, [r1+%2]
- movq mm5, [r2+%2]
- movq mm6, [r3+%2]
- psadbw mm4, mm7
- psadbw mm5, mm7
- psadbw mm6, mm7
- psadbw mm7, [r4+%2]
- paddw mm0, mm4
- paddw mm1, mm5
- paddw mm2, mm6
- paddw mm3, mm7
- %endmacro
- %macro SAD_X4_START_2x4P 0
- movd mm7, [r0]
- movd mm0, [r1]
- movd mm1, [r2]
- movd mm2, [r3]
- movd mm3, [r4]
- punpckldq mm7, [r0+FENC_STRIDE]
- punpckldq mm0, [r1+r5]
- punpckldq mm1, [r2+r5]
- punpckldq mm2, [r3+r5]
- punpckldq mm3, [r4+r5]
- psadbw mm0, mm7
- psadbw mm1, mm7
- psadbw mm2, mm7
- psadbw mm3, mm7
- %endmacro
- %macro SAD_X4_INC_2x4P 0
- movd mm7, [r0]
- movd mm4, [r1]
- movd mm5, [r2]
- punpckldq mm7, [r0+FENC_STRIDE]
- punpckldq mm4, [r1+r5]
- punpckldq mm5, [r2+r5]
- psadbw mm4, mm7
- psadbw mm5, mm7
- paddw mm0, mm4
- paddw mm1, mm5
- movd mm4, [r3]
- movd mm5, [r4]
- punpckldq mm4, [r3+r5]
- punpckldq mm5, [r4+r5]
- psadbw mm4, mm7
- psadbw mm5, mm7
- paddw mm2, mm4
- paddw mm3, mm5
- %endmacro
- %macro SAD_X4_2x16P 1
- %if %1
- SAD_X4_START_1x8P
- %else
- SAD_X4_1x8P 0, 0
- %endif
- SAD_X4_1x8P 8, 8
- SAD_X4_1x8P FENC_STRIDE, r5
- SAD_X4_1x8P FENC_STRIDE+8, r5+8
- add r0, 2*FENC_STRIDE
- lea r1, [r1+2*r5]
- lea r2, [r2+2*r5]
- lea r3, [r3+2*r5]
- lea r4, [r4+2*r5]
- %endmacro
- %macro SAD_X4_2x8P 1
- %if %1
- SAD_X4_START_1x8P
- %else
- SAD_X4_1x8P 0, 0
- %endif
- SAD_X4_1x8P FENC_STRIDE, r5
- add r0, 2*FENC_STRIDE
- lea r1, [r1+2*r5]
- lea r2, [r2+2*r5]
- lea r3, [r3+2*r5]
- lea r4, [r4+2*r5]
- %endmacro
- %macro SAD_X4_2x4P 1
- %if %1
- SAD_X4_START_2x4P
- %else
- SAD_X4_INC_2x4P
- %endif
- add r0, 2*FENC_STRIDE
- lea r1, [r1+2*r5]
- lea r2, [r2+2*r5]
- lea r3, [r3+2*r5]
- lea r4, [r4+2*r5]
- %endmacro
- %macro SAD_X3_END 0
- %ifdef ARCH_X86_64
- movd [r5+0], mm0
- movd [r5+4], mm1
- movd [r5+8], mm2
- %else
- mov r0, r5m
- movd [r0+0], mm0
- movd [r0+4], mm1
- movd [r0+8], mm2
- %endif
- RET
- %endmacro
- %macro SAD_X4_END 0
- mov r0, r6m
- movd [r0+0], mm0
- movd [r0+4], mm1
- movd [r0+8], mm2
- movd [r0+12], mm3
- RET
- %endmacro
- ;-----------------------------------------------------------------------------
- ; void x264_pixel_sad_x3_16x16_mmxext( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
- ; uint8_t *pix2, int i_stride, int scores[3] )
- ;-----------------------------------------------------------------------------
- %macro SAD_X 3
- cglobal x264_pixel_sad_x%1_%2x%3_mmxext, %1+2, %1+2
- SAD_X%1_2x%2P 1
- %rep %3/2-1
- SAD_X%1_2x%2P 0
- %endrep
- SAD_X%1_END
- %endmacro
- SAD_X 3, 16, 16
- SAD_X 3, 16, 8
- SAD_X 3, 8, 16
- SAD_X 3, 8, 8
- SAD_X 3, 8, 4
- SAD_X 3, 4, 8
- SAD_X 3, 4, 4
- SAD_X 4, 16, 16
- SAD_X 4, 16, 8
- SAD_X 4, 8, 16
- SAD_X 4, 8, 8
- SAD_X 4, 8, 4
- SAD_X 4, 4, 8
- SAD_X 4, 4, 4
- ;=============================================================================
- ; SAD x3/x4 XMM
- ;=============================================================================
- %macro SAD_X3_START_1x16P_SSE2 0
- movdqa xmm3, [r0]
- movdqu xmm0, [r1]
- movdqu xmm1, [r2]
- movdqu xmm2, [r3]
- psadbw xmm0, xmm3
- psadbw xmm1, xmm3
- psadbw xmm2, xmm3
- %endmacro
- %macro SAD_X3_1x16P_SSE2 2
- movdqa xmm3, [r0+%1]
- movdqu xmm4, [r1+%2]
- movdqu xmm5, [r2+%2]
- movdqu xmm6, [r3+%2]
- psadbw xmm4, xmm3
- psadbw xmm5, xmm3
- psadbw xmm6, xmm3
- paddw xmm0, xmm4
- paddw xmm1, xmm5
- paddw xmm2, xmm6
- %endmacro
- %macro SAD_X3_2x16P_SSE2 1
- %if %1
- SAD_X3_START_1x16P_SSE2
- %else
- SAD_X3_1x16P_SSE2 0, 0
- %endif
- SAD_X3_1x16P_SSE2 FENC_STRIDE, r4
- add r0, 2*FENC_STRIDE
- lea r1, [r1+2*r4]
- lea r2, [r2+2*r4]
- lea r3, [r3+2*r4]
- %endmacro
- %macro SAD_X4_START_1x16P_SSE2 0
- movdqa xmm7, [r0]
- movdqu xmm0, [r1]
- movdqu xmm1, [r2]
- movdqu xmm2, [r3]
- movdqu xmm3, [r4]
- psadbw xmm0, xmm7
- psadbw xmm1, xmm7
- psadbw xmm2, xmm7
- psadbw xmm3, xmm7
- %endmacro
- %macro SAD_X4_1x16P_SSE2 2
- movdqa xmm7, [r0+%1]
- movdqu xmm4, [r1+%2]
- movdqu xmm5, [r2+%2]
- movdqu xmm6, [r3+%2]
- %ifdef ARCH_X86_64
- movdqu xmm8, [r4+%2]
- psadbw xmm4, xmm7
- psadbw xmm5, xmm7
- psadbw xmm6, xmm7
- psadbw xmm8, xmm7
- paddw xmm0, xmm4
- paddw xmm1, xmm5
- paddw xmm2, xmm6
- paddw xmm3, xmm8
- %else
- psadbw xmm4, xmm7
- psadbw xmm5, xmm7
- paddw xmm0, xmm4
- psadbw xmm6, xmm7
- movdqu xmm4, [r4+%2]
- paddw xmm1, xmm5
- psadbw xmm4, xmm7
- paddw xmm2, xmm6
- paddw xmm3, xmm4
- %endif
- %endmacro
- %macro SAD_X4_2x16P_SSE2 1
- %if %1
- SAD_X4_START_1x16P_SSE2
- %else
- SAD_X4_1x16P_SSE2 0, 0
- %endif
- SAD_X4_1x16P_SSE2 FENC_STRIDE, r5
- add r0, 2*FENC_STRIDE
- lea r1, [r1+2*r5]
- lea r2, [r2+2*r5]
- lea r3, [r3+2*r5]
- lea r4, [r4+2*r5]
- %endmacro
- %macro SAD_X3_END_SSE2 0
- movhlps xmm4, xmm0
- movhlps xmm5, xmm1
- movhlps xmm6, xmm2
- paddw xmm0, xmm4
- paddw xmm1, xmm5
- paddw xmm2, xmm6
- %ifdef ARCH_X86_64
- movd [r5+0], xmm0
- movd [r5+4], xmm1
- movd [r5+8], xmm2
- %else
- mov r0, r5m
- movd [r0+0], xmm0
- movd [r0+4], xmm1
- movd [r0+8], xmm2
- %endif
- RET
- %endmacro
- %macro SAD_X4_END_SSE2 0
- mov r0, r6m
- psllq xmm1, 32
- psllq xmm3, 32
- paddw xmm0, xmm1
- paddw xmm2, xmm3
- movhlps xmm1, xmm0
- movhlps xmm3, xmm2
- paddw xmm0, xmm1
- paddw xmm2, xmm3
- movq [r0+0], xmm0
- movq [r0+8], xmm2
- RET
- %endmacro
- ;-----------------------------------------------------------------------------
- ; void x264_pixel_sad_x3_16x16_sse2( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
- ; uint8_t *pix2, int i_stride, int scores[3] )
- ;-----------------------------------------------------------------------------
- %macro SAD_X_SSE2 4
- cglobal x264_pixel_sad_x%1_%2x%3_%4, 2+%1,2+%1
- SAD_X%1_2x%2P_SSE2 1
- %rep %3/2-1
- SAD_X%1_2x%2P_SSE2 0
- %endrep
- SAD_X%1_END_SSE2
- %endmacro
- SAD_X_SSE2 3, 16, 16, sse2
- SAD_X_SSE2 3, 16, 8, sse2
- SAD_X_SSE2 4, 16, 16, sse2
- SAD_X_SSE2 4, 16, 8, sse2
- %define movdqu lddqu
- SAD_X_SSE2 3, 16, 16, sse3
- SAD_X_SSE2 3, 16, 8, sse3
- SAD_X_SSE2 4, 16, 16, sse3
- SAD_X_SSE2 4, 16, 8, sse3
- %undef movdqu
- ;=============================================================================
- ; SAD cacheline split
- ;=============================================================================
- ; Core2 (Conroe) can load unaligned data just as quickly as aligned data...
- ; unless the unaligned data spans the border between 2 cachelines, in which
- ; case it's really slow. The exact numbers may differ, but all Intel cpus
- ; have a large penalty for cacheline splits.
- ; (8-byte alignment exactly half way between two cachelines is ok though.)
- ; LDDQU was supposed to fix this, but it only works on Pentium 4.
- ; So in the split case we load aligned data and explicitly perform the
- ; alignment between registers. Like on archs that have only aligned loads,
- ; except complicated by the fact that PALIGNR takes only an immediate, not
- ; a variable alignment.
- ; It is also possible to hoist the realignment to the macroblock level (keep
- ; 2 copies of the reference frame, offset by 32 bytes), but the extra memory
- ; needed for that method makes it often slower.
- ; sad 16x16 costs on Core2:
- ; good offsets: 49 cycles (50/64 of all mvs)
- ; cacheline split: 234 cycles (14/64 of all mvs. ammortized: +40 cycles)
- ; page split: 3600 cycles (14/4096 of all mvs. ammortized: +11.5 cycles)
- ; cache or page split with palignr: 57 cycles (ammortized: +2 cycles)
- ; computed jump assumes this loop is exactly 80 bytes
- %macro SAD16_CACHELINE_LOOP_SSE2 1 ; alignment
- ALIGN 16
- sad_w16_align%1_sse2:
- movdqa xmm1, [r2+16]
- movdqa xmm2, [r2+r3+16]
- movdqa xmm3, [r2]
- movdqa xmm4, [r2+r3]
- pslldq xmm1, 16-%1
- pslldq xmm2, 16-%1
- psrldq xmm3, %1
- psrldq xmm4, %1
- por xmm1, xmm3
- por xmm2, xmm4
- psadbw xmm1, [r0]
- psadbw xmm2, [r0+r1]
- paddw xmm0, xmm1
- paddw xmm0, xmm2
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- dec r4
- jg sad_w16_align%1_sse2
- ret
- %endmacro
- ; computed jump assumes this loop is exactly 64 bytes
- %macro SAD16_CACHELINE_LOOP_SSSE3 1 ; alignment
- ALIGN 16
- sad_w16_align%1_ssse3:
- movdqa xmm1, [r2+16]
- movdqa xmm2, [r2+r3+16]
- palignr xmm1, [r2], %1
- palignr xmm2, [r2+r3], %1
- psadbw xmm1, [r0]
- psadbw xmm2, [r0+r1]
- paddw xmm0, xmm1
- paddw xmm0, xmm2
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- dec r4
- jg sad_w16_align%1_ssse3
- ret
- %endmacro
- %macro SAD16_CACHELINE_FUNC 2 ; cpu, height
- cglobal x264_pixel_sad_16x%2_cache64_%1, 0,0
- mov eax, r2m
- and eax, 0x37
- cmp eax, 0x30
- jle x264_pixel_sad_16x%2_sse2
- PROLOGUE 4,6
- mov r4d, r2d
- and r4d, 15
- %ifidn %1, ssse3
- shl r4d, 6 ; code size = 64
- %else
- lea r4, [r4*5]
- shl r4d, 4 ; code size = 80
- %endif
- %define sad_w16_addr (sad_w16_align1_%1 + (sad_w16_align1_%1 - sad_w16_align2_%1))
- %ifdef PIC
- lea r5, [sad_w16_addr GLOBAL]
- add r5, r4
- %else
- lea r5, [sad_w16_addr + r4 GLOBAL]
- %endif
- and r2, ~15
- mov r4d, %2/2
- pxor xmm0, xmm0
- call r5
- movhlps xmm1, xmm0
- paddw xmm0, xmm1
- movd eax, xmm0
- RET
- %endmacro
- %macro SAD_CACHELINE_START_MMX2 4 ; width, height, iterations, cacheline
- mov eax, r2m
- and eax, 0x17|%1|(%4>>1)
- cmp eax, 0x10|%1|(%4>>1)
- jle x264_pixel_sad_%1x%2_mmxext
- and eax, 7
- shl eax, 3
- movd mm6, [sw_64 GLOBAL]
- movd mm7, eax
- psubw mm6, mm7
- PROLOGUE 4,5
- and r2, ~7
- mov r4d, %3
- pxor mm0, mm0
- %endmacro
- %macro SAD16_CACHELINE_FUNC_MMX2 2 ; height, cacheline
- cglobal x264_pixel_sad_16x%1_cache%2_mmxext, 0,0
- SAD_CACHELINE_START_MMX2 16, %1, %1, %2
- .loop:
- movq mm1, [r2]
- movq mm2, [r2+8]
- movq mm3, [r2+16]
- movq mm4, mm2
- psrlq mm1, mm7
- psllq mm2, mm6
- psllq mm3, mm6
- psrlq mm4, mm7
- por mm1, mm2
- por mm3, mm4
- psadbw mm1, [r0]
- psadbw mm3, [r0+8]
- paddw mm0, mm1
- paddw mm0, mm3
- add r2, r3
- add r0, r1
- dec r4
- jg .loop
- movd eax, mm0
- RET
- %endmacro
- %macro SAD8_CACHELINE_FUNC_MMX2 2 ; height, cacheline
- cglobal x264_pixel_sad_8x%1_cache%2_mmxext, 0,0
- SAD_CACHELINE_START_MMX2 8, %1, %1/2, %2
- .loop:
- movq mm1, [r2+8]
- movq mm2, [r2+r3+8]
- movq mm3, [r2]
- movq mm4, [r2+r3]
- psllq mm1, mm6
- psllq mm2, mm6
- psrlq mm3, mm7
- psrlq mm4, mm7
- por mm1, mm3
- por mm2, mm4
- psadbw mm1, [r0]
- psadbw mm2, [r0+r1]
- paddw mm0, mm1
- paddw mm0, mm2
- lea r2, [r2+2*r3]
- lea r0, [r0+2*r1]
- dec r4
- jg .loop
- movd eax, mm0
- RET
- %endmacro
- ; sad_x3/x4_cache64: check each mv.
- ; if they're all within a cacheline, use normal sad_x3/x4.
- ; otherwise, send them individually to sad_cache64.
- %macro CHECK_SPLIT 3 ; pix, width, cacheline
- mov eax, %1
- and eax, 0x17|%2|(%3>>1)
- cmp eax, 0x10|%2|(%3>>1)
- jg .split
- %endmacro
- %macro SADX3_CACHELINE_FUNC 5 ; width, height, cacheline, normal_ver, split_ver
- cglobal x264_pixel_sad_x3_%1x%2_cache%3_%5, 0,0
- CHECK_SPLIT r1m, %1, %3
- CHECK_SPLIT r2m, %1, %3
- CHECK_SPLIT r3m, %1, %3
- jmp x264_pixel_sad_x3_%1x%2_%4
- .split:
- %ifdef ARCH_X86_64
- push r3
- push r2
- mov r2, r1
- mov r1, FENC_STRIDE
- mov r3, r4
- mov r10, r0
- mov r11, r5
- call x264_pixel_sad_%1x%2_cache%3_%5
- mov [r11], eax
- pop r2
- mov r0, r10
- call x264_pixel_sad_%1x%2_cache%3_%5
- mov [r11+4], eax
- pop r2
- mov r0, r10
- call x264_pixel_sad_%1x%2_cache%3_%5
- mov [r11+8], eax
- %else
- push edi
- mov edi, [esp+28]
- push dword [esp+24]
- push dword [esp+16]
- push dword 16
- push dword [esp+20]
- call x264_pixel_sad_%1x%2_cache%3_%5
- mov ecx, [esp+32]
- mov [edi], eax
- mov [esp+8], ecx
- call x264_pixel_sad_%1x%2_cache%3_%5
- mov ecx, [esp+36]
- mov [edi+4], eax
- mov [esp+8], ecx
- call x264_pixel_sad_%1x%2_cache%3_%5
- mov [edi+8], eax
- add esp, 16
- pop edi
- %endif
- ret
- %endmacro
- %macro SADX4_CACHELINE_FUNC 5 ; width, height, cacheline, normal_ver, split_ver
- cglobal x264_pixel_sad_x4_%1x%2_cache%3_%5, 0,0
- CHECK_SPLIT r1m, %1, %3
- CHECK_SPLIT r2m, %1, %3
- CHECK_SPLIT r3m, %1, %3
- CHECK_SPLIT r4m, %1, %3
- jmp x264_pixel_sad_x4_%1x%2_%4
- .split:
- %ifdef ARCH_X86_64
- mov r11, r6m
- push r4
- push r3
- push r2
- mov r2, r1
- mov r1, FENC_STRIDE
- mov r3, r5
- mov r10, r0
- call x264_pixel_sad_%1x%2_cache%3_%5
- mov [r11], eax
- pop r2
- mov r0, r10
- call x264_pixel_sad_%1x%2_cache%3_%5
- mov [r11+4], eax
- pop r2
- mov r0, r10
- call x264_pixel_sad_%1x%2_cache%3_%5
- mov [r11+8], eax
- pop r2
- mov r0, r10
- call x264_pixel_sad_%1x%2_cache%3_%5
- mov [r11+12], eax
- %else
- push edi
- mov edi, [esp+32]
- push dword [esp+28]
- push dword [esp+16]
- push dword 16
- push dword [esp+20]
- call x264_pixel_sad_%1x%2_cache%3_%5
- mov ecx, [esp+32]
- mov [edi], eax
- mov [esp+8], ecx
- call x264_pixel_sad_%1x%2_cache%3_%5
- mov ecx, [esp+36]
- mov [edi+4], eax
- mov [esp+8], ecx
- call x264_pixel_sad_%1x%2_cache%3_%5
- mov ecx, [esp+40]
- mov [edi+8], eax
- mov [esp+8], ecx
- call x264_pixel_sad_%1x%2_cache%3_%5
- mov [edi+12], eax
- add esp, 16
- pop edi
- %endif
- ret
- %endmacro
- %macro SADX34_CACHELINE_FUNC 5
- SADX3_CACHELINE_FUNC %1, %2, %3, %4, %5
- SADX4_CACHELINE_FUNC %1, %2, %3, %4, %5
- %endmacro
- ; instantiate the aligned sads
- %ifndef ARCH_X86_64
- SAD16_CACHELINE_FUNC_MMX2 8, 32
- SAD16_CACHELINE_FUNC_MMX2 16, 32
- SAD8_CACHELINE_FUNC_MMX2 4, 32
- SAD8_CACHELINE_FUNC_MMX2 8, 32
- SAD8_CACHELINE_FUNC_MMX2 16, 32
- SAD16_CACHELINE_FUNC_MMX2 8, 64
- SAD16_CACHELINE_FUNC_MMX2 16, 64
- %endif ; !ARCH_X86_64
- SAD8_CACHELINE_FUNC_MMX2 4, 64
- SAD8_CACHELINE_FUNC_MMX2 8, 64
- SAD8_CACHELINE_FUNC_MMX2 16, 64
- %ifndef ARCH_X86_64
- SADX34_CACHELINE_FUNC 16, 16, 32, mmxext, mmxext
- SADX34_CACHELINE_FUNC 16, 8, 32, mmxext, mmxext
- SADX34_CACHELINE_FUNC 8, 16, 32, mmxext, mmxext
- SADX34_CACHELINE_FUNC 8, 8, 32, mmxext, mmxext
- SADX34_CACHELINE_FUNC 16, 16, 64, mmxext, mmxext
- SADX34_CACHELINE_FUNC 16, 8, 64, mmxext, mmxext
- %endif ; !ARCH_X86_64
- SADX34_CACHELINE_FUNC 8, 16, 64, mmxext, mmxext
- SADX34_CACHELINE_FUNC 8, 8, 64, mmxext, mmxext
- %ifndef ARCH_X86_64
- SAD16_CACHELINE_FUNC sse2, 8
- SAD16_CACHELINE_FUNC sse2, 16
- %assign i 1
- %rep 15
- SAD16_CACHELINE_LOOP_SSE2 i
- %assign i i+1
- %endrep
- SADX34_CACHELINE_FUNC 16, 16, 64, sse2, sse2
- SADX34_CACHELINE_FUNC 16, 8, 64, sse2, sse2
- %endif ; !ARCH_X86_64
- SAD16_CACHELINE_FUNC ssse3, 8
- SAD16_CACHELINE_FUNC ssse3, 16
- %assign i 1
- %rep 15
- SAD16_CACHELINE_LOOP_SSSE3 i
- %assign i i+1
- %endrep
- SADX34_CACHELINE_FUNC 16, 16, 64, sse2, ssse3
- SADX34_CACHELINE_FUNC 16, 8, 64, sse2, ssse3