deblock-a.asm
资源名称:chapter15.rar [点击查看]
上传用户:hjq518
上传日期:2021-12-09
资源大小:5084k
文件大小:13k
源码类别:
Audio
开发平台:
Visual C++
- ;*****************************************************************************
- ;* deblock-a.asm: h264 encoder library
- ;*****************************************************************************
- ;* Copyright (C) 2005 x264 project
- ;*
- ;* Authors: Loren Merritt <lorenm@u.washington.edu>
- ;*
- ;* This program is free software; you can redistribute it and/or modify
- ;* it under the terms of the GNU General Public License as published by
- ;* the Free Software Foundation; either version 2 of the License, or
- ;* (at your option) any later version.
- ;*
- ;* This program is distributed in the hope that it will be useful,
- ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
- ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- ;* GNU General Public License for more details.
- ;*
- ;* You should have received a copy of the GNU General Public License
- ;* along with this program; if not, write to the Free Software
- ;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
- ;*****************************************************************************
- BITS 64
- %include "amd64inc.asm"
- SECTION .rodata align=16
- pb_01: times 16 db 0x01
- pb_3f: times 16 db 0x3f
- pb_ff: times 16 db 0xff
- SECTION .text
- cglobal x264_deblock_v_luma_sse2
- cglobal x264_deblock_h_luma_sse2
- cglobal x264_deblock_v_chroma_mmxext
- cglobal x264_deblock_h_chroma_mmxext
- cglobal x264_deblock_v_chroma_intra_mmxext
- cglobal x264_deblock_h_chroma_intra_mmxext
- ; expands to [base],...,[base+7*stride]
- %define PASS8ROWS(base, base3, stride, stride3)
- [base], [base+stride], [base+stride*2], [base3],
- [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
- ; in: 8 rows of 4 bytes in %1..%8
- ; out: 4 rows of 8 bytes in mm0..mm3
- %macro TRANSPOSE4x8_LOAD 8
- movd mm0, %1
- movd mm2, %2
- movd mm1, %3
- movd mm3, %4
- punpcklbw mm0, mm2
- punpcklbw mm1, mm3
- movq mm2, mm0
- punpcklwd mm0, mm1
- punpckhwd mm2, mm1
- movd mm4, %5
- movd mm6, %6
- movd mm5, %7
- movd mm7, %8
- punpcklbw mm4, mm6
- punpcklbw mm5, mm7
- movq mm6, mm4
- punpcklwd mm4, mm5
- punpckhwd mm6, mm5
- movq mm1, mm0
- movq mm3, mm2
- punpckldq mm0, mm4
- punpckhdq mm1, mm4
- punpckldq mm2, mm6
- punpckhdq mm3, mm6
- %endmacro
- ; in: 4 rows of 8 bytes in mm0..mm3
- ; out: 8 rows of 4 bytes in %1..%8
- %macro TRANSPOSE8x4_STORE 8
- movq mm4, mm0
- movq mm5, mm1
- movq mm6, mm2
- punpckhdq mm4, mm4
- punpckhdq mm5, mm5
- punpckhdq mm6, mm6
- punpcklbw mm0, mm1
- punpcklbw mm2, mm3
- movq mm1, mm0
- punpcklwd mm0, mm2
- punpckhwd mm1, mm2
- movd %1, mm0
- punpckhdq mm0, mm0
- movd %2, mm0
- movd %3, mm1
- punpckhdq mm1, mm1
- movd %4, mm1
- punpckhdq mm3, mm3
- punpcklbw mm4, mm5
- punpcklbw mm6, mm3
- movq mm5, mm4
- punpcklwd mm4, mm6
- punpckhwd mm5, mm6
- movd %5, mm4
- punpckhdq mm4, mm4
- movd %6, mm4
- movd %7, mm5
- punpckhdq mm5, mm5
- movd %8, mm5
- %endmacro
- %macro SBUTTERFLY 4
- movq %4, %2
- punpckl%1 %2, %3
- punpckh%1 %4, %3
- %endmacro
- ; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8
- ; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16]
- %macro TRANSPOSE6x8_MEM 9
- movq mm0, %1
- movq mm1, %3
- movq mm2, %5
- movq mm3, %7
- SBUTTERFLY bw, mm0, %2, mm4
- SBUTTERFLY bw, mm1, %4, mm5
- SBUTTERFLY bw, mm2, %6, mm6
- movq [%9+0x10], mm5
- SBUTTERFLY bw, mm3, %8, mm7
- SBUTTERFLY wd, mm0, mm1, mm5
- SBUTTERFLY wd, mm2, mm3, mm1
- punpckhdq mm0, mm2
- movq [%9+0x00], mm0
- SBUTTERFLY wd, mm4, [%9+0x10], mm3
- SBUTTERFLY wd, mm6, mm7, mm2
- SBUTTERFLY dq, mm4, mm6, mm0
- SBUTTERFLY dq, mm5, mm1, mm7
- punpckldq mm3, mm2
- movq [%9+0x10], mm5
- movq [%9+0x20], mm7
- movq [%9+0x30], mm4
- movq [%9+0x40], mm0
- movq [%9+0x50], mm3
- %endmacro
- ; out: %4 = |%1-%2|>%3
- ; clobbers: %5
- %macro DIFF_GT 6
- mov%1 %6, %3
- mov%1 %5, %2
- psubusb %6, %2
- psubusb %5, %3
- por %5, %6
- psubusb %5, %4
- %endmacro
- %macro DIFF_GT_MMX 5
- DIFF_GT q, %1, %2, %3, %4, %5
- %endmacro
- %macro DIFF_GT_SSE2 5
- DIFF_GT dqa, %1, %2, %3, %4, %5
- %endmacro
- ; in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 %1=alpha-1 %2=beta-1
- ; out: mm5=beta-1, mm7=mask
- ; clobbers: mm4,mm6
- %macro LOAD_MASK_MMX 2
- movd mm4, %1
- movd mm5, %2
- pshufw mm4, mm4, 0
- pshufw mm5, mm5, 0
- packuswb mm4, mm4 ; 8x alpha-1
- packuswb mm5, mm5 ; 8x beta-1
- DIFF_GT_MMX mm1, mm2, mm4, mm7, mm6 ; |p0-q0| > alpha-1
- DIFF_GT_MMX mm0, mm1, mm5, mm4, mm6 ; |p1-p0| > beta-1
- por mm7, mm4
- DIFF_GT_MMX mm3, mm2, mm5, mm4, mm6 ; |q1-q0| > beta-1
- por mm7, mm4
- pxor mm6, mm6
- pcmpeqb mm7, mm6
- %endmacro
- %macro LOAD_MASK_SSE2 2
- movd xmm4, %1
- movd xmm5, %2
- pshuflw xmm4, xmm4, 0
- pshuflw xmm5, xmm5, 0
- punpcklqdq xmm4, xmm4
- punpcklqdq xmm5, xmm5
- packuswb xmm4, xmm4 ; 16x alpha-1
- packuswb xmm5, xmm5 ; 16x beta-1
- DIFF_GT_SSE2 xmm1, xmm2, xmm4, xmm7, xmm6 ; |p0-q0| > alpha-1
- DIFF_GT_SSE2 xmm0, xmm1, xmm5, xmm4, xmm6 ; |p1-p0| > beta-1
- por xmm7, xmm4
- DIFF_GT_SSE2 xmm3, xmm2, xmm5, xmm4, xmm6 ; |q1-q0| > beta-1
- por xmm7, xmm4
- pxor xmm6, xmm6
- pcmpeqb xmm7, xmm6
- %endmacro
- ; in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask)
- ; out: mm1=p0' mm2=q0'
- ; clobbers: mm0,3-6
- %macro DEBLOCK_P0_Q0 2
- ; a = q0^p0^((p1-q1)>>2)
- mov%1 %2m4, %2m0
- psubb %2m4, %2m3
- psrlw %2m4, 2
- pxor %2m4, %2m1
- pxor %2m4, %2m2
- ; b = p0^(q1>>2)
- psrlw %2m3, 2
- pand %2m3, [pb_3f GLOBAL]
- mov%1 %2m5, %2m1
- pxor %2m5, %2m3
- ; c = q0^(p1>>2)
- psrlw %2m0, 2
- pand %2m0, [pb_3f GLOBAL]
- mov%1 %2m6, %2m2
- pxor %2m6, %2m0
- ; d = (c^b) & ~(b^a) & 1
- pxor %2m6, %2m5
- pxor %2m5, %2m4
- pandn %2m5, %2m6
- pand %2m5, [pb_01 GLOBAL]
- ; delta = (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3
- ; = (avg(q0, p1>>2) + (d&a))
- ; - (avg(p0, q1>>2) + (d^(d&a)))
- pavgb %2m0, %2m2
- pand %2m4, %2m5
- paddusb %2m0, %2m4
- pavgb %2m3, %2m1
- pxor %2m4, %2m5
- paddusb %2m3, %2m4
- ; p0 += clip(delta, -tc0, tc0)
- ; q0 -= clip(delta, -tc0, tc0)
- mov%1 %2m4, %2m0
- psubusb %2m0, %2m3
- psubusb %2m3, %2m4
- pminub %2m0, %2m7
- pminub %2m3, %2m7
- paddusb %2m1, %2m0
- paddusb %2m2, %2m3
- psubusb %2m1, %2m3
- psubusb %2m2, %2m0
- %endmacro
- %macro DEBLOCK_P0_Q0_MMX 0
- DEBLOCK_P0_Q0 q, m
- %endmacro
- %macro DEBLOCK_P0_Q0_SSE2 0
- DEBLOCK_P0_Q0 dqa, xm
- %endmacro
- ; in: mm1=p0 mm2=q0
- ; %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp
- ; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
- ; clobbers: q2, tmp, tc0
- %macro LUMA_Q1_SSE2 6
- movdqa %6, xmm1
- pavgb %6, xmm2
- pavgb %2, %6 ; avg(p2,avg(p0,q0))
- pxor %6, %3
- pand %6, [pb_01 GLOBAL] ; (p2^avg(p0,q0))&1
- psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1
- movdqa %6, %1
- psubusb %6, %5
- paddusb %5, %1
- pmaxub %2, %6
- pminub %2, %5
- movdqa %4, %2
- %endmacro
- SECTION .text
- ALIGN 16
- ;-----------------------------------------------------------------------------
- ; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
- ;-----------------------------------------------------------------------------
- x264_deblock_v_luma_sse2:
- ; rdi = pix
- movsxd rsi, esi ; stride
- dec edx ; alpha-1
- dec ecx ; beta-1
- movd xmm8, [r8] ; tc0
- mov r8, rdi
- sub r8, rsi
- sub r8, rsi
- sub r8, rsi ; pix-3*stride
- movdqa xmm0, [r8+rsi] ; p1
- movdqa xmm1, [r8+2*rsi] ; p0
- movdqa xmm2, [rdi] ; q0
- movdqa xmm3, [rdi+rsi] ; q1
- LOAD_MASK_SSE2 edx, ecx
- punpcklbw xmm8, xmm8
- punpcklbw xmm8, xmm8 ; xmm8 = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
- movdqa xmm9, [pb_ff GLOBAL]
- pcmpeqb xmm9, xmm8
- pandn xmm9, xmm7
- pand xmm8, xmm9
- movdqa xmm3, [r8] ; p2
- DIFF_GT_SSE2 xmm1, xmm3, xmm5, xmm6, xmm7 ; |p2-p0| > beta-1
- pandn xmm6, xmm9
- pcmpeqb xmm6, xmm9
- pand xmm6, xmm9
- movdqa xmm7, [pb_01 GLOBAL]
- pand xmm7, xmm6
- pand xmm6, xmm8
- paddb xmm7, xmm8
- LUMA_Q1_SSE2 xmm0, xmm3, [r8], [r8+rsi], xmm6, xmm4
- movdqa xmm4, [rdi+2*rsi] ; q2
- DIFF_GT_SSE2 xmm2, xmm4, xmm5, xmm6, xmm3 ; |q2-q0| > beta-1
- pandn xmm6, xmm9
- pcmpeqb xmm6, xmm9
- pand xmm6, xmm9
- pand xmm8, xmm6
- pand xmm6, [pb_01 GLOBAL]
- paddb xmm7, xmm6
- movdqa xmm3, [rdi+rsi]
- LUMA_Q1_SSE2 xmm3, xmm4, [rdi+2*rsi], [rdi+rsi], xmm8, xmm6
- DEBLOCK_P0_Q0_SSE2
- movdqa [r8+2*rsi], xmm1
- movdqa [rdi], xmm2
- ret
- ALIGN 16
- ;-----------------------------------------------------------------------------
- ; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
- ;-----------------------------------------------------------------------------
- x264_deblock_h_luma_sse2:
- movsxd r10, esi
- lea r11, [r10+r10*2]
- lea rax, [rdi-4]
- lea r9, [rdi-4+r11]
- %define pix_tmp rsp-104 ; 16x6 for the buffer + 8 for x264_deblock_v_luma_sse2's return address
- ; transpose 6x16 -> tmp space
- TRANSPOSE6x8_MEM PASS8ROWS(rax, r9, r10, r11), pix_tmp
- lea rax, [rax+r10*8]
- lea r9, [r9 +r10*8]
- TRANSPOSE6x8_MEM PASS8ROWS(rax, r9, r10, r11), pix_tmp+8
- ; vertical filter
- ; alpha, beta, tc0 are still in edx, ecx, r8
- ; don't backup rax, r9, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them
- lea rdi, [pix_tmp+0x30]
- mov esi, 0x10
- call x264_deblock_v_luma_sse2
- ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
- add rax, 2
- add r9, 2
- movq mm0, [pix_tmp+0x18]
- movq mm1, [pix_tmp+0x28]
- movq mm2, [pix_tmp+0x38]
- movq mm3, [pix_tmp+0x48]
- TRANSPOSE8x4_STORE PASS8ROWS(rax, r9, r10, r11)
- shl r10, 3
- sub rax, r10
- sub r9, r10
- shr r10, 3
- movq mm0, [pix_tmp+0x10]
- movq mm1, [pix_tmp+0x20]
- movq mm2, [pix_tmp+0x30]
- movq mm3, [pix_tmp+0x40]
- TRANSPOSE8x4_STORE PASS8ROWS(rax, r9, r10, r11)
- ret
- %macro CHROMA_V_START 0
- ; rdi = pix
- movsxd rsi, esi ; stride
- dec edx ; alpha-1
- dec ecx ; beta-1
- mov rax, rdi
- sub rax, rsi
- sub rax, rsi
- %endmacro
- %macro CHROMA_H_START 0
- movsxd rsi, esi
- dec edx
- dec ecx
- sub rdi, 2
- lea r9, [rsi+rsi*2]
- mov rax, rdi
- add rdi, r9
- %endmacro
- ALIGN 16
- ;-----------------------------------------------------------------------------
- ; void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
- ;-----------------------------------------------------------------------------
- x264_deblock_v_chroma_mmxext:
- CHROMA_V_START
- movq mm0, [rax]
- movq mm1, [rax+rsi]
- movq mm2, [rdi]
- movq mm3, [rdi+rsi]
- LOAD_MASK_MMX edx, ecx
- movd mm6, [r8] ; tc0
- punpcklbw mm6, mm6
- pand mm7, mm6
- DEBLOCK_P0_Q0_MMX
- movq [rax+rsi], mm1
- movq [rdi], mm2
- ret
- ALIGN 16
- ;-----------------------------------------------------------------------------
- ; void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
- ;-----------------------------------------------------------------------------
- x264_deblock_h_chroma_mmxext:
- CHROMA_H_START
- TRANSPOSE4x8_LOAD PASS8ROWS(rax, rdi, rsi, r9)
- movq [rsp-8], mm0
- movq [rsp-16], mm3
- LOAD_MASK_MMX edx, ecx
- movd mm6, [r8] ; tc0
- punpcklbw mm6, mm6
- pand mm7, mm6
- DEBLOCK_P0_Q0_MMX
- movq mm0, [rsp-8]
- movq mm3, [rsp-16]
- TRANSPOSE8x4_STORE PASS8ROWS(rax, rdi, rsi, r9)
- ret
- ; in: %1=p0 %2=p1 %3=q1
- ; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2
- %macro CHROMA_INTRA_P0 3
- movq mm4, %1
- pxor mm4, %3
- pand mm4, [pb_01 GLOBAL] ; mm4 = (p0^q1)&1
- pavgb %1, %3
- psubusb %1, mm4
- pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1))
- %endmacro
- %macro CHROMA_INTRA_BODY 0
- LOAD_MASK_MMX edx, ecx
- movq mm5, mm1
- movq mm6, mm2
- CHROMA_INTRA_P0 mm1, mm0, mm3
- CHROMA_INTRA_P0 mm2, mm3, mm0
- psubb mm1, mm5
- psubb mm2, mm6
- pand mm1, mm7
- pand mm2, mm7
- paddb mm1, mm5
- paddb mm2, mm6
- %endmacro
- ALIGN 16
- ;-----------------------------------------------------------------------------
- ; void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta )
- ;-----------------------------------------------------------------------------
- x264_deblock_v_chroma_intra_mmxext:
- CHROMA_V_START
- movq mm0, [rax]
- movq mm1, [rax+rsi]
- movq mm2, [rdi]
- movq mm3, [rdi+rsi]
- CHROMA_INTRA_BODY
- movq [rax+rsi], mm1
- movq [rdi], mm2
- ret
- ALIGN 16
- ;-----------------------------------------------------------------------------
- ; void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta )
- ;-----------------------------------------------------------------------------
- x264_deblock_h_chroma_intra_mmxext:
- CHROMA_H_START
- TRANSPOSE4x8_LOAD PASS8ROWS(rax, rdi, rsi, r9)
- CHROMA_INTRA_BODY
- TRANSPOSE8x4_STORE PASS8ROWS(rax, rdi, rsi, r9)
- ret