dct_sse2.asm
资源名称:h264src.zip [点击查看]
上传用户:sunbaby
上传日期:2013-05-31
资源大小:242k
文件大小:13k
源码类别:
mpeg/mp3
开发平台:
Visual C++
- ;/*****************************************************************************
- ; *
- ; * T264 AVC CODEC
- ; *
- ; * Copyright(C) 2004-2005 llcc <lcgate1@yahoo.com.cn>
- ; * 2004-2005 visionany <visionany@yahoo.com.cn>
- ; *
- ; * This program is free software ; you can redistribute it and/or modify
- ; * it under the terms of the GNU General Public License as published by
- ; * the Free Software Foundation ; either version 2 of the License, or
- ; * (at your option) any later version.
- ; *
- ; * This program is distributed in the hope that it will be useful,
- ; * but WITHOUT ANY WARRANTY ; without even the implied warranty of
- ; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- ; * GNU General Public License for more details.
- ; *
- ; * You should have received a copy of the GNU General Public License
- ; * along with this program ; if not, write to the Free Software
- ; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- ; *
- ; ****************************************************************************/
- bits 32
- ; ideal from xvid
- ; modify by Thomascatlee@163.com
- ; for GCC
- %macro cglobal 1
- %ifdef NOPREFIX
- global %1
- %else
- global _%1
- %define %1 _%1
- %endif
- %endmacro
- %macro cextern 1
- %ifdef NOPREFIX
- extern %1
- %else
- extern _%1
- %define %1 _%1
- %endif
- %endmacro
- ; input 0 1 2 3, output 3 4 1 0
- %macro transpose 5
- movq %5, %1
- punpckhwd %5, %2 ; mm4 = 8 4 7 3
- punpcklwd %1, %2 ; mm0 = 6 2 5 1
- movq %2, %3
- punpckhwd %2, %4 ; mm1 = 16 12 15 11
- punpcklwd %3, %4 ; mm2 = 14 10 13 9
- movq %4, %5
- punpckhdq %4, %2 ; mm3 = 16 12 8 4
- punpckldq %5, %2 ; mm4 = 15 11 7 3
- movq %2, %1
- punpckhdq %2, %3 ; mm1 = 14 10 6 2
- punpckldq %1, %3 ; mm0 = 13 9 5 1
- %endmacro
- %macro addsub 5
- movq %5, %1
- paddw %1, %4 ; %0 = s[0]
- psubw %5, %4 ; %4 = s[3]
- movq %4, %2
- paddw %2, %3 ; %1 = s[1]
- psubw %4, %3 ; %3 = s[2]
- %endmacro
- %macro addsub2 5
- movq %5, %1 ; %5 = s[0]
- paddw %1, %2 ; d[0] = s[0] + s[1]
- psubw %5, %2 ; d[2] = tmp - s[1]
- movq %2, %4 ; %2 = s[3]
- paddw %2, %2 ; %2 = %2 + %2
- paddw %2, %3 ; d[1] = %2 + s[2]
- paddw %3, %3 ; s[2] = s[2]+ s[2]
- psubw %4, %3 ; d[3] = s[3]- s[2]
- %endmacro
- ; output 0 4 1 2
- %macro idct_addsub2 5
- movq %5, %1 ; %5 = d[0]
- paddw %1, %3 ; s[0] = d[0] + d[2]
- psubw %5, %3 ; s[1] = d[0] - d[2]
- movq %3, %2 ; %3 = d[1]
- psraw %2, 1 ; %2 = %2 / 2
- psubw %2, %4 ; s[2] = %2 - d[3]
- psraw %4, 1 ; d[3] = d[3] / 2
- paddw %3, %4 ; s[3] = d[1] + d[3]
- %endmacro
- ; %1 = mmx content, %2 = tmp mmx, %3 = zero mmx, %4 = xmm content, %5 = xmm tmp
- %macro word2dw 5
- movq %2, %1
- punpcklwd %2, %3 ; dcba->0b0a
- punpckhwd %1, %3 ; dcba->0d0c
- movq2dq %4, %1 ; 00 00 0d 0c
- pslldq %4, 8 ; 0d 0c 00 00
- movq2dq %5, %2 ; 00 00 0b 0a
- por %4, %5 ; 0d 0c 0b 0a
- %endmacro
- section .rodata data align=16
- align 16
- sse2_neg1 dw -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
- sse2_1 dw 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
- align 16
- mmx1 dw 1, 1, 1, 1
- align 16
- mmx32 dw 32, 32, 32, 32
- cextern quant
- cextern dequant
- align 16
- section .text
- ;======================================================
- ;
- ; void
- ; dct4x4_mmx(int16_t* data)
- ;
- ;======================================================
- align 16
- cglobal dct4x4_mmx
- dct4x4_mmx
- mov eax, [esp + 4] ; data
- movq mm0, [eax + 0]
- movq mm1, [eax + 8]
- movq mm2, [eax + 16]
- movq mm3, [eax + 24]
- transpose mm0, mm1, mm2, mm3, mm4 ; input 0 1 2 3, output 0 1 4 3
- addsub mm0, mm1, mm4, mm3, mm2 ; input 0 1 2 3, output 0 1 3 4
- ; s[0] = mm0, s[1] = mm1, s[2] = mm3, s[3] = mm2
- addsub2 mm0, mm1, mm3, mm2, mm4 ; input 0 1 2 3, output 0 1 4 3
- transpose mm0, mm1, mm4, mm2, mm3 ; input 0 1 2 3, output 0 1 4 3
- addsub mm0, mm1, mm3, mm2, mm4 ; input 0 1 2 3, output 0 1 3 4
- ; s[0] = mm0, s[1] = mm1, s[2] = mm2, s[3] = mm4
- addsub2 mm0, mm1, mm2, mm4, mm3 ; input 0 1 2 3, output 0 1 4 3
- movq [eax + 0], mm0
- movq [eax + 8], mm1
- movq [eax +16], mm3
- movq [eax +24], mm4
- ret
- ;======================================================
- ;
- ; void
- ; dct4x4dc_mmx(int16_t* data)
- ;
- ;======================================================
- align 16
- cglobal dct4x4dc_mmx
- dct4x4dc_mmx
- mov eax, [esp + 4] ; data
- movq mm0, [eax + 0]
- movq mm1, [eax + 8]
- movq mm2, [eax + 16]
- movq mm3, [eax + 24]
- transpose mm0, mm1, mm2, mm3, mm4 ; input 0 1 2 3, output 0 1 4 3
- addsub mm0, mm1, mm4, mm3, mm2 ; input 0 1 2 3, output 0 1 3 4
- ; s[0] = mm0, s[1] = mm1, s[2] = mm3, s[3] = mm2
- addsub mm0, mm2, mm3, mm1, mm4 ; input 0 1 2 3, output 0 1 4 3
- transpose mm0, mm2, mm4, mm1, mm3 ; input 0 1 2 3, output 0 1 4 3
- addsub mm0, mm2, mm3, mm1, mm4 ; input 0 1 2 3, output 0 1 3 4
- ; s[0] = mm0, s[1] = mm2, s[2] = mm1, s[3] = mm4
- addsub mm0, mm4, mm1, mm2, mm3 ; input 0 1 2 3, output 0 1 4 3
- movq mm1, [mmx1]
- paddw mm0, mm1
- paddw mm4, mm1
- paddw mm3, mm1
- paddw mm2, mm1
- psraw mm0, 1
- psraw mm4, 1
- psraw mm3, 1
- psraw mm2, 1
- movq [eax + 0], mm0
- movq [eax + 8], mm4
- movq [eax +16], mm3
- movq [eax +24], mm2
- ret
- ;======================================================
- ;
- ; void
- ; idct4x4_mmx(int16_t* data)
- ;
- ;======================================================
- align 16
- cglobal idct4x4_mmx
- idct4x4_mmx
- mov eax, [esp + 4] ; data
- movq mm0, [eax + 0]
- movq mm1, [eax + 8]
- movq mm2, [eax + 16]
- movq mm3, [eax + 24]
- transpose mm0, mm1, mm2, mm3, mm4 ; input 0 1 2 3, output 0 1 4 3
- idct_addsub2 mm0, mm1, mm4, mm3, mm2 ; input 0 1 2 3, output 0 4 1 2
- ; s[0] = mm0, s[1] = mm2, s[2] = mm1, s[3] = mm4
- addsub mm0, mm2, mm1, mm4, mm3 ; input 0 1 2 3, output 0 1 3 4
- transpose mm0, mm2, mm4, mm3, mm1 ; input 0 1 2 3, output 0 1 4 3
- idct_addsub2 mm0, mm2, mm1, mm3, mm4 ; input 0 1 2 3, output 0 4 1 2
- ; s[0] = mm0, s[1] = mm4, s[2] = mm2, s[3] = mm1
- addsub mm0, mm4, mm2, mm1, mm3 ; input 0 1 2 3, output 0 1 3 4
- movq mm2, [mmx32]
- paddw mm0, mm2
- paddw mm4, mm2
- paddw mm1, mm2
- paddw mm3, mm2
- psraw mm0, 6
- psraw mm4, 6
- psraw mm1, 6
- psraw mm3, 6
- movq [eax + 0], mm0
- movq [eax + 8], mm4
- movq [eax +16], mm1
- movq [eax +24], mm3
- ret
- ;======================================================
- ;
- ; void
- ; idct4x4dc_mmx(int16_t* data)
- ;
- ;======================================================
- align 16
- cglobal idct4x4dc_mmx
- idct4x4dc_mmx
- mov eax, [esp + 4] ; data
- movq mm0, [eax + 0]
- movq mm1, [eax + 8]
- movq mm2, [eax + 16]
- movq mm3, [eax + 24]
- transpose mm0, mm1, mm2, mm3, mm4 ; input 0 1 2 3, output 0 1 4 3
- addsub mm0, mm1, mm3, mm4, mm2 ; input 0 1 2 3, output 0 4 3 1
- ; s[0] = mm0, s[1] = mm2, s[2] = mm4, s[3] = mm1
- addsub mm0, mm2, mm4, mm1, mm3 ; input 0 1 2 3, output 0 1 3 4
- transpose mm0, mm2, mm1, mm3, mm4 ; input 0 1 2 3, output 0 1 4 3
- addsub mm0, mm2, mm3, mm4, mm1 ; input 0 1 2 3, output 0 4 3 1
- ; s[0] = mm0, s[1] = mm1, s[2] = mm4, s[3] = mm2
- addsub mm0, mm1, mm4, mm2, mm3 ; input 0 1 2 3, output 0 1 3 4
- movq [eax + 0], mm0
- movq [eax + 8], mm1
- movq [eax +16], mm2
- movq [eax +24], mm3
- ret
- ;======================================================
- ;
- ; void
- ; quant4x4_sse2(int16_t* data, const int32_t Qp, int32_t is_intra)
- ;
- ;======================================================
- align 16
- cglobal quant4x4_sse2
- quant4x4_sse2
- push ebx
- push esi
- push edi
- push ebp
- mov edi, [esp + 4 + 16] ; data
- mov eax, [esp + 8 + 16] ; qp
- cdq
- mov ebp, [esp + 12 + 16] ; is_intra
- mov ebx, 6
- idiv ebx
- add eax, 15 ; qbits(eax) = 15 + qp / 6, mf_index(edx) = qp % 6
- mov esi, edx
- shl esi, 5
- add esi, quant ; esi = quant[mf_index]
- mov ecx, eax ; ecx = qbits
- neg ebp
- sbb ebp, ebp
- and ebp, 0xfffffffd
- add ebp, 6 ; is_intra(ecx) ? 3 : 6
- mov eax, 1
- shl eax, cl ; 1 << qbits
- cdq
- idiv ebp ; 1 << qbits / is_intra(ecx) ? 3 : 6
- ; eax = f, ecx = qbits, esi = quant[mf_index], edi = data
- movd mm0, eax
- movd mm1, ecx
- pshufw mm0, mm0, 0x44
- movq2dq xmm6, mm0
- movq2dq xmm7, mm1
- pshufd xmm6, xmm6, 0x44 ; f
- pxor mm3, mm3
- movdqa xmm0, [edi + 0] ; data
- movdqa xmm1, [esi + 0] ; quant
- ; > 0
- pxor xmm4, xmm4
- movdqa xmm2, xmm0
- pcmpgtw xmm0, xmm4
- movdqa xmm4, xmm0
- pand xmm0, xmm2
- movdqa xmm3, xmm0
- pmullw xmm0, xmm1 ; low part
- pmulhw xmm3, xmm1 ; high part
- movdqa xmm5, xmm0
- punpcklwd xmm0, xmm3 ; low 4 - 32 bits
- punpckhwd xmm5, xmm3 ; high 4 - 32 bits
- movdqa xmm3, xmm4
- punpcklwd xmm4, xmm4
- pand xmm4, xmm6
- paddd xmm0, xmm4 ; data * quant + f
- psrad xmm0, xmm7 ; data * quant + f >> qbits
- punpckhwd xmm3, xmm3
- pand xmm3, xmm6
- paddd xmm5, xmm3 ; data * quant + f
- psrad xmm5, xmm7 ; data * quant + f >> qbits
- packssdw xmm0, xmm5
- ; < 0
- pxor xmm4, xmm4
- movdqa xmm5, xmm2
- pcmpgtw xmm4, xmm2
- pand xmm5, xmm4
- pmullw xmm5, [sse2_neg1]
- movdqa xmm3, xmm5
- pmullw xmm5, xmm1
- pmulhw xmm3, xmm1
- movdqa xmm1, xmm5
- punpcklwd xmm5, xmm3
- punpckhwd xmm1, xmm3
- movdqa xmm3, xmm4
- punpcklwd xmm4, xmm4
- pand xmm4, xmm6
- paddd xmm5, xmm4 ; data * quant - f
- psrad xmm5, xmm7
- punpckhwd xmm3, xmm3
- pand xmm3, xmm6
- paddd xmm1, xmm3
- psrad xmm1, xmm7
- packssdw xmm5, xmm1
- pmullw xmm5, [sse2_neg1]
- por xmm5, xmm0
- movdqa [edi + 0], xmm5
- movdqa xmm0, [edi + 16] ; data
- movdqa xmm1, [esi + 16] ; quant
- ; > 0
- pxor xmm4, xmm4
- movdqa xmm2, xmm0
- pcmpgtw xmm0, xmm4
- movdqa xmm4, xmm0
- pand xmm0, xmm2
- movdqa xmm3, xmm0
- pmullw xmm0, xmm1 ; low part
- pmulhw xmm3, xmm1 ; high part
- movdqa xmm5, xmm0
- punpcklwd xmm0, xmm3 ; low 4 - 32 bits
- punpckhwd xmm5, xmm3 ; high 4 - 32 bits
- movdqa xmm3, xmm4
- punpcklwd xmm4, xmm4
- pand xmm4, xmm6
- paddd xmm0, xmm4 ; data * quant + f
- psrad xmm0, xmm7 ; data * quant + f >> qbits
- punpckhwd xmm3, xmm3
- pand xmm3, xmm6
- paddd xmm5, xmm3 ; data * quant + f
- psrad xmm5, xmm7 ; data * quant + f >> qbits
- packssdw xmm0, xmm5
- ; < 0
- pxor xmm4, xmm4
- movdqa xmm5, xmm2
- pcmpgtw xmm4, xmm2
- pand xmm5, xmm4
- pmullw xmm5, [sse2_neg1]
- movdqa xmm3, xmm5
- pmullw xmm5, xmm1
- pmulhw xmm3, xmm1
- movdqa xmm1, xmm5
- punpcklwd xmm5, xmm3
- punpckhwd xmm1, xmm3
- movdqa xmm3, xmm4
- punpcklwd xmm4, xmm4
- pand xmm4, xmm6
- paddd xmm5, xmm4 ; data * quant - f
- psrad xmm5, xmm7
- punpckhwd xmm3, xmm3
- pand xmm3, xmm6
- paddd xmm1, xmm3
- psrad xmm1, xmm7
- packssdw xmm5, xmm1
- pmullw xmm5, [sse2_neg1]
- por xmm5, xmm0
- movdqa [edi + 16], xmm5
- pop ebp
- pop edi
- pop esi
- pop ebx
- ret
- ;======================================================
- ;
- ; void
- ; iquant4x4_sse2(int16_t* data, const int32_t Qp)
- ;
- ;======================================================
- align 16
- cglobal iquant4x4_sse2
- iquant4x4_sse2
- mov eax, [esp + 8] ; qp
- cdq
- mov ecx, 6
- idiv ecx ; qbits(eax) = qp / 6, mf_index(edx) = qp % 6
- mov ecx, edx
- shl ecx, 5
- add ecx, dequant ; ecx = quant[mf_index]
- mov edx, [esp + 4] ; data
- ; eax = qbits, ecx = quant[mf_index], edx = data
- movdqa xmm6, [sse2_1]
- movdqa xmm0, [edx + 0]
- movdqa xmm2, [edx + 16]
- movdqa xmm1, [ecx + 0]
- movdqa xmm3, [ecx + 16]
- pmullw xmm0, xmm1
- pmullw xmm2, xmm3
- movd xmm7, eax
- psllw xmm6, xmm7 ; << qbits
- pmullw xmm0, xmm6
- pmullw xmm2, xmm6
- movdqa [edx + 0], xmm0
- movdqa [edx + 16], xmm2
- ret