deblock_horiz_lpf9.c
上传用户:tuheem
上传日期:2007-05-01
资源大小:21889k
文件大小:6k
- #include "postprocess_mmx.h"
- const static uint64_t mm64_0008 = 0x0008000800080008;
- const static uint64_t mm64_0101 = 0x0101010101010101;
- static uint64_t mm64_temp;
- const static uint64_t mm64_coefs[18] = {
- 0x0001000200040006, /* p1 left */ 0x0000000000000001, /* v1 right */
- 0x0001000200020004, /* v1 left */ 0x0000000000010001, /* v2 right */
- 0x0002000200040002, /* v2 left */ 0x0000000100010002, /* v3 right */
- 0x0002000400020002, /* v3 left */ 0x0001000100020002, /* v4 right */
- 0x0004000200020001, /* v4 left */ 0x0001000200020004, /* v5 right */
- 0x0002000200010001, /* v5 left */ 0x0002000200040002, /* v6 right */
- 0x0002000100010000, /* v6 left */ 0x0002000400020002, /* v7 right */
- 0x0001000100000000, /* v7 left */ 0x0004000200020001, /* v8 right */
- 0x0001000000000000, /* v8 left */ 0x0006000400020001 /* p2 right */
- };
- static uint32_t mm32_p1p2;
- static uint8_t *pmm1;
- INLINE void deblock_horiz_lpf9(uint8_t *v, int stride, int QP) {
- int y, p1, p2;
- #ifdef PP_SELF_CHECK
- uint8_t selfcheck[9];
- int psum;
- uint8_t *vv;
- int i;
- #endif
- for (y=0; y<4; y++) {
- p1 = (ABS(v[0+y*stride]-v[1+y*stride]) < QP ) ? v[0+y*stride] : v[1+y*stride];
- p2 = (ABS(v[8+y*stride]-v[9+y*stride]) < QP ) ? v[9+y*stride] : v[8+y*stride];
- mm32_p1p2 = 0x0101 * ((p2 << 16) + p1);
- #ifdef PP_SELF_CHECK
- vv = &(v[y*stride]);
- psum = p1 + p1 + p1 + vv[1] + vv[2] + vv[3] + vv[4] + 4;
- selfcheck[1] = (((psum + vv[1]) << 1) - (vv[4] - vv[5])) >> 4;
- psum += vv[5] - p1;
- selfcheck[2] = (((psum + vv[2]) << 1) - (vv[5] - vv[6])) >> 4;
- psum += vv[6] - p1;
- selfcheck[3] = (((psum + vv[3]) << 1) - (vv[6] - vv[7])) >> 4;
- psum += vv[7] - p1;
- selfcheck[4] = (((psum + vv[4]) << 1) + p1 - vv[1] - (vv[7] - vv[8])) >> 4;
- psum += vv[8] - vv[1];
- selfcheck[5] = (((psum + vv[5]) << 1) + (vv[1] - vv[2]) - vv[8] + p2) >> 4;
- psum += p2 - vv[2];
- selfcheck[6] = (((psum + vv[6]) << 1) + (vv[2] - vv[3])) >> 4;
- psum += p2 - vv[3];
- selfcheck[7] = (((psum + vv[7]) << 1) + (vv[3] - vv[4])) >> 4;
- psum += p2 - vv[4];
- selfcheck[8] = (((psum + vv[8]) << 1) + (vv[4] - vv[5])) >> 4;
- #endif
- pmm1 = (&(v[y*stride-3])); __asm {
- push eax
- push ebx
- mov eax, pmm1
- lea ebx, mm64_coefs
- #ifdef PREFETCH_ENABLE
- prefetcht0 32[ebx]
- #endif
- movd mm0, mm32_p1p2
- punpcklbw mm0, mm0
- movq mm2, qword ptr [eax]
- pxor mm7, mm7
- movq mm6, mm64_0008
- punpckhbw mm2, mm2
- movq mm64_temp, mm0
- punpcklbw mm0, mm7
- movq mm5, mm6
- pmullw mm0, [ebx]
- movq mm1, mm2
- punpcklbw mm2, mm2
- punpckhbw mm1, mm1
- #ifdef PREFETCH_ENABLE
- prefetcht0 32[ebx]
- #endif
- movq mm3, mm2
- punpcklbw mm2, mm7
- punpckhbw mm3, mm7
- paddw mm6, mm0
- movq mm0, mm2
- pmullw mm0, 8[ebx]
- movq mm4, mm3
- pmullw mm2, 16[ebx]
- pmullw mm3, 32[ebx]
- pmullw mm4, 24[ebx]
- paddw mm5, mm0
- paddw mm6, mm2
- movq mm2, mm1
- punpckhbw mm2, mm7
- paddw mm5, mm4
- punpcklbw mm1, mm7
- paddw mm6, mm3
- #ifdef PREFETCH_ENABLE
- prefetcht0 64[ebx]
- #endif
- movq mm0, mm1
- pmullw mm1, 48[ebx]
- pmullw mm0, 40[ebx]
- movq mm4, mm2
- pmullw mm2, 64[ebx]
- paddw mm6, mm1
- pmullw mm4, 56[ebx]
- pxor mm3, mm3
- movq mm1, 8[eax]
- paddw mm5, mm0
- punpcklbw mm1, mm1
- paddw mm6, mm2
- #ifdef PREFETCH_ENABLE
- prefetcht0 96[ebx]
- #endif
- movq mm2, mm1
- paddw mm5, mm4
- punpcklbw mm2, mm2
- punpckhbw mm1, mm1
- movq mm3, mm2
- punpcklbw mm2, mm7
- punpckhbw mm3, mm7
- movq mm0, mm2
- pmullw mm0, 72[ebx]
- movq mm4, mm3
- pmullw mm2, 80[ebx]
- pmullw mm3, 96[ebx]
- pmullw mm4, 88[ebx]
- paddw mm5, mm0
- paddw mm6, mm2
- movq mm2, mm1
- paddw mm6, mm3
- punpcklbw mm1, mm7
- paddw mm5, mm4
- punpckhbw mm2, mm7
- #ifdef PREFETCH_ENABLE
- prefetcht0 128[ebx]
- #endif
- movq mm3, mm64_temp
- movq mm0, mm1
- pmullw mm0, 104[ebx]
- movq mm4, mm2
- pmullw mm1, 112[ebx]
- punpckhbw mm3, mm7
- pmullw mm2, 128[ebx]
- pmullw mm4, 120[ebx]
- paddw mm5, mm0
- pmullw mm3, 136[ebx]
- paddw mm6, mm1
- paddw mm6, mm2
- paddw mm5, mm4
- psrlw mm6, 4
- paddw mm5, mm3
- psrlw mm5, 4
- packuswb mm6, mm5
- movq 4[eax], mm6
- pop ebx
- pop eax
- };
-
- #ifdef PP_SELF_CHECK
- for (i=1; i<=8; i++) {
- if (selfcheck[i] != v[i+y*stride]) {
- printf("ERROR: MMX version of horiz lpf9 is incorrect at %dn", i);
- }
- }
- #endif
- }
- }