idct_sse2.cpp
上传用户:xjjlds
上传日期:2015-12-05
资源大小:22823k
文件大小:17k
- #include "stdafx.h"
- #include "libmpeg2.h"
- // Intel's SSE2 implementation of iDCT
- // AP-945
- // http://cache-www.intel.com/cd/00/00/01/76/17680_w_idct.pdf
- #define BITS_INV_ACC 4 // 4 or 5 for IEEE
- #define SHIFT_INV_ROW 16 - BITS_INV_ACC
- #define SHIFT_INV_COL 1 + BITS_INV_ACC
- const short RND_INV_ROW = 1024 * (6 - BITS_INV_ACC); //1 << (SHIFT_INV_ROW-1)
- const short RND_INV_COL = 16 * (BITS_INV_ACC - 3); // 1 << (SHIFT_INV_COL-1)
- const short RND_INV_CORR = RND_INV_COL - 1; // correction -1.0 and round
- __declspec(align(16)) short M128_one_corr[8] = {1,1,1,1,1,1,1,1};
- __declspec(align(16)) short M128_round_inv_row[8] = {RND_INV_ROW, 0, RND_INV_ROW, 0, RND_INV_ROW, 0, RND_INV_ROW, 0};
- __declspec(align(16)) short M128_round_inv_col[8] = {RND_INV_COL, RND_INV_COL, RND_INV_COL, RND_INV_COL, RND_INV_COL, RND_INV_COL, RND_INV_COL, RND_INV_COL};
- __declspec(align(16)) short M128_round_inv_corr[8]= {RND_INV_CORR, RND_INV_CORR, RND_INV_CORR, RND_INV_CORR, RND_INV_CORR, RND_INV_CORR, RND_INV_CORR, RND_INV_CORR};
- __declspec(align(16)) short M128_tg_1_16[8] = {13036, 13036, 13036, 13036, 13036, 13036, 13036, 13036}; // tg * (2<<16) + 0.5
- __declspec(align(16)) short M128_tg_2_16[8] = {27146, 27146, 27146, 27146, 27146, 27146, 27146, 27146}; // tg * (2<<16) + 0.5
- __declspec(align(16)) short M128_tg_3_16[8] = {-21746, -21746, -21746, -21746, -21746, -21746, -21746, -21746}; // tg * (2<<16) + 0.5
- __declspec(align(16)) short M128_cos_4_16[8] = {-19195, -19195, -19195, -19195, -19195, -19195, -19195, -19195};// cos * (2<<16) + 0.5
- //-----------------------------------------------------------------------------
- // Table for rows 0,4 - constants are multiplied on cos_4_16
- //movq -> w13 w12 w09 w08 w05 w04 w01 w00
- // w15 w14 w11 w10 w07 w06 w03 w02
- // w29 w28 w25 w24 w21 w20 w17 w16
- // w31 w30 w27 w26 w23 w22 w19 w18
- __declspec(align(16)) short M128_tab_i_04[] =
- {
- 16384, 21407, 16384, 8867, //movq -> w05 w04 w01 w00
- 16384, -8867, 16384, -21407, // w13 w12 w09 w08
- 16384, 8867, -16384, -21407, // w07 w06 w03 w02
- -16384, 21407, 16384, -8867, // w15 w14 w11 w10
- 22725, 19266, 19266, -4520, // w21 w20 w17 w16
- 12873, -22725, 4520, -12873, // w29 w28 w25 w24
- 12873, 4520, -22725, -12873, // w23 w22 w19 w18
- 4520, 19266, 19266, -22725 // w31 w30 w27 w26
- };
- // Table for rows 1,7 - constants are multiplied on cos_1_16
- __declspec(align(16)) short M128_tab_i_17[] =
- {
- 22725, 29692, 22725, 12299, //movq -> w05 w04 w01 w00
- 22725, -12299, 22725, -29692, // w13 w12 w09 w08
- 22725, 12299, -22725, -29692, // w07 w06 w03 w02
- -22725, 29692, 22725, -12299, // w15 w14 w11 w10
- 31521, 26722, 26722, -6270, // w21 w20 w17 w16
- 17855, -31521, 6270, -17855, // w29 w28 w25 w24
- 17855, 6270, -31521, -17855, // w23 w22 w19 w18
- 6270, 26722, 26722, -31521 // w31 w30 w27 w26
- };
- // Table for rows 2,6 - constants are multiplied on cos_2_16
- __declspec(align(16)) short M128_tab_i_26[] =
- {
- 21407, 27969, 21407, 11585, //movq -> w05 w04 w01 w00
- 21407, -11585, 21407, -27969, // w13 w12 w09 w08
- 21407, 11585, -21407, -27969, // w07 w06 w03 w02
- -21407, 27969, 21407, -11585, // w15 w14 w11 w10
- 29692, 25172, 25172, -5906, // w21 w20 w17 w16
- 16819, -29692, 5906, -16819, // w29 w28 w25 w24
- 16819, 5906, -29692, -16819, // w23 w22 w19 w18
- 5906, 25172, 25172, -29692 // w31 w30 w27 w26
- };
- // Table for rows 3,5 - constants are multiplied on cos_3_16
- __declspec(align(16)) short M128_tab_i_35[] =
- {
- 19266, 25172, 19266, 10426, //movq -> w05 w04 w01 w00
- 19266, -10426, 19266, -25172, // w13 w12 w09 w08
- 19266, 10426, -19266, -25172, // w07 w06 w03 w02
- -19266, 25172, 19266, -10426, // w15 w14 w11 w10
- 26722, 22654, 22654, -5315, // w21 w20 w17 w16
- 15137, -26722, 5315, -15137, // w29 w28 w25 w24
- 15137, 5315, -26722, -15137, // w23 w22 w19 w18
- 5315, 22654, 22654, -26722 // w31 w30 w27 w26
- };
- //-----------------------------------------------------------------------------
- /*
- ;=============================================================================
- ;=============================================================================
- ;=============================================================================
- ;
- ; Inverse DCT
- ;
- ;-----------------------------------------------------------------------------
- ;
- ; This implementation calculates iDCT-2D by a row-column method.
- ; On the first stage the iDCT-1D is calculated for each row with use
- ; direct algorithm, on the second stage the calculation is executed
- ; at once for four columns with use of scaled iDCT-1D algorithm.
- ; Base R&Y algorithm for iDCT-1D is modified for second stage.
- ;
- ;=============================================================================
- ;-----------------------------------------------------------------------------
- ;
- ; The first stage - inverse DCTs of rows
- ;
- ;-----------------------------------------------------------------------------
- ; The 8-point inverse DCT direct algorithm
- ;-----------------------------------------------------------------------------
- ;
- ; static const short w[32] = {
- ; FIX(cos_4_16), FIX(cos_2_16), FIX(cos_4_16), FIX(cos_6_16),
- ; FIX(cos_4_16), FIX(cos_6_16), -FIX(cos_4_16), -FIX(cos_2_16),
- ; FIX(cos_4_16), -FIX(cos_6_16), -FIX(cos_4_16), FIX(cos_2_16),
- ; FIX(cos_4_16), -FIX(cos_2_16), FIX(cos_4_16), -FIX(cos_6_16),
- ; FIX(cos_1_16), FIX(cos_3_16), FIX(cos_5_16), FIX(cos_7_16),
- ; FIX(cos_3_16), -FIX(cos_7_16), -FIX(cos_1_16), -FIX(cos_5_16),
- ; FIX(cos_5_16), -FIX(cos_1_16), FIX(cos_7_16), FIX(cos_3_16),
- ; FIX(cos_7_16), -FIX(cos_5_16), FIX(cos_3_16), -FIX(cos_1_16) };
- ;
- ; #define DCT_8_INV_ROW(x, y)
- ; {
- ; int a0, a1, a2, a3, b0, b1, b2, b3;
- ;
- ; a0 = x[0] * w[ 0] + x[2] * w[ 1] + x[4] * w[ 2] + x[6] * w[ 3];
- ; a1 = x[0] * w[ 4] + x[2] * w[ 5] + x[4] * w[ 6] + x[6] * w[ 7];
- ; a2 = x[0] * w[ 8] + x[2] * w[ 9] + x[4] * w[10] + x[6] * w[11];
- ; a3 = x[0] * w[12] + x[2] * w[13] + x[4] * w[14] + x[6] * w[15];
- ; b0 = x[1] * w[16] + x[3] * w[17] + x[5] * w[18] + x[7] * w[19];
- ; b1 = x[1] * w[20] + x[3] * w[21] + x[5] * w[22] + x[7] * w[23];
- ; b2 = x[1] * w[24] + x[3] * w[25] + x[5] * w[26] + x[7] * w[27];
- ; b3 = x[1] * w[28] + x[3] * w[29] + x[5] * w[30] + x[7] * w[31];
- ;
- ; y[0] = SHIFT_ROUND ( a0 + b0 );
- ; y[1] = SHIFT_ROUND ( a1 + b1 );
- ; y[2] = SHIFT_ROUND ( a2 + b2 );
- ; y[3] = SHIFT_ROUND ( a3 + b3 );
- ; y[4] = SHIFT_ROUND ( a3 - b3 );
- ; y[5] = SHIFT_ROUND ( a2 - b2 );
- ; y[6] = SHIFT_ROUND ( a1 - b1 );
- ; y[7] = SHIFT_ROUND ( a0 - b0 );
- ; }
- ;
- ;-----------------------------------------------------------------------------
- ;
- ; In this implementation the outputs of the iDCT-1D are multiplied
- ; for rows 0,4 - on cos_4_16,
- ; for rows 1,7 - on cos_1_16,
- ; for rows 2,6 - on cos_2_16,
- ; for rows 3,5 - on cos_3_16
- ; and are shifted to the left for rise of accuracy
- ;
- ; For used constants
- ; FIX(float_const) = (short) (float_const * (1<<15) + 0.5)
- ;
- ;-----------------------------------------------------------------------------
- ;-----------------------------------------------------------------------------
- ;
- ; The second stage - inverse DCTs of columns
- ;
- ; The inputs are multiplied
- ; for rows 0,4 - on cos_4_16,
- ; for rows 1,7 - on cos_1_16,
- ; for rows 2,6 - on cos_2_16,
- ; for rows 3,5 - on cos_3_16
- ; and are shifted to the left for rise of accuracy
- ;
- ;-----------------------------------------------------------------------------
- ;
- ; The 8-point scaled inverse DCT algorithm (26a8m)
- ;
- ;-----------------------------------------------------------------------------
- ;
- ; #define DCT_8_INV_COL(x, y)
- ; {
- ; short t0, t1, t2, t3, t4, t5, t6, t7;
- ; short tp03, tm03, tp12, tm12, tp65, tm65;
- ; short tp465, tm465, tp765, tm765;
- ;
- ; tp765 = x[1] + x[7] * tg_1_16;
- ; tp465 = x[1] * tg_1_16 - x[7];
- ; tm765 = x[5] * tg_3_16 + x[3];
- ; tm465 = x[5] - x[3] * tg_3_16;
- ;
- ; t7 = tp765 + tm765;
- ; tp65 = tp765 - tm765;
- ; t4 = tp465 + tm465;
- ; tm65 = tp465 - tm465;
- ;
- ; t6 = ( tp65 + tm65 ) * cos_4_16;
- ; t5 = ( tp65 - tm65 ) * cos_4_16;
- ;
- ; tp03 = x[0] + x[4];
- ; tp12 = x[0] - x[4];
- ;
- ; tm03 = x[2] + x[6] * tg_2_16;
- ; tm12 = x[2] * tg_2_16 - x[6];
- ;
- ; t0 = tp03 + tm03;
- ; t3 = tp03 - tm03;
- ; t1 = tp12 + tm12;
- ; t2 = tp12 - tm12;
- ;
- ; y[0] = SHIFT_ROUND ( t0 + t7 );
- ; y[7] = SHIFT_ROUND ( t0 - t7 );
- ; y[1] = SHIFT_ROUND ( t1 + t6 );
- ; y[6] = SHIFT_ROUND ( t1 - t6 );
- ; y[2] = SHIFT_ROUND ( t2 + t5 );
- ; y[5] = SHIFT_ROUND ( t2 - t5 );
- ; y[3] = SHIFT_ROUND ( t3 + t4 );
- ; y[4] = SHIFT_ROUND ( t3 - t4 );
- ; }
- ;
- ;-----------------------------------------------------------------------------
- */
- //xmm7 = round_inv_row
- #define DCT_8_INV_ROW __asm{
- __asm pshuflw xmm0, xmm0, 0xD8
- __asm pshufd xmm1, xmm0, 0
- __asm pmaddwd xmm1, [esi]
- __asm pshufd xmm3, xmm0, 0x55
- __asm pshufhw xmm0, xmm0, 0xD8
- __asm pmaddwd xmm3, [esi+32]
- __asm pshufd xmm2, xmm0, 0xAA
- __asm pshufd xmm0, xmm0, 0xFF
- __asm pmaddwd xmm2, [esi+16]
- __asm pshufhw xmm4, xmm4, 0xD8
- __asm paddd xmm1, M128_round_inv_row
- __asm pshuflw xmm4, xmm4, 0xD8
- __asm pmaddwd xmm0, [esi+48]
- __asm pshufd xmm5, xmm4, 0
- __asm pshufd xmm6, xmm4, 0xAA
- __asm pmaddwd xmm5, [ecx]
- __asm paddd xmm1, xmm2
- __asm movdqa xmm2, xmm1
- __asm pshufd xmm7, xmm4, 0x55
- __asm pmaddwd xmm6, [ecx+16]
- __asm paddd xmm0, xmm3
- __asm pshufd xmm4, xmm4, 0xFF
- __asm psubd xmm2, xmm0
- __asm pmaddwd xmm7, [ecx+32]
- __asm paddd xmm0, xmm1
- __asm psrad xmm2, 12
- __asm paddd xmm5, M128_round_inv_row
- __asm pmaddwd xmm4, [ecx+48]
- __asm paddd xmm5, xmm6
- __asm movdqa xmm6, xmm5
- __asm psrad xmm0, 12
- __asm pshufd xmm2, xmm2, 0x1B
- __asm packssdw xmm0, xmm2
- __asm paddd xmm4, xmm7
- __asm psubd xmm6, xmm4
- __asm paddd xmm4, xmm5
- __asm psrad xmm6, 12
- __asm psrad xmm4, 12
- __asm pshufd xmm6, xmm6, 0x1B
- __asm packssdw xmm4, xmm6
- }
- #define DCT_8_INV_COL_8 __asm{
- __asm movdqa xmm1, XMMWORD PTR M128_tg_3_16
- __asm movdqa xmm2, xmm0
- __asm movdqa xmm3, XMMWORD PTR [edx+3*16]
- __asm pmulhw xmm0, xmm1
- __asm pmulhw xmm1, xmm3
- __asm movdqa xmm5, XMMWORD PTR M128_tg_1_16
- __asm movdqa xmm6, xmm4
- __asm pmulhw xmm4, xmm5
- __asm paddsw xmm0, xmm2
- __asm pmulhw xmm5, [edx+1*16]
- __asm paddsw xmm1, xmm3
- __asm movdqa xmm7, XMMWORD PTR [edx+6*16]
- __asm paddsw xmm0, xmm3
- __asm movdqa xmm3, XMMWORD PTR M128_tg_2_16
- __asm psubsw xmm2, xmm1
- __asm pmulhw xmm7, xmm3
- __asm movdqa xmm1, xmm0
- __asm pmulhw xmm3, [edx+2*16]
- __asm psubsw xmm5, xmm6
- __asm paddsw xmm4, [edx+1*16]
- __asm paddsw xmm0, xmm4
- __asm paddsw xmm0, XMMWORD PTR M128_one_corr
- __asm psubsw xmm4, xmm1
- __asm movdqa xmm6, xmm5
- __asm psubsw xmm5, xmm2
- __asm paddsw xmm5, XMMWORD PTR M128_one_corr
- __asm paddsw xmm6, xmm2
- __asm movdqa [edx+7*16], xmm0
- __asm movdqa xmm1, xmm4
- __asm movdqa xmm0, XMMWORD PTR M128_cos_4_16
- __asm paddsw xmm4, xmm5
- __asm movdqa xmm2, XMMWORD PTR M128_cos_4_16
- __asm pmulhw xmm2, xmm4
- __asm movdqa [edx+3*16], xmm6
- __asm psubsw xmm1, xmm5
- __asm paddsw xmm7, [edx+2*16]
- __asm psubsw xmm3, [edx+6*16]
- __asm movdqa xmm6, [edx]
- __asm pmulhw xmm0, xmm1
- __asm movdqa xmm5, [edx+4*16]
- __asm paddsw xmm5, xmm6
- __asm psubsw xmm6, [edx+4*16]
- __asm paddsw xmm4, xmm2
- __asm por xmm4, XMMWORD PTR M128_one_corr
- __asm paddsw xmm0, xmm1
- __asm por xmm0, XMMWORD PTR M128_one_corr
- __asm movdqa xmm2, xmm5
- __asm paddsw xmm5, xmm7
- __asm movdqa xmm1, xmm6
- __asm paddsw xmm5, XMMWORD PTR M128_round_inv_col
- __asm psubsw xmm2, xmm7
- __asm movdqa xmm7, [edx+7*16]
- __asm paddsw xmm6, xmm3
- __asm paddsw xmm6, XMMWORD PTR M128_round_inv_col
- __asm paddsw xmm7, xmm5
- __asm psraw xmm7, SHIFT_INV_COL
- __asm psubsw xmm1, xmm3
- __asm paddsw xmm1, XMMWORD PTR M128_round_inv_corr
- __asm movdqa xmm3, xmm6
- __asm paddsw xmm2, XMMWORD PTR M128_round_inv_corr
- __asm paddsw xmm6, xmm4
- __asm movdqa [edx], xmm7
- __asm psraw xmm6, SHIFT_INV_COL
- __asm movdqa xmm7, xmm1
- __asm paddsw xmm1, xmm0
- __asm movdqa [edx+1*16], xmm6
- __asm psraw xmm1, SHIFT_INV_COL
- __asm movdqa xmm6, [edx+3*16]
- __asm psubsw xmm7, xmm0
- __asm psraw xmm7, SHIFT_INV_COL
- __asm movdqa [edx+2*16], xmm1
- __asm psubsw xmm5, [edx+7*16]
- __asm psraw xmm5, SHIFT_INV_COL
- __asm movdqa [edx+7*16], xmm5
- __asm psubsw xmm3, xmm4
- __asm paddsw xmm6, xmm2
- __asm psubsw xmm2, [edx+3*16]
- __asm psraw xmm6, SHIFT_INV_COL
- __asm psraw xmm2, SHIFT_INV_COL
- __asm movdqa [edx+3*16], xmm6
- __asm psraw xmm3, SHIFT_INV_COL
- __asm movdqa [edx+4*16], xmm2
- __asm movdqa [edx+5*16], xmm7
- __asm movdqa [edx+6*16], xmm3
- }
- //assumes src and destination are aligned on a 16-byte boundary
- static void idct_M128ASM(short* src)
- {
- ASSERT(((DWORD)src & 0xf) == 0); //aligned on 16-byte boundary
- __asm mov edx, src
- __asm movdqa xmm0, XMMWORD PTR[edx] //row 1
- __asm lea esi, M128_tab_i_04
- __asm movdqa xmm4, XMMWORD PTR[edx+16*2] //row 3
- __asm lea ecx, M128_tab_i_26
- DCT_8_INV_ROW; //Row 1, tab_i_04 and Row 3, tab_i_26
- __asm movdqa XMMWORD PTR[edx], xmm0
- __asm movdqa XMMWORD PTR[edx+16*2], xmm4
- __asm movdqa xmm0, XMMWORD PTR[edx+16*4] //row 5
- //__asm lea esi, M128_tab_i_04
- __asm movdqa xmm4, XMMWORD PTR[edx+16*6] //row 7
- //__asm lea ecx, M128_tab_i_26
- DCT_8_INV_ROW; //Row 5, tab_i_04 and Row 7, tab_i_26
- __asm movdqa XMMWORD PTR[edx+16*4], xmm0
- __asm movdqa XMMWORD PTR[edx+16*6], xmm4
- __asm movdqa xmm0, XMMWORD PTR[edx+16*3] //row 4
- __asm lea esi, M128_tab_i_35
- __asm movdqa xmm4, XMMWORD PTR[edx+16*1] //row 2
- __asm lea ecx, M128_tab_i_17
- DCT_8_INV_ROW; //Row 4, tab_i_35 and Row 2, tab_i_17
- __asm movdqa XMMWORD PTR[edx+16*3], xmm0
- __asm movdqa XMMWORD PTR[edx+16*1], xmm4
- __asm movdqa xmm0, XMMWORD PTR[edx+16*5] //row 6
- //__asm lea esi, M128_tab_i_35
- __asm movdqa xmm4, XMMWORD PTR[edx+16*7] //row 8
- //__asm lea ecx, M128_tab_i_17
- DCT_8_INV_ROW; //Row 6, tab_i_35 and Row 8, tab_i_17
- //__asm movdqa XMMWORD PTR[edx+80], xmm0
- //__asm movdqa xmm0, XMMWORD PTR [edx+80] /* 0 /* x5 */
- //__asm movdqa XMMWORD PTR[edx+16*7], xmm4
- //__asm movdqa xmm4, XMMWORD PTR [edx+7*16]/* 4 ; x7 */
- DCT_8_INV_COL_8
- // __asm emms
- }
- /////////////
- #define CLIP(x) (x < 0 ? 0 : x > 255 ? 255 : x)
- void mpeg2_idct_copy_sse2(int16_t* block, uint8_t* dest, const int stride)
- {
- idct_M128ASM(block);
- /*
- for(int i = 0; i < 8; i++)
- {
- dest[0] = CLIP(block[0]);
- dest[1] = CLIP(block[1]);
- dest[2] = CLIP(block[2]);
- dest[3] = CLIP(block[3]);
- dest[4] = CLIP(block[4]);
- dest[5] = CLIP(block[5]);
- dest[6] = CLIP(block[6]);
- dest[7] = CLIP(block[7]);
- memset(block, 0, sizeof(short)*8);
- dest += stride;
- block += 8;
- }
- */
- __asm
- {
- mov esi, block
- mov edi, dest
- mov edx, stride
- lea ecx, [edx+edx]
- movdqa xmm0, [esi+16*0]
- movdqa xmm1, [esi+16*1]
- movdqa xmm2, [esi+16*2]
- movdqa xmm3, [esi+16*3]
- movdqa xmm4, [esi+16*4]
- movdqa xmm5, [esi+16*5]
- movdqa xmm6, [esi+16*6]
- movdqa xmm7, [esi+16*7]
-
- packuswb xmm0, xmm1
- packuswb xmm2, xmm3
- packuswb xmm4, xmm5
- packuswb xmm6, xmm7
- movlps [edi], xmm0
- movhps [edi+edx], xmm0
- add edi, ecx
- movlps [edi], xmm2
- movhps [edi+edx], xmm2
- add edi, ecx
- movlps [edi], xmm4
- movhps [edi+edx], xmm4
- add edi, ecx
- movlps [edi], xmm6
- movhps [edi+edx], xmm6
- xorps xmm7, xmm7
- movdqa [esi+16*0], xmm7
- movdqa [esi+16*1], xmm7
- movdqa [esi+16*2], xmm7
- movdqa [esi+16*3], xmm7
- movdqa [esi+16*4], xmm7
- movdqa [esi+16*5], xmm7
- movdqa [esi+16*6], xmm7
- movdqa [esi+16*7], xmm7
- }
- }
- void mpeg2_idct_add_sse2(const int last, int16_t* block, uint8_t* dest, const int stride)
- {
- idct_M128ASM(block);
- /*
- for(int i = 0; i < 8; i++)
- {
- dest[0] = CLIP(block[0] + dest[0]);
- dest[1] = CLIP(block[1] + dest[1]);
- dest[2] = CLIP(block[2] + dest[2]);
- dest[3] = CLIP(block[3] + dest[3]);
- dest[4] = CLIP(block[4] + dest[4]);
- dest[5] = CLIP(block[5] + dest[5]);
- dest[6] = CLIP(block[6] + dest[6]);
- dest[7] = CLIP(block[7] + dest[7]);
- memset(block, 0, sizeof(short)*8);
- dest += stride;
- block += 8;
- }
- */
- __asm
- {
- mov esi, block
- mov edi, dest
- mov ecx, 4
- mov edx, stride
- xorps xmm7, xmm7
- mpeg2_idct_add_sse2_loop:
- movdqa xmm0, [esi]
- movdqa xmm1, [esi+16]
- movlps xmm2, [edi]
- punpcklbw xmm2, xmm7
- paddsw xmm0, xmm2
- movlps xmm2, [edi+edx]
- punpcklbw xmm2, xmm7
- paddsw xmm1, xmm2
- packuswb xmm0, xmm1
- movdqa [esi], xmm7
- movdqa [esi+16], xmm7
- movlps [edi], xmm0
- movhps [edi+edx], xmm0
- lea esi, [esi+16*2]
- lea edi, [edi+edx*2]
- dec ecx
- jnz mpeg2_idct_add_sse2_loop
- }
- }
- void mpeg2_idct_init_sse2()
- {
- }