mpeg/mp3

开发平台：
Visual C++

block.c：源码内容
							/*****************************************************************************
*
*  T264 AVC CODEC
*
*  Copyright(C) 2004-2005 llcc <lcgate1@yahoo.com.cn>
*               2004-2005 visionany <visionany@yahoo.com.cn>
*   2005.2.24 CloudWu<sywu@sohu.com>	added support for B-frame MB16x16 support 
*   2005.3.2 CloudWu<sywu@sohu.com>	added support for B-frame MB16x8 and MB8x16,MB8x8 support
*
*  This program is free software ; you can redistribute it and/or modify
*  it under the terms of the GNU General Public License as published by
*  the Free Software Foundation ; either version 2 of the License, or
*  (at your option) any later version.
*
*  This program is distributed in the hope that it will be useful,
*  but WITHOUT ANY WARRANTY ; without even the implied warranty of
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
*  GNU General Public License for more details.
*
*  You should have received a copy of the GNU General Public License
*  along with this program ; if not, write to the Free Software
*  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
*
****************************************************************************/
#include "stdio.h"
#include "T264.h"
#include "utility.h"
#ifndef CHIP_DM642
#include "memory.h"
#endif
#include "assert.h"
#include "block.h"
/* intra */
static void __inline
T264dec_mb_decode_predict_i16x16_y(T264_t* t, uint8_t mode, uint8_t* pred, uint8_t* src)
{
    DECLARE_ALIGNED_MATRIX(topcache, 1, 16 + CACHE_SIZE, uint8_t, CACHE_SIZE);
    DECLARE_ALIGNED_MATRIX(leftcache, 1, 16 + CACHE_SIZE, uint8_t, CACHE_SIZE);
    uint8_t* p;
    int32_t i;
    uint8_t* top, *left;
    top  =  &topcache[CACHE_SIZE];
    left = &leftcache[CACHE_SIZE];
    if (mode == Intra_16x16_DC)
    {
        if ((t->mb.mb_neighbour & (MB_LEFT | MB_TOP)) == (MB_LEFT | MB_TOP))
        {
            mode = Intra_16x16_DC;
            p = src - t->edged_stride;
            for(i = 0 ; i < 16 ; i ++)
            {
                top[i] = p[i];
            }
            p = src - 1;
            for(i = 0 ; i < 16 ; i ++)
            {
                left[i] = p[0];
                p += t->edged_stride;
            }
        }
        else if(t->mb.mb_neighbour & MB_LEFT)
        {
            mode = Intra_16x16_DCLEFT;
            p = src - 1;
            for(i = 0 ; i < 16 ; i ++)
            {
                left[i] = p[0];
                p += t->edged_stride;
            }
        }
        else if(t->mb.mb_neighbour & MB_TOP)
        {
            mode = Intra_16x16_DCTOP;
            p = src - t->edged_stride;
            for(i = 0 ; i < 16 ; i ++)
            {
                top[i] = p[i];
            }
        }
        else
        {
            mode = Intra_16x16_DC128;
        }
    }
    else
    {
        switch(mode)
        {
        case Intra_16x16_TOP:
            p = src - t->edged_stride;
            for(i = 0 ; i < 16 ; i ++)
            {
                top[i] = p[i];
            }
            break;
        case Intra_16x16_LEFT:
            p = src - 1;
            for(i = 0 ; i < 16 ; i ++)
            {
                left[i] = p[0];
                p += t->edged_stride;
            }
            break;
        case Intra_16x16_PLANE:
            p = src - t->edged_stride;
            for(i = -1 ; i < 16 ; i ++)
            {
                top[i] = p[i];
            }
            p -= 1;
            for(i = -1 ; i < 16 ; i ++)
            {
                left[i] = p[0];
                p += t->edged_stride;
            }
            break;
        default:
            assert(0);
            break;
        }
    }
    t->pred16x16[mode](pred, 16, top, left);
}
static void __inline
T264dec_mb_decode_predict_i4x4_y(T264_t* t, uint8_t idx, uint8_t mode, uint8_t* pred, uint8_t* src)
{
    DECLARE_ALIGNED_MATRIX(topcache,  8 + CACHE_SIZE, 1, uint8_t, CACHE_SIZE);
    DECLARE_ALIGNED_MATRIX(leftcache, 4 + CACHE_SIZE, 1, uint8_t, CACHE_SIZE);
    static const int32_t neighbour[] =
    {
        0, MB_LEFT, MB_LEFT, MB_LEFT,
        MB_TOP| MB_TOPRIGHT, MB_LEFT| MB_TOP,              MB_LEFT |MB_TOP| MB_TOPRIGHT, MB_LEFT| MB_TOP,
        MB_TOP| MB_TOPRIGHT, MB_LEFT| MB_TOP| MB_TOPRIGHT, MB_LEFT |MB_TOP| MB_TOPRIGHT, MB_LEFT| MB_TOP,
        MB_TOP| MB_TOPRIGHT, MB_LEFT| MB_TOP,              MB_LEFT |MB_TOP| MB_TOPRIGHT, MB_LEFT| MB_TOP
    };
    static const int32_t fix[] =
    {
        ~0, ~0, ~0, ~0,
        ~0, ~MB_TOPRIGHT, ~0, ~MB_TOPRIGHT,
        ~0, ~0, ~0, ~MB_TOPRIGHT,
        ~0, ~MB_TOPRIGHT, ~0, ~MB_TOPRIGHT
    };
    uint8_t* p;
    int32_t i;
    uint8_t* top  = &topcache[CACHE_SIZE];
    uint8_t* left = &leftcache[CACHE_SIZE];
    if (mode == Intra_4x4_DC)
    {
        int32_t mb_neighbour = (t->mb.mb_neighbour| neighbour[idx]) & fix[idx];
        if ((mb_neighbour & (MB_LEFT | MB_TOP)) == (MB_LEFT | MB_TOP))
        {
            mode = Intra_4x4_DC;
            p = src - t->edged_stride;
            for(i = 0 ; i < 4 ; i ++)
            {
                top[i] = p[i];
            }
            p = src - 1;
            for(i = 0 ; i < 4 ; i ++)
            {
                left[i] = p[0];
                p += t->edged_stride;
            }
        }
        else if(mb_neighbour & MB_LEFT)
        {
            mode = Intra_4x4_DCLEFT;
            p = src - 1;
            for(i = 0 ; i < 4 ; i ++)
            {
                left[i] = p[0];
                p += t->edged_stride;
            }
        }
        else if(mb_neighbour & MB_TOP)
        {
            mode = Intra_4x4_DCTOP;
            p = src - t->edged_stride;
            for(i = 0 ; i < 4 ; i ++)
            {
                top[i] = p[i];
            }
        }
        else
        {
            mode = Intra_4x4_DC128;
        }
    }
    else
    {
        switch(mode)
        {
        case Intra_4x4_TOP:
            p = src - t->edged_stride;
            for(i = 0 ; i < 4 ; i ++)
            {
                top[i] = p[i];
            }
            break;
        case Intra_4x4_LEFT:
        case Intra_4x4_HORIZONTAL_UP:
            p = src - 1;
            for(i = 0 ; i < 4 ; i ++)
            {
                left[i] = p[0];
                p += t->edged_stride;
            }
            break;
        case Intra_4x4_DIAGONAL_DOWNLEFT:
        case Intra_4x4_VERTICAL_LEFT:
            {
                int32_t mb_neighbour = (t->mb.mb_neighbour| neighbour[idx]) & fix[idx];
            
                p = src - t->edged_stride;
                if((idx & 3) == 3 && t->mb.mb_x == t->mb_width - 1)    //if is the right-most sub-block, if is th last MB in horizontal, no top-right exist
                    mb_neighbour &= ~MB_TOPRIGHT;
                if (mb_neighbour & MB_TOPRIGHT)
                {
                    for(i = 0 ; i < 8 ; i ++)
                    {
                        top[i] = p[i];
                    }
                }
                else
                {
                    for(i = 0 ; i < 4 ; i ++)
                    {
                        top[i] = p[i];
                    }
                    top[4] = p[3];
                    top[5] = p[3];
                    top[6] = p[3];
                    top[7] = p[3];
                }
            }
            break;
        case Intra_4x4_DIAGONAL_DOWNRIGHT:
        case Intra_4x4_VERTICAL_RIGHT:
        case Intra_4x4_HORIZONTAL_DOWN:
            p = src - t->edged_stride;
            for(i = -1 ; i < 4 ; i ++)
            {
                top[i] = p[i];
            }
            p -= 1;
            for(i = -1 ; i < 4 ; i ++)
            {
                left[i] = p[0];
                p += t->edged_stride;
            }
            break;
        default:
            assert(0);
            break;
        }
    }
    t->pred4x4[mode](pred, 4, top, left);
}
static void __inline
T264dec_mb_decode_predict_i8x8_y(T264_t* t, uint8_t mode, uint8_t* pred_u, uint8_t* pred_v)
{
    DECLARE_ALIGNED_MATRIX(topcacheu, 1, 8 + CACHE_SIZE, uint8_t, CACHE_SIZE);
    DECLARE_ALIGNED_MATRIX(leftcacheu, 1, 8 + CACHE_SIZE, uint8_t, CACHE_SIZE);
    DECLARE_ALIGNED_MATRIX(topcachev, 1, 8 + CACHE_SIZE, uint8_t, CACHE_SIZE);
    DECLARE_ALIGNED_MATRIX(leftcachev, 1, 8 + CACHE_SIZE, uint8_t, CACHE_SIZE);
    uint8_t* p_u, *p_v;
    int32_t i;
    uint8_t* top_u, *left_u;
    uint8_t* top_v, *left_v;
    top_u  = &topcacheu[CACHE_SIZE];
    top_v  = &topcachev[CACHE_SIZE];
    left_u = &leftcacheu[CACHE_SIZE];
    left_v = &leftcachev[CACHE_SIZE];
    if (mode == Intra_8x8_DC)
    {
        if ((t->mb.mb_neighbour & (MB_LEFT | MB_TOP)) == (MB_LEFT | MB_TOP))
        {
            mode = Intra_8x8_DC;
            p_u = t->mb.src_u - t->edged_stride_uv;
            p_v = t->mb.src_v - t->edged_stride_uv;
            for(i = 0 ; i < 8 ; i ++)
            {
                top_u[i] = p_u[i];
                top_v[i] = p_v[i];
            }
            p_u = t->mb.src_u - 1;
            p_v = t->mb.src_v - 1;
            for(i = 0 ; i < 8 ; i ++)
            {
                left_u[i] = p_u[0];
                left_v[i] = p_v[0];
                p_u += t->edged_stride_uv;
                p_v += t->edged_stride_uv;
            }
        }
        else if(t->mb.mb_neighbour & MB_LEFT)
        {
            mode = Intra_8x8_DCLEFT;
            p_u = t->mb.src_u - 1;
            p_v = t->mb.src_v - 1;
            for(i = 0 ; i < 8 ; i ++)
            {
                left_u[i] = p_u[0];
                left_v[i] = p_v[0];
                p_u += t->edged_stride_uv;
                p_v += t->edged_stride_uv;
            }
        }
        else if(t->mb.mb_neighbour & MB_TOP)
        {
            mode = Intra_8x8_DCTOP;
            p_u = t->mb.src_u - t->edged_stride_uv;
            p_v = t->mb.src_v - t->edged_stride_uv;
            for(i = 0 ; i < 8 ; i ++)
            {
                top_u[i] = p_u[i];
                top_v[i] = p_v[i];
            }
        }
        else
        {
            mode = Intra_8x8_DC128;
        }
    }
    else
    {
        switch(mode)
        {
        case Intra_8x8_TOP:
            p_u = t->mb.src_u - t->edged_stride_uv;
            p_v = t->mb.src_v - t->edged_stride_uv;
            for(i = 0 ; i < 8 ; i ++)
            {
                top_u[i] = p_u[i];
                top_v[i] = p_v[i];
            }
            break;
        case Intra_8x8_LEFT:
            p_u = t->mb.src_u - 1;
            p_v = t->mb.src_v - 1;
            for(i = 0 ; i < 8 ; i ++)
            {
                left_u[i] = p_u[0];
                left_v[i] = p_v[0];
                p_u += t->edged_stride_uv;
                p_v += t->edged_stride_uv;
            }
            break;
        case Intra_8x8_PLANE:
            p_u = t->mb.src_u - t->edged_stride_uv;
            p_v = t->mb.src_v - t->edged_stride_uv;
            for(i = -1 ; i < 8 ; i ++)
            {
                top_u[i] = p_u[i];
                top_v[i] = p_v[i];
            }
            p_u -= 1;
            p_v -= 1;
            for(i = -1 ; i < 8 ; i ++)
            {
                left_u[i] = p_u[0];
                p_u += t->edged_stride_uv;
                left_v[i] = p_v[0];
                p_v += t->edged_stride_uv;
            }
            break;
        default:
            assert(0);
            break;
        }
    }
    t->pred8x8[mode](pred_u, 8, top_u, left_u);
    t->pred8x8[mode](pred_v, 8, top_v, left_v);
}
static void __inline
T264dec_mb_decode_i16x16_y(T264_t* t)
{
    DECLARE_ALIGNED_MATRIX(dct, 1+16, 16, int16_t, CACHE_SIZE);
 
    int32_t qp = t->qp_y;
    int32_t i;
    int16_t* curdct;
    uint8_t* src;
    
    src = t->mb.src_y;
    T264dec_mb_decode_predict_i16x16_y(t, t->mb.mode_i16x16, t->mb.pred_i16x16, src);
    unscan_zig_4x4( t->mb.dc4x4_z, dct + 256 );
    t->iquant4x4dc(dct + 256, qp);
    t->idct4x4dc(dct + 256);
    curdct = dct;
    for( i = 0; i < 16; i++ )
    {
        unscan_zig_4x4( t->mb.dct_y_z[luma_index[i]], curdct );
        t->iquant4x4( curdct, qp );
        curdct[0] = dct[256 + i];
        t->idct4x4(curdct);
        curdct += 16;
    }
    t->contract16to8add(dct, 16 / 4, 16 / 4, t->mb.pred_i16x16, src, t->edged_stride);
}
static void __inline
T264dec_mb_decode_i4x4_y(T264_t* t)
{
    DECLARE_ALIGNED_MATRIX(pred, 4, 5, uint8_t, CACHE_SIZE);
    DECLARE_ALIGNED_MATRIX(dct, 1, 16, int16_t, 16);
    int32_t qp = t->qp_y;
    int32_t i;
    uint8_t* src;
    for(i = 0 ; i < 16 ; i ++)
    {
        int32_t row = i / 4;
        int32_t col = i % 4;
        src = t->mb.src_y + (row * t->edged_stride << 2) + (col << 2);
        T264dec_mb_decode_predict_i4x4_y(t, i, t->mb.mode_i4x4[luma_index[i]], pred, src);
        unscan_zig_4x4(t->mb.dct_y_z[luma_index[i]], dct);
        t->iquant4x4(dct, qp);
        t->idct4x4(dct);
        t->contract16to8add(dct, 4 / 4, 4 / 4, pred, src, t->edged_stride);
    }
}
void
T264dec_mb_decode_intra_y(T264_t* t)
{
    if (t->mb.mb_mode == I_4x4)
        T264dec_mb_decode_i4x4_y(t);
    else
        T264dec_mb_decode_i16x16_y(t);
}
void
T264dec_mb_decode_uv(T264_t* t, uint8_t* pred_u, uint8_t* pred_v)
{
    DECLARE_ALIGNED_MATRIX(dct, 10, 8, int16_t, CACHE_SIZE);
    int32_t qp = t->qp_uv;
    int32_t i, j;
    int16_t* curdct;
    uint8_t* start;
    uint8_t* src;
    start = pred_u;
    src   = t->mb.src_u;
    
    for(j = 0 ; j < 2 ; j ++)
    {
        unscan_zig_2x2(t->mb.dc2x2_z[j], dct + 64);
        t->iquant2x2dc(dct + 64, qp);
        t->idct2x2dc(dct + 64);
        curdct = dct;
        for(i = 0 ; i < 4 ; i ++)
        {
            unscan_zig_4x4(t->mb.dct_uv_z[j][i], curdct);
            t->iquant4x4(curdct, qp);
            curdct[0] = dct[64 + i];
            t->idct4x4(curdct);
            curdct += 16;
        }
        t->contract16to8add(dct, 8 / 4, 8 / 4, start, src, t->edged_stride_uv);
        //
        // change to v
        //
        start = pred_v;
        src   = t->mb.src_v;
    }
}
void
T264dec_mb_decode_intra_uv(T264_t* t)
{
    T264dec_mb_decode_predict_i8x8_y(t, t->mb.mb_mode_uv, t->mb.pred_i8x8u, t->mb.pred_i8x8v);
    T264dec_mb_decode_uv(t, t->mb.pred_i8x8u, t->mb.pred_i8x8v);
}
void
T264dec_mb_decode_interp_mc(T264_t* t, uint8_t* ref)
{
    T264_vector_t vec;
    uint8_t* tmp;
    int32_t x, y;
    int32_t i;
    int32_t list_index = 0;
    static const int8_t index[4][4][6] = 
    {
        {{0, 0, 0, 0, 0, 0}, {0, 1, 0, 0, 0, 0}, {1, 1, 0, 0, 0, 0}, {1, 0, 0, 0, 1, 0}},
        {{0, 2, 0, 0, 0, 0}, {1, 2, 0, 0, 0, 0}, {1, 3, 0, 0, 0, 0}, {1, 2, 0, 0, 1, 0}},
        {{2, 2, 0, 0, 0, 0}, {2, 3, 0, 0, 0, 0}, {3, 3, 0, 0, 0, 0}, {3, 2, 0, 0, 1, 0}},
        {{2, 0, 0, 0, 0, 1}, {2, 1, 0, 0, 0, 1}, {3, 1, 0, 0, 0, 1}, {1, 2, 0, 1, 1, 0}}
    };
    switch(t->mb.mb_part)
    {
    case MB_16x16:
        vec = t->mb.vec[0][0];
        x = (vec.x & 3);
        y = (vec.y & 3);
        if (index[y][x][0] == index[y][x][1])
        {
            tmp = t->ref[list_index][vec.refno]->Y[index[y][x][0]] + ((t->mb.mb_y << 4) + (vec.y >> 2)) * t->edged_stride + 
                ((t->mb.mb_x << 4) + (vec.x >> 2));
            t->memcpy_stride_u(tmp, 16, 16, t->edged_stride, ref, 16);
        }
        else
        {
            t->pia[MB_16x16](t->ref[list_index][vec.refno]->Y[index[y][x][0]] + ((t->mb.mb_y << 4) + (vec.y >> 2) + index[y][x][3]) * t->edged_stride + (t->mb.mb_x << 4) + (vec.x >> 2) + index[y][x][2], 
                t->ref[list_index][vec.refno]->Y[index[y][x][1]] + ((t->mb.mb_y << 4) + (vec.y >> 2) + index[y][x][5]) * t->edged_stride + (t->mb.mb_x << 4) + (vec.x >> 2) + index[y][x][4],
                t->edged_stride, t->edged_stride, ref, 16);
        }
        break;
    case MB_16x8:
        vec = t->mb.vec[0][0];
        x = (vec.x & 3);
        y = (vec.y & 3);
        if (index[y][x][0] == index[y][x][1])
        {
            tmp = t->ref[list_index][vec.refno]->Y[index[y][x][0]] + ((t->mb.mb_y << 4) + (vec.y >> 2)) * t->edged_stride + 
                ((t->mb.mb_x << 4) + (vec.x >> 2));
            t->memcpy_stride_u(tmp, 16, 8, t->edged_stride, ref, 16);
        }
        else
        {
            t->pia[MB_16x8](t->ref[list_index][vec.refno]->Y[index[y][x][0]] + ((t->mb.mb_y << 4) + (vec.y >> 2) + index[y][x][3]) * t->edged_stride + (t->mb.mb_x << 4) + (vec.x >> 2) + index[y][x][2], 
                t->ref[list_index][vec.refno]->Y[index[y][x][1]] + ((t->mb.mb_y << 4) + (vec.y >> 2) + index[y][x][5]) * t->edged_stride + (t->mb.mb_x << 4) + (vec.x >> 2) + index[y][x][4],
                t->edged_stride, t->edged_stride, ref, 16);
        }
        vec = t->mb.vec[0][8];
        x = (vec.x & 3);
        y = (vec.y & 3);
        if (index[y][x][0] == index[y][x][1])
        {
            tmp = t->ref[list_index][vec.refno]->Y[index[y][x][0]] + ((t->mb.mb_y << 4) + (vec.y >> 2) + 8) * t->edged_stride + 
                ((t->mb.mb_x << 4) + (vec.x >> 2));
            t->memcpy_stride_u(tmp, 16, 8, t->edged_stride, ref + 16 * 8, 16);
        }
        else
        {
            t->pia[MB_16x8](t->ref[list_index][vec.refno]->Y[index[y][x][0]] + ((t->mb.mb_y << 4) + (vec.y >> 2) + index[y][x][3] + 8) * t->edged_stride + (t->mb.mb_x << 4) + (vec.x >> 2) + index[y][x][2], 
                t->ref[list_index][vec.refno]->Y[index[y][x][1]] + ((t->mb.mb_y << 4) + (vec.y >> 2) + index[y][x][5] + 8) * t->edged_stride + (t->mb.mb_x << 4) + (vec.x >> 2) + index[y][x][4],
                t->edged_stride, t->edged_stride, ref + 16 * 8, 16);
        }
        break;
    case MB_8x16:
        vec = t->mb.vec[0][0];
        x = (vec.x & 3);
        y = (vec.y & 3);
        if (index[y][x][0] == index[y][x][1])
        {
            tmp = t->ref[list_index][vec.refno]->Y[index[y][x][0]] + ((t->mb.mb_y << 4) + (vec.y >> 2)) * t->edged_stride + 
                ((t->mb.mb_x << 4) + (vec.x >> 2));
            t->memcpy_stride_u(tmp, 8, 16, t->edged_stride, ref, 16);
        }
        else
        {
            t->pia[MB_8x16](t->ref[list_index][vec.refno]->Y[index[y][x][0]] + ((t->mb.mb_y << 4) + (vec.y >> 2) + index[y][x][3]) * t->edged_stride + (t->mb.mb_x << 4) + (vec.x >> 2) + index[y][x][2], 
                t->ref[list_index][vec.refno]->Y[index[y][x][1]] + ((t->mb.mb_y << 4) + (vec.y >> 2) + index[y][x][5]) * t->edged_stride + (t->mb.mb_x << 4) + (vec.x >> 2) + index[y][x][4],
                t->edged_stride, t->edged_stride, ref, 16);
        }
        vec = t->mb.vec[0][2];
        x = (vec.x & 3);
        y = (vec.y & 3);
        if (index[y][x][0] == index[y][x][1])
        {
            tmp = t->ref[list_index][vec.refno]->Y[index[y][x][0]] + ((t->mb.mb_y << 4) + (vec.y >> 2)) * t->edged_stride + 
                ((t->mb.mb_x << 4) + (vec.x >> 2)) + 8;
            t->memcpy_stride_u(tmp, 8, 16, t->edged_stride, ref + 8, 16);
        }
        else
        {
            t->pia[MB_8x16](t->ref[list_index][vec.refno]->Y[index[y][x][0]] + ((t->mb.mb_y << 4) + (vec.y >> 2) + index[y][x][3]) * t->edged_stride + (t->mb.mb_x << 4) + (vec.x >> 2) + index[y][x][2] + 8, 
                t->ref[list_index][vec.refno]->Y[index[y][x][1]] + ((t->mb.mb_y << 4) + (vec.y >> 2) + index[y][x][5]) * t->edged_stride + (t->mb.mb_x << 4) + (vec.x >> 2) + index[y][x][4] + 8,
                t->edged_stride, t->edged_stride, ref + 8, 16);
        }
        break;
    case MB_8x8:
    case MB_8x8ref0:
        for(i = 0 ; i < 4 ; i ++)
        {
            int32_t offset1, offset2;
            switch(t->mb.submb_part[luma_index[4 * i]]) 
            {
            case MB_8x8:
                vec = t->mb.vec[0][luma_index[4 * i]];
                x = (vec.x & 3);
                y = (vec.y & 3);
                if (index[y][x][0] == index[y][x][1])
                {
                    offset1 = ((t->mb.mb_y << 4) + (vec.y >> 2) + i / 2 * 8) * t->edged_stride + ((t->mb.mb_x << 4) + (vec.x >> 2)) + i % 2 * 8;
                    tmp = t->ref[list_index][vec.refno]->Y[index[y][x][0]] + offset1;
                    t->memcpy_stride_u(tmp, 8, 8, t->edged_stride, ref + i / 2 * 16 * 8 + i % 2 * 8, 16);
                }
                else
                {
                    offset1 = ((t->mb.mb_y << 4) + (vec.y >> 2) + index[y][x][3] + i / 2 * 8) * t->edged_stride + (t->mb.mb_x << 4) + (vec.x >> 2) + index[y][x][2] + i % 2 * 8;
                    offset2 = ((t->mb.mb_y << 4) + (vec.y >> 2) + index[y][x][5] + i / 2 * 8) * t->edged_stride + (t->mb.mb_x << 4) + (vec.x >> 2) + index[y][x][4] + i % 2 * 8;
                    t->pia[MB_8x8](t->ref[list_index][vec.refno]->Y[index[y][x][0]] + offset1, 
                        t->ref[list_index][vec.refno]->Y[index[y][x][1]] + offset2,
                        t->edged_stride, t->edged_stride, ref + i / 2 * 16 * 8 + i % 2 * 8, 
                        16);
                }
                break;
            case MB_8x4:
                vec = t->mb.vec[0][luma_index[4 * i]];
                x = (vec.x & 3);
                y = (vec.y & 3);
                if (index[y][x][0] == index[y][x][1])
                {
                    offset1 = ((t->mb.mb_y << 4) + (vec.y >> 2) + i / 2 * 8) * t->edged_stride + ((t->mb.mb_x << 4) + (vec.x >> 2)) + i % 2 * 8;
                    tmp = t->ref[list_index][vec.refno]->Y[index[y][x][0]] + offset1;
                    t->memcpy_stride_u(tmp, 8, 4, t->edged_stride, ref + i / 2 * 16 * 8 + i % 2 * 8, 16);
                }
                else
                {
                    offset1 = ((t->mb.mb_y << 4) + (vec.y >> 2) + index[y][x][3] + i / 2 * 8) * t->edged_stride + (t->mb.mb_x << 4) + (vec.x >> 2) + index[y][x][2] + i % 2 * 8;
                    offset2 = ((t->mb.mb_y << 4) + (vec.y >> 2) + index[y][x][5] + i / 2 * 8) * t->edged_stride + (t->mb.mb_x << 4) + (vec.x >> 2) + index[y][x][4] + i % 2 * 8;
                    t->pia[MB_8x4](t->ref[list_index][vec.refno]->Y[index[y][x][0]] + offset1, 
                        t->ref[list_index][vec.refno]->Y[index[y][x][1]] + offset2,
                        t->edged_stride, t->edged_stride, ref + i / 2 * 16 * 8 + i % 2 * 8, 
                        16);
                }
                vec = t->mb.vec[0][luma_index[4 * i + 2]];
                x = (vec.x & 3);
                y = (vec.y & 3);
                if (index[y][x][0] == index[y][x][1])
                {
                    offset1 = ((t->mb.mb_y << 4) + (vec.y >> 2) + i / 2 * 8 + 4) * t->edged_stride + ((t->mb.mb_x << 4) + (vec.x >> 2)) + i % 2 * 8;
                    tmp = t->ref[list_index][vec.refno]->Y[index[y][x][0]] + offset1;
                    t->memcpy_stride_u(tmp, 8, 4, t->edged_stride, ref + i / 2  * 16 * 8 + 64 + i % 2 * 8, 16);
                }
                else
                {
                    offset1 = ((t->mb.mb_y << 4) + (vec.y >> 2) + index[y][x][3] + i / 2 * 8 + 4) * t->edged_stride + (t->mb.mb_x << 4) + (vec.x >> 2) + index[y][x][2] + i % 2 * 8;
                    offset2 = ((t->mb.mb_y << 4) + (vec.y >> 2) + index[y][x][5] + i / 2 * 8 + 4) * t->edged_stride + (t->mb.mb_x << 4) + (vec.x >> 2) + index[y][x][4] + i % 2 * 8;
                    t->pia[MB_8x4](t->ref[list_index][vec.refno]->Y[index[y][x][0]] + offset1, 
                        t->ref[list_index][vec.refno]->Y[index[y][x][1]] + offset2,
                        t->edged_stride, t->edged_stride, ref + i / 2 * 16 * 8 + i % 2 * 8 + 64, 
                        16);
                }
                break;
            case MB_4x8:
                vec = t->mb.vec[0][luma_index[4 * i]];
                x = (vec.x & 3);
                y = (vec.y & 3);
                if (index[y][x][0] == index[y][x][1])
                {
                    offset1 = ((t->mb.mb_y << 4) + (vec.y >> 2) + i / 2 * 8) * t->edged_stride + ((t->mb.mb_x << 4) + (vec.x >> 2)) + i % 2 * 8;
                    tmp = t->ref[list_index][vec.refno]->Y[index[y][x][0]] + offset1;
                    t->memcpy_stride_u(tmp, 4, 8, t->edged_stride, ref + i / 2 * 16 * 8 + i % 2 * 8, 16);
                }
                else
                {
                    offset1 = ((t->mb.mb_y << 4) + (vec.y >> 2) + index[y][x][3] + i / 2 * 8) * t->edged_stride + (t->mb.mb_x << 4) + (vec.x >> 2) + index[y][x][2] + i % 2 * 8;
                    offset2 = ((t->mb.mb_y << 4) + (vec.y >> 2) + index[y][x][5] + i / 2 * 8) * t->edged_stride + (t->mb.mb_x << 4) + (vec.x >> 2) + index[y][x][4] + i % 2 * 8;
                    t->pia[MB_4x8](t->ref[list_index][vec.refno]->Y[index[y][x][0]] + offset1, 
                        t->ref[list_index][vec.refno]->Y[index[y][x][1]] + offset2,
                        t->edged_stride, t->edged_stride, ref + i / 2 * 16 * 8 + i % 2 * 8, 
                        16);
                }
                vec = t->mb.vec[0][luma_index[4 * i + 1]];
                x = (vec.x & 3);
                y = (vec.y & 3);
                if (index[y][x][0] == index[y][x][1])
                {
                    offset1 = ((t->mb.mb_y << 4) + (vec.y >> 2) + i / 2 * 8) * t->edged_stride + ((t->mb.mb_x << 4) + (vec.x >> 2)) + i % 2 * 8 + 4;
                    tmp = t->ref[list_index][vec.refno]->Y[index[y][x][0]] + offset1;
                    t->memcpy_stride_u(tmp, 4, 8, t->edged_stride, ref + i / 2  * 16 * 8 + i % 2 * 8 + 4, 16);
                }
                else
                {
                    offset1 = ((t->mb.mb_y << 4) + (vec.y >> 2) + index[y][x][3] + i / 2 * 8) * t->edged_stride + (t->mb.mb_x << 4) + (vec.x >> 2) + index[y][x][2] + i % 2 * 8 + 4;
                    offset2 = ((t->mb.mb_y << 4) + (vec.y >> 2) + index[y][x][5] + i / 2 * 8) * t->edged_stride + (t->mb.mb_x << 4) + (vec.x >> 2) + index[y][x][4] + i % 2 * 8 + 4;
                    t->pia[MB_4x8](t->ref[list_index][vec.refno]->Y[index[y][x][0]] + offset1, 
                        t->ref[list_index][vec.refno]->Y[index[y][x][1]] + offset2,
                        t->edged_stride, t->edged_stride, ref + i / 2 * 16 * 8 + i % 2 * 8 + 4, 
                        16);
                }
                break;
            case MB_4x4:
                vec = t->mb.vec[0][luma_index[4 * i]];
                x = (vec.x & 3);
                y = (vec.y & 3);
                if (index[y][x][0] == index[y][x][1])
                {
                    offset1 = ((t->mb.mb_y << 4) + (vec.y >> 2) + i / 2 * 8) * t->edged_stride + ((t->mb.mb_x << 4) + (vec.x >> 2)) + i % 2 * 8;
                    tmp = t->ref[list_index][vec.refno]->Y[index[y][x][0]] + offset1;
                    t->memcpy_stride_u(tmp, 4, 4, t->edged_stride, ref + i / 2 * 16 * 8 + i % 2 * 8, 16);
                }
                else
                {
                    offset1 = ((t->mb.mb_y << 4) + (vec.y >> 2) + index[y][x][3] + i / 2 * 8) * t->edged_stride + (t->mb.mb_x << 4) + (vec.x >> 2) + index[y][x][2] + i % 2 * 8;
                    offset2 = ((t->mb.mb_y << 4) + (vec.y >> 2) + index[y][x][5] + i / 2 * 8) * t->edged_stride + (t->mb.mb_x << 4) + (vec.x >> 2) + index[y][x][4] + i % 2 * 8;
                    t->pia[MB_4x4](t->ref[list_index][vec.refno]->Y[index[y][x][0]] + offset1, 
                        t->ref[list_index][vec.refno]->Y[index[y][x][1]] + offset2,
                        t->edged_stride, t->edged_stride, ref + i / 2 * 16 * 8 + i % 2 * 8, 
                        16);
                }
                vec = t->mb.vec[0][luma_index[4 * i + 1]];
                x = (vec.x & 3);
                y = (vec.y & 3);
                if (index[y][x][0] == index[y][x][1])
                {
                    offset1 = ((t->mb.mb_y << 4) + (vec.y >> 2) + i / 2 * 8) * t->edged_stride + ((t->mb.mb_x << 4) + (vec.x >> 2)) + i % 2 * 8 + 4;
                    tmp = t->ref[list_index][vec.refno]->Y[index[y][x][0]] + offset1;
                    t->memcpy_stride_u(tmp, 4, 4, t->edged_stride, ref + i / 2  * 16 * 8 + i % 2 * 8 + 4, 16);
                }
                else
                {
                    offset1 = ((t->mb.mb_y << 4) + (vec.y >> 2) + index[y][x][3] + i / 2 * 8) * t->edged_stride + (t->mb.mb_x << 4) + (vec.x >> 2) + index[y][x][2] + i % 2 * 8 + 4;
                    offset2 = ((t->mb.mb_y << 4) + (vec.y >> 2) + index[y][x][5] + i / 2 * 8) * t->edged_stride + (t->mb.mb_x << 4) + (vec.x >> 2) + index[y][x][4] + i % 2 * 8 + 4;
                    t->pia[MB_4x4](t->ref[list_index][vec.refno]->Y[index[y][x][0]] + offset1, 
                        t->ref[list_index][vec.refno]->Y[index[y][x][1]] + offset2,
                        t->edged_stride, t->edged_stride, ref + i / 2 * 16 * 8 + i % 2 * 8 + 4, 
                        16);
                }
                vec = t->mb.vec[0][luma_index[4 * i + 2]];
                x = (vec.x & 3);
                y = (vec.y & 3);
                if (index[y][x][0] == index[y][x][1])
                {
                    offset1 = ((t->mb.mb_y << 4) + (vec.y >> 2) + i / 2 * 8 + 4) * t->edged_stride + ((t->mb.mb_x << 4) + (vec.x >> 2)) + i % 2 * 8;
                    tmp = t->ref[list_index][vec.refno]->Y[index[y][x][0]] + offset1;
                    t->memcpy_stride_u(tmp, 4, 4, t->edged_stride, ref + i / 2  * 16 * 8 + 64 + i % 2 * 8, 16);
                }
                else
                {
                    offset1 = ((t->mb.mb_y << 4) + (vec.y >> 2) + index[y][x][3] + i / 2 * 8 + 4) * t->edged_stride + (t->mb.mb_x << 4) + (vec.x >> 2) + index[y][x][2] + i % 2 * 8;
                    offset2 = ((t->mb.mb_y << 4) + (vec.y >> 2) + index[y][x][5] + i / 2 * 8 + 4) * t->edged_stride + (t->mb.mb_x << 4) + (vec.x >> 2) + index[y][x][4] + i % 2 * 8;
                    t->pia[MB_4x4](t->ref[list_index][vec.refno]->Y[index[y][x][0]] + offset1, 
                        t->ref[list_index][vec.refno]->Y[index[y][x][1]] + offset2,
                        t->edged_stride, t->edged_stride, ref + i / 2 * 16 * 8 + i % 2 * 8 + 64, 
                        16);
                }
                vec = t->mb.vec[0][luma_index[4 * i + 3]];
                x = (vec.x & 3);
                y = (vec.y & 3);
                if (index[y][x][0] == index[y][x][1])
                {
                    offset1 = ((t->mb.mb_y << 4) + (vec.y >> 2) + i / 2 * 8 + 4) * t->edged_stride + ((t->mb.mb_x << 4) + (vec.x >> 2)) + i % 2 * 8 + 4;
                    tmp = t->ref[list_index][vec.refno]->Y[index[y][x][0]] + offset1;
                    t->memcpy_stride_u(tmp, 4, 4, t->edged_stride, ref + i / 2  * 16 * 8 + 64 + i % 2 * 8 + 4, 16);
                }
                else
                {
                    offset1 = ((t->mb.mb_y << 4) + (vec.y >> 2) + index[y][x][3] + i / 2 * 8 + 4) * t->edged_stride + (t->mb.mb_x << 4) + (vec.x >> 2) + index[y][x][2] + i % 2 * 8 + 4;
                    offset2 = ((t->mb.mb_y << 4) + (vec.y >> 2) + index[y][x][5] + i / 2 * 8 + 4) * t->edged_stride + (t->mb.mb_x << 4) + (vec.x >> 2) + index[y][x][4] + i % 2 * 8 + 4;
                    t->pia[MB_4x4](t->ref[list_index][vec.refno]->Y[index[y][x][0]] + offset1, 
                        t->ref[list_index][vec.refno]->Y[index[y][x][1]] + offset2,
                        t->edged_stride, t->edged_stride, ref + i / 2 * 16 * 8 + i % 2 * 8 + 64 + 4, 
                        16);
                }
                break;
            }
        }
        break;
    default:
        assert(0);
        break;
    }
}
void
T264dec_mb_decode_interp_transform(T264_t* t, uint8_t* ref)
{
    DECLARE_ALIGNED_MATRIX(dct, 16, 16, int16_t, 16);
 
    int16_t* curdct = dct;
    int32_t i;
    for(i = 0 ; i < 16 ; i ++)
    {
        unscan_zig_4x4(t->mb.dct_y_z[luma_index[i]], curdct);
        t->iquant4x4(curdct, t->qp_y);
        t->idct4x4(curdct);
        curdct += 16;
    }
    t->contract16to8add(dct, 16 / 4, 16 / 4, ref, t->mb.src_y, t->edged_stride);
}
void 
T264dec_mb_decode_interp_y(T264_t* t)
{
    T264dec_mb_decode_interp_mc(t, t->mb.pred_p16x16);
    T264dec_mb_decode_interp_transform(t, t->mb.pred_p16x16);
}
void 
T264dec_mb_decode_interp_uv(T264_t* t)
{
    DECLARE_ALIGNED_MATRIX(pred_u, 8, 8, uint8_t, CACHE_SIZE);
    DECLARE_ALIGNED_MATRIX(pred_v, 8, 8, uint8_t, CACHE_SIZE);
    T264_vector_t vec;
    uint8_t* src, *dst;
    uint8_t* src_u, *dst_u;
    int32_t i;
    int32_t list_index = 0;
    switch (t->mb.mb_part)
    {
    case MB_16x16:
        vec = t->mb.vec[0][0];
        src = t->ref[list_index][vec.refno]->U + ((t->mb.mb_y << 3) + (vec.y >> 3)) * t->edged_stride_uv + (t->mb.mb_x << 3) + (vec.x >> 3);
        dst = pred_u;
        t->eighth_pixel_mc_u(src, t->edged_stride_uv, dst, vec.x, vec.y, 8, 8);
        src = t->ref[list_index][vec.refno]->V + ((t->mb.mb_y << 3) + (vec.y >> 3)) * t->edged_stride_uv + (t->mb.mb_x << 3) + (vec.x >> 3);
        dst = pred_v;
        t->eighth_pixel_mc_u(src, t->edged_stride_uv, dst, vec.x, vec.y, 8, 8);
        break;
    case MB_16x8:
        vec = t->mb.vec[0][0];
        src_u = t->ref[list_index][vec.refno]->U + ((t->mb.mb_y << 3) + (vec.y >> 3)) * t->edged_stride_uv + (t->mb.mb_x << 3) + (vec.x >> 3);
        dst_u = pred_u;
        t->eighth_pixel_mc_u(src_u, t->edged_stride_uv, dst_u, vec.x, vec.y, 8, 4);
        src = t->ref[list_index][vec.refno]->V + ((t->mb.mb_y << 3) + (vec.y >> 3)) * t->edged_stride_uv + (t->mb.mb_x << 3) + (vec.x >> 3);
        dst = pred_v;
        t->eighth_pixel_mc_u(src, t->edged_stride_uv, dst, vec.x, vec.y, 8, 4);
        vec = t->mb.vec[0][luma_index[8]];
        src_u = t->ref[list_index][vec.refno]->U + ((t->mb.mb_y << 3) + (vec.y >> 3)) * t->edged_stride_uv + (t->mb.mb_x << 3) + (vec.x >> 3) +
            4 * t->edged_stride_uv;
        dst_u += 4 * 8;
        t->eighth_pixel_mc_u(src_u, t->edged_stride_uv, dst_u, vec.x, vec.y, 8, 4);
        src = t->ref[list_index][vec.refno]->V + ((t->mb.mb_y << 3) + (vec.y >> 3)) * t->edged_stride_uv + (t->mb.mb_x << 3) + (vec.x >> 3) + 
            4 * t->edged_stride_uv;
        dst += 4 * 8;
        t->eighth_pixel_mc_u(src, t->edged_stride_uv, dst, vec.x, vec.y, 8, 4);
        break;
    case MB_8x16:
        vec = t->mb.vec[0][0];
        src_u = t->ref[list_index][vec.refno]->U + ((t->mb.mb_y << 3) + (vec.y >> 3)) * t->edged_stride_uv + (t->mb.mb_x << 3) + (vec.x >> 3);
        dst_u = pred_u;
        t->eighth_pixel_mc_u(src_u, t->edged_stride_uv, dst_u, vec.x, vec.y, 4, 8);
        src = t->ref[list_index][vec.refno]->V + ((t->mb.mb_y << 3) + (vec.y >> 3)) * t->edged_stride_uv + (t->mb.mb_x << 3) + (vec.x >> 3);
        dst = pred_v;
        t->eighth_pixel_mc_u(src, t->edged_stride_uv, dst, vec.x, vec.y, 4, 8);
        vec = t->mb.vec[0][luma_index[4]];
        src_u = t->ref[list_index][vec.refno]->U + ((t->mb.mb_y << 3) + (vec.y >> 3)) * t->edged_stride_uv + (t->mb.mb_x << 3) + (vec.x >> 3) + 4;
        dst_u += 4;
        t->eighth_pixel_mc_u(src_u, t->edged_stride_uv, dst_u, vec.x, vec.y, 4, 8);
        src = t->ref[list_index][vec.refno]->V + ((t->mb.mb_y << 3) + (vec.y >> 3)) * t->edged_stride_uv + (t->mb.mb_x << 3) + (vec.x >> 3) + 4;
        dst += 4;
        t->eighth_pixel_mc_u(src, t->edged_stride_uv, dst, vec.x, vec.y, 4, 8);
        break;
    case MB_8x8:
    case MB_8x8ref0:
        for(i = 0 ; i < 4 ; i ++)
        {
            switch(t->mb.submb_part[luma_index[4 * i]])
            {
            case MB_8x8:
                vec = t->mb.vec[0][luma_index[4 * i]];
                src = t->ref[list_index][vec.refno]->U + ((t->mb.mb_y << 3) + (vec.y >> 3) + i / 2 * 4) * t->edged_stride_uv + (t->mb.mb_x << 3) + (vec.x >> 3) + (i % 2 * 4);
                dst = pred_u + i / 2 * 32 + i % 2 * 4;
                t->eighth_pixel_mc_u(src, t->edged_stride_uv, dst, vec.x, vec.y, 4, 4);
                src = t->ref[list_index][vec.refno]->V + ((t->mb.mb_y << 3) + (vec.y >> 3) + i / 2 * 4) * t->edged_stride_uv + (t->mb.mb_x << 3) + (vec.x >> 3) + (i % 2 * 4);
                dst = pred_v + i / 2 * 32 + i % 2 * 4;
                t->eighth_pixel_mc_u(src, t->edged_stride_uv, dst, vec.x, vec.y, 4, 4);
                break;
            case MB_8x4:
                vec = t->mb.vec[0][luma_index[4 * i]];
                src_u = t->ref[list_index][vec.refno]->U + ((t->mb.mb_y << 3) + (vec.y >> 3) + i / 2 * 4) * t->edged_stride_uv + (t->mb.mb_x << 3) + (vec.x >> 3) + (i % 2 * 4);
                dst_u = pred_u + i / 2 * 32 + i % 2 * 4;
                t->eighth_pixel_mc_u(src_u, t->edged_stride_uv, dst_u, vec.x, vec.y, 4, 2);
                src = t->ref[list_index][vec.refno]->V + ((t->mb.mb_y << 3) + (vec.y >> 3) + i / 2 * 4) * t->edged_stride_uv + (t->mb.mb_x << 3) + (vec.x >> 3) + (i % 2 * 4);
                dst = pred_v + i / 2 * 32 + i % 2 * 4;
                t->eighth_pixel_mc_u(src, t->edged_stride_uv, dst, vec.x, vec.y, 4, 2);
                vec = t->mb.vec[0][luma_index[4 * i + 2]];
                src_u = t->ref[list_index][vec.refno]->U + ((t->mb.mb_y << 3) + (vec.y >> 3) + i / 2 * 4) * t->edged_stride_uv + (t->mb.mb_x << 3) + (vec.x >> 3) + (i % 2 * 4) + 
                    2 * t->edged_stride_uv;
                dst_u += 2 * 8;
                t->eighth_pixel_mc_u(src_u, t->edged_stride_uv, dst_u, vec.x, vec.y, 4, 2);
                src = t->ref[list_index][vec.refno]->V + ((t->mb.mb_y << 3) + (vec.y >> 3) + i / 2 * 4) * t->edged_stride_uv + (t->mb.mb_x << 3) + (vec.x >> 3) + (i % 2 * 4) +
                    2 * t->edged_stride_uv;
                dst += 2 * 8;
                t->eighth_pixel_mc_u(src, t->edged_stride_uv, dst, vec.x, vec.y, 4, 2);
                break;
            case MB_4x8:
                vec = t->mb.vec[0][luma_index[4 * i]];
                src_u = t->ref[list_index][vec.refno]->U + ((t->mb.mb_y << 3) + (vec.y >> 3) + i / 2 * 4) * t->edged_stride_uv + (t->mb.mb_x << 3) + (vec.x >> 3) + (i % 2 * 4);
                dst_u = pred_u + i / 2 * 32 + i % 2 * 4;
                t->eighth_pixel_mc_u(src_u, t->edged_stride_uv, dst_u, vec.x, vec.y, 2, 4);
                src = t->ref[list_index][vec.refno]->V + ((t->mb.mb_y << 3) + (vec.y >> 3) + i / 2 * 4) * t->edged_stride_uv + (t->mb.mb_x << 3) + (vec.x >> 3) + (i % 2 * 4);
                dst = pred_v + i / 2 * 32 + i % 2 * 4;
                t->eighth_pixel_mc_u(src, t->edged_stride_uv, dst, vec.x, vec.y, 2, 4);
                vec = t->mb.vec[0][luma_index[4 * i + 1]];
                src_u = t->ref[list_index][vec.refno]->U + ((t->mb.mb_y << 3) + (vec.y >> 3) + i / 2 * 4) * t->edged_stride_uv + (t->mb.mb_x << 3) + (vec.x >> 3) + (i % 2 * 4) + 2;
                dst_u += 2;
                t->eighth_pixel_mc_u(src_u, t->edged_stride_uv, dst_u, vec.x, vec.y, 2, 4);
                src = t->ref[list_index][vec.refno]->V + ((t->mb.mb_y << 3) + (vec.y >> 3) + i / 2 * 4) * t->edged_stride_uv + (t->mb.mb_x << 3) + (vec.x >> 3) + (i % 2 * 4) + 2;
                dst += 2;
                t->eighth_pixel_mc_u(src, t->edged_stride_uv, dst, vec.x, vec.y, 2, 4);
                break;
            case MB_4x4:
                vec = t->mb.vec[0][luma_index[4 * i]];
                src_u = t->ref[list_index][vec.refno]->U + ((t->mb.mb_y << 3) + (vec.y >> 3) + i / 2 * 4) * t->edged_stride_uv + (t->mb.mb_x << 3) + (vec.x >> 3) + (i % 2 * 4);
                dst_u = pred_u + i / 2 * 32 + i % 2 * 4;
                t->eighth_pixel_mc_u(src_u, t->edged_stride_uv, dst_u, vec.x, vec.y, 2, 2);
                src = t->ref[list_index][vec.refno]->V + ((t->mb.mb_y << 3) + (vec.y >> 3) + i / 2 * 4) * t->edged_stride_uv + (t->mb.mb_x << 3) + (vec.x >> 3) + (i % 2 * 4);
                dst = pred_v + i / 2 * 32 + i % 2 * 4;
                t->eighth_pixel_mc_u(src, t->edged_stride_uv, dst, vec.x, vec.y, 2, 2);
                vec = t->mb.vec[0][luma_index[4 * i + 1]];
                src_u = t->ref[list_index][vec.refno]->U + ((t->mb.mb_y << 3) + (vec.y >> 3) + i / 2 * 4) * t->edged_stride_uv + (t->mb.mb_x << 3) + (vec.x >> 3) + (i % 2 * 4) + 2;
                dst_u += 2;
                t->eighth_pixel_mc_u(src_u, t->edged_stride_uv, dst_u, vec.x, vec.y, 2, 2);
                src = t->ref[list_index][vec.refno]->V + ((t->mb.mb_y << 3) + (vec.y >> 3) + i / 2 * 4) * t->edged_stride_uv + (t->mb.mb_x << 3) + (vec.x >> 3) + (i % 2 * 4) + 2;
                dst += 2;
                t->eighth_pixel_mc_u(src, t->edged_stride_uv, dst, vec.x, vec.y, 2, 2);
                vec = t->mb.vec[0][luma_index[4 * i + 2]];
                src_u = t->ref[list_index][vec.refno]->U + ((t->mb.mb_y << 3) + (vec.y >> 3) + i / 2 * 4) * t->edged_stride_uv + (t->mb.mb_x << 3) + (vec.x >> 3) + (i % 2 * 4) + 
                    2 * t->edged_stride_uv;
                dst_u += 2 * 8 - 2;
                t->eighth_pixel_mc_u(src_u, t->edged_stride_uv, dst_u, vec.x, vec.y, 2, 2);
                src = t->ref[list_index][vec.refno]->V + ((t->mb.mb_y << 3) + (vec.y >> 3) + i / 2 * 4) * t->edged_stride_uv + (t->mb.mb_x << 3) + (vec.x >> 3) + (i % 2 * 4) + 
                    2 * t->edged_stride_uv;
                dst += 2 * 8 - 2;
                t->eighth_pixel_mc_u(src, t->edged_stride_uv, dst, vec.x, vec.y, 2, 2);
                vec = t->mb.vec[0][luma_index[4 * i + 3]];
                src_u = t->ref[list_index][vec.refno]->U + ((t->mb.mb_y << 3) + (vec.y >> 3) + i / 2 * 4) * t->edged_stride_uv + (t->mb.mb_x << 3) + (vec.x >> 3) + (i % 2 * 4) +
                    2 * t->edged_stride_uv + 2;
                dst_u += 2;
                t->eighth_pixel_mc_u(src_u, t->edged_stride_uv, dst_u, vec.x, vec.y, 2, 2);
                src = t->ref[list_index][vec.refno]->V + ((t->mb.mb_y << 3) + (vec.y >> 3) + i / 2 * 4) * t->edged_stride_uv + (t->mb.mb_x << 3) + (vec.x >> 3) + (i % 2 * 4) + 
                    2 * t->edged_stride_uv + 2;
                dst += 2;
                t->eighth_pixel_mc_u(src, t->edged_stride_uv, dst, vec.x, vec.y, 2, 2);
                break;
            default:
                break;
            }
        }
        break;
    default:
        break;
    }
    T264dec_mb_decode_uv(t, pred_u, pred_v);
}
    static const int8_t index[4][4][6] = 
    {
        {{0, 0, 0, 0, 0, 0}, {0, 1, 0, 0, 0, 0}, {1, 1, 0, 0, 0, 0}, {1, 0, 0, 0, 1, 0}},
        {{0, 2, 0, 0, 0, 0}, {1, 2, 0, 0, 0, 0}, {1, 3, 0, 0, 0, 0}, {1, 2, 0, 0, 1, 0}},
        {{2, 2, 0, 0, 0, 0}, {2, 3, 0, 0, 0, 0}, {3, 3, 0, 0, 0, 0}, {3, 2, 0, 0, 1, 0}},
        {{2, 0, 0, 0, 0, 1}, {2, 1, 0, 0, 0, 1}, {3, 1, 0, 0, 0, 1}, {1, 2, 0, 1, 1, 0}}
    };
void 
T264_mb4x4_interb_uv_mc(T264_t* t,T264_vector_t vecPredicted[2][16],uint8_t* pred_u,uint8_t* pred_v)
{
    DECLARE_ALIGNED_MATRIX(pred_u_l1, 8, 8, uint8_t, CACHE_SIZE);
    DECLARE_ALIGNED_MATRIX(pred_v_l1, 8, 8, uint8_t, CACHE_SIZE);
    T264_vector_t vec;
    uint8_t* src, *dst;
    int32_t i;
    int32_t j;
    int32_t idx;
    int32_t offset_src,offset_dst;
    uint8_t *dstv;
    for(i = 0;i < 4; ++i)
    {
        for(j = 0;j < 4; ++j)
        {    //predict each 2x2 block
            idx = (i * 4) + j;
            offset_dst = ((i * 2) * 8) + (j << 1);
            vec = vecPredicted[0][idx];
            offset_src = ((t->mb.mb_y << 3) + ((i << 1) + (vec.y >> 3))) * t->edged_stride_uv + (t->mb.mb_x << 3) + (j << 1) + (vec.x >> 3);
            dstv = pred_v + offset_dst;
            dst = pred_u + offset_dst;
            if(vec.refno > -1)
            {
                src = t->ref[0][vec.refno]->U + offset_src;
                t->eighth_pixel_mc_u(src, t->edged_stride_uv, dst, vec.x, vec.y, 2, 2);
                src = t->ref[0][vec.refno]->V + offset_src;
                t->eighth_pixel_mc_u(src, t->edged_stride_uv, dstv, vec.x, vec.y, 2, 2);
            }
            vec = vecPredicted[1][idx];
            offset_src = ((t->mb.mb_y << 3) + ((i << 1) + (vec.y >> 3))) * t->edged_stride_uv + (t->mb.mb_x << 3) + (j << 1) + (vec.x >> 3);
            if(vec.refno > -1)
            {
                if(vecPredicted[0][idx].refno > -1)
                {
                    dst = pred_u_l1 + offset_dst;
                    dstv = pred_v_l1 + offset_dst;
                }
                src = t->ref[1][vec.refno]->U + offset_src;
                t->eighth_pixel_mc_u(src, t->edged_stride_uv, dst, vec.x, vec.y, 2, 2);
                src = t->ref[1][vec.refno]->V + offset_src;
                t->eighth_pixel_mc_u(src, t->edged_stride_uv, dstv, vec.x, vec.y, 2, 2);
            }
            if(dst != pred_u + offset_dst)
            {
                t->pia[MB_2x2](dst, pred_u + offset_dst, 8, 8, pred_u + offset_dst, 8);
                t->pia[MB_2x2](dstv, pred_v + offset_dst, 8, 8, pred_v + offset_dst, 8);
            }
        }
    }
}
void 
T264_mb4x4_interb_mc(T264_t* t,T264_vector_t vec[2][16],uint8_t* ref)
{
    T264_vector_t vec0,vec1;
    uint8_t* tmp,*pred_tmp;
    int32_t x, y,i,j;
    int32_t list_index,
            block_idx = 0;
    int32_t offset1, offset2;
    DECLARE_ALIGNED_MATRIX_H(pred_16x16bi, 16, 16, uint8_t, CACHE_SIZE);
 
    for(i = 0 ; i < 4 ; i ++)
    {
        for(j = 0;j < 4; ++j)
        {
            int32_t offset_base;
            vec0 = vec[0][block_idx];
            vec1 = vec[1][block_idx];
            x = (vec0.x & 3);
            y = (vec0.y & 3);
        //    offset_base = luma_inverse_y[block_idx] * 16 * 4 + luma_inverse_x[block_idx] * 4;
            offset_base = i * 16 * 4 + j * 4;
            pred_tmp = ref + offset_base;
            if(vec0.refno > -1)
            {
                    list_index = 0;
                    if (index[y][x][0] == index[y][x][1])
                    {
                        offset1 = ((t->mb.mb_y << 4) + (vec0.y >> 2) + i * 4) * t->edged_stride + ((t->mb.mb_x << 4) + (vec0.x >> 2)) + j  * 4;
                        tmp = t->ref[list_index][vec0.refno]->Y[index[y][x][0]] + offset1;
                        t->memcpy_stride_u(tmp, 4, 4, t->edged_stride, pred_tmp, 16);
                    }
                    else
                    {
                        offset1 = ((t->mb.mb_y << 4) + (vec0.y >> 2) + index[y][x][3] + i * 4) * t->edged_stride + (t->mb.mb_x << 4) + (vec0.x >> 2) + index[y][x][2] + j * 4;
                        offset2 = ((t->mb.mb_y << 4) + (vec0.y >> 2) + index[y][x][5] + i * 4) * t->edged_stride + (t->mb.mb_x << 4) + (vec0.x >> 2) + index[y][x][4] + j * 4;
                        t->pia[MB_4x4](t->ref[list_index][vec0.refno]->Y[index[y][x][0]] + offset1, 
                            t->ref[list_index][vec0.refno]->Y[index[y][x][1]] + offset2,
                            t->edged_stride, t->edged_stride, pred_tmp,16);
                    }
                }
                x = (vec1.x & 3);
                y = (vec1.y & 3);
                if(vec1.refno > -1)
                {
                    list_index = 1;
                    if(vec0.refno > -1)
                        pred_tmp = pred_16x16bi + offset_base;
                    if (index[y][x][0] == index[y][x][1])
                    {
                        offset1 = ((t->mb.mb_y << 4) + (vec1.y >> 2) + i * 4) * t->edged_stride + ((t->mb.mb_x << 4) + (vec1.x >> 2)) + j * 4;
                        tmp = t->ref[list_index][vec1.refno]->Y[index[y][x][0]] + offset1;
                        t->memcpy_stride_u(tmp, 4, 4, t->edged_stride, pred_tmp, 16);
                    }
                    else
                    {
                        offset1 = ((t->mb.mb_y << 4) + (vec1.y >> 2) + index[y][x][3] + i * 4) * t->edged_stride + (t->mb.mb_x << 4) + (vec1.x >> 2) + index[y][x][2] + j * 4;
                        offset2 = ((t->mb.mb_y << 4) + (vec1.y >> 2) + index[y][x][5] + i * 4) * t->edged_stride + (t->mb.mb_x << 4) + (vec1.x >> 2) + index[y][x][4] + j * 4;
                        t->pia[MB_4x4](t->ref[list_index][vec1.refno]->Y[index[y][x][0]] + offset1, 
                            t->ref[list_index][vec1.refno]->Y[index[y][x][1]] + offset2,
                            t->edged_stride, t->edged_stride, pred_tmp, 16);
                    }
                }
                if(pred_tmp != ref + offset_base)
                    t->pia[MB_4x4](pred_tmp,ref + offset_base,16,16,ref + offset_base,16);        
                ++block_idx;
        }
    }
}
void
T264dec_mb_decode_interb_mc(T264_t* t, uint8_t* ref)
{
    T264_vector_t vec0,vec1;
    uint8_t* tmp,*pred_tmp;
    int32_t x, y,i;
    int32_t list_index;
    DECLARE_ALIGNED_MATRIX_H(pred_16x16bi, 16, 16, uint8_t, CACHE_SIZE);
 
    if(t->mb.is_copy)
        T264_mb4x4_interb_mc(t,t->mb.vec,ref);
    else
    switch(t->mb.mb_part)
    {
    case MB_16x16:
        vec0 = t->mb.vec[0][0];
        vec1 = t->mb.vec[1][0];
        x = (vec0.x & 3);
        y = (vec0.y & 3);
        pred_tmp = ref;    
        if(vec0.refno > -1)
        {
            list_index = 0;
            if (index[y][x][0] == index[y][x][1])
            {   
                tmp = t->ref[list_index][vec0.refno]->Y[index[y][x][0]] + ((t->mb.mb_y << 4) + (vec0.y >> 2)) * t->edged_stride + 
                    ((t->mb.mb_x << 4) + (vec0.x >> 2));
                t->memcpy_stride_u(tmp, 16, 16, t->edged_stride, ref, 16);
            }
            else
            {  
                t->pia[MB_16x16](t->ref[list_index][vec0.refno]->Y[index[y][x][0]] + ((t->mb.mb_y << 4) + (vec0.y >> 2) + index[y][x][3]) * t->edged_stride + (t->mb.mb_x << 4) + (vec0.x >> 2) + index[y][x][2], 
                    t->ref[list_index][vec0.refno]->Y[index[y][x][1]] + ((t->mb.mb_y << 4) + (vec0.y >> 2) + index[y][x][5]) * t->edged_stride + (t->mb.mb_x << 4) + (vec0.x >> 2) + index[y][x][4],
                    t->edged_stride, t->edged_stride, ref, 16);
            }                              
        }
        if(vec1.refno > -1)
        {   //if bi-pred
                x = (vec1.x & 3);
                y = (vec1.y & 3);
                list_index = 1;
                if(vec0.refno > -1) //if biPred
                    pred_tmp = pred_16x16bi;
                else
                    pred_tmp = ref;
                if (index[y][x][0] == index[y][x][1])
                {   
                    tmp = t->ref[list_index][vec1.refno]->Y[index[y][x][0]] + ((t->mb.mb_y << 4) + (vec1.y >> 2)) * t->edged_stride + 
                        ((t->mb.mb_x << 4) + (vec1.x >> 2));
                    t->memcpy_stride_u(tmp, 16, 16, t->edged_stride, pred_tmp, 16);
                }
                else
                {   
                    t->pia[MB_16x16](t->ref[list_index][vec1.refno]->Y[index[y][x][0]] + ((t->mb.mb_y << 4) + (vec1.y >> 2) + index[y][x][3]) * t->edged_stride + (t->mb.mb_x << 4) + (vec1.x >> 2) + index[y][x][2], 
                        t->ref[list_index][vec1.refno]->Y[index[y][x][1]] + ((t->mb.mb_y << 4) + (vec1.y >> 2) + index[y][x][5]) * t->edged_stride + (t->mb.mb_x << 4) + (vec1.x >> 2) + index[y][x][4],
                        t->edged_stride, t->edged_stride, pred_tmp, 16);
                }    
        } 
        if(pred_tmp != ref)
        {   //if biPred
            t->pia[MB_16x16](pred_tmp,ref,16,16,ref,16);            
        }
        break;
    case MB_16x8:
        vec0 = t->mb.vec[0][0];
        vec1 = t->mb.vec[1][0];
        pred_tmp = ref;   
        if(vec0.refno > -1)
        {
            list_index = 0;
            x = (vec0.x & 3);
            y = (vec0.y & 3);
            if (index[y][x][0] == index[y][x][1])
            {
                tmp = t->ref[list_index][vec0.refno]->Y[index[y][x][0]] + ((t->mb.mb_y << 4) + (vec0.y >> 2)) * t->edged_stride + 
                    ((t->mb.mb_x << 4) + (vec0.x >> 2));
                t->memcpy_stride_u(tmp, 16, 8, t->edged_stride, ref, 16);
            }
            else
            {
                t->pia[MB_16x8](t->ref[list_index][vec0.refno]->Y[index[y][x][0]] + ((t->mb.mb_y << 4) + (vec0.y >> 2) + index[y][x][3]) * t->edged_stride + (t->mb.mb_x << 4) + (vec0.x >> 2) + index[y][x][2], 
                    t->ref[list_index][vec0.refno]->Y[index[y][x][1]] + ((t->mb.mb_y << 4) + (vec0.y >> 2) + index[y][x][5]) * t->edged_stride + (t->mb.mb_x << 4) + (vec0.x >> 2) + index[y][x][4],
                    t->edged_stride, t->edged_stride, ref, 16);
            }
        }
        if(vec1.refno > -1)
        {
            x = (vec1.x & 3);
            y = (vec1.y & 3);
            list_index = 1;
            if(vec0.refno > -1) //if biPred
                pred_tmp = pred_16x16bi;
            else
                pred_tmp = ref;
            if (index[y][x][0] == index[y][x][1])
            {
                tmp = t->ref[list_index][vec1.refno]->Y[index[y][x][0]] + ((t->mb.mb_y << 4) + (vec1.y >> 2)) * t->edged_stride + 
                    ((t->mb.mb_x << 4) + (vec1.x >> 2));
                t->memcpy_stride_u(tmp, 16, 8, t->edged_stride, pred_tmp, 16);
            }
            else
            {
                t->pia[MB_16x8](t->ref[list_index][vec1.refno]->Y[index[y][x][0]] + ((t->mb.mb_y << 4) + (vec1.y >> 2) + index[y][x][3]) * t->edged_stride + (t->mb.mb_x << 4) + (vec1.x >> 2) + index[y][x][2], 
                    t->ref[list_index][vec1.refno]->Y[index[y][x][1]] + ((t->mb.mb_y << 4) + (vec1.y >> 2) + index[y][x][5]) * t->edged_stride + (t->mb.mb_x << 4) + (vec1.x >> 2) + index[y][x][4],
                    t->edged_stride, t->edged_stride, pred_tmp, 16);
            }
        }
        if(pred_tmp != ref)
        {   //if biPred
            t->pia[MB_16x8](pred_tmp,ref,16,16,ref,16);            
        }
        //For second MB16x8
        vec0 = t->mb.vec[0][8];
        vec1 = t->mb.vec[1][8];
        pred_tmp = ref + 16 * 8;    
        if(vec0.refno > -1)
        {
            x = (vec0.x & 3);
            y = (vec0.y & 3);
            list_index = 0;
            if (index[y][x][0] == index[y][x][1])
            {
                tmp = t->ref[list_index][vec0.refno]->Y[index[y][x][0]] + ((t->mb.mb_y << 4) + (vec0.y >> 2) + 8) * t->edged_stride + 
                    ((t->mb.mb_x << 4) + (vec0.x >> 2));
                t->memcpy_stride_u(tmp, 16, 8, t->edged_stride, pred_tmp, 16);
            }
            else
            {
                t->pia[MB_16x8](t->ref[list_index][vec0.refno]->Y[index[y][x][0]] + ((t->mb.mb_y << 4) + (vec0.y >> 2) + index[y][x][3] + 8) * t->edged_stride + (t->mb.mb_x << 4) + (vec0.x >> 2) + index[y][x][2], 
                    t->ref[list_index][vec0.refno]->Y[index[y][x][1]] + ((t->mb.mb_y << 4) + (vec0.y >> 2) + index[y][x][5] + 8) * t->edged_stride + (t->mb.mb_x << 4) + (vec0.x >> 2) + index[y][x][4],
                    t->edged_stride, t->edged_stride, pred_tmp, 16);
            }
        }
        if(vec1.refno > -1)
        {
            x = (vec1.x & 3);
            y = (vec1.y & 3);
            list_index = 1;
            if(vec0.refno > -1) //if biPred
                pred_tmp = pred_16x16bi + 16 * 8;
            if (index[y][x][0] == index[y][x][1])
            {
                tmp = t->ref[list_index][vec1.refno]->Y[index[y][x][0]] + ((t->mb.mb_y << 4) + (vec1.y >> 2) + 8) * t->edged_stride + 
                    ((t->mb.mb_x << 4) + (vec1.x >> 2));
                t->memcpy_stride_u(tmp, 16, 8, t->edged_stride,pred_tmp, 16);
            }
            else
            {
                t->pia[MB_16x8](t->ref[list_index][vec1.refno]->Y[index[y][x][0]] + ((t->mb.mb_y << 4) + (vec1.y >> 2) + index[y][x][3] + 8) * t->edged_stride + (t->mb.mb_x << 4) + (vec1.x >> 2) + index[y][x][2], 
                    t->ref[list_index][vec1.refno]->Y[index[y][x][1]] + ((t->mb.mb_y << 4) + (vec1.y >> 2) + index[y][x][5] + 8) * t->edged_stride + (t->mb.mb_x << 4) + (vec1.x >> 2) + index[y][x][4],
                    t->edged_stride, t->edged_stride, pred_tmp, 16);
            }
        }
        if(pred_tmp != ref + 16 * 8)
        {   //if biPred
            t->pia[MB_16x8](pred_tmp,ref + 16 * 8,16,16,ref + 16 * 8,16);            
        }
        break;
    case MB_8x16:
        pred_tmp = ref;
        vec0 = t->mb.vec[0][0];
        vec1 = t->mb.vec[1][0];
        if(vec0.refno > -1)
        {
            x = (vec0.x & 3);
            y = (vec0.y & 3);
            list_index = 0;
            if (index[y][x][0] == index[y][x][1])
            {
                tmp = t->ref[list_index][vec0.refno]->Y[index[y][x][0]] + ((t->mb.mb_y << 4) + (vec0.y >> 2)) * t->edged_stride + 
                    ((t->mb.mb_x << 4) + (vec0.x >> 2));
                t->memcpy_stride_u(tmp, 8, 16, t->edged_stride, ref, 16);
            }
            else
            {
                t->pia[MB_8x16](t->ref[list_index][vec0.refno]->Y[index[y][x][0]] + ((t->mb.mb_y << 4) + (vec0.y >> 2) + index[y][x][3]) * t->edged_stride + (t->mb.mb_x << 4) + (vec0.x >> 2) + index[y][x][2], 
                    t->ref[list_index][vec0.refno]->Y[index[y][x][1]] + ((t->mb.mb_y << 4) + (vec0.y >> 2) + index[y][x][5]) * t->edged_stride + (t->mb.mb_x << 4) + (vec0.x >> 2) + index[y][x][4],
                    t->edged_stride, t->edged_stride, ref, 16);
            }
        }
        if(vec1.refno > -1)
        {
            list_index = 1;
            x = (vec1.x & 3);
            y = (vec1.y & 3);
            if(vec0.refno > -1) //if biPred
                pred_tmp = pred_16x16bi;
            if (index[y][x][0] == index[y][x][1])
            {
                tmp = t->ref[list_index][vec1.refno]->Y[index[y][x][0]] + ((t->mb.mb_y << 4) + (vec1.y >> 2)) * t->edged_stride + 
                    ((t->mb.mb_x << 4) + (vec1.x >> 2));
                t->memcpy_stride_u(tmp, 8, 16, t->edged_stride, pred_tmp, 16);
            }
            else
            {
                t->pia[MB_8x16](t->ref[list_index][vec1.refno]->Y[index[y][x][0]] + ((t->mb.mb_y << 4) + (vec1.y >> 2) + index[y][x][3]) * t->edged_stride + (t->mb.mb_x << 4) + (vec1.x >> 2) + index[y][x][2], 
                    t->ref[list_index][vec1.refno]->Y[index[y][x][1]] + ((t->mb.mb_y << 4) + (vec1.y >> 2) + index[y][x][5]) * t->edged_stride + (t->mb.mb_x << 4) + (vec1.x >> 2) + index[y][x][4],
                    t->edged_stride, t->edged_stride,pred_tmp, 16);
            }
        }
        if(pred_tmp != ref)
        {   //if biPred
            t->pia[MB_8x16](pred_tmp,ref,16,16,ref,16);            
        }
        //for second MB8x16
        vec0 = t->mb.vec[0][2];
        vec1 = t->mb.vec[1][2];
        pred_tmp = ref + 8;
        if(vec0.refno > -1)
        {
            x = (vec0.x & 3);
            y = (vec0.y & 3);
            list_index = 0;
            if (index[y][x][0] == index[y][x][1])
            {
                tmp = t->ref[list_index][vec0.refno]->Y[index[y][x][0]] + ((t->mb.mb_y << 4) + (vec0.y >> 2)) * t->edged_stride + 
                    ((t->mb.mb_x << 4) + (vec0.x >> 2)) + 8;
                t->memcpy_stride_u(tmp, 8, 16, t->edged_stride, pred_tmp, 16);
            }
            else
            {
                t->pia[MB_8x16](t->ref[list_index][vec0.refno]->Y[index[y][x][0]] + ((t->mb.mb_y << 4) + (vec0.y >> 2) + index[y][x][3]) * t->edged_stride + (t->mb.mb_x << 4) + (vec0.x >> 2) + index[y][x][2] + 8, 
                    t->ref[list_index][vec0.refno]->Y[index[y][x][1]] + ((t->mb.mb_y << 4) + (vec0.y >> 2) + index[y][x][5]) * t->edged_stride + (t->mb.mb_x << 4) + (vec0.x >> 2) + index[y][x][4] + 8,
                    t->edged_stride, t->edged_stride, pred_tmp, 16);
            }
        }
        if(vec1.refno > -1)
        {
            x = (vec1.x & 3);
            y = (vec1.y & 3);
            list_index = 1;
            if(vec0.refno > -1) //if biPred
                pred_tmp = pred_16x16bi + 8;
            if (index[y][x][0] == index[y][x][1])
            {
                tmp = t->ref[list_index][vec1.refno]->Y[index[y][x][0]] + ((t->mb.mb_y << 4) + (vec1.y >> 2)) * t->edged_stride + 
                    ((t->mb.mb_x << 4) + (vec1.x >> 2)) + 8;
                t->memcpy_stride_u(tmp, 8, 16, t->edged_stride, pred_tmp, 16);
            }
            else
            {
                t->pia[MB_8x16](t->ref[list_index][vec1.refno]->Y[index[y][x][0]] + ((t->mb.mb_y << 4) + (vec1.y >> 2) + index[y][x][3]) * t->edged_stride + (t->mb.mb_x << 4) + (vec1.x >> 2) + index[y][x][2] + 8, 
                    t->ref[list_index][vec1.refno]->Y[index[y][x][1]] + ((t->mb.mb_y << 4) + (vec1.y >> 2) + index[y][x][5]) * t->edged_stride + (t->mb.mb_x << 4) + (vec1.x >> 2) + index[y][x][4] + 8,
                    t->edged_stride, t->edged_stride,pred_tmp, 16);
            }
        }
        if(pred_tmp != ref + 8)
        {   //if biPred
            t->pia[MB_8x16](pred_tmp,ref + 8,16,16,ref + 8,16);            
        }
        break;
    case MB_8x8:
        for(i = 0 ; i < 4 ; i ++)
        {
            int32_t offset1, offset2;
            switch(t->mb.submb_part[luma_index[4 * i]]) 
            {
            case MB_8x8:
                vec0 = t->mb.vec[0][luma_index[4 * i]];
                vec1 = t->mb.vec[1][luma_index[4 * i]];
                x = (vec0.x & 3);
                y = (vec0.y & 3);
                pred_tmp = ref + i / 2 * 16 * 8 + i % 2 * 8;
                if(vec0.refno > -1)
                {
                    list_index = 0;
                    if (index[y][x][0] == index[y][x][1])
                    {
                        offset1 = ((t->mb.mb_y << 4) + (vec0.y >> 2) + i / 2 * 8) * t->edged_stride + ((t->mb.mb_x << 4) + (vec0.x >> 2)) + i % 2 * 8;
                        tmp = t->ref[list_index][vec0.refno]->Y[index[y][x][0]] + offset1;
                        t->memcpy_stride_u(tmp, 8, 8, t->edged_stride, pred_tmp, 16);
                    }
                    else
                    {
                        offset1 = ((t->mb.mb_y << 4) + (vec0.y >> 2) + index[y][x][3] + i / 2 * 8) * t->edged_stride + (t->mb.mb_x << 4) + (vec0.x >> 2) + index[y][x][2] + i % 2 * 8;
                        offset2 = ((t->mb.mb_y << 4) + (vec0.y >> 2) + index[y][x][5] + i / 2 * 8) * t->edged_stride + (t->mb.mb_x << 4) + (vec0.x >> 2) + index[y][x][4] + i % 2 * 8;
                        t->pia[MB_8x8](t->ref[list_index][vec0.refno]->Y[index[y][x][0]] + offset1, 
                            t->ref[list_index][vec0.refno]->Y[index[y][x][1]] + offset2,
                            t->edged_stride, t->edged_stride, pred_tmp,16);
                    }
                }
                x = (vec1.x & 3);
                y = (vec1.y & 3);
                if(vec1.refno > -1)
                {
                    list_index = 1;
                    if(vec0.refno > -1)
                        pred_tmp = pred_16x16bi + i / 2 * 16 * 8 + i % 2 * 8;
                    if (index[y][x][0] == index[y][x][1])
                    {
                        offset1 = ((t->mb.mb_y << 4) + (vec1.y >> 2) + i / 2 * 8) * t->edged_stride + ((t->mb.mb_x << 4) + (vec1.x >> 2)) + i % 2 * 8;
                        tmp = t->ref[list_index][vec1.refno]->Y[index[y][x][0]] + offset1;
                        t->memcpy_stride_u(tmp, 8, 8, t->edged_stride, pred_tmp, 16);
                    }
                    else
                    {
                        offset1 = ((t->mb.mb_y << 4) + (vec1.y >> 2) + index[y][x][3] + i / 2 * 8) * t->edged_stride + (t->mb.mb_x << 4) + (vec1.x >> 2) + index[y][x][2] + i % 2 * 8;
                        offset2 = ((t->mb.mb_y << 4) + (vec1.y >> 2) + index[y][x][5] + i / 2 * 8) * t->edged_stride + (t->mb.mb_x << 4) + (vec1.x >> 2) + index[y][x][4] + i % 2 * 8;
                        t->pia[MB_8x8](t->ref[list_index][vec1.refno]->Y[index[y][x][0]] + offset1, 
                            t->ref[list_index][vec1.refno]->Y[index[y][x][1]] + offset2,
                            t->edged_stride, t->edged_stride, pred_tmp, 16);
                    }
                }
                if(pred_tmp != ref + i / 2 * 16 * 8 + i % 2 * 8)
                    t->pia[MB_8x8](pred_tmp,ref + i / 2 * 16 * 8 + i % 2 * 8,16,16,ref + i / 2 * 16 * 8 + i % 2 * 8,16);
                break;
            default:
                assert(0);
                break;
            }
        }
        break;
    default:    //only support MB16x16 B-frame
        assert(0);
        break;
    }
}
void 
T264dec_mb_decode_interb_y(T264_t* t)
{
    T264dec_mb_decode_interb_mc(t, t->mb.pred_p16x16);
    T264dec_mb_decode_interp_transform(t, t->mb.pred_p16x16);
}
void 
T264dec_mb_decode_interb_uv(T264_t* t)
{
    DECLARE_ALIGNED_MATRIX(pred_u, 8, 8, uint8_t, CACHE_SIZE);
    DECLARE_ALIGNED_MATRIX(pred_v, 8, 8, uint8_t, CACHE_SIZE);
    DECLARE_ALIGNED_MATRIX(pred_bi, 8, 8, uint8_t, CACHE_SIZE);
    T264_vector_t vec0,vec1;
    uint8_t* src, *dst;
    int32_t list_index,i;
    if(t->mb.is_copy)
    {
        T264_mb4x4_interb_uv_mc(t,t->mb.vec,pred_u,pred_v);
    }else
    switch (t->mb.mb_part)
    {
    case MB_16x16:
        vec0 = t->mb.vec[0][0];
        vec1 = t->mb.vec[1][0];
        dst  = pred_u;
        if(vec0.refno > -1)
        {
            list_index = 0;
            src = t->ref[list_index][vec0.refno]->U + ((t->mb.mb_y << 3) + (vec0.y >> 3)) * t->edged_stride_uv + (t->mb.mb_x << 3) + (vec0.x >> 3);
            t->eighth_pixel_mc_u(src, t->edged_stride_uv, pred_u, vec0.x, vec0.y, 8, 8);            
        }
        if(vec1.refno > -1)
        {
            list_index = 1;
            if(vec0.refno > -1)
                dst = pred_bi;            
            else
                dst = pred_u;
            src = t->ref[list_index][vec1.refno]->U + ((t->mb.mb_y << 3) + (vec1.y >> 3)) * t->edged_stride_uv + (t->mb.mb_x << 3) + (vec1.x >> 3);
            t->eighth_pixel_mc_u(src, t->edged_stride_uv, dst, vec1.x, vec1.y, 8, 8);            
        }
        if(dst != pred_u)
        {
            t->pia[MB_8x8](dst,pred_u,8,8,pred_u,8);            
        }
        dst = pred_v;
        if(vec0.refno > -1)
        {
            list_index = 0;
            src = t->ref[list_index][vec0.refno]->V + ((t->mb.mb_y << 3) + (vec0.y >> 3)) * t->edged_stride_uv + (t->mb.mb_x << 3) + (vec0.x >> 3);
            t->eighth_pixel_mc_u(src, t->edged_stride_uv, pred_v, vec0.x, vec0.y, 8, 8);            
        }
        if(vec1.refno > -1)
        {
            list_index = 1;
            if(vec0.refno > -1)
                dst = pred_bi;            
            else
                dst = pred_v;
            src = t->ref[list_index][vec1.refno]->V + ((t->mb.mb_y << 3) + (vec1.y >> 3)) * t->edged_stride_uv + (t->mb.mb_x << 3) + (vec1.x >> 3);
            t->eighth_pixel_mc_u(src, t->edged_stride_uv, dst, vec1.x, vec1.y, 8, 8);            
        }
        if(dst != pred_v)
        {
            t->pia[MB_8x8](dst,pred_v,8,8,pred_v,8);            
        }
        break;
    case MB_16x8:
        vec0 = t->mb.vec[0][0];
        vec1 = t->mb.vec[1][0];
        
        dst  = pred_u;
        if(vec0.refno > -1)
        {
            list_index = 0;
            src = t->ref[list_index][vec0.refno]->U + ((t->mb.mb_y << 3) + (vec0.y >> 3)) * t->edged_stride_uv + (t->mb.mb_x << 3) + (vec0.x >> 3);
            t->eighth_pixel_mc_u(src, t->edged_stride_uv, dst, vec0.x, vec0.y, 8, 4);
        }
        if(vec1.refno > -1)
        {
            if(vec0.refno > -1)
                dst = pred_bi;
            else
                dst = pred_u;
            list_index = 1;
            src = t->ref[list_index][vec1.refno]->U + ((t->mb.mb_y << 3) + (vec1.y >> 3)) * t->edged_stride_uv + (t->mb.mb_x << 3) + (vec1.x >> 3);
            t->eighth_pixel_mc_u(src, t->edged_stride_uv, dst, vec1.x, vec1.y, 8, 4);
        }
        if(dst != pred_u)
        {
            t->pia[MB_8x4](dst,pred_u,8,8,pred_u,8);            
        }
        dst  = pred_v;
        if(vec0.refno > -1)
        {
            list_index = 0;
            src = t->ref[list_index][vec0.refno]->V + ((t->mb.mb_y << 3) + (vec0.y >> 3)) * t->edged_stride_uv + (t->mb.mb_x << 3) + (vec0.x >> 3);
            t->eighth_pixel_mc_u(src, t->edged_stride_uv, dst, vec0.x, vec0.y, 8, 4);
        }
        if(vec1.refno > -1)
        {
            if(vec0.refno > -1)
                dst = pred_bi;
            else
                dst = pred_v;
            list_index = 1;
            src = t->ref[list_index][vec1.refno]->V + ((t->mb.mb_y << 3) + (vec1.y >> 3)) * t->edged_stride_uv + (t->mb.mb_x << 3) + (vec1.x >> 3);
            t->eighth_pixel_mc_u(src, t->edged_stride_uv, dst, vec1.x, vec1.y, 8, 4);
        }
        if(dst != pred_v)
        {
            t->pia[MB_8x4](dst,pred_v,8,8,pred_v,8);            
        }
        //now for next MB16x8
        vec0 = t->mb.vec[0][luma_index[8]];
        vec1 = t->mb.vec[1][luma_index[8]];        
        dst  = pred_u + 4 * 8;
        if(vec0.refno > -1)
        {
            list_index = 0;
            src = t->ref[list_index][vec0.refno]->U + ((t->mb.mb_y << 3) + (vec0.y >> 3)) * t->edged_stride_uv + (t->mb.mb_x << 3) + (vec0.x >> 3) +
            4 * t->edged_stride_uv;
            t->eighth_pixel_mc_u(src, t->edged_stride_uv, dst, vec0.x, vec0.y, 8, 4);
        }
        if(vec1.refno > -1)
        {
            if(vec0.refno > -1)
                dst = pred_bi + 4 * 8;
            else
                dst = pred_u + 4 * 8;
            list_index = 1;
            src = t->ref[list_index][vec1.refno]->U + ((t->mb.mb_y << 3) + (vec1.y >> 3)) * t->edged_stride_uv + (t->mb.mb_x << 3) + (vec1.x >> 3) +
            4 * t->edged_stride_uv;
            t->eighth_pixel_mc_u(src, t->edged_stride_uv, dst, vec1.x, vec1.y, 8, 4);
        }
        if(dst != pred_u + 4 * 8)
        {
            t->pia[MB_8x4](dst,pred_u + 4 * 8,8,8,pred_u + 4 * 8,8);
        }
        //for v
        dst  = pred_v + 4 * 8;
        if(vec0.refno > -1)
        {
            list_index = 0;
            src = t->ref[list_index][vec0.refno]->V + ((t->mb.mb_y << 3) + (vec0.y >> 3)) * t->edged_stride_uv + (t->mb.mb_x << 3) + (vec0.x >> 3) + 
                4 * t->edged_stride_uv;        
            t->eighth_pixel_mc_u(src, t->edged_stride_uv, dst, vec0.x, vec0.y, 8, 4);
        }
        if(vec1.refno > -1)
        {
            if(vec0.refno > -1)
                dst = pred_bi + 4 * 8;
            else
                dst = pred_v + 4 * 8;
            list_index = 1;
            src = t->ref[list_index][vec1.refno]->V + ((t->mb.mb_y << 3) + (vec1.y >> 3)) * t->edged_stride_uv + (t->mb.mb_x << 3) + (vec1.x >> 3) + 
                4 * t->edged_stride_uv;        
            t->eighth_pixel_mc_u(src, t->edged_stride_uv, dst, vec1.x, vec1.y, 8, 4);
        }
        if(dst != pred_v + 4 * 8)
        {
            t->pia[MB_8x4](dst,pred_v + 4 * 8,8,8,pred_v + 4 * 8,8);            
        }
        break;
    case MB_8x16:
        vec0 = t->mb.vec[0][0];
        vec1 = t->mb.vec[1][0];
        
        dst  = pred_u;
        if(vec0.refno > -1)
        {
            list_index = 0;
            src = t->ref[list_index][vec0.refno]->U + ((t->mb.mb_y << 3) + (vec0.y >> 3)) * t->edged_stride_uv + (t->mb.mb_x << 3) + (vec0.x >> 3);
            t->eighth_pixel_mc_u(src, t->edged_stride_uv, dst, vec0.x, vec0.y, 4, 8);
        }
        if(vec1.refno > -1)
        {
            if(vec0.refno > -1)
                dst = pred_bi;
            else
                dst = pred_u;
            list_index = 1;
            src = t->ref[list_index][vec1.refno]->U + ((t->mb.mb_y << 3) + (vec1.y >> 3)) * t->edged_stride_uv + (t->mb.mb_x << 3) + (vec1.x >> 3);
            t->eighth_pixel_mc_u(src, t->edged_stride_uv, dst, vec1.x, vec1.y, 4, 8);
        }
        if(dst != pred_u)
        {
            t->pia[MB_4x8](dst,pred_u,8,8,pred_u,8);            
        }
        dst  = pred_v;
        if(vec0.refno > -1)
        {
            list_index = 0;
            src = t->ref[list_index][vec0.refno]->V + ((t->mb.mb_y << 3) + (vec0.y >> 3)) * t->edged_stride_uv + (t->mb.mb_x << 3) + (vec0.x >> 3);
            //dst = pred_v;
            t->eighth_pixel_mc_u(src, t->edged_stride_uv, dst, vec0.x, vec0.y, 4, 8);
        }
        if(vec1.refno > -1)
        {
            if(vec0.refno > -1)
                dst = pred_bi;
            else
                dst = pred_v;
            list_index = 1;
            src = t->ref[list_index][vec1.refno]->V + ((t->mb.mb_y << 3) + (vec1.y >> 3)) * t->edged_stride_uv + (t->mb.mb_x << 3) + (vec1.x >> 3);
            t->eighth_pixel_mc_u(src, t->edged_stride_uv, dst, vec1.x, vec1.y, 4, 8);
        }
        if(dst != pred_v)
        {
            t->pia[MB_4x8](dst,pred_v,8,8,pred_v,8);            
        }
        //now for next MB8x16
        vec0 = t->mb.vec[0][luma_index[4]];
        vec1 = t->mb.vec[1][luma_index[4]];        
        dst  = pred_u + 4;
        if(vec0.refno > -1)
        {
            list_index = 0;
            src = t->ref[list_index][vec0.refno]->U + ((t->mb.mb_y << 3) + (vec0.y >> 3)) * t->edged_stride_uv + (t->mb.mb_x << 3) + (vec0.x >> 3) + 4;
            t->eighth_pixel_mc_u(src, t->edged_stride_uv, dst, vec0.x, vec0.y, 4, 8);
        }
        if(vec1.refno > -1)
        {
            if(vec0.refno > -1)
                dst = pred_bi + 4;
            else
                dst = pred_u + 4;
            list_index = 1;
            src = t->ref[list_index][vec1.refno]->U + ((t->mb.mb_y << 3) + (vec1.y >> 3)) * t->edged_stride_uv + (t->mb.mb_x << 3) + (vec1.x >> 3) + 4;
            t->eighth_pixel_mc_u(src, t->edged_stride_uv, dst, vec1.x, vec1.y, 4, 8);
        }
        if(dst != pred_u + 4)
        {
            t->pia[MB_4x8](dst,pred_u + 4,8,8,pred_u + 4,8);            
        }
        //for v
        dst  = pred_v + 4;
        if(vec0.refno > -1)
        {
            list_index = 0;
            src = t->ref[list_index][vec0.refno]->V + ((t->mb.mb_y << 3) + (vec0.y >> 3)) * t->edged_stride_uv + (t->mb.mb_x << 3) + (vec0.x >> 3) + 4;        
            t->eighth_pixel_mc_u(src, t->edged_stride_uv, dst, vec0.x, vec0.y, 4, 8);
        }
        if(vec1.refno > -1)
        {
            if(vec0.refno > -1)
                dst = pred_bi + 4;
            else
                dst = pred_v + 4;
            list_index = 1;
            src = t->ref[list_index][vec1.refno]->V + ((t->mb.mb_y << 3) + (vec1.y >> 3)) * t->edged_stride_uv + (t->mb.mb_x << 3) + (vec1.x >> 3) + 4;        
            t->eighth_pixel_mc_u(src, t->edged_stride_uv, dst, vec1.x, vec1.y, 4, 8);
        }
        if(dst != pred_v + 4)
        {
            t->pia[MB_4x8](dst,pred_v + 4,8,8,pred_v + 4,8);            
        }
      
        break;
    case MB_8x8:
        for(i = 0 ; i < 4 ; i ++)
        {
            switch(t->mb.submb_part[luma_index[4 * i]])
            {
            case MB_8x8:
                vec0 = t->mb.vec[0][luma_index[4 * i]];
                vec1 = t->mb.vec[1][luma_index[4 * i]];
                dst = pred_u + i / 2 * 32 + i % 2 * 4;
                if(vec0.refno > -1)
                {
                    list_index = 0;
                    src = t->ref[list_index][vec0.refno]->U + ((t->mb.mb_y << 3) + (vec0.y >> 3) + i / 2 * 4) * t->edged_stride_uv + (t->mb.mb_x << 3) + (vec0.x >> 3) + (i % 2 * 4);
                    t->eighth_pixel_mc_u(src, t->edged_stride_uv, dst, vec0.x, vec0.y, 4, 4);
                }
                if(vec1.refno > -1)
                {
                    if(vec0.refno > -1)
                        dst = pred_bi + i / 2 * 32 + i % 2 * 4;
                    list_index = 1;
                    src = t->ref[list_index][vec1.refno]->U + ((t->mb.mb_y << 3) + (vec1.y >> 3) + i / 2 * 4) * t->edged_stride_uv + (t->mb.mb_x << 3) + (vec1.x >> 3) + (i % 2 * 4);
                    t->eighth_pixel_mc_u(src, t->edged_stride_uv, dst, vec1.x, vec1.y, 4, 4);
                }
                if(dst != pred_u + i / 2 * 32 + i % 2 * 4)
                    t->pia[MB_4x4](dst,pred_u + i / 2 * 32 + i % 2 * 4,8,8,pred_u + i / 2 * 32 + i % 2 * 4,8);  
                dst = pred_v + i / 2 * 32 + i % 2 * 4;
                if(vec0.refno > -1)
                {
                    list_index = 0;
                    src = t->ref[list_index][vec0.refno]->V + ((t->mb.mb_y << 3) + (vec0.y >> 3) + i / 2 * 4) * t->edged_stride_uv + (t->mb.mb_x << 3) + (vec0.x >> 3) + (i % 2 * 4);
                    t->eighth_pixel_mc_u(src, t->edged_stride_uv, dst, vec0.x, vec0.y, 4, 4);
                }
                if(vec1.refno > -1)
                {
                    if(vec0.refno > -1)
                        dst = pred_bi + i / 2 * 32 + i % 2 * 4;
                    list_index = 1;
                    src = t->ref[list_index][vec1.refno]->V + ((t->mb.mb_y << 3) + (vec1.y >> 3) + i / 2 * 4) * t->edged_stride_uv + (t->mb.mb_x << 3) + (vec1.x >> 3) + (i % 2 * 4);
                    t->eighth_pixel_mc_u(src, t->edged_stride_uv, dst, vec1.x, vec1.y, 4, 4);
                }
                if(dst != pred_v + i / 2 * 32 + i % 2 * 4)
                    t->pia[MB_4x4](dst,pred_v + i / 2 * 32 + i % 2 * 4,8,8,pred_v + i / 2 * 32 + i % 2 * 4,8);  
                break;
            default:
                assert(0);
                break;
            }
        }
    default:
        break;
    }
    T264dec_mb_decode_uv(t, pred_u, pred_v);   
}