mc.c
上传用户:hjq518
上传日期:2021-12-09
资源大小:5084k
文件大小:23k
源码类别:

Audio

开发平台:

Visual C++

  1. /*****************************************************************************
  2.  * mc.c: h264 encoder library (Motion Compensation)
  3.  *****************************************************************************
  4.  * Copyright (C) 2003-2008 x264 project
  5.  *
  6.  * Authors: Eric Petit <eric.petit@lapsus.org>
  7.  *          Guillaume Poirier <gpoirier@mplayerhq.hu>
  8.  *
  9.  * This program is free software; you can redistribute it and/or modify
  10.  * it under the terms of the GNU General Public License as published by
  11.  * the Free Software Foundation; either version 2 of the License, or
  12.  * (at your option) any later version.
  13.  *
  14.  * This program is distributed in the hope that it will be useful,
  15.  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16.  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17.  * GNU General Public License for more details.
  18.  *
  19.  * You should have received a copy of the GNU General Public License
  20.  * along with this program; if not, write to the Free Software
  21.  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  22.  *****************************************************************************/
  23. #include <stdlib.h>
  24. #include <stdio.h>
  25. #include <string.h>
  26. #include <stdint.h>
  27. #include <stdarg.h>
  28. #ifdef SYS_LINUX
  29. #include <altivec.h>
  30. #endif
  31. #include "x264.h"
  32. #include "common/common.h"
  33. #include "common/mc.h"
  34. #include "mc.h"
  35. #include "ppccommon.h"
  36. typedef void (*pf_mc_t)( uint8_t *src, int i_src,
  37.                          uint8_t *dst, int i_dst, int i_height );
  38. static inline int x264_tapfilter( uint8_t *pix, int i_pix_next )
  39. {
  40.     return pix[-2*i_pix_next] - 5*pix[-1*i_pix_next] + 20*(pix[0] +
  41.            pix[1*i_pix_next]) - 5*pix[ 2*i_pix_next] +
  42.            pix[ 3*i_pix_next];
  43. }
  44. static inline int x264_tapfilter1( uint8_t *pix )
  45. {
  46.     return pix[-2] - 5*pix[-1] + 20*(pix[0] + pix[1]) - 5*pix[ 2] +
  47.            pix[ 3];
  48. }
  49. /* pixel_avg */
  50. static inline void pixel_avg_w4( uint8_t *dst,  int i_dst,
  51.                                  uint8_t *src1, int i_src1,
  52.                                  uint8_t *src2, int i_src2,
  53.                                  int i_height )
  54. {
  55.     int x, y;
  56.     for( y = 0; y < i_height; y++ )
  57.     {
  58.         for( x = 0; x < 4; x++ )
  59.         {
  60.             dst[x] = ( src1[x] + src2[x] + 1 ) >> 1;
  61.         }
  62.         dst  += i_dst;
  63.         src1 += i_src1;
  64.         src2 += i_src2;
  65.     }
  66. }
  67. static inline void pixel_avg_w8( uint8_t *dst,  int i_dst,
  68.                                  uint8_t *src1, int i_src1,
  69.                                  uint8_t *src2, int i_src2,
  70.                                  int i_height )
  71. {
  72.     int y;
  73.     vec_u8_t src1v, src2v;
  74.     LOAD_ZERO;
  75.     PREP_LOAD;
  76.     PREP_STORE8;
  77.     for( y = 0; y < i_height; y++ )
  78.     {
  79.         VEC_LOAD( src1, src1v, 8, vec_u8_t );
  80.         VEC_LOAD( src2, src2v, 8, vec_u8_t );
  81.         src1v = vec_avg( src1v, src2v );
  82.         VEC_STORE8( src1v, dst );
  83.         dst  += i_dst;
  84.         src1 += i_src1;
  85.         src2 += i_src2;
  86.     }
  87. }
  88. static inline void pixel_avg_w16( uint8_t *dst,  int i_dst,
  89.                                   uint8_t *src1, int i_src1,
  90.                                   uint8_t *src2, int i_src2,
  91.                                   int i_height )
  92. {
  93.     int y;
  94.     vec_u8_t src1v, src2v;
  95.     PREP_LOAD;
  96.     PREP_STORE16;
  97.     for( y = 0; y < i_height; y++ )
  98.     {
  99.         VEC_LOAD( src1, src1v, 16, vec_u8_t );
  100.         VEC_LOAD( src2, src2v, 16, vec_u8_t );
  101.         src1v = vec_avg( src1v, src2v );
  102.         VEC_STORE16( src1v, dst );
  103.         dst  += i_dst;
  104.         src1 += i_src1;
  105.         src2 += i_src2;
  106.     }
  107. }
  108. /* mc_copy: plain c */
  109. #define MC_COPY( name, a )                                
  110. static void name( uint8_t *src, int i_src,                
  111.                   uint8_t *dst, int i_dst, int i_height ) 
  112. {                                                         
  113.     int y;                                                
  114.     for( y = 0; y < i_height; y++ )                       
  115.     {                                                     
  116.         memcpy( dst, src, a );                            
  117.         src += i_src;                                     
  118.         dst += i_dst;                                     
  119.     }                                                     
  120. }
  121. MC_COPY( mc_copy_w4,  4  )
  122. MC_COPY( mc_copy_w8,  8  )
  123. MC_COPY( mc_copy_w16, 16 )
  124. void mc_luma_altivec( uint8_t *dst,    int i_dst_stride,
  125.                       uint8_t *src[4], int i_src_stride,
  126.                       int mvx, int mvy,
  127.                       int i_width, int i_height )
  128. {
  129.     uint8_t *src1, *src2;
  130.     
  131.     /* todo : fixme... */
  132.     int correction = (((mvx&3) == 3 && (mvy&3) == 1) || ((mvx&3) == 1 && (mvy&3) == 3)) ? 1:0;
  133.     
  134.     int hpel1x = mvx>>1;
  135.     int hpel1y = (mvy+1-correction)>>1;
  136.     int filter1 = (hpel1x & 1) + ( (hpel1y & 1) << 1 );
  137.     
  138.     
  139.     src1 = src[filter1] + (hpel1y >> 1) * i_src_stride + (hpel1x >> 1);
  140.     
  141.     if ( (mvx|mvy) & 1 ) /* qpel interpolation needed */
  142.     {
  143.         int hpel2x = (mvx+1)>>1;
  144.         int hpel2y = (mvy+correction)>>1;
  145.         int filter2 = (hpel2x & 1) + ( (hpel2y & 1) <<1 );
  146.         
  147.         src2 = src[filter2] + (hpel2y >> 1) * i_src_stride + (hpel2x >> 1);
  148.         
  149.         switch(i_width) {
  150.         case 4:
  151.             pixel_avg_w4( dst, i_dst_stride, src1, i_src_stride,
  152.                           src2, i_src_stride, i_height );
  153.             break;
  154.         case 8:
  155.             pixel_avg_w8( dst, i_dst_stride, src1, i_src_stride,
  156.                           src2, i_src_stride, i_height );
  157.             break;
  158.         case 16:
  159.         default:
  160.             pixel_avg_w16( dst, i_dst_stride, src1, i_src_stride,
  161.                            src2, i_src_stride, i_height );
  162.         }
  163.         
  164.     }
  165.     else
  166.     {
  167.         switch(i_width) {
  168.         case 4:
  169.             mc_copy_w4( src1, i_src_stride, dst, i_dst_stride, i_height );
  170.             break;
  171.         case 8:
  172.             mc_copy_w8( src1, i_src_stride, dst, i_dst_stride, i_height );
  173.             break;
  174.         case 16:
  175.             mc_copy_w16( src1, i_src_stride, dst, i_dst_stride, i_height );
  176.             break;
  177.         }
  178.         
  179.     }
  180. }
  181. uint8_t *get_ref_altivec( uint8_t *dst,    int * i_dst_stride,
  182.                           uint8_t *src[4], int i_src_stride,
  183.                           int mvx, int mvy,
  184.                           int i_width, int i_height )
  185. {
  186.     uint8_t *src1, *src2;
  187.     
  188.     /* todo : fixme... */
  189.     int correction = (((mvx&3) == 3 && (mvy&3) == 1) || ((mvx&3) == 1 && (mvy&3) == 3)) ? 1:0;
  190.     
  191.     int hpel1x = mvx>>1;
  192.     int hpel1y = (mvy+1-correction)>>1;
  193.     int filter1 = (hpel1x & 1) + ( (hpel1y & 1) << 1 );
  194.     
  195.     
  196.     src1 = src[filter1] + (hpel1y >> 1) * i_src_stride + (hpel1x >> 1);
  197.     
  198.     if ( (mvx|mvy) & 1 ) /* qpel interpolation needed */
  199.     {
  200.         int hpel2x = (mvx+1)>>1;
  201.         int hpel2y = (mvy+correction)>>1;
  202.         int filter2 = (hpel2x & 1) + ( (hpel2y & 1) <<1 );
  203.         
  204.         src2 = src[filter2] + (hpel2y >> 1) * i_src_stride + (hpel2x >> 1);
  205.         
  206.         switch(i_width) {
  207.         case 4:
  208.             pixel_avg_w4( dst, *i_dst_stride, src1, i_src_stride,
  209.                           src2, i_src_stride, i_height );
  210.             break;
  211.         case 8:
  212.             pixel_avg_w8( dst, *i_dst_stride, src1, i_src_stride,
  213.                           src2, i_src_stride, i_height );
  214.             break;
  215.         case 12:
  216.         case 16:
  217.         default:
  218.             pixel_avg_w16( dst, *i_dst_stride, src1, i_src_stride,
  219.                           src2, i_src_stride, i_height );
  220.             break;
  221.         case 20:
  222.             //FIXME suboptimal
  223.             pixel_avg_w16( dst, *i_dst_stride, src1, i_src_stride,
  224.                           src2, i_src_stride, i_height );
  225.             pixel_avg_w4( dst+16, *i_dst_stride, src1+16, i_src_stride,
  226.                           src2+16, i_src_stride, i_height );
  227.             break;
  228.         }
  229.         return dst;
  230.     }
  231.     else
  232.     {
  233.         *i_dst_stride = i_src_stride;
  234.         return src1;
  235.     }
  236. }
  237. #define DO_PROCESS(a) 
  238.         src##a##v_16 = vec_u8_to_u16( src##a##v_8 ); 
  239.         src##a##v_16 = vec_mladd( coeff##a##v, src##a##v_16, zero_u16v ); 
  240.         dstv_16      = vec_add( dstv_16, src##a##v_16 )
  241. static void mc_chroma_altivec_4xh( uint8_t *dst, int i_dst_stride,
  242.                                    uint8_t *src, int i_src_stride,
  243.                                    int mvx, int mvy,
  244.                                    int i_height )
  245. {
  246.     uint8_t *srcp;
  247.     int y;
  248.     int d8x = mvx & 0x07;
  249.     int d8y = mvy & 0x07;
  250.     DECLARE_ALIGNED_16( uint16_t coeff[4] );
  251.     coeff[0] = (8-d8x)*(8-d8y);
  252.     coeff[1] = d8x    *(8-d8y);
  253.     coeff[2] = (8-d8x)*d8y;
  254.     coeff[3] = d8x    *d8y;
  255.     src  += (mvy >> 3) * i_src_stride + (mvx >> 3);
  256.     srcp  = &src[i_src_stride];
  257.     
  258.     LOAD_ZERO;
  259.     PREP_LOAD;
  260.     PREP_STORE4;
  261.     vec_u16_t   coeff0v, coeff1v, coeff2v, coeff3v;
  262.     vec_u8_t    src0v_8, src1v_8, src2v_8, src3v_8;
  263.     vec_u16_t   src0v_16, src1v_16, src2v_16, src3v_16;
  264.     vec_u8_t    dstv_8;
  265.     vec_u16_t   dstv_16;
  266.     vec_u8_t    permv;
  267.     vec_u16_t   shiftv;
  268.     vec_u16_t   k32v;
  269.     
  270.     coeff0v = vec_ld( 0, coeff );
  271.     coeff3v = vec_splat( coeff0v, 3 );
  272.     coeff2v = vec_splat( coeff0v, 2 );
  273.     coeff1v = vec_splat( coeff0v, 1 );
  274.     coeff0v = vec_splat( coeff0v, 0 );
  275.     k32v    = vec_sl( vec_splat_u16( 1 ), vec_splat_u16( 5 ) );
  276.     permv   = vec_lvsl( 0, (uint8_t *) 1 );
  277.     shiftv  = vec_splat_u16( 6 );
  278.     VEC_LOAD( src, src2v_8, 5, vec_u8_t );
  279.     src3v_8 = vec_perm( src2v_8, src2v_8, permv );
  280.     for( y = 0; y < i_height; y++ )
  281.     {
  282.         src0v_8 = src2v_8;
  283.         src1v_8 = src3v_8;
  284.         VEC_LOAD( srcp, src2v_8, 5, vec_u8_t );
  285.         src3v_8 = vec_perm( src2v_8, src2v_8, permv );
  286.         dstv_16 = k32v;
  287.         DO_PROCESS( 0 );
  288.         DO_PROCESS( 1 );
  289.         DO_PROCESS( 2 );
  290.         DO_PROCESS( 3 );
  291.         dstv_16 = vec_sr( dstv_16, shiftv );
  292.         dstv_8  = vec_u16_to_u8( dstv_16 );
  293.         VEC_STORE4( dstv_8, dst );
  294.         dst  += i_dst_stride;
  295.         srcp += i_src_stride;
  296.     }
  297. }
  298. static void mc_chroma_altivec_8xh( uint8_t *dst, int i_dst_stride,
  299.                                    uint8_t *src, int i_src_stride,
  300.                                    int mvx, int mvy,
  301.                                    int i_height )
  302. {
  303.     uint8_t *srcp;
  304.     int y;
  305.     int d8x = mvx & 0x07;
  306.     int d8y = mvy & 0x07;
  307.     DECLARE_ALIGNED_16( uint16_t coeff[4] );
  308.     coeff[0] = (8-d8x)*(8-d8y);
  309.     coeff[1] = d8x    *(8-d8y);
  310.     coeff[2] = (8-d8x)*d8y;
  311.     coeff[3] = d8x    *d8y;
  312.     src  += (mvy >> 3) * i_src_stride + (mvx >> 3);
  313.     srcp  = &src[i_src_stride];
  314.     
  315.     LOAD_ZERO;
  316.     PREP_LOAD;
  317.     PREP_STORE8;
  318.     vec_u16_t   coeff0v, coeff1v, coeff2v, coeff3v;
  319.     vec_u8_t    src0v_8, src1v_8, src2v_8, src3v_8;
  320.     vec_u16_t   src0v_16, src1v_16, src2v_16, src3v_16;
  321.     vec_u8_t    dstv_8;
  322.     vec_u16_t   dstv_16;
  323.     vec_u8_t    permv;
  324.     vec_u16_t   shiftv;
  325.     vec_u16_t   k32v;
  326.     
  327.     coeff0v = vec_ld( 0, coeff );
  328.     coeff3v = vec_splat( coeff0v, 3 );
  329.     coeff2v = vec_splat( coeff0v, 2 );
  330.     coeff1v = vec_splat( coeff0v, 1 );
  331.     coeff0v = vec_splat( coeff0v, 0 );
  332.     k32v    = vec_sl( vec_splat_u16( 1 ), vec_splat_u16( 5 ) );
  333.     permv   = vec_lvsl( 0, (uint8_t *) 1 );
  334.     shiftv  = vec_splat_u16( 6 );
  335.     VEC_LOAD( src, src2v_8, 9, vec_u8_t );
  336.     src3v_8 = vec_perm( src2v_8, src2v_8, permv );
  337.     for( y = 0; y < i_height; y++ )
  338.     {
  339.         src0v_8 = src2v_8;
  340.         src1v_8 = src3v_8;
  341.         VEC_LOAD( srcp, src2v_8, 9, vec_u8_t );
  342.         src3v_8 = vec_perm( src2v_8, src2v_8, permv );
  343.         dstv_16 = k32v;
  344.         DO_PROCESS( 0 );
  345.         DO_PROCESS( 1 );
  346.         DO_PROCESS( 2 );
  347.         DO_PROCESS( 3 );
  348.         dstv_16 = vec_sr( dstv_16, shiftv );
  349.         dstv_8  = vec_u16_to_u8( dstv_16 );
  350.         VEC_STORE8( dstv_8, dst );
  351.         dst  += i_dst_stride;
  352.         srcp += i_src_stride;
  353.     }
  354. }
  355. static void mc_chroma_altivec( uint8_t *dst, int i_dst_stride,
  356.                                uint8_t *src, int i_src_stride,
  357.                                int mvx, int mvy,
  358.                                int i_width, int i_height )
  359. {
  360.     if( i_width == 8 )
  361.     {
  362.         mc_chroma_altivec_8xh( dst, i_dst_stride, src, i_src_stride,
  363.                                mvx, mvy, i_height );
  364.     }
  365.     else
  366.     {
  367.         mc_chroma_altivec_4xh( dst, i_dst_stride, src, i_src_stride,
  368.                                mvx, mvy, i_height );
  369.     }
  370. }
  371. #define HPEL_FILTER_1( t1v, t2v, t3v, t4v, t5v, t6v ) 
  372. {                                                     
  373.     t1v = vec_add( t1v, t6v );                        
  374.     t2v = vec_add( t2v, t5v );                        
  375.     t3v = vec_add( t3v, t4v );                        
  376.                                                       
  377.     t1v = vec_sub( t1v, t2v );   /* (a-b) */          
  378.     t2v = vec_sub( t2v, t3v );   /* (b-c) */          
  379.     t2v = vec_sl(  t2v, twov );  /* (b-c)*4 */        
  380.     t1v = vec_sub( t1v, t2v );   /* a-5*b+4*c */      
  381.     t3v = vec_sl(  t3v, fourv ); /* 16*c */           
  382.     t1v = vec_add( t1v, t3v );   /* a-5*b+20*c */     
  383. }
  384. #define HPEL_FILTER_2( t1v, t2v, t3v, t4v, t5v, t6v ) 
  385. {                                                     
  386.     t1v = vec_add( t1v, t6v );                        
  387.     t2v = vec_add( t2v, t5v );                        
  388.     t3v = vec_add( t3v, t4v );                        
  389.                                                       
  390.     t1v = vec_sub( t1v, t2v );  /* (a-b) */           
  391.     t1v = vec_sra( t1v, twov ); /* (a-b)/4 */         
  392.     t1v = vec_sub( t1v, t2v );  /* (a-b)/4-b */       
  393.     t1v = vec_add( t1v, t3v );  /* (a-b)/4-b+c */     
  394.     t1v = vec_sra( t1v, twov ); /* ((a-b)/4-b+c)/4 */ 
  395.     t1v = vec_add( t1v, t3v );  /* ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 */ 
  396. }
  397. #define HPEL_FILTER_HORIZONTAL()                            
  398. {                                                           
  399.     VEC_LOAD( &src[x- 2+i_stride*y], src1v, 16, vec_u8_t ); 
  400.     VEC_LOAD( &src[x+14+i_stride*y], src6v, 16, vec_u8_t ); 
  401.                                                             
  402.     src2v = vec_sld( src1v, src6v,  1 );                    
  403.     src3v = vec_sld( src1v, src6v,  2 );                    
  404.     src4v = vec_sld( src1v, src6v,  3 );                    
  405.     src5v = vec_sld( src1v, src6v,  4 );                    
  406.     src6v = vec_sld( src1v, src6v,  5 );                    
  407.                                                             
  408.     temp1v = vec_u8_to_s16_h( src1v );                      
  409.     temp2v = vec_u8_to_s16_h( src2v );                      
  410.     temp3v = vec_u8_to_s16_h( src3v );                      
  411.     temp4v = vec_u8_to_s16_h( src4v );                      
  412.     temp5v = vec_u8_to_s16_h( src5v );                      
  413.     temp6v = vec_u8_to_s16_h( src6v );                      
  414.                                                             
  415.     HPEL_FILTER_1( temp1v, temp2v, temp3v,                  
  416.                    temp4v, temp5v, temp6v );                
  417.                                                             
  418.     dest1v = vec_add( temp1v, sixteenv );                   
  419.     dest1v = vec_sra( dest1v, fivev );                      
  420.                                                             
  421.     temp1v = vec_u8_to_s16_l( src1v );                      
  422.     temp2v = vec_u8_to_s16_l( src2v );                      
  423.     temp3v = vec_u8_to_s16_l( src3v );                      
  424.     temp4v = vec_u8_to_s16_l( src4v );                      
  425.     temp5v = vec_u8_to_s16_l( src5v );                      
  426.     temp6v = vec_u8_to_s16_l( src6v );                      
  427.                                                             
  428.     HPEL_FILTER_1( temp1v, temp2v, temp3v,                  
  429.                    temp4v, temp5v, temp6v );                
  430.                                                             
  431.     dest2v = vec_add( temp1v, sixteenv );                   
  432.     dest2v = vec_sra( dest2v, fivev );                      
  433.                                                             
  434.     destv = vec_packsu( dest1v, dest2v );                   
  435.                                                             
  436.     VEC_STORE16( destv, &dsth[x+i_stride*y] );              
  437. }
  438. #define HPEL_FILTER_VERTICAL()                               
  439. {                                                            
  440.     VEC_LOAD( &src[x+i_stride*(y-2)], src1v, 16, vec_u8_t ); 
  441.     VEC_LOAD( &src[x+i_stride*(y-1)], src2v, 16, vec_u8_t ); 
  442.     VEC_LOAD( &src[x+i_stride*(y-0)], src3v, 16, vec_u8_t ); 
  443.     VEC_LOAD( &src[x+i_stride*(y+1)], src4v, 16, vec_u8_t ); 
  444.     VEC_LOAD( &src[x+i_stride*(y+2)], src5v, 16, vec_u8_t ); 
  445.     VEC_LOAD( &src[x+i_stride*(y+3)], src6v, 16, vec_u8_t ); 
  446.                                                              
  447.     temp1v = vec_u8_to_s16_h( src1v );                       
  448.     temp2v = vec_u8_to_s16_h( src2v );                       
  449.     temp3v = vec_u8_to_s16_h( src3v );                       
  450.     temp4v = vec_u8_to_s16_h( src4v );                       
  451.     temp5v = vec_u8_to_s16_h( src5v );                       
  452.     temp6v = vec_u8_to_s16_h( src6v );                       
  453.                                                              
  454.     HPEL_FILTER_1( temp1v, temp2v, temp3v,                   
  455.                    temp4v, temp5v, temp6v );                 
  456.                                                              
  457.     dest1v = vec_add( temp1v, sixteenv );                    
  458.     dest1v = vec_sra( dest1v, fivev );                       
  459.                                                              
  460.     temp4v = vec_u8_to_s16_l( src1v );                       
  461.     temp5v = vec_u8_to_s16_l( src2v );                       
  462.     temp6v = vec_u8_to_s16_l( src3v );                       
  463.     temp7v = vec_u8_to_s16_l( src4v );                       
  464.     temp8v = vec_u8_to_s16_l( src5v );                       
  465.     temp9v = vec_u8_to_s16_l( src6v );                       
  466.                                                              
  467.     HPEL_FILTER_1( temp4v, temp5v, temp6v,                   
  468.                    temp7v, temp8v, temp9v );                 
  469.                                                              
  470.     dest2v = vec_add( temp4v, sixteenv );                    
  471.     dest2v = vec_sra( dest2v, fivev );                       
  472.                                                              
  473.     destv = vec_packsu( dest1v, dest2v );                    
  474.                                                              
  475.     VEC_STORE16( destv, &dstv[x+i_stride*y] );               
  476. }
  477. #define HPEL_FILTER_CENTRAL()                     
  478. {                                                 
  479.     temp1v = vec_sld( tempav, tempbv, 12 );       
  480.     temp2v = vec_sld( tempav, tempbv, 14 );       
  481.     temp3v = tempbv;                              
  482.     temp4v = vec_sld( tempbv, tempcv,  2 );       
  483.     temp5v = vec_sld( tempbv, tempcv,  4 );       
  484.     temp6v = vec_sld( tempbv, tempcv,  6 );       
  485.                                                   
  486.     HPEL_FILTER_2( temp1v, temp2v, temp3v,        
  487.                    temp4v, temp5v, temp6v );      
  488.                                                   
  489.     dest1v = vec_add( temp1v, thirtytwov );       
  490.     dest1v = vec_sra( dest1v, sixv );             
  491.                                                   
  492.     temp1v = vec_sld( tempbv, tempcv, 12 );       
  493.     temp2v = vec_sld( tempbv, tempcv, 14 );       
  494.     temp3v = tempcv;                              
  495.     temp4v = vec_sld( tempcv, tempdv,  2 );       
  496.     temp5v = vec_sld( tempcv, tempdv,  4 );       
  497.     temp6v = vec_sld( tempcv, tempdv,  6 );       
  498.                                                   
  499.     HPEL_FILTER_2( temp1v, temp2v, temp3v,        
  500.                    temp4v, temp5v, temp6v );      
  501.                                                   
  502.     dest2v = vec_add( temp1v, thirtytwov );       
  503.     dest2v = vec_sra( dest2v, sixv );             
  504.                                                   
  505.     destv = vec_packsu( dest1v, dest2v );         
  506.                                                   
  507.     VEC_STORE16( destv, &dstc[x-16+i_stride*y] ); 
  508. }
  509. void x264_hpel_filter_altivec( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
  510.                                int i_stride, int i_width, int i_height )
  511. {
  512.     int x, y;
  513.     vec_u8_t destv;
  514.     vec_u8_t src1v, src2v, src3v, src4v, src5v, src6v;
  515.     vec_s16_t dest1v, dest2v;
  516.     vec_s16_t temp1v, temp2v, temp3v, temp4v, temp5v, temp6v, temp7v, temp8v, temp9v;
  517.     vec_s16_t tempav, tempbv, tempcv, tempdv, tempev;
  518.     PREP_LOAD;
  519.     PREP_STORE16;
  520.     LOAD_ZERO;
  521.     vec_u16_t twov, fourv, fivev, sixv;
  522.     vec_s16_t sixteenv, thirtytwov;
  523.     vect_ushort_u temp_u;
  524.     temp_u.s[0]=2;
  525.     twov = vec_splat( temp_u.v, 0 );
  526.     temp_u.s[0]=4;
  527.     fourv = vec_splat( temp_u.v, 0 );
  528.     temp_u.s[0]=5;
  529.     fivev = vec_splat( temp_u.v, 0 );
  530.     temp_u.s[0]=6;
  531.     sixv = vec_splat( temp_u.v, 0 );
  532.     temp_u.s[0]=16;
  533.     sixteenv = (vec_s16_t)vec_splat( temp_u.v, 0 );
  534.     temp_u.s[0]=32;
  535.     thirtytwov = (vec_s16_t)vec_splat( temp_u.v, 0 );
  536.     for( y = 0; y < i_height; y++ )
  537.     {
  538.         x = 0;
  539.         /* horizontal_filter */
  540.         HPEL_FILTER_HORIZONTAL();
  541.         /* vertical_filter */
  542.         HPEL_FILTER_VERTICAL();
  543.         /* central_filter */
  544.         tempav = tempcv;
  545.         tempbv = tempdv;
  546.         tempcv = vec_splat( temp1v, 0 ); /* first only */
  547.         tempdv = temp1v;
  548.         tempev = temp4v;
  549.         for( x = 16; x < i_width; x+=16 )
  550.         {
  551.             /* horizontal_filter */
  552.             HPEL_FILTER_HORIZONTAL();
  553.             /* vertical_filter */
  554.             HPEL_FILTER_VERTICAL();
  555.             /* central_filter */
  556.             tempav = tempcv;
  557.             tempbv = tempdv;
  558.             tempcv = tempev;
  559.             tempdv = temp1v;
  560.             tempev = temp4v;
  561.             HPEL_FILTER_CENTRAL();
  562.         }
  563.         /* Partial vertical filter */
  564.         VEC_LOAD( &src[x+i_stride*(y-2)], src1v, 16, vec_u8_t );
  565.         VEC_LOAD( &src[x+i_stride*(y-1)], src2v, 16, vec_u8_t );
  566.         VEC_LOAD( &src[x+i_stride*(y-0)], src3v, 16, vec_u8_t );
  567.         VEC_LOAD( &src[x+i_stride*(y+1)], src4v, 16, vec_u8_t );
  568.         VEC_LOAD( &src[x+i_stride*(y+2)], src5v, 16, vec_u8_t );
  569.         VEC_LOAD( &src[x+i_stride*(y+3)], src6v, 16, vec_u8_t );
  570.         temp1v = vec_u8_to_s16_h( src1v );
  571.         temp2v = vec_u8_to_s16_h( src2v );
  572.         temp3v = vec_u8_to_s16_h( src3v );
  573.         temp4v = vec_u8_to_s16_h( src4v );
  574.         temp5v = vec_u8_to_s16_h( src5v );
  575.         temp6v = vec_u8_to_s16_h( src6v );
  576.         HPEL_FILTER_1( temp1v, temp2v, temp3v,
  577.                       temp4v, temp5v, temp6v );
  578.         /* central_filter */
  579.         tempav = tempcv;
  580.         tempbv = tempdv;
  581.         tempcv = tempev;
  582.         tempdv = temp1v;
  583.         /* tempev is not used */
  584.         HPEL_FILTER_CENTRAL();
  585.     }
  586. }
  587. void x264_mc_altivec_init( x264_mc_functions_t *pf )
  588. {
  589.     pf->mc_luma   = mc_luma_altivec;
  590.     pf->get_ref   = get_ref_altivec;
  591.     pf->mc_chroma = mc_chroma_altivec;
  592.     pf->hpel_filter = x264_hpel_filter_altivec;
  593. }