i420_yuy2.c
上传用户:kjfoods
上传日期:2020-07-06
资源大小:29949k
文件大小:33k
源码类别:

midi

开发平台:

Unix_Linux

  1. /*****************************************************************************
  2.  * i420_yuy2.c : YUV to YUV conversion module for vlc
  3.  *****************************************************************************
  4.  * Copyright (C) 2000, 2001 the VideoLAN team
  5.  * $Id: 15f5ac2fee6d469c27339e27e161b761f1ba043c $
  6.  *
  7.  * Authors: Samuel Hocevar <sam@zoy.org>
  8.  *          Damien Fouilleul <damien@videolan.org>
  9.  *
  10.  * This program is free software; you can redistribute it and/or modify
  11.  * it under the terms of the GNU General Public License as published by
  12.  * the Free Software Foundation; either version 2 of the License, or
  13.  * (at your option) any later version.
  14.  *
  15.  * This program is distributed in the hope that it will be useful,
  16.  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17.  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18.  * GNU General Public License for more details.
  19.  *
  20.  * You should have received a copy of the GNU General Public License
  21.  * along with this program; if not, write to the Free Software
  22.  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
  23.  *****************************************************************************/
  24. /*****************************************************************************
  25.  * Preamble
  26.  *****************************************************************************/
  27. #ifdef HAVE_CONFIG_H
  28. # include "config.h"
  29. #endif
  30. #include <vlc_common.h>
  31. #include <vlc_plugin.h>
  32. #include <vlc_filter.h>
  33. #include <vlc_vout.h>
  34. #if defined (MODULE_NAME_IS_i420_yuy2_altivec) && defined(HAVE_ALTIVEC_H)
  35. #   include <altivec.h>
  36. #endif
  37. #include "i420_yuy2.h"
  38. #define SRC_FOURCC  "I420,IYUV,YV12"
  39. #if defined (MODULE_NAME_IS_i420_yuy2)
  40. #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv,Y211"
  41. #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
  42. #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
  43. #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
  44. #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
  45. #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
  46. #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422"
  47. #endif
  48. /*****************************************************************************
  49.  * Local and extern prototypes.
  50.  *****************************************************************************/
  51. static int  Activate ( vlc_object_t * );
  52. static void I420_YUY2           ( filter_t *, picture_t *, picture_t * );
  53. static void I420_YVYU           ( filter_t *, picture_t *, picture_t * );
  54. static void I420_UYVY           ( filter_t *, picture_t *, picture_t * );
  55. static picture_t *I420_YUY2_Filter    ( filter_t *, picture_t * );
  56. static picture_t *I420_YVYU_Filter    ( filter_t *, picture_t * );
  57. static picture_t *I420_UYVY_Filter    ( filter_t *, picture_t * );
  58. #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
  59. static void I420_IUYV           ( filter_t *, picture_t *, picture_t * );
  60. static void I420_cyuv           ( filter_t *, picture_t *, picture_t * );
  61. static picture_t *I420_IUYV_Filter    ( filter_t *, picture_t * );
  62. static picture_t *I420_cyuv_Filter    ( filter_t *, picture_t * );
  63. #endif
  64. #if defined (MODULE_NAME_IS_i420_yuy2)
  65. static void I420_Y211           ( filter_t *, picture_t *, picture_t * );
  66. static picture_t *I420_Y211_Filter    ( filter_t *, picture_t * );
  67. #endif
  68. #ifdef MODULE_NAME_IS_i420_yuy2_mmx
  69. /* Initialize MMX-specific constants */
  70. static const uint64_t i_00ffw = 0x00ff00ff00ff00ffULL;
  71. static const uint64_t i_80w   = 0x0000000080808080ULL;
  72. #endif
  73. /*****************************************************************************
  74.  * Module descriptor.
  75.  *****************************************************************************/
  76. vlc_module_begin ()
  77. #if defined (MODULE_NAME_IS_i420_yuy2)
  78.     set_description( N_("Conversions from " SRC_FOURCC " to " DEST_FOURCC) )
  79.     set_capability( "video filter2", 80 )
  80. #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
  81.     set_description( N_("MMX conversions from " SRC_FOURCC " to " DEST_FOURCC) )
  82.     set_capability( "video filter2", 160 )
  83.     add_requirement( MMX )
  84. #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
  85.     set_description( N_("SSE2 conversions from " SRC_FOURCC " to " DEST_FOURCC) )
  86.     set_capability( "video filter2", 250 )
  87.     add_requirement( SSE2 )
  88. #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
  89.     set_description(
  90.             _("AltiVec conversions from " SRC_FOURCC " to " DEST_FOURCC) );
  91.     set_capability( "video filter2", 250 )
  92.     add_requirement( ALTIVEC )
  93. #endif
  94.     set_callbacks( Activate, NULL )
  95. vlc_module_end ()
  96. /*****************************************************************************
  97.  * Activate: allocate a chroma function
  98.  *****************************************************************************
  99.  * This function allocates and initializes a chroma function
  100.  *****************************************************************************/
  101. static int Activate( vlc_object_t *p_this )
  102. {
  103.     filter_t *p_filter = (filter_t *)p_this;
  104.     if( p_filter->fmt_in.video.i_width & 1
  105.      || p_filter->fmt_in.video.i_height & 1 )
  106.     {
  107.         return -1;
  108.     }
  109.     if( p_filter->fmt_in.video.i_width != p_filter->fmt_out.video.i_width
  110.      || p_filter->fmt_in.video.i_height != p_filter->fmt_out.video.i_height )
  111.         return -1;
  112.     switch( p_filter->fmt_in.video.i_chroma )
  113.     {
  114.         case VLC_FOURCC('Y','V','1','2'):
  115.         case VLC_FOURCC('I','4','2','0'):
  116.         case VLC_FOURCC('I','Y','U','V'):
  117.             switch( p_filter->fmt_out.video.i_chroma )
  118.             {
  119.                 case VLC_FOURCC('Y','U','Y','2'):
  120.                 case VLC_FOURCC('Y','U','N','V'):
  121.                     p_filter->pf_video_filter = I420_YUY2_Filter;
  122.                     break;
  123.                 case VLC_FOURCC('Y','V','Y','U'):
  124.                     p_filter->pf_video_filter = I420_YVYU_Filter;
  125.                     break;
  126.                 case VLC_FOURCC('U','Y','V','Y'):
  127.                 case VLC_FOURCC('U','Y','N','V'):
  128.                 case VLC_FOURCC('Y','4','2','2'):
  129.                     p_filter->pf_video_filter = I420_UYVY_Filter;
  130.                     break;
  131. #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
  132.                 case VLC_FOURCC('I','U','Y','V'):
  133.                     p_filter->pf_video_filter = I420_IUYV_Filter;
  134.                     break;
  135.                 case VLC_FOURCC('c','y','u','v'):
  136.                     p_filter->pf_video_filter = I420_cyuv_Filter;
  137.                     break;
  138. #endif
  139. #if defined (MODULE_NAME_IS_i420_yuy2)
  140.                 case VLC_FOURCC('Y','2','1','1'):
  141.                     p_filter->pf_video_filter = I420_Y211_Filter;
  142.                     break;
  143. #endif
  144.                 default:
  145.                     return -1;
  146.             }
  147.             break;
  148.         default:
  149.             return -1;
  150.     }
  151.     return 0;
  152. }
  153. #if 0
  154. static inline unsigned long long read_cycles(void)
  155. {
  156.     unsigned long long v;
  157.     __asm__ __volatile__("rdtsc" : "=A" (v): );
  158.     return v;
  159. }
  160. #endif
  161. /* Following functions are local */
  162. VIDEO_FILTER_WRAPPER( I420_YUY2 )
  163. VIDEO_FILTER_WRAPPER( I420_YVYU )
  164. VIDEO_FILTER_WRAPPER( I420_UYVY )
  165. #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
  166. VIDEO_FILTER_WRAPPER( I420_IUYV )
  167. VIDEO_FILTER_WRAPPER( I420_cyuv )
  168. #endif
  169. #if defined (MODULE_NAME_IS_i420_yuy2)
  170. VIDEO_FILTER_WRAPPER( I420_Y211 )
  171. #endif
  172. /*****************************************************************************
  173.  * I420_YUY2: planar YUV 4:2:0 to packed YUYV 4:2:2
  174.  *****************************************************************************/
  175. static void I420_YUY2( filter_t *p_filter, picture_t *p_source,
  176.                                            picture_t *p_dest )
  177. {
  178.     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
  179.     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
  180.     uint8_t *p_u = p_source->U_PIXELS;
  181.     uint8_t *p_v = p_source->V_PIXELS;
  182.     int i_x, i_y;
  183. #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
  184. #define VEC_NEXT_LINES( ) 
  185.     p_line1  = p_line2; 
  186.     p_line2 += p_dest->p->i_pitch; 
  187.     p_y1     = p_y2; 
  188.     p_y2    += p_source->p[Y_PLANE].i_pitch;
  189. #define VEC_LOAD_UV( ) 
  190.     u_vec = vec_ld( 0, p_u ); p_u += 16; 
  191.     v_vec = vec_ld( 0, p_v ); p_v += 16;
  192. #define VEC_MERGE( a ) 
  193.     uv_vec = a( u_vec, v_vec ); 
  194.     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; 
  195.     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; 
  196.     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; 
  197.     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; 
  198.     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16; 
  199.     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16;
  200.     vector unsigned char u_vec;
  201.     vector unsigned char v_vec;
  202.     vector unsigned char uv_vec;
  203.     vector unsigned char y_vec;
  204.     if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
  205.            ( p_filter->fmt_in.video.i_height % 2 ) ) )
  206.     {
  207.         /* Width is a multiple of 32, we take 2 lines at a time */
  208.         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
  209.         {
  210.             VEC_NEXT_LINES( );
  211.             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
  212.             {
  213.                 VEC_LOAD_UV( );
  214.                 VEC_MERGE( vec_mergeh );
  215.                 VEC_MERGE( vec_mergel );
  216.             }
  217.         }
  218.     }
  219.     else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
  220.                 ( p_filter->fmt_in.video.i_height % 4 ) ) )
  221.     {
  222.         /* Width is only a multiple of 16, we take 4 lines at a time */
  223.         for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
  224.         {
  225.             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
  226.             VEC_NEXT_LINES( );
  227.             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
  228.             {
  229.                 VEC_LOAD_UV( );
  230.                 VEC_MERGE( vec_mergeh );
  231.                 VEC_MERGE( vec_mergel );
  232.             }
  233.             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
  234.             VEC_LOAD_UV( );
  235.             VEC_MERGE( vec_mergeh );
  236.             /* Line 3 and 4, pixels 0 to 16 */
  237.             VEC_NEXT_LINES( );
  238.             VEC_MERGE( vec_mergel );
  239.             /* Line 3 and 4, pixels 16 to ( width ) */
  240.             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
  241.             {
  242.                 VEC_LOAD_UV( );
  243.                 VEC_MERGE( vec_mergeh );
  244.                 VEC_MERGE( vec_mergel );
  245.             }
  246.         }
  247.     }
  248.     else
  249.     {
  250.         /* Crap, use the C version */
  251. #undef VEC_NEXT_LINES
  252. #undef VEC_LOAD_UV
  253. #undef VEC_MERGE
  254. #endif
  255.     const int i_source_margin = p_source->p[0].i_pitch
  256.                                  - p_source->p[0].i_visible_pitch;
  257.     const int i_source_margin_c = p_source->p[1].i_pitch
  258.                                  - p_source->p[1].i_visible_pitch;
  259.     const int i_dest_margin = p_dest->p->i_pitch
  260.                                - p_dest->p->i_visible_pitch;
  261. #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
  262.     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
  263.     {
  264.         p_line1 = p_line2;
  265.         p_line2 += p_dest->p->i_pitch;
  266.         p_y1 = p_y2;
  267.         p_y2 += p_source->p[Y_PLANE].i_pitch;
  268. #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
  269.         for( i_x = p_filter->fmt_in.video.i_width / 8; i_x-- ; )
  270.         {
  271.             C_YUV420_YUYV( );
  272.             C_YUV420_YUYV( );
  273.             C_YUV420_YUYV( );
  274.             C_YUV420_YUYV( );
  275.         }
  276. #else
  277.         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
  278.         {
  279.             MMX_CALL( MMX_YUV420_YUYV );
  280.         }
  281. #endif
  282.         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
  283.         {
  284.             C_YUV420_YUYV( );
  285.         }
  286.         p_y1 += i_source_margin;
  287.         p_y2 += i_source_margin;
  288.         p_u += i_source_margin_c;
  289.         p_v += i_source_margin_c;
  290.         p_line1 += i_dest_margin;
  291.         p_line2 += i_dest_margin;
  292.     }
  293. #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
  294.     /* re-enable FPU registers */
  295.     MMX_END;
  296. #endif
  297. #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
  298.     }
  299. #endif
  300. #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
  301.     /*
  302.     ** SSE2 128 bits fetch/store instructions are faster
  303.     ** if memory access is 16 bytes aligned
  304.     */
  305.     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
  306.         ((intptr_t)p_line2|(intptr_t)p_y2))) )
  307.     {
  308.         /* use faster SSE2 aligned fetch and store */
  309.         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
  310.         {
  311.             p_line1 = p_line2;
  312.             p_line2 += p_dest->p->i_pitch;
  313.             p_y1 = p_y2;
  314.             p_y2 += p_source->p[Y_PLANE].i_pitch;
  315.             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
  316.             {
  317.                 SSE2_CALL( SSE2_YUV420_YUYV_ALIGNED );
  318.             }
  319.             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
  320.             {
  321.                 C_YUV420_YUYV( );
  322.             }
  323.             p_y1 += i_source_margin;
  324.             p_y2 += i_source_margin;
  325.             p_u += i_source_margin_c;
  326.             p_v += i_source_margin_c;
  327.             p_line1 += i_dest_margin;
  328.             p_line2 += i_dest_margin;
  329.         }
  330.     }
  331.     else
  332.     {
  333.         /* use slower SSE2 unaligned fetch and store */
  334.         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
  335.         {
  336.             p_line1 = p_line2;
  337.             p_line2 += p_dest->p->i_pitch;
  338.             p_y1 = p_y2;
  339.             p_y2 += p_source->p[Y_PLANE].i_pitch;
  340.             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
  341.             {
  342.                 SSE2_CALL( SSE2_YUV420_YUYV_UNALIGNED );
  343.             }
  344.             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
  345.             {
  346.                 C_YUV420_YUYV( );
  347.             }
  348.             p_y1 += i_source_margin;
  349.             p_y2 += i_source_margin;
  350.             p_u += i_source_margin_c;
  351.             p_v += i_source_margin_c;
  352.             p_line1 += i_dest_margin;
  353.             p_line2 += i_dest_margin;
  354.         }
  355.     }
  356.     /* make sure all SSE2 stores are visible thereafter */
  357.     SSE2_END;
  358. #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
  359. }
  360. /*****************************************************************************
  361.  * I420_YVYU: planar YUV 4:2:0 to packed YVYU 4:2:2
  362.  *****************************************************************************/
  363. static void I420_YVYU( filter_t *p_filter, picture_t *p_source,
  364.                                            picture_t *p_dest )
  365. {
  366.     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
  367.     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
  368.     uint8_t *p_u = p_source->U_PIXELS;
  369.     uint8_t *p_v = p_source->V_PIXELS;
  370.     int i_x, i_y;
  371. #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
  372. #define VEC_NEXT_LINES( ) 
  373.     p_line1  = p_line2; 
  374.     p_line2 += p_dest->p->i_pitch; 
  375.     p_y1     = p_y2; 
  376.     p_y2    += p_source->p[Y_PLANE].i_pitch;
  377. #define VEC_LOAD_UV( ) 
  378.     u_vec = vec_ld( 0, p_u ); p_u += 16; 
  379.     v_vec = vec_ld( 0, p_v ); p_v += 16;
  380. #define VEC_MERGE( a ) 
  381.     vu_vec = a( v_vec, u_vec ); 
  382.     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; 
  383.     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; 
  384.     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; 
  385.     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; 
  386.     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16; 
  387.     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16;
  388.     vector unsigned char u_vec;
  389.     vector unsigned char v_vec;
  390.     vector unsigned char vu_vec;
  391.     vector unsigned char y_vec;
  392.     if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
  393.            ( p_filter->fmt_in.video.i_height % 2 ) ) )
  394.     {
  395.         /* Width is a multiple of 32, we take 2 lines at a time */
  396.         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
  397.         {
  398.             VEC_NEXT_LINES( );
  399.             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
  400.             {
  401.                 VEC_LOAD_UV( );
  402.                 VEC_MERGE( vec_mergeh );
  403.                 VEC_MERGE( vec_mergel );
  404.             }
  405.         }
  406.     }
  407.     else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
  408.                 ( p_filter->fmt_in.video.i_height % 4 ) ) )
  409.     {
  410.         /* Width is only a multiple of 16, we take 4 lines at a time */
  411.         for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
  412.         {
  413.             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
  414.             VEC_NEXT_LINES( );
  415.             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
  416.             {
  417.                 VEC_LOAD_UV( );
  418.                 VEC_MERGE( vec_mergeh );
  419.                 VEC_MERGE( vec_mergel );
  420.             }
  421.             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
  422.             VEC_LOAD_UV( );
  423.             VEC_MERGE( vec_mergeh );
  424.             /* Line 3 and 4, pixels 0 to 16 */
  425.             VEC_NEXT_LINES( );
  426.             VEC_MERGE( vec_mergel );
  427.             /* Line 3 and 4, pixels 16 to ( width ) */
  428.             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
  429.             {
  430.                 VEC_LOAD_UV( );
  431.                 VEC_MERGE( vec_mergeh );
  432.                 VEC_MERGE( vec_mergel );
  433.             }
  434.         }
  435.     }
  436.     else
  437.     {
  438.         /* Crap, use the C version */
  439. #undef VEC_NEXT_LINES
  440. #undef VEC_LOAD_UV
  441. #undef VEC_MERGE
  442. #endif
  443.     const int i_source_margin = p_source->p[0].i_pitch
  444.                                  - p_source->p[0].i_visible_pitch;
  445.     const int i_source_margin_c = p_source->p[1].i_pitch
  446.                                  - p_source->p[1].i_visible_pitch;
  447.     const int i_dest_margin = p_dest->p->i_pitch
  448.                                - p_dest->p->i_visible_pitch;
  449. #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
  450.     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
  451.     {
  452.         p_line1 = p_line2;
  453.         p_line2 += p_dest->p->i_pitch;
  454.         p_y1 = p_y2;
  455.         p_y2 += p_source->p[Y_PLANE].i_pitch;
  456.         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
  457.         {
  458. #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
  459.             C_YUV420_YVYU( );
  460.             C_YUV420_YVYU( );
  461.             C_YUV420_YVYU( );
  462.             C_YUV420_YVYU( );
  463. #else
  464.             MMX_CALL( MMX_YUV420_YVYU );
  465. #endif
  466.         }
  467.         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
  468.         {
  469.             C_YUV420_YVYU( );
  470.         }
  471.         p_y1 += i_source_margin;
  472.         p_y2 += i_source_margin;
  473.         p_u += i_source_margin_c;
  474.         p_v += i_source_margin_c;
  475.         p_line1 += i_dest_margin;
  476.         p_line2 += i_dest_margin;
  477.     }
  478. #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
  479.     /* re-enable FPU registers */
  480.     MMX_END;
  481. #endif
  482. #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
  483.     }
  484. #endif
  485. #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
  486.     /*
  487.     ** SSE2 128 bits fetch/store instructions are faster
  488.     ** if memory access is 16 bytes aligned
  489.     */
  490.     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
  491.         ((intptr_t)p_line2|(intptr_t)p_y2))) )
  492.     {
  493.         /* use faster SSE2 aligned fetch and store */
  494.         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
  495.         {
  496.             p_line1 = p_line2;
  497.             p_line2 += p_dest->p->i_pitch;
  498.             p_y1 = p_y2;
  499.             p_y2 += p_source->p[Y_PLANE].i_pitch;
  500.             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
  501.             {
  502.                 SSE2_CALL( SSE2_YUV420_YVYU_ALIGNED );
  503.             }
  504.             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
  505.             {
  506.                 C_YUV420_YVYU( );
  507.             }
  508.             p_y1 += i_source_margin;
  509.             p_y2 += i_source_margin;
  510.             p_u += i_source_margin_c;
  511.             p_v += i_source_margin_c;
  512.             p_line1 += i_dest_margin;
  513.             p_line2 += i_dest_margin;
  514.         }
  515.     }
  516.     else
  517.     {
  518.         /* use slower SSE2 unaligned fetch and store */
  519.         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
  520.         {
  521.             p_line1 = p_line2;
  522.             p_line2 += p_dest->p->i_pitch;
  523.             p_y1 = p_y2;
  524.             p_y2 += p_source->p[Y_PLANE].i_pitch;
  525.             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
  526.             {
  527.                 SSE2_CALL( SSE2_YUV420_YVYU_UNALIGNED );
  528.             }
  529.             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
  530.             {
  531.                 C_YUV420_YVYU( );
  532.             }
  533.             p_y1 += i_source_margin;
  534.             p_y2 += i_source_margin;
  535.             p_u += i_source_margin_c;
  536.             p_v += i_source_margin_c;
  537.             p_line1 += i_dest_margin;
  538.             p_line2 += i_dest_margin;
  539.         }
  540.     }
  541.     /* make sure all SSE2 stores are visible thereafter */
  542.     SSE2_END;
  543. #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
  544. }
  545. /*****************************************************************************
  546.  * I420_UYVY: planar YUV 4:2:0 to packed UYVY 4:2:2
  547.  *****************************************************************************/
  548. static void I420_UYVY( filter_t *p_filter, picture_t *p_source,
  549.                                            picture_t *p_dest )
  550. {
  551.     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
  552.     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
  553.     uint8_t *p_u = p_source->U_PIXELS;
  554.     uint8_t *p_v = p_source->V_PIXELS;
  555.     int i_x, i_y;
  556. #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
  557. #define VEC_NEXT_LINES( ) 
  558.     p_line1  = p_line2; 
  559.     p_line2 += p_dest->p->i_pitch; 
  560.     p_y1     = p_y2; 
  561.     p_y2    += p_source->p[Y_PLANE].i_pitch;
  562. #define VEC_LOAD_UV( ) 
  563.     u_vec = vec_ld( 0, p_u ); p_u += 16; 
  564.     v_vec = vec_ld( 0, p_v ); p_v += 16;
  565. #define VEC_MERGE( a ) 
  566.     uv_vec = a( u_vec, v_vec ); 
  567.     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; 
  568.     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; 
  569.     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; 
  570.     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; 
  571.     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16; 
  572.     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16;
  573.     vector unsigned char u_vec;
  574.     vector unsigned char v_vec;
  575.     vector unsigned char uv_vec;
  576.     vector unsigned char y_vec;
  577.     if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
  578.            ( p_filter->fmt_in.video.i_height % 2 ) ) )
  579.     {
  580.         /* Width is a multiple of 32, we take 2 lines at a time */
  581.         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
  582.         {
  583.             VEC_NEXT_LINES( );
  584.             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
  585.             {
  586.                 VEC_LOAD_UV( );
  587.                 VEC_MERGE( vec_mergeh );
  588.                 VEC_MERGE( vec_mergel );
  589.             }
  590.         }
  591.     }
  592.     else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
  593.                 ( p_filter->fmt_in.video.i_height % 4 ) ) )
  594.     {
  595.         /* Width is only a multiple of 16, we take 4 lines at a time */
  596.         for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
  597.         {
  598.             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
  599.             VEC_NEXT_LINES( );
  600.             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
  601.             {
  602.                 VEC_LOAD_UV( );
  603.                 VEC_MERGE( vec_mergeh );
  604.                 VEC_MERGE( vec_mergel );
  605.             }
  606.             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
  607.             VEC_LOAD_UV( );
  608.             VEC_MERGE( vec_mergeh );
  609.             /* Line 3 and 4, pixels 0 to 16 */
  610.             VEC_NEXT_LINES( );
  611.             VEC_MERGE( vec_mergel );
  612.             /* Line 3 and 4, pixels 16 to ( width ) */
  613.             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
  614.             {
  615.                 VEC_LOAD_UV( );
  616.                 VEC_MERGE( vec_mergeh );
  617.                 VEC_MERGE( vec_mergel );
  618.             }
  619.         }
  620.     }
  621.     else
  622.     {
  623.         /* Crap, use the C version */
  624. #undef VEC_NEXT_LINES
  625. #undef VEC_LOAD_UV
  626. #undef VEC_MERGE
  627. #endif
  628.     const int i_source_margin = p_source->p[0].i_pitch
  629.                                  - p_source->p[0].i_visible_pitch;
  630.     const int i_source_margin_c = p_source->p[1].i_pitch
  631.                                  - p_source->p[1].i_visible_pitch;
  632.     const int i_dest_margin = p_dest->p->i_pitch
  633.                                - p_dest->p->i_visible_pitch;
  634. #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
  635.     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
  636.     {
  637.         p_line1 = p_line2;
  638.         p_line2 += p_dest->p->i_pitch;
  639.         p_y1 = p_y2;
  640.         p_y2 += p_source->p[Y_PLANE].i_pitch;
  641.         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
  642.         {
  643. #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
  644.             C_YUV420_UYVY( );
  645.             C_YUV420_UYVY( );
  646.             C_YUV420_UYVY( );
  647.             C_YUV420_UYVY( );
  648. #else
  649.             MMX_CALL( MMX_YUV420_UYVY );
  650. #endif
  651.         }
  652.         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x--; )
  653.         {
  654.             C_YUV420_UYVY( );
  655.         }
  656.         p_y1 += i_source_margin;
  657.         p_y2 += i_source_margin;
  658.         p_u += i_source_margin_c;
  659.         p_v += i_source_margin_c;
  660.         p_line1 += i_dest_margin;
  661.         p_line2 += i_dest_margin;
  662.     }
  663. #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
  664.     /* re-enable FPU registers */
  665.     MMX_END;
  666. #endif
  667. #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
  668.     }
  669. #endif
  670. #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
  671.     /*
  672.     ** SSE2 128 bits fetch/store instructions are faster
  673.     ** if memory access is 16 bytes aligned
  674.     */
  675.     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
  676.         ((intptr_t)p_line2|(intptr_t)p_y2))) )
  677.     {
  678.         /* use faster SSE2 aligned fetch and store */
  679.         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
  680.         {
  681.             p_line1 = p_line2;
  682.             p_line2 += p_dest->p->i_pitch;
  683.             p_y1 = p_y2;
  684.             p_y2 += p_source->p[Y_PLANE].i_pitch;
  685.             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
  686.             {
  687.                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
  688.             }
  689.             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
  690.             {
  691.                 C_YUV420_UYVY( );
  692.             }
  693.             p_y1 += i_source_margin;
  694.             p_y2 += i_source_margin;
  695.             p_u += i_source_margin_c;
  696.             p_v += i_source_margin_c;
  697.             p_line1 += i_dest_margin;
  698.             p_line2 += i_dest_margin;
  699.         }
  700.     }
  701.     else
  702.     {
  703.         /* use slower SSE2 unaligned fetch and store */
  704.         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
  705.         {
  706.             p_line1 = p_line2;
  707.             p_line2 += p_dest->p->i_pitch;
  708.             p_y1 = p_y2;
  709.             p_y2 += p_source->p[Y_PLANE].i_pitch;
  710.             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
  711.             {
  712.                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
  713.             }
  714.             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
  715.             {
  716.                 C_YUV420_UYVY( );
  717.             }
  718.             p_y1 += i_source_margin;
  719.             p_y2 += i_source_margin;
  720.             p_u += i_source_margin_c;
  721.             p_v += i_source_margin_c;
  722.             p_line1 += i_dest_margin;
  723.             p_line2 += i_dest_margin;
  724.         }
  725.     }
  726.     /* make sure all SSE2 stores are visible thereafter */
  727.     SSE2_END;
  728. #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
  729. }
  730. #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
  731. /*****************************************************************************
  732.  * I420_IUYV: planar YUV 4:2:0 to interleaved packed UYVY 4:2:2
  733.  *****************************************************************************/
  734. static void I420_IUYV( filter_t *p_filter, picture_t *p_source,
  735.                                            picture_t *p_dest )
  736. {
  737.     VLC_UNUSED(p_source); VLC_UNUSED(p_dest);
  738.     /* FIXME: TODO ! */
  739.     msg_Err( p_filter, "I420_IUYV unimplemented, please harass <sam@zoy.org>" );
  740. }
  741. /*****************************************************************************
  742.  * I420_cyuv: planar YUV 4:2:0 to upside-down packed UYVY 4:2:2
  743.  *****************************************************************************/
  744. static void I420_cyuv( filter_t *p_filter, picture_t *p_source,
  745.                                            picture_t *p_dest )
  746. {
  747.     uint8_t *p_line1 = p_dest->p->p_pixels +
  748.                        p_dest->p->i_visible_lines * p_dest->p->i_pitch
  749.                        + p_dest->p->i_pitch;
  750.     uint8_t *p_line2 = p_dest->p->p_pixels +
  751.                        p_dest->p->i_visible_lines * p_dest->p->i_pitch;
  752.     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
  753.     uint8_t *p_u = p_source->U_PIXELS;
  754.     uint8_t *p_v = p_source->V_PIXELS;
  755.     int i_x, i_y;
  756.     const int i_source_margin = p_source->p[0].i_pitch
  757.                                  - p_source->p[0].i_visible_pitch;
  758.     const int i_source_margin_c = p_source->p[1].i_pitch
  759.                                  - p_source->p[1].i_visible_pitch;
  760.     const int i_dest_margin = p_dest->p->i_pitch
  761.                                - p_dest->p->i_visible_pitch;
  762. #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
  763.     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
  764.     {
  765.         p_line1 -= 3 * p_dest->p->i_pitch;
  766.         p_line2 -= 3 * p_dest->p->i_pitch;
  767.         p_y1 = p_y2;
  768.         p_y2 += p_source->p[Y_PLANE].i_pitch;
  769.         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
  770.         {
  771. #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
  772.             C_YUV420_UYVY( );
  773.             C_YUV420_UYVY( );
  774.             C_YUV420_UYVY( );
  775.             C_YUV420_UYVY( );
  776. #else
  777.             MMX_CALL( MMX_YUV420_UYVY );
  778. #endif
  779.         }
  780.         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
  781.         {
  782.             C_YUV420_UYVY( );
  783.         }
  784.         p_y1 += i_source_margin;
  785.         p_y2 += i_source_margin;
  786.         p_u += i_source_margin_c;
  787.         p_v += i_source_margin_c;
  788.         p_line1 += i_dest_margin;
  789.         p_line2 += i_dest_margin;
  790.     }
  791. #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
  792.     /* re-enable FPU registers */
  793.     MMX_END;
  794. #endif
  795. #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
  796.     /*
  797.     ** SSE2 128 bits fetch/store instructions are faster
  798.     ** if memory access is 16 bytes aligned
  799.     */
  800.     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
  801.         ((intptr_t)p_line2|(intptr_t)p_y2))) )
  802.     {
  803.         /* use faster SSE2 aligned fetch and store */
  804.         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
  805.         {
  806.             p_line1 = p_line2;
  807.             p_line2 += p_dest->p->i_pitch;
  808.             p_y1 = p_y2;
  809.             p_y2 += p_source->p[Y_PLANE].i_pitch;
  810.             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
  811.             {
  812.                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
  813.             }
  814.             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
  815.             {
  816.                 C_YUV420_UYVY( );
  817.             }
  818.             p_y1 += i_source_margin;
  819.             p_y2 += i_source_margin;
  820.             p_u += i_source_margin_c;
  821.             p_v += i_source_margin_c;
  822.             p_line1 += i_dest_margin;
  823.             p_line2 += i_dest_margin;
  824.         }
  825.     }
  826.     else
  827.     {
  828.         /* use slower SSE2 unaligned fetch and store */
  829.         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
  830.         {
  831.             p_line1 = p_line2;
  832.             p_line2 += p_dest->p->i_pitch;
  833.             p_y1 = p_y2;
  834.             p_y2 += p_source->p[Y_PLANE].i_pitch;
  835.             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
  836.             {
  837.                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
  838.             }
  839.             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
  840.             {
  841.                 C_YUV420_UYVY( );
  842.             }
  843.             p_y1 += i_source_margin;
  844.             p_y2 += i_source_margin;
  845.             p_u += i_source_margin_c;
  846.             p_v += i_source_margin_c;
  847.             p_line1 += i_dest_margin;
  848.             p_line2 += i_dest_margin;
  849.         }
  850.     }
  851.     /* make sure all SSE2 stores are visible thereafter */
  852.     SSE2_END;
  853. #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
  854. }
  855. #endif // !defined (MODULE_NAME_IS_i420_yuy2_altivec)
  856. /*****************************************************************************
  857.  * I420_Y211: planar YUV 4:2:0 to packed YUYV 2:1:1
  858.  *****************************************************************************/
  859. #if defined (MODULE_NAME_IS_i420_yuy2)
  860. static void I420_Y211( filter_t *p_filter, picture_t *p_source,
  861.                                            picture_t *p_dest )
  862. {
  863.     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
  864.     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
  865.     uint8_t *p_u = p_source->U_PIXELS;
  866.     uint8_t *p_v = p_source->V_PIXELS;
  867.     int i_x, i_y;
  868.     const int i_source_margin = p_source->p[0].i_pitch
  869.                                  - p_source->p[0].i_visible_pitch;
  870.     const int i_source_margin_c = p_source->p[1].i_pitch
  871.                                  - p_source->p[1].i_visible_pitch;
  872.     const int i_dest_margin = p_dest->p->i_pitch
  873.                                - p_dest->p->i_visible_pitch;
  874.     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
  875.     {
  876.         p_line1 = p_line2;
  877.         p_line2 += p_dest->p->i_pitch;
  878.         p_y1 = p_y2;
  879.         p_y2 += p_source->p[Y_PLANE].i_pitch;
  880.         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
  881.         {
  882.             C_YUV420_Y211( );
  883.             C_YUV420_Y211( );
  884.         }
  885.         p_y1 += i_source_margin;
  886.         p_y2 += i_source_margin;
  887.         p_u += i_source_margin_c;
  888.         p_v += i_source_margin_c;
  889.         p_line1 += i_dest_margin;
  890.         p_line2 += i_dest_margin;
  891.     }
  892. }
  893. #endif