i420_yuy2.h
上传用户:kjfoods
上传日期:2020-07-06
资源大小:29949k
文件大小:25k
源码类别:

midi

开发平台:

Unix_Linux

  1. /*****************************************************************************
  2.  * i420_yuy2.h : YUV to YUV conversion module for vlc
  3.  *****************************************************************************
  4.  * Copyright (C) 2000, 2001 the VideoLAN team
  5.  * $Id: e6994428ce16dae462487ebd6e6524e64c627f48 $
  6.  *
  7.  * Authors: Samuel Hocevar <sam@zoy.org>
  8.  *          Damien Fouilleul <damien@videolan.org>
  9.  *
  10.  * This program is free software; you can redistribute it and/or modify
  11.  * it under the terms of the GNU General Public License as published by
  12.  * the Free Software Foundation; either version 2 of the License, or
  13.  * (at your option) any later version.
  14.  *
  15.  * This program is distributed in the hope that it will be useful,
  16.  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17.  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18.  * GNU General Public License for more details.
  19.  *
  20.  * You should have received a copy of the GNU General Public License
  21.  * along with this program; if not, write to the Free Software
  22.  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
  23.  *****************************************************************************/
  24. #ifdef MODULE_NAME_IS_i420_yuy2_mmx
  25. #if defined(CAN_COMPILE_MMX)
  26. /* MMX assembly */
  27.  
  28. #define MMX_CALL(MMX_INSTRUCTIONS)          
  29.     do {                                    
  30.     __asm__ __volatile__(                   
  31.         ".p2align 3 nt                    
  32. movd       (%0), %%mm1  # Load 4 Cb           00 00 00 00 u3 u2 u1 u0     n
  33. movd       (%1), %%mm2  # Load 4 Cr           00 00 00 00 v3 v2 v1 v0     n
  34. movq       (%2), %%mm0  # Load 8 Y            y7 y6 y5 y4 y3 y2 y1 y0     n
  35. movq       (%3), %%mm3  # Load 8 Y            Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0     n
  36.         :                                   
  37.         : "r" (p_u), "r" (p_v),             
  38.           "r" (p_y1), "r" (p_y2) );         
  39.     __asm__ __volatile__(                   
  40.         ".p2align 3 nt"                   
  41.         MMX_INSTRUCTIONS                    
  42.         :                                   
  43.         : "r" (p_line1), "r" (p_line2) );   
  44.         p_line1 += 16; p_line2 += 16;       
  45.         p_y1 += 8; p_y2 += 8;               
  46.         p_u += 4; p_v += 4;                 
  47.     } while(0)
  48. #define MMX_END __asm__ __volatile__ ( "emms" )
  49. #define MMX_YUV420_YUYV "                                                 n
  50. punpcklbw %%mm2, %%mm1  #                     v3 u3 v2 u2 v1 u1 v0 u0     n
  51. movq      %%mm0, %%mm2  #                     y7 y6 y5 y4 y3 y2 y1 y0     n
  52. punpcklbw %%mm1, %%mm2  #                     v1 y3 u1 y2 v0 y1 u0 y0     n
  53. movq      %%mm2, (%0)   # Store low YUYV                                  n
  54. punpckhbw %%mm1, %%mm0  #                     v3 y7 u3 y6 v2 y5 u2 y4     n
  55. movq      %%mm0, 8(%0)  # Store high YUYV                                 n
  56. movq      %%mm3, %%mm4  #                     Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0     n
  57. punpcklbw %%mm1, %%mm4  #                     v1 Y3 u1 Y2 v0 Y1 u0 Y0     n
  58. movq      %%mm4, (%1)   # Store low YUYV                                  n
  59. punpckhbw %%mm1, %%mm3  #                     v3 Y7 u3 Y6 v2 Y5 u2 Y4     n
  60. movq      %%mm3, 8(%1)  # Store high YUYV                                 n
  61. "
  62. #define MMX_YUV420_YVYU "                                                 n
  63. punpcklbw %%mm1, %%mm2  #                     u3 v3 u2 v2 u1 v1 u0 v0     n
  64. movq      %%mm0, %%mm1  #                     y7 y6 y5 y4 y3 y2 y1 y0     n
  65. punpcklbw %%mm2, %%mm1  #                     u1 y3 v1 y2 u0 y1 v0 y0     n
  66. movq      %%mm1, (%0)   # Store low YUYV                                  n
  67. punpckhbw %%mm2, %%mm0  #                     u3 y7 v3 y6 u2 y5 v2 y4     n
  68. movq      %%mm0, 8(%0)  # Store high YUYV                                 n
  69. movq      %%mm3, %%mm4  #                     Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0     n
  70. punpcklbw %%mm2, %%mm4  #                     u1 Y3 v1 Y2 u0 Y1 v0 Y0     n
  71. movq      %%mm4, (%1)   # Store low YUYV                                  n
  72. punpckhbw %%mm2, %%mm3  #                     u3 Y7 v3 Y6 u2 Y5 v2 Y4     n
  73. movq      %%mm3, 8(%1)  # Store high YUYV                                 n
  74. "
  75. #define MMX_YUV420_UYVY "                                                 n
  76. punpcklbw %%mm2, %%mm1  #                     v3 u3 v2 u2 v1 u1 v0 u0     n
  77. movq      %%mm1, %%mm2  #                     v3 u3 v2 u2 v1 u1 v0 u0     n
  78. punpcklbw %%mm0, %%mm2  #                     y3 v1 y2 u1 y1 v0 y0 u0     n
  79. movq      %%mm2, (%0)   # Store low UYVY                                  n
  80. movq      %%mm1, %%mm2  #                     u3 v3 u2 v2 u1 v1 u0 v0     n
  81. punpckhbw %%mm0, %%mm2  #                     y3 v1 y2 u1 y1 v0 y0 u0     n
  82. movq      %%mm2, 8(%0)  # Store high UYVY                                 n
  83. movq      %%mm1, %%mm4  #                     u3 v3 u2 v2 u1 v1 u0 v0     n
  84. punpcklbw %%mm3, %%mm4  #                     Y3 v1 Y2 u1 Y1 v0 Y0 u0     n
  85. movq      %%mm4, (%1)   # Store low UYVY                                  n
  86. punpckhbw %%mm3, %%mm1  #                     Y7 v3 Y6 u3 Y5 v2 Y4 u2     n
  87. movq      %%mm1, 8(%1)  # Store high UYVY                                 n
  88. "
  89. /* FIXME: this code does not work ! Chroma seems to be wrong. */
  90. #define MMX_YUV420_Y211 "                                                 n
  91. movd       (%4), %%mm2  # Load 4 Cb           00 00 00 00 u3 u2 u1 u0     n
  92. movd       (%5), %%mm3  # Load 4 Cr           00 00 00 00 v3 v2 v1 v0     n
  93. pand    i_00ffw, %%mm0  # get Y even          00 Y6 00 Y4 00 Y2 00 Y0     n
  94. packuswb  %%mm0, %%mm0  # pack Y              y6 y4 y2 y0 y6 y4 y2 y0     n
  95. pand    i_00ffw, %%mm2  # get U even          00 u6 00 u4 00 u2 00 u0     n
  96. packuswb  %%mm2, %%mm2  # pack U              00 00 u2 u0 00 00 u2 u0     n
  97. pand    i_00ffw, %%mm3  # get V even          00 v6 00 v4 00 v2 00 v0     n
  98. packuswb  %%mm3, %%mm3  # pack V              00 00 v2 v0 00 00 v2 v0     n
  99. punpcklbw %%mm3, %%mm2  #                     00 00 00 00 v2 u2 v0 u0     n
  100. psubsw    i_80w, %%mm2  # U,V -= 128                                      n
  101. punpcklbw %%mm2, %%mm0  #                     v2 y6 u2 y4 v0 y2 u0 y0     n
  102. movq      %%mm0, (%0)   # Store YUYV                                      n
  103. pand    i_00ffw, %%mm1  # get Y even          00 Y6 00 Y4 00 Y2 00 Y0     n
  104. packuswb  %%mm1, %%mm1  # pack Y              Y6 Y4 Y2 Y0 Y6 Y4 Y2 Y0     n
  105. punpcklbw %%mm2, %%mm1  #                     v2 Y6 u2 Y4 v0 Y2 u0 Y0     n
  106. movq      %%mm1, (%1)   # Store YUYV                                      n
  107. "
  108. #elif defined(HAVE_MMX_INTRINSICS)
  109. /* MMX intrinsics */
  110. #include <mmintrin.h>
  111. #define MMX_CALL(MMX_INSTRUCTIONS)          
  112.     do {                                    
  113.         __m64 mm0, mm1, mm2, mm3, mm4;      
  114.         MMX_INSTRUCTIONS                    
  115.         p_line1 += 16; p_line2 += 16;       
  116.         p_y1 += 8; p_y2 += 8;               
  117.         p_u += 4; p_v += 4;                 
  118.     } while(0)
  119. #define MMX_END _mm_empty()
  120.  
  121. #define MMX_YUV420_YUYV                     
  122.     mm1 = _mm_cvtsi32_si64(*(int*)p_u);     
  123.     mm2 = _mm_cvtsi32_si64(*(int*)p_v);     
  124.     mm0 = (__m64)*(uint64_t*)p_y1;          
  125.     mm3 = (__m64)*(uint64_t*)p_y2;          
  126.     mm1 = _mm_unpacklo_pi8(mm1, mm2);       
  127.     mm2 = mm0;                              
  128.     mm2 = _mm_unpacklo_pi8(mm2, mm1);       
  129.     *(uint64_t*)p_line1 = (uint64_t)mm2;    
  130.     mm0 = _mm_unpackhi_pi8(mm0, mm1);       
  131.     *(uint64_t*)(p_line1+8) = (uint64_t)mm0;
  132.     mm4 = mm3;                              
  133.     mm4 = _mm_unpacklo_pi8(mm4, mm1);       
  134.     *(uint64_t*)p_line2 = (uint64_t)mm4;    
  135.     mm3 = _mm_unpackhi_pi8(mm3, mm1);       
  136.     *(uint64_t*)(p_line2+8) = (uint64_t)mm3;
  137. #define MMX_YUV420_YVYU                     
  138.     mm2 = _mm_cvtsi32_si64(*(int*)p_u);     
  139.     mm1 = _mm_cvtsi32_si64(*(int*)p_v);     
  140.     mm0 = (__m64)*(uint64_t*)p_y1;          
  141.     mm3 = (__m64)*(uint64_t*)p_y2;          
  142.     mm1 = _mm_unpacklo_pi8(mm1, mm2);       
  143.     mm2 = mm0;                              
  144.     mm2 = _mm_unpacklo_pi8(mm2, mm1);       
  145.     *(uint64_t*)p_line1 = (uint64_t)mm2;    
  146.     mm0 = _mm_unpackhi_pi8(mm0, mm1);       
  147.     *(uint64_t*)(p_line1+8) = (uint64_t)mm0;
  148.     mm4 = mm3;                              
  149.     mm4 = _mm_unpacklo_pi8(mm4, mm1);       
  150.     *(uint64_t*)p_line2 = (uint64_t)mm4;    
  151.     mm3 = _mm_unpackhi_pi8(mm3, mm1);       
  152.     *(uint64_t*)(p_line2+8) = (uint64_t)mm3;
  153. #define MMX_YUV420_UYVY                     
  154.     mm1 = _mm_cvtsi32_si64(*(int*)p_u);     
  155.     mm2 = _mm_cvtsi32_si64(*(int*)p_v);     
  156.     mm0 = (__m64)*(uint64_t*)p_y1;          
  157.     mm3 = (__m64)*(uint64_t*)p_y2;          
  158.     mm1 = _mm_unpacklo_pi8(mm1, mm2);       
  159.     mm2 = mm1;                              
  160.     mm2 = _mm_unpacklo_pi8(mm2, mm0);       
  161.     *(uint64_t*)p_line1 = (uint64_t)mm2;    
  162.     mm2 = mm1;                              
  163.     mm2 = _mm_unpackhi_pi8(mm2, mm0);       
  164.     *(uint64_t*)(p_line1+8) = (uint64_t)mm2;
  165.     mm4 = mm1;                              
  166.     mm4 = _mm_unpacklo_pi8(mm4, mm3);       
  167.     *(uint64_t*)p_line2 = (uint64_t)mm4;    
  168.     mm1 = _mm_unpackhi_pi8(mm1, mm3);       
  169.     *(uint64_t*)(p_line2+8) = (uint64_t)mm1;
  170. #endif
  171. #elif defined( MODULE_NAME_IS_i420_yuy2_sse2 )
  172. #if defined(CAN_COMPILE_SSE2)
  173. /* SSE2 assembly */
  174. #define SSE2_CALL(SSE2_INSTRUCTIONS)    
  175.     do {                                
  176.     __asm__ __volatile__(               
  177.         ".p2align 3 nt                
  178. movq        (%0), %%xmm1  # Load 8 Cb         u7 u6 u5 u4 u3 u2 u1 u0     n
  179. movq        (%1), %%xmm2  # Load 8 Cr         v7 06 v5 v4 v3 v2 v1 v0     n
  180.         :                               
  181.         : "r" (p_u),  "r" (p_v) );      
  182.     __asm__ __volatile__(               
  183.         ".p2align 3 nt"               
  184.         SSE2_INSTRUCTIONS               
  185.         :                               
  186.         : "r" (p_line1), "r" (p_line2), 
  187.           "r" (p_y1),  "r" (p_y2) );    
  188.         p_line1 += 32; p_line2 += 32;   
  189.         p_y1 += 16; p_y2 += 16;         
  190.         p_u += 8; p_v += 8;             
  191.     } while(0)
  192. #define SSE2_END  __asm__ __volatile__ ( "sfence" ::: "memory" )
  193. #define SSE2_YUV420_YUYV_ALIGNED "                                        n
  194. movdqa      (%2), %%xmm0  # Load 16 Y         y15 y14 y13 .. y2 y1 y0     n
  195. movdqa      (%3), %%xmm3  # Load 16 Y         Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0     n
  196. punpcklbw %%xmm2, %%xmm1  #                   v7 u7 v6 u6 .. u1 v0 u0     n
  197. movdqa    %%xmm0, %%xmm2  #                   y15 y14 y13 .. y2 y1 y0     n
  198. punpcklbw %%xmm1, %%xmm2  #                   v3 y7 u3 .. v0 y1 u0 y0     n
  199. movntdq   %%xmm2, (%0)    # Store low YUYV                                n
  200. punpckhbw %%xmm1, %%xmm0  #                   v3 y7 u3 y6 v2 y5 u2 y4     n
  201. movntdq   %%xmm0, 16(%0)  # Store high YUYV                               n
  202. movdqa    %%xmm3, %%xmm4  #                   Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0     n
  203. punpcklbw %%xmm1, %%xmm4  #                   v1 Y3 u1 Y2 v0 Y1 u0 Y0     n
  204. movntdq   %%xmm4, (%1)    # Store low YUYV                                n
  205. punpckhbw %%xmm1, %%xmm3  #                   v3 Y7 u3 Y6 v2 Y5 u2 Y4     n
  206. movntdq   %%xmm3, 16(%1)  # Store high YUYV                               n
  207. "
  208. #define SSE2_YUV420_YUYV_UNALIGNED "                                      n
  209. movdqu      (%2), %%xmm0  # Load 16 Y         y7 y6 y5 y4 y3 y2 y1 y0     n
  210. movdqu      (%3), %%xmm3  # Load 16 Y         Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0     n
  211. prefetchnta (%0)          # Tell CPU not to cache output YUYV data        n
  212. prefetchnta (%1)          # Tell CPU not to cache output YUYV data        n
  213. punpcklbw %%xmm2, %%xmm1  #                   v3 u3 v2 u2 v1 u1 v0 u0     n
  214. movdqa    %%xmm0, %%xmm2  #                   y7 y6 y5 y4 y3 y2 y1 y0     n
  215. punpcklbw %%xmm1, %%xmm2  #                   v1 y3 u1 y2 v0 y1 u0 y0     n
  216. movdqu    %%xmm2, (%0)    # Store low YUYV                                n
  217. punpckhbw %%xmm1, %%xmm0  #                   v3 y7 u3 y6 v2 y5 u2 y4     n
  218. movdqu    %%xmm0, 16(%0)  # Store high YUYV                               n
  219. movdqa    %%xmm3, %%xmm4  #                   Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0     n
  220. punpcklbw %%xmm1, %%xmm4  #                   v1 Y3 u1 Y2 v0 Y1 u0 Y0     n
  221. movdqu    %%xmm4, (%1)    # Store low YUYV                                n
  222. punpckhbw %%xmm1, %%xmm3  #                   v3 Y7 u3 Y6 v2 Y5 u2 Y4     n
  223. movdqu    %%xmm3, 16(%1)  # Store high YUYV                               n
  224. "
  225. #define SSE2_YUV420_YVYU_ALIGNED "                                        n
  226. movdqa      (%2), %%xmm0  # Load 16 Y           y7 y6 y5 y4 y3 y2 y1 y0   n
  227. movdqa      (%3), %%xmm3  # Load 16 Y           Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0   n
  228. punpcklbw %%xmm1, %%xmm2  #                     u3 v3 u2 v2 u1 v1 u0 v0   n
  229. movdqa    %%xmm0, %%xmm1  #                     y7 y6 y5 y4 y3 y2 y1 y0   n
  230. punpcklbw %%xmm2, %%xmm1  #                     u1 y3 v1 y2 u0 y1 v0 y0   n
  231. movntdq   %%xmm1, (%0)    # Store low YUYV                                n
  232. punpckhbw %%xmm2, %%xmm0  #                     u3 y7 v3 y6 u2 y5 v2 y4   n
  233. movntdq   %%xmm0, 16(%0)  # Store high YUYV                               n
  234. movdqa    %%xmm3, %%xmm4  #                     Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0   n
  235. punpcklbw %%xmm2, %%xmm4  #                     u1 Y3 v1 Y2 u0 Y1 v0 Y0   n
  236. movntdq   %%xmm4, (%1)    # Store low YUYV                                n
  237. punpckhbw %%xmm2, %%xmm3  #                     u3 Y7 v3 Y6 u2 Y5 v2 Y4   n
  238. movntdq   %%xmm3, 16(%1)  # Store high YUYV                               n
  239. "
  240. #define SSE2_YUV420_YVYU_UNALIGNED "                                      n
  241. movdqu      (%2), %%xmm0  # Load 16 Y           y7 y6 y5 y4 y3 y2 y1 y0   n
  242. movdqu      (%3), %%xmm3  # Load 16 Y           Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0   n
  243. prefetchnta (%0)          # Tell CPU not to cache output YVYU data        n
  244. prefetchnta (%1)          # Tell CPU not to cache output YVYU data        n
  245. punpcklbw %%xmm1, %%xmm2  #                     u3 v3 u2 v2 u1 v1 u0 v0   n
  246. movdqu    %%xmm0, %%xmm1  #                     y7 y6 y5 y4 y3 y2 y1 y0   n
  247. punpcklbw %%xmm2, %%xmm1  #                     u1 y3 v1 y2 u0 y1 v0 y0   n
  248. movdqu    %%xmm1, (%0)    # Store low YUYV                                n
  249. punpckhbw %%xmm2, %%xmm0  #                     u3 y7 v3 y6 u2 y5 v2 y4   n
  250. movdqu    %%xmm0, 16(%0)  # Store high YUYV                               n
  251. movdqu    %%xmm3, %%xmm4  #                     Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0   n
  252. punpcklbw %%xmm2, %%xmm4  #                     u1 Y3 v1 Y2 u0 Y1 v0 Y0   n
  253. movdqu    %%xmm4, (%1)    # Store low YUYV                                n
  254. punpckhbw %%xmm2, %%xmm3  #                     u3 Y7 v3 Y6 u2 Y5 v2 Y4   n
  255. movdqu    %%xmm3, 16(%1)  # Store high YUYV                               n
  256. "
  257. #define SSE2_YUV420_UYVY_ALIGNED "                                        n
  258. movdqa      (%2), %%xmm0  # Load 16 Y           y7 y6 y5 y4 y3 y2 y1 y0   n
  259. movdqa      (%3), %%xmm3  # Load 16 Y           Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0   n
  260. punpcklbw %%xmm2, %%xmm1  #                     v3 u3 v2 u2 v1 u1 v0 u0   n
  261. movdqa    %%xmm1, %%xmm2  #                     v3 u3 v2 u2 v1 u1 v0 u0   n
  262. punpcklbw %%xmm0, %%xmm2  #                     y3 v1 y2 u1 y1 v0 y0 u0   n
  263. movntdq   %%xmm2, (%0)    # Store low UYVY                                n
  264. movdqa    %%xmm1, %%xmm2  #                     u3 v3 u2 v2 u1 v1 u0 v0   n
  265. punpckhbw %%xmm0, %%xmm2  #                     y3 v1 y2 u1 y1 v0 y0 u0   n
  266. movntdq   %%xmm2, 16(%0)  # Store high UYVY                               n
  267. movdqa    %%xmm1, %%xmm4  #                     u3 v3 u2 v2 u1 v1 u0 v0   n
  268. punpcklbw %%xmm3, %%xmm4  #                     Y3 v1 Y2 u1 Y1 v0 Y0 u0   n
  269. movntdq   %%xmm4, (%1)    # Store low UYVY                                n
  270. punpckhbw %%xmm3, %%xmm1  #                     Y7 v3 Y6 u3 Y5 v2 Y4 u2   n
  271. movntdq   %%xmm1, 16(%1)  # Store high UYVY                               n
  272. "
  273. #define SSE2_YUV420_UYVY_UNALIGNED "                                      n
  274. movdqu      (%2), %%xmm0  # Load 16 Y           y7 y6 y5 y4 y3 y2 y1 y0   n
  275. movdqu      (%3), %%xmm3  # Load 16 Y           Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0   n
  276. prefetchnta (%0)          # Tell CPU not to cache output UYVY data        n
  277. prefetchnta (%1)          # Tell CPU not to cache output UYVY data        n
  278. punpcklbw %%xmm2, %%xmm1  #                     v3 u3 v2 u2 v1 u1 v0 u0   n
  279. movdqu    %%xmm1, %%xmm2  #                     v3 u3 v2 u2 v1 u1 v0 u0   n
  280. punpcklbw %%xmm0, %%xmm2  #                     y3 v1 y2 u1 y1 v0 y0 u0   n
  281. movdqu    %%xmm2, (%0)    # Store low UYVY                                n
  282. movdqu    %%xmm1, %%xmm2  #                     u3 v3 u2 v2 u1 v1 u0 v0   n
  283. punpckhbw %%xmm0, %%xmm2  #                     y3 v1 y2 u1 y1 v0 y0 u0   n
  284. movdqu    %%xmm2, 16(%0)  # Store high UYVY                               n
  285. movdqu    %%xmm1, %%xmm4  #                     u3 v3 u2 v2 u1 v1 u0 v0   n
  286. punpcklbw %%xmm3, %%xmm4  #                     Y3 v1 Y2 u1 Y1 v0 Y0 u0   n
  287. movdqu    %%xmm4, (%1)    # Store low UYVY                                n
  288. punpckhbw %%xmm3, %%xmm1  #                     Y7 v3 Y6 u3 Y5 v2 Y4 u2   n
  289. movdqu    %%xmm1, 16(%1)  # Store high UYVY                               n
  290. "
  291. #elif defined(HAVE_SSE2_INTRINSICS)
  292. /* SSE2 intrinsics */
  293. #include <emmintrin.h>
  294. #define SSE2_CALL(SSE2_INSTRUCTIONS)            
  295.     do {                                        
  296.         __m128i xmm0, xmm1, xmm2, xmm3, xmm4;   
  297.         SSE2_INSTRUCTIONS                       
  298.         p_line1 += 32; p_line2 += 32;           
  299.         p_y1 += 16; p_y2 += 16;                 
  300.         p_u += 8; p_v += 8;                     
  301.     } while(0)
  302. #define SSE2_END  _mm_sfence()
  303. #define SSE2_YUV420_YUYV_ALIGNED                    
  304.     xmm1 = _mm_loadl_epi64((__m128i *)p_u);         
  305.     xmm2 = _mm_loadl_epi64((__m128i *)p_v);         
  306.     xmm0 = _mm_load_si128((__m128i *)p_y1);         
  307.     xmm3 = _mm_load_si128((__m128i *)p_y2);         
  308.     xmm1 = _mm_unpacklo_epi8(xmm1, xmm2);           
  309.     xmm2 = xmm0;                                    
  310.     xmm2 = _mm_unpacklo_epi8(xmm2, xmm1);           
  311.     _mm_stream_si128((__m128i*)(p_line1), xmm2);    
  312.     xmm0 = _mm_unpackhi_epi8(xmm0, xmm1);           
  313.     _mm_stream_si128((__m128i*)(p_line1+16), xmm0); 
  314.     xmm4 = xmm3;                                    
  315.     xmm4 = _mm_unpacklo_epi8(xmm4, xmm1);           
  316.     _mm_stream_si128((__m128i*)(p_line2), xmm4);    
  317.     xmm3 = _mm_unpackhi_epi8(xmm3, xmm1);           
  318.     _mm_stream_si128((__m128i*)(p_line1+16), xmm3);
  319. #define SSE2_YUV420_YUYV_UNALIGNED                  
  320.     xmm1 = _mm_loadl_epi64((__m128i *)p_u);         
  321.     xmm2 = _mm_loadl_epi64((__m128i *)p_v);         
  322.     xmm0 = _mm_loadu_si128((__m128i *)p_y1);        
  323.     xmm3 = _mm_loadu_si128((__m128i *)p_y2);        
  324.     _mm_prefetch(p_line1, _MM_HINT_NTA);            
  325.     _mm_prefetch(p_line2, _MM_HINT_NTA);            
  326.     xmm1 = _mm_unpacklo_epi8(xmm1, xmm2);           
  327.     xmm2 = xmm0;                                    
  328.     xmm2 = _mm_unpacklo_epi8(xmm2, xmm1);           
  329.     _mm_storeu_si128((__m128i*)(p_line1), xmm2);    
  330.     xmm0 = _mm_unpackhi_epi8(xmm0, xmm1);           
  331.     _mm_storeu_si128((__m128i*)(p_line1+16), xmm0); 
  332.     xmm4 = xmm3;                                    
  333.     xmm4 = _mm_unpacklo_epi8(xmm4, xmm1);           
  334.     _mm_storeu_si128((__m128i*)(p_line2), xmm4);    
  335.     xmm3 = _mm_unpackhi_epi8(xmm3, xmm1);           
  336.     _mm_storeu_si128((__m128i*)(p_line1+16), xmm3);
  337. #define SSE2_YUV420_YVYU_ALIGNED                    
  338.     xmm1 = _mm_loadl_epi64((__m128i *)p_v);         
  339.     xmm2 = _mm_loadl_epi64((__m128i *)p_u);         
  340.     xmm0 = _mm_load_si128((__m128i *)p_y1);         
  341.     xmm3 = _mm_load_si128((__m128i *)p_y2);         
  342.     xmm1 = _mm_unpacklo_epi8(xmm1, xmm2);           
  343.     xmm2 = xmm0;                                    
  344.     xmm2 = _mm_unpacklo_epi8(xmm2, xmm1);           
  345.     _mm_stream_si128((__m128i*)(p_line1), xmm2);    
  346.     xmm0 = _mm_unpackhi_epi8(xmm0, xmm1);           
  347.     _mm_stream_si128((__m128i*)(p_line1+16), xmm0); 
  348.     xmm4 = xmm3;                                    
  349.     xmm4 = _mm_unpacklo_epi8(xmm4, xmm1);           
  350.     _mm_stream_si128((__m128i*)(p_line2), xmm4);    
  351.     xmm3 = _mm_unpackhi_epi8(xmm3, xmm1);           
  352.     _mm_stream_si128((__m128i*)(p_line1+16), xmm3);
  353. #define SSE2_YUV420_YVYU_UNALIGNED                  
  354.     xmm1 = _mm_loadl_epi64((__m128i *)p_v);         
  355.     xmm2 = _mm_loadl_epi64((__m128i *)p_u);         
  356.     xmm0 = _mm_loadu_si128((__m128i *)p_y1);        
  357.     xmm3 = _mm_loadu_si128((__m128i *)p_y2);        
  358.     _mm_prefetch(p_line1, _MM_HINT_NTA);            
  359.     _mm_prefetch(p_line2, _MM_HINT_NTA);            
  360.     xmm1 = _mm_unpacklo_epi8(xmm1, xmm2);           
  361.     xmm2 = xmm0;                                    
  362.     xmm2 = _mm_unpacklo_epi8(xmm2, xmm1);           
  363.     _mm_storeu_si128((__m128i*)(p_line1), xmm2);    
  364.     xmm0 = _mm_unpackhi_epi8(xmm0, xmm1);           
  365.     _mm_storeu_si128((__m128i*)(p_line1+16), xmm0); 
  366.     xmm4 = xmm3;                                    
  367.     xmm4 = _mm_unpacklo_epi8(xmm4, xmm1);           
  368.     _mm_storeu_si128((__m128i*)(p_line2), xmm4);    
  369.     xmm3 = _mm_unpackhi_epi8(xmm3, xmm1);           
  370.     _mm_storeu_si128((__m128i*)(p_line1+16), xmm3);
  371. #define SSE2_YUV420_UYVY_ALIGNED                    
  372.     xmm1 = _mm_loadl_epi64((__m128i *)p_u);         
  373.     xmm2 = _mm_loadl_epi64((__m128i *)p_v);         
  374.     xmm0 = _mm_load_si128((__m128i *)p_y1);         
  375.     xmm3 = _mm_load_si128((__m128i *)p_y2);         
  376.     xmm1 = _mm_unpacklo_epi8(xmm1, xmm2);           
  377.     xmm2 = xmm1;                                    
  378.     xmm2 = _mm_unpacklo_epi8(xmm2, xmm0);           
  379.     _mm_stream_si128((__m128i*)(p_line1), xmm2);    
  380.     xmm2 = xmm1;                                    
  381.     xmm2 = _mm_unpackhi_epi8(xmm2, xmm0);           
  382.     _mm_stream_si128((__m128i*)(p_line1+16), xmm2); 
  383.     xmm4 = xmm1;                                    
  384.     xmm4 = _mm_unpacklo_epi8(xmm4, xmm3);           
  385.     _mm_stream_si128((__m128i*)(p_line2), xmm4);    
  386.     xmm1 = _mm_unpackhi_epi8(xmm1, xmm3);           
  387.     _mm_stream_si128((__m128i*)(p_line1+16), xmm1);
  388. #define SSE2_YUV420_UYVY_UNALIGNED                  
  389.     xmm1 = _mm_loadl_epi64((__m128i *)p_u);         
  390.     xmm2 = _mm_loadl_epi64((__m128i *)p_v);         
  391.     xmm0 = _mm_loadu_si128((__m128i *)p_y1);        
  392.     xmm3 = _mm_loadu_si128((__m128i *)p_y2);        
  393.     _mm_prefetch(p_line1, _MM_HINT_NTA);            
  394.     _mm_prefetch(p_line2, _MM_HINT_NTA);            
  395.     xmm1 = _mm_unpacklo_epi8(xmm1, xmm2);           
  396.     xmm2 = xmm1;                                    
  397.     xmm2 = _mm_unpacklo_epi8(xmm2, xmm0);           
  398.     _mm_storeu_si128((__m128i*)(p_line1), xmm2);    
  399.     xmm2 = xmm1;                                    
  400.     xmm2 = _mm_unpackhi_epi8(xmm2, xmm0);           
  401.     _mm_storeu_si128((__m128i*)(p_line1+16), xmm2); 
  402.     xmm4 = xmm1;                                    
  403.     xmm4 = _mm_unpacklo_epi8(xmm4, xmm3);           
  404.     _mm_storeu_si128((__m128i*)(p_line2), xmm4);    
  405.     xmm1 = _mm_unpackhi_epi8(xmm1, xmm3);           
  406.     _mm_storeu_si128((__m128i*)(p_line1+16), xmm1);
  407. #endif
  408. #endif
  409. /* Used in both accelerated and C modules */
  410. #define C_YUV420_YVYU( )                                                    
  411.     *(p_line1)++ = *(p_y1)++; *(p_line2)++ = *(p_y2)++;                     
  412.     *(p_line1)++ =            *(p_line2)++ = *(p_v)++;                      
  413.     *(p_line1)++ = *(p_y1)++; *(p_line2)++ = *(p_y2)++;                     
  414.     *(p_line1)++ =            *(p_line2)++ = *(p_u)++;                      
  415. #define C_YUV420_Y211( )                                                    
  416.     *(p_line1)++ = *(p_y1); p_y1 += 2;                                      
  417.     *(p_line2)++ = *(p_y2); p_y2 += 2;                                      
  418.     *(p_line1)++ = *(p_line2)++ = *(p_u) - 0x80; p_u += 2;                  
  419.     *(p_line1)++ = *(p_y1); p_y1 += 2;                                      
  420.     *(p_line2)++ = *(p_y2); p_y2 += 2;                                      
  421.     *(p_line1)++ = *(p_line2)++ = *(p_v) - 0x80; p_v += 2;                  
  422. #define C_YUV420_YUYV( )                                                    
  423.     *(p_line1)++ = *(p_y1)++; *(p_line2)++ = *(p_y2)++;                     
  424.     *(p_line1)++ =            *(p_line2)++ = *(p_u)++;                      
  425.     *(p_line1)++ = *(p_y1)++; *(p_line2)++ = *(p_y2)++;                     
  426.     *(p_line1)++ =            *(p_line2)++ = *(p_v)++;                      
  427. #define C_YUV420_UYVY( )                                                    
  428.     *(p_line1)++ =            *(p_line2)++ = *(p_u)++;                      
  429.     *(p_line1)++ = *(p_y1)++; *(p_line2)++ = *(p_y2)++;                     
  430.     *(p_line1)++ =            *(p_line2)++ = *(p_v)++;                      
  431.     *(p_line1)++ = *(p_y1)++; *(p_line2)++ = *(p_y2)++;