i420_rgb_mmx.h
上传用户:kjfoods
上传日期:2020-07-06
资源大小:29949k
文件大小:73k
源码类别:

midi

开发平台:

Unix_Linux

  1. /*****************************************************************************
  2.  * transforms_yuvmmx.h: MMX YUV transformation assembly
  3.  *****************************************************************************
  4.  * Copyright (C) 1999-2007 the VideoLAN team
  5.  * $Id: cb7a9d8d1b7287c46f3d00d259fe85133b277a1e $
  6.  *
  7.  * Authors: Olie Lho <ollie@sis.com.tw>
  8.  *          Gaël Hendryckx <jimmy@via.ecp.fr>
  9.  *          Samuel Hocevar <sam@zoy.org>
  10.  *          Damien Fouilleul <damienf@videolan.org>
  11.  *
  12.  * This program is free software; you can redistribute it and/or modify
  13.  * it under the terms of the GNU General Public License as published by
  14.  * the Free Software Foundation; either version 2 of the License, or
  15.  * (at your option) any later version.
  16.  *
  17.  * This program is distributed in the hope that it will be useful,
  18.  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19.  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20.  * GNU General Public License for more details.
  21.  *
  22.  * You should have received a copy of the GNU General Public License
  23.  * along with this program; if not, write to the Free Software
  24.  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
  25.  *****************************************************************************/
  26. #ifdef MODULE_NAME_IS_i420_rgb_mmx
  27. /* hope these constant values are cache line aligned */
  28. #if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 3)
  29. #define USED_U64(foo) 
  30.     static const uint64_t foo __asm__ (#foo) __attribute__((used))
  31. #else
  32. #define USED_U64(foo) 
  33.     static const uint64_t foo __asm__ (#foo) __attribute__((unused))
  34. #endif
  35. USED_U64(mmx_80w)     = 0x0080008000800080ULL;
  36. USED_U64(mmx_10w)     = 0x1010101010101010ULL;
  37. USED_U64(mmx_00ffw)   = 0x00ff00ff00ff00ffULL;
  38. USED_U64(mmx_Y_coeff) = 0x253f253f253f253fULL;
  39. USED_U64(mmx_U_green) = 0xf37df37df37df37dULL;
  40. USED_U64(mmx_U_blue)  = 0x4093409340934093ULL;
  41. USED_U64(mmx_V_red)   = 0x3312331233123312ULL;
  42. USED_U64(mmx_V_green) = 0xe5fce5fce5fce5fcULL;
  43. USED_U64(mmx_mask_f8) = 0xf8f8f8f8f8f8f8f8ULL;
  44. USED_U64(mmx_mask_fc) = 0xfcfcfcfcfcfcfcfcULL;
  45. #undef USED_U64
  46. #if defined(CAN_COMPILE_MMX)
  47. /* MMX assembly */
  48.  
  49. #define MMX_CALL(MMX_INSTRUCTIONS)      
  50.     do {                                
  51.     __asm__ __volatile__(               
  52.         ".p2align 3 nt"               
  53.         MMX_INSTRUCTIONS                
  54.         :                               
  55.         : "r" (p_y), "r" (p_u),         
  56.           "r" (p_v), "r" (p_buffer) );  
  57.     } while(0)
  58. #define MMX_END __asm__ __volatile__ ( "emms" )
  59. /* Use RIP-relative code in PIC mode on amd64 */
  60. #if defined(__x86_64__) && defined(__PIC__)
  61. #   define G "(%%rip)"
  62. #else
  63. #   define G
  64. #endif
  65. #define MMX_INIT_16 "                                                       n
  66. movd       (%1), %%mm0      # Load 4 Cb       00 00 00 00 u3 u2 u1 u0       n
  67. movd       (%2), %%mm1      # Load 4 Cr       00 00 00 00 v3 v2 v1 v0       n
  68. pxor      %%mm4, %%mm4      # zero mm4                                      n
  69. movq       (%0), %%mm6      # Load 8 Y        Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0       n
  70. "
  71. #define MMX_INIT_16_GRAY "                                                  n
  72. movq      (%0), %%mm6       # Load 8 Y        Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0       n
  73. #movl      $0, (%3)         # cache preload for image                       n
  74. "
  75. #define MMX_INIT_32 "                                                       n
  76. movd      (%1), %%mm0       # Load 4 Cb       00 00 00 00 u3 u2 u1 u0       n
  77. movl        $0, (%3)        # cache preload for image                       n
  78. movd      (%2), %%mm1       # Load 4 Cr       00 00 00 00 v3 v2 v1 v0       n
  79. pxor     %%mm4, %%mm4       # zero mm4                                      n
  80. movq      (%0), %%mm6       # Load 8 Y        Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0       n
  81. "
  82. /*
  83.  * Do the multiply part of the conversion for even and odd pixels,
  84.  * register usage:
  85.  * mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels,
  86.  * mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd  pixels,
  87.  * mm6 -> Y even, mm7 -> Y odd
  88.  */
  89. #define MMX_YUV_MUL "                                                       n
  90. # convert the chroma part                                                   n
  91. punpcklbw %%mm4, %%mm0          # scatter 4 Cb    00 u3 00 u2 00 u1 00 u0   n
  92. punpcklbw %%mm4, %%mm1          # scatter 4 Cr    00 v3 00 v2 00 v1 00 v0   n
  93. psubsw    mmx_80w"G", %%mm0     # Cb -= 128                                 n
  94. psubsw    mmx_80w"G", %%mm1     # Cr -= 128                                 n
  95. psllw     $3, %%mm0             # Promote precision                         n
  96. psllw     $3, %%mm1             # Promote precision                         n
  97. movq      %%mm0, %%mm2          # Copy 4 Cb       00 u3 00 u2 00 u1 00 u0   n
  98. movq      %%mm1, %%mm3          # Copy 4 Cr       00 v3 00 v2 00 v1 00 v0   n
  99. pmulhw    mmx_U_green"G", %%mm2 # Mul Cb with green coeff -> Cb green       n
  100. pmulhw    mmx_V_green"G", %%mm3 # Mul Cr with green coeff -> Cr green       n
  101. pmulhw    mmx_U_blue"G", %%mm0  # Mul Cb -> Cblue 00 b3 00 b2 00 b1 00 b0   n
  102. pmulhw    mmx_V_red"G", %%mm1   # Mul Cr -> Cred  00 r3 00 r2 00 r1 00 r0   n
  103. paddsw    %%mm3, %%mm2          # Cb green + Cr green -> Cgreen             n
  104.                                                                             n
  105. # convert the luma part                                                     n
  106. psubusb   mmx_10w"G", %%mm6     # Y -= 16                                   n
  107. movq      %%mm6, %%mm7          # Copy 8 Y        Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0   n
  108. pand      mmx_00ffw"G", %%mm6   # get Y even      00 Y6 00 Y4 00 Y2 00 Y0   n
  109. psrlw     $8, %%mm7             # get Y odd       00 Y7 00 Y5 00 Y3 00 Y1   n
  110. psllw     $3, %%mm6             # Promote precision                         n
  111. psllw     $3, %%mm7             # Promote precision                         n
  112. pmulhw    mmx_Y_coeff"G", %%mm6 # Mul 4 Y even    00 y6 00 y4 00 y2 00 y0   n
  113. pmulhw    mmx_Y_coeff"G", %%mm7 # Mul 4 Y odd     00 y7 00 y5 00 y3 00 y1   n
  114. "
  115. /*
  116.  * Do the addition part of the conversion for even and odd pixels,
  117.  * register usage:
  118.  * mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels,
  119.  * mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd  pixels,
  120.  * mm6 -> Y even, mm7 -> Y odd
  121.  */
  122. #define MMX_YUV_ADD "                                                       n
  123. # Do horizontal and vertical scaling                                        n
  124. movq      %%mm0, %%mm3          # Copy Cblue                                n
  125. movq      %%mm1, %%mm4          # Copy Cred                                 n
  126. movq      %%mm2, %%mm5          # Copy Cgreen                               n
  127. paddsw    %%mm6, %%mm0          # Y even + Cblue  00 B6 00 B4 00 B2 00 B0   n
  128. paddsw    %%mm7, %%mm3          # Y odd  + Cblue  00 B7 00 B5 00 B3 00 B1   n
  129. paddsw    %%mm6, %%mm1          # Y even + Cred   00 R6 00 R4 00 R2 00 R0   n
  130. paddsw    %%mm7, %%mm4          # Y odd  + Cred   00 R7 00 R5 00 R3 00 R1   n
  131. paddsw    %%mm6, %%mm2          # Y even + Cgreen 00 G6 00 G4 00 G2 00 G0   n
  132. paddsw    %%mm7, %%mm5          # Y odd  + Cgreen 00 G7 00 G5 00 G3 00 G1   n
  133.                                                                             n
  134. # Limit RGB even to 0..255                                                  n
  135. packuswb  %%mm0, %%mm0          # B6 B4 B2 B0 / B6 B4 B2 B0                 n
  136. packuswb  %%mm1, %%mm1          # R6 R4 R2 R0 / R6 R4 R2 R0                 n
  137. packuswb  %%mm2, %%mm2          # G6 G4 G2 G0 / G6 G4 G2 G0                 n
  138.                                                                             n
  139. # Limit RGB odd to 0..255                                                   n
  140. packuswb  %%mm3, %%mm3          # B7 B5 B3 B1 / B7 B5 B3 B1                 n
  141. packuswb  %%mm4, %%mm4          # R7 R5 R3 R1 / R7 R5 R3 R1                 n
  142. packuswb  %%mm5, %%mm5          # G7 G5 G3 G1 / G7 G5 G3 G1                 n
  143.                                                                             n
  144. # Interleave RGB even and odd                                               n
  145. punpcklbw %%mm3, %%mm0          #                 B7 B6 B5 B4 B3 B2 B1 B0   n
  146. punpcklbw %%mm4, %%mm1          #                 R7 R6 R5 R4 R3 R2 R1 R0   n
  147. punpcklbw %%mm5, %%mm2          #                 G7 G6 G5 G4 G3 G2 G1 G0   n
  148. "
  149. /*
  150.  * Grayscale case, only use Y
  151.  */
  152. #define MMX_YUV_GRAY "                                                      n
  153. # convert the luma part                                                     n
  154. psubusb   mmx_10w"G", %%mm6                                                 n
  155. movq      %%mm6, %%mm7                                                      n
  156. pand      mmx_00ffw"G", %%mm6                                               n
  157. psrlw     $8, %%mm7                                                         n
  158. psllw     $3, %%mm6                                                         n
  159. psllw     $3, %%mm7                                                         n
  160. pmulhw    mmx_Y_coeff"G", %%mm6                                             n
  161. pmulhw    mmx_Y_coeff"G", %%mm7                                             n
  162. packuswb  %%mm6, %%mm6                                                      n
  163. packuswb  %%mm7, %%mm7                                                      n
  164. punpcklbw %%mm7, %%mm6                                                      n
  165. "
  166. #define MMX_UNPACK_16_GRAY "                                                n
  167. movq      %%mm6, %%mm5                                                      n
  168. pand      mmx_mask_f8"G", %%mm6                                             n
  169. pand      mmx_mask_fc"G", %%mm5                                             n
  170. movq      %%mm6, %%mm7                                                      n
  171. psrlw     $3, %%mm7                                                         n
  172. pxor      %%mm3, %%mm3                                                      n
  173. movq      %%mm7, %%mm2                                                      n
  174. movq      %%mm5, %%mm0                                                      n
  175. punpcklbw %%mm3, %%mm5                                                      n
  176. punpcklbw %%mm6, %%mm7                                                      n
  177. psllw     $3, %%mm5                                                         n
  178. por       %%mm5, %%mm7                                                      n
  179. movq      %%mm7, (%3)                                                       n
  180. punpckhbw %%mm3, %%mm0                                                      n
  181. punpckhbw %%mm6, %%mm2                                                      n
  182. psllw     $3, %%mm0                                                         n
  183. movq      8(%0), %%mm6                                                      n
  184. por       %%mm0, %%mm2                                                      n
  185. movq      %%mm2, 8(%3)                                                      n
  186. "
  187. /*
  188.  * convert RGB plane to RGB 15 bits,
  189.  * mm0 -> B, mm1 -> R, mm2 -> G,
  190.  * mm4 -> GB, mm5 -> AR pixel 4-7,
  191.  * mm6 -> GB, mm7 -> AR pixel 0-3
  192.  */
  193. #define MMX_UNPACK_15 "                                                     n
  194. # mask unneeded bits off                                                    n
  195. pand      mmx_mask_f8"G", %%mm0 # b7b6b5b4 b3______ b7b6b5b4 b3______       n
  196. psrlw     $3,%%mm0              # ______b7 b6b5b4b3 ______b7 b6b5b4b3       n
  197. pand      mmx_mask_f8"G", %%mm2 # g7g6g5g4 g3______ g7g6g5g4 g3______       n
  198. pand      mmx_mask_f8"G", %%mm1 # r7r6r5r4 r3______ r7r6r5r4 r3______       n
  199. psrlw     $1,%%mm1              # __r7r6r5 r4r3____ __r7r6r5 r4r3____       n
  200. pxor      %%mm4, %%mm4          # zero mm4                                  n
  201. movq      %%mm0, %%mm5          # Copy B7-B0                                n
  202. movq      %%mm2, %%mm7          # Copy G7-G0                                n
  203.                                                                             n
  204. # convert rgb24 plane to rgb15 pack for pixel 0-3                           n
  205. punpcklbw %%mm4, %%mm2          # ________ ________ g7g6g5g4 g3______       n
  206. punpcklbw %%mm1, %%mm0          # r7r6r5r4 r3______ ______b7 b6b5b4b3       n
  207. psllw     $2,%%mm2              # ________ ____g7g6 g5g4g3__ ________       n
  208. por       %%mm2, %%mm0          # r7r6r5r4 r3__g7g6 g5g4g3b7 b6b5b4b3       n
  209. movq      8(%0), %%mm6          # Load 8 Y        Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0   n
  210. movq      %%mm0, (%3)           # store pixel 0-3                           n
  211.                                                                             n
  212. # convert rgb24 plane to rgb16 pack for pixel 0-3                           n
  213. punpckhbw %%mm4, %%mm7          # ________ ________ g7g6g5g4 g3______       n
  214. punpckhbw %%mm1, %%mm5          # r7r6r5r4 r3______ ______b7 b6b5b4b3       n
  215. psllw     $2,%%mm7              # ________ ____g7g6 g5g4g3__ ________       n
  216. movd      4(%1), %%mm0          # Load 4 Cb       __ __ __ __ u3 u2 u1 u0   n
  217. por       %%mm7, %%mm5          # r7r6r5r4 r3__g7g6 g5g4g3b7 b6b5b4b3       n
  218. movd      4(%2), %%mm1          # Load 4 Cr       __ __ __ __ v3 v2 v1 v0   n
  219. movq      %%mm5, 8(%3)          # store pixel 4-7                           n
  220. "
  221. /*
  222.  * convert RGB plane to RGB 16 bits,
  223.  * mm0 -> B, mm1 -> R, mm2 -> G,
  224.  * mm4 -> GB, mm5 -> AR pixel 4-7,
  225.  * mm6 -> GB, mm7 -> AR pixel 0-3
  226.  */
  227. #define MMX_UNPACK_16 "                                                     n
  228. # mask unneeded bits off                                                    n
  229. pand      mmx_mask_f8"G", %%mm0 # b7b6b5b4 b3______ b7b6b5b4 b3______       n
  230. pand      mmx_mask_fc"G", %%mm2 # g7g6g5g4 g3g2____ g7g6g5g4 g3g2____       n
  231. pand      mmx_mask_f8"G", %%mm1 # r7r6r5r4 r3______ r7r6r5r4 r3______       n
  232. psrlw     $3,%%mm0              # ______b7 b6b5b4b3 ______b7 b6b5b4b3       n
  233. pxor      %%mm4, %%mm4          # zero mm4                                  n
  234. movq      %%mm0, %%mm5          # Copy B7-B0                                n
  235. movq      %%mm2, %%mm7          # Copy G7-G0                                n
  236.                                                                             n
  237. # convert rgb24 plane to rgb16 pack for pixel 0-3                           n
  238. punpcklbw %%mm4, %%mm2          # ________ ________ g7g6g5g4 g3g2____       n
  239. punpcklbw %%mm1, %%mm0          # r7r6r5r4 r3______ ______b7 b6b5b4b3       n
  240. psllw     $3,%%mm2              # ________ __g7g6g5 g4g3g2__ ________       n
  241. por       %%mm2, %%mm0          # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3       n
  242. movq      8(%0), %%mm6          # Load 8 Y        Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0   n
  243. movq      %%mm0, (%3)           # store pixel 0-3                           n
  244.                                                                             n
  245. # convert rgb24 plane to rgb16 pack for pixel 0-3                           n
  246. punpckhbw %%mm4, %%mm7          # ________ ________ g7g6g5g4 g3g2____       n
  247. punpckhbw %%mm1, %%mm5          # r7r6r5r4 r3______ ______b7 b6b5b4b3       n
  248. psllw     $3,%%mm7              # ________ __g7g6g5 g4g3g2__ ________       n
  249. movd      4(%1), %%mm0          # Load 4 Cb       __ __ __ __ u3 u2 u1 u0   n
  250. por       %%mm7, %%mm5          # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3       n
  251. movd      4(%2), %%mm1          # Load 4 Cr       __ __ __ __ v3 v2 v1 v0   n
  252. movq      %%mm5, 8(%3)          # store pixel 4-7                           n
  253. "
  254. /*
  255.  * convert RGB plane to RGB packed format,
  256.  * mm0 -> B, mm1 -> R, mm2 -> G
  257.  */
  258. #define MMX_UNPACK_32_ARGB "                                                n
  259. pxor      %%mm3, %%mm3  # zero mm3                                          n
  260. movq      %%mm0, %%mm4  #                 B7 B6 B5 B4 B3 B2 B1 B0           n
  261. punpcklbw %%mm2, %%mm4  #                 G3 B3 G2 B2 G1 B1 G0 B0           n
  262. movq      %%mm1, %%mm5  #                 R7 R6 R5 R4 R3 R2 R1 R0           n
  263. punpcklbw %%mm3, %%mm5  #                 00 R3 00 R2 00 R1 00 R0           n
  264. movq      %%mm4, %%mm6  #                 G3 B3 G2 B2 G1 B1 G0 B0           n
  265. punpcklwd %%mm5, %%mm4  #                 00 R1 B1 G1 00 R0 B0 G0           n
  266. movq      %%mm4, (%3)   # Store ARGB1 ARGB0                                 n
  267. punpckhwd %%mm5, %%mm6  #                 00 R3 B3 G3 00 R2 B2 G2           n
  268. movq      %%mm6, 8(%3)  # Store ARGB3 ARGB2                                 n
  269. punpckhbw %%mm2, %%mm0  #                 G7 B7 G6 B6 G5 B5 G4 B4           n
  270. punpckhbw %%mm3, %%mm1  #                 00 R7 00 R6 00 R5 00 R4           n
  271. movq      %%mm0, %%mm5  #                 G7 B7 G6 B6 G5 B5 G4 B4           n
  272. punpcklwd %%mm1, %%mm5  #                 00 R5 B5 G5 00 R4 B4 G4           n
  273. movq      %%mm5, 16(%3) # Store ARGB5 ARGB4                                 n
  274. punpckhwd %%mm1, %%mm0  #                 00 R7 B7 G7 00 R6 B6 G6           n
  275. movq      %%mm0, 24(%3) # Store ARGB7 ARGB6                                 n
  276. "
  277. #define MMX_UNPACK_32_RGBA "                                                n
  278. pxor      %%mm3, %%mm3  # zero mm3                                          n
  279. movq      %%mm2, %%mm4  #                 G7 G6 G5 G4 G3 G2 G1 G0           n
  280. punpcklbw %%mm1, %%mm4  #                 R3 G3 R2 G2 R1 G1 R0 G0           n
  281. punpcklbw %%mm0, %%mm3  #                 B3 00 B2 00 B1 00 B0 00           n
  282. movq      %%mm3, %%mm5  #                 R3 00 R2 00 R1 00 R0 00           n
  283. punpcklwd %%mm4, %%mm3  #                 R1 G1 B1 00 R0 G0 B0 00           n
  284. movq      %%mm3, (%3)   # Store RGBA1 RGBA0                                 n
  285. punpckhwd %%mm4, %%mm5  #                 R3 G3 B3 00 R2 G2 B2 00           n
  286. movq      %%mm5, 8(%3)  # Store RGBA3 RGBA2                                 n
  287. pxor      %%mm6, %%mm6  # zero mm6                                          n
  288. punpckhbw %%mm1, %%mm2  #                 R7 G7 R6 G6 R5 G5 R4 G4           n
  289. punpckhbw %%mm0, %%mm6  #                 B7 00 B6 00 B5 00 B4 00           n
  290. movq      %%mm6, %%mm0  #                 B7 00 B6 00 B5 00 B4 00           n
  291. punpcklwd %%mm2, %%mm6  #                 R5 G5 B5 00 R4 G4 B4 00           n
  292. movq      %%mm6, 16(%3) # Store RGBA5 RGBA4                                 n
  293. punpckhwd %%mm2, %%mm0  #                 R7 G7 B7 00 R6 G6 B6 00           n
  294. movq      %%mm0, 24(%3) # Store RGBA7 RGBA6                                 n
  295. "
  296. #define MMX_UNPACK_32_BGRA "                                                n
  297. pxor      %%mm3, %%mm3  # zero mm3                                          n
  298. movq      %%mm2, %%mm4  #                 G7 G6 G5 G4 G3 G2 G1 G0           n
  299. punpcklbw %%mm0, %%mm4  #                 B3 G3 B2 G2 B1 G1 B0 G0           n
  300. punpcklbw %%mm1, %%mm3  #                 R3 00 R2 00 R1 00 R0 00           n
  301. movq      %%mm3, %%mm5  #                 R3 00 R2 00 R1 00 R0 00           n
  302. punpcklwd %%mm4, %%mm3  #                 B1 G1 R1 00 B0 G0 R0 00           n
  303. movq      %%mm3, (%3)   # Store BGRA1 BGRA0                                 n
  304. punpckhwd %%mm4, %%mm5  #                 B3 G3 R3 00 B2 G2 R2 00           n
  305. movq      %%mm5, 8(%3)  # Store BGRA3 BGRA2                                 n
  306. pxor      %%mm6, %%mm6  # zero mm6                                          n
  307. punpckhbw %%mm0, %%mm2  #                 B7 G7 B6 G6 B5 G5 B4 G4           n
  308. punpckhbw %%mm1, %%mm6  #                 R7 00 R6 00 R5 00 R4 00           n
  309. movq      %%mm6, %%mm0  #                 R7 00 R6 00 R5 00 R4 00           n
  310. punpcklwd %%mm2, %%mm6  #                 B5 G5 R5 00 B4 G4 R4 00           n
  311. movq      %%mm6, 16(%3) # Store BGRA5 BGRA4                                 n
  312. punpckhwd %%mm2, %%mm0  #                 B7 G7 R7 00 B6 G6 R6 00           n
  313. movq      %%mm0, 24(%3) # Store BGRA7 BGRA6                                 n
  314. "
  315. #define MMX_UNPACK_32_ABGR "                                                n
  316. pxor      %%mm3, %%mm3  # zero mm3                                          n
  317. movq      %%mm1, %%mm4  #                 R7 R6 R5 R4 R3 R2 R1 R0           n
  318. punpcklbw %%mm2, %%mm4  #                 G3 R3 G2 R2 G1 R1 G0 R0           n
  319. movq      %%mm0, %%mm5  #                 B7 B6 B5 B4 B3 B2 B1 B0           n
  320. punpcklbw %%mm3, %%mm5  #                 00 B3 00 B2 00 B1 00 B0           n
  321. movq      %%mm4, %%mm6  #                 G3 R3 G2 R2 G1 R1 G0 R0           n
  322. punpcklwd %%mm5, %%mm4  #                 00 B1 G1 R1 00 B0 G0 R0           n
  323. movq      %%mm4, (%3)   # Store ABGR1 ABGR0                                 n
  324. punpckhwd %%mm5, %%mm6  #                 00 B3 G3 R3 00 B2 G2 R2           n
  325. movq      %%mm6, 8(%3)  # Store ABGR3 ABGR2                                 n
  326. punpckhbw %%mm2, %%mm1  #                 G7 R7 G6 R6 G5 R5 G4 R4           n
  327. punpckhbw %%mm3, %%mm0  #                 00 B7 00 B6 00 B5 00 B4           n
  328. movq      %%mm1, %%mm2  #                 G7 R7 G6 R6 G5 R5 G4 R4           n
  329. punpcklwd %%mm0, %%mm1  #                 00 B5 G5 R5 00 B4 G4 R4           n
  330. movq      %%mm1, 16(%3) # Store ABGR5 ABGR4                                 n
  331. punpckhwd %%mm0, %%mm2  #                 B7 G7 R7 00 B6 G6 R6 00           n
  332. movq      %%mm2, 24(%3) # Store ABGR7 ABGR6                                 n
  333. "
  334. #elif defined(HAVE_MMX_INTRINSICS)
  335. /* MMX intrinsics */
  336. #include <mmintrin.h>
  337. #define MMX_CALL(MMX_INSTRUCTIONS)  
  338.     do {                            
  339.         __m64 mm0, mm1, mm2, mm3,   
  340.               mm4, mm5, mm6, mm7;   
  341.         MMX_INSTRUCTIONS            
  342.     } while(0)
  343. #define MMX_END _mm_empty()
  344.  
  345. #define MMX_INIT_16                     
  346.     mm0 = _mm_cvtsi32_si64(*(int*)p_u); 
  347.     mm1 = _mm_cvtsi32_si64(*(int*)p_v); 
  348.     mm4 = _mm_setzero_si64();           
  349.     mm6 = (__m64)*(uint64_t *)p_y;
  350. #define MMX_INIT_32                     
  351.     mm0 = _mm_cvtsi32_si64(*(int*)p_u); 
  352.     *(uint16_t *)p_buffer = 0;          
  353.     mm1 = _mm_cvtsi32_si64(*(int*)p_v); 
  354.     mm4 = _mm_setzero_si64();           
  355.     mm6 = (__m64)*(uint64_t *)p_y;
  356. #define MMX_YUV_MUL                                 
  357.     mm0 = _mm_unpacklo_pi8(mm0, mm4);               
  358.     mm1 = _mm_unpacklo_pi8(mm1, mm4);               
  359.     mm0 = _mm_subs_pi16(mm0, (__m64)mmx_80w);       
  360.     mm1 = _mm_subs_pi16(mm1, (__m64)mmx_80w);       
  361.     mm0 = _mm_slli_pi16(mm0, 3);                    
  362.     mm1 = _mm_slli_pi16(mm1, 3);                    
  363.     mm2 = mm0;                                      
  364.     mm3 = mm1;                                      
  365.     mm2 = _mm_mulhi_pi16(mm2, (__m64)mmx_U_green);  
  366.     mm3 = _mm_mulhi_pi16(mm3, (__m64)mmx_V_green);  
  367.     mm0 = _mm_mulhi_pi16(mm0, (__m64)mmx_U_blue);   
  368.     mm1 = _mm_mulhi_pi16(mm1, (__m64)mmx_V_red);    
  369.     mm2 = _mm_adds_pi16(mm2, mm3);                  
  370.     
  371.     mm6 = _mm_subs_pu8(mm6, (__m64)mmx_10w);        
  372.     mm7 = mm6;                                      
  373.     mm6 = _mm_and_si64(mm6, (__m64)mmx_00ffw);      
  374.     mm7 = _mm_srli_pi16(mm7, 8);                    
  375.     mm6 = _mm_slli_pi16(mm6, 3);                    
  376.     mm7 = _mm_slli_pi16(mm7, 3);                    
  377.     mm6 = _mm_mulhi_pi16(mm6, (__m64)mmx_Y_coeff);  
  378.     mm7 = _mm_mulhi_pi16(mm7, (__m64)mmx_Y_coeff);
  379. #define MMX_YUV_ADD                     
  380.     mm3 = mm0;                          
  381.     mm4 = mm1;                          
  382.     mm5 = mm2;                          
  383.     mm0 = _mm_adds_pi16(mm0, mm6);      
  384.     mm3 = _mm_adds_pi16(mm3, mm7);      
  385.     mm1 = _mm_adds_pi16(mm1, mm6);      
  386.     mm4 = _mm_adds_pi16(mm4, mm7);      
  387.     mm2 = _mm_adds_pi16(mm2, mm6);      
  388.     mm5 = _mm_adds_pi16(mm5, mm7);      
  389.     
  390.     mm0 = _mm_packs_pu16(mm0, mm0);     
  391.     mm1 = _mm_packs_pu16(mm1, mm1);     
  392.     mm2 = _mm_packs_pu16(mm2, mm2);     
  393.     
  394.     mm3 = _mm_packs_pu16(mm3, mm3);     
  395.     mm4 = _mm_packs_pu16(mm4, mm4);     
  396.     mm5 = _mm_packs_pu16(mm5, mm5);     
  397.     
  398.     mm0 = _mm_unpacklo_pi8(mm0, mm3);   
  399.     mm1 = _mm_unpacklo_pi8(mm1, mm4);   
  400.     mm2 = _mm_unpacklo_pi8(mm2, mm5);
  401. #define MMX_UNPACK_15                               
  402.     mm0 = _mm_and_si64(mm0, (__m64)mmx_mask_f8);    
  403.     mm0 = _mm_srli_pi16(mm0, 3);                    
  404.     mm2 = _mm_and_si64(mm2, (__m64)mmx_mask_f8);    
  405.     mm1 = _mm_and_si64(mm1, (__m64)mmx_mask_f8);    
  406.     mm1 = _mm_srli_pi16(mm1, 1);                    
  407.     mm4 = _mm_setzero_si64();                       
  408.     mm5 = mm0;                                      
  409.     mm7 = mm2;                                      
  410.     
  411.     mm2 = _mm_unpacklo_pi8(mm2, mm4);               
  412.     mm0 = _mm_unpacklo_pi8(mm0, mm1);               
  413.     mm2 = _mm_slli_pi16(mm2, 2);                    
  414.     mm0 = _mm_or_si64(mm0, mm2);                    
  415.     mm6 = (__m64)*(uint64_t *)(p_y + 8);            
  416.     *(uint64_t *)p_buffer = (uint64_t)mm0;          
  417.     
  418.     mm7 = _mm_unpackhi_pi8(mm7, mm4);               
  419.     mm5 = _mm_unpackhi_pi8(mm5, mm1);               
  420.     mm7 = _mm_slli_pi16(mm7, 2);                    
  421.     mm0 = _mm_cvtsi32_si64((int)*(uint32_t *)(p_u + 4)); 
  422.     mm5 = _mm_or_si64(mm5, mm7);                    
  423.     mm1 = _mm_cvtsi32_si64((int)*(uint32_t *)(p_v + 4)); 
  424.     *(uint64_t *)(p_buffer + 4) = (uint64_t)mm5;
  425. #define MMX_UNPACK_16                               
  426.     mm0 = _mm_and_si64(mm0, (__m64)mmx_mask_f8);    
  427.     mm2 = _mm_and_si64(mm2, (__m64)mmx_mask_fc);    
  428.     mm1 = _mm_and_si64(mm1, (__m64)mmx_mask_f8);    
  429.     mm0 = _mm_srli_pi16(mm0, 3);                    
  430.     mm4 = _mm_setzero_si64();                       
  431.     mm5 = mm0;                                      
  432.     mm7 = mm2;                                      
  433.     
  434.     mm2 = _mm_unpacklo_pi8(mm2, mm4);               
  435.     mm0 = _mm_unpacklo_pi8(mm0, mm1);               
  436.     mm2 = _mm_slli_pi16(mm2, 3);                    
  437.     mm0 = _mm_or_si64(mm0, mm2);                    
  438.     mm6 = (__m64)*(uint64_t *)(p_y + 8);            
  439.     *(uint64_t *)p_buffer = (uint64_t)mm0;          
  440.     
  441.     mm7 = _mm_unpackhi_pi8(mm7, mm4);               
  442.     mm5 = _mm_unpackhi_pi8(mm5, mm1);               
  443.     mm7 = _mm_slli_pi16(mm7, 3);                    
  444.     mm0 = _mm_cvtsi32_si64((int)*(uint32_t *)(p_u + 4)); 
  445.     mm5 = _mm_or_si64(mm5, mm7);                    
  446.     mm1 = _mm_cvtsi32_si64((int)*(uint32_t *)(p_v + 4)); 
  447.     *(uint64_t *)(p_buffer + 4) = (uint64_t)mm5;
  448. #define MMX_UNPACK_32_ARGB                      
  449.     mm3 = _mm_setzero_si64();                   
  450.     mm4 = mm0;                                  
  451.     mm4 = _mm_unpacklo_pi8(mm4, mm2);           
  452.     mm5 = mm1;                                  
  453.     mm5 = _mm_unpacklo_pi8(mm5, mm3);           
  454.     mm6 = mm4;                                  
  455.     mm4 = _mm_unpacklo_pi16(mm4, mm5);          
  456.     *(uint64_t *)p_buffer = (uint64_t)mm4;      
  457.     mm6 = _mm_unpackhi_pi16(mm6, mm5);          
  458.     *(uint64_t *)(p_buffer + 2) = (uint64_t)mm6;
  459.     mm0 = _mm_unpackhi_pi8(mm0, mm2);           
  460.     mm1 = _mm_unpackhi_pi8(mm1, mm3);           
  461.     mm5 = mm0;                                  
  462.     mm5 = _mm_unpacklo_pi16(mm5, mm1);          
  463.     *(uint64_t *)(p_buffer + 4) = (uint64_t)mm5;
  464.     mm0 = _mm_unpackhi_pi16(mm0, mm1);          
  465.     *(uint64_t *)(p_buffer + 6) = (uint64_t)mm0;
  466. #define MMX_UNPACK_32_RGBA                      
  467.     mm3 = _mm_setzero_si64();                   
  468.     mm4 = mm2;                                  
  469.     mm4 = _mm_unpacklo_pi8(mm4, mm1);           
  470.     mm3 = _mm_unpacklo_pi8(mm3, mm0);           
  471.     mm5 = mm3;                                  
  472.     mm3 = _mm_unpacklo_pi16(mm3, mm4);          
  473.     *(uint64_t *)p_buffer = (uint64_t)mm3;      
  474.     mm5 = _mm_unpackhi_pi16(mm5, mm4);          
  475.     *(uint64_t *)(p_buffer + 2) = (uint64_t)mm5;
  476.     mm6 = _mm_setzero_si64();                   
  477.     mm2 = _mm_unpackhi_pi8(mm2, mm1);           
  478.     mm6 = _mm_unpackhi_pi8(mm6, mm0);           
  479.     mm0 = mm6;                                  
  480.     mm6 = _mm_unpacklo_pi16(mm6, mm2);          
  481.     *(uint64_t *)(p_buffer + 4) = (uint64_t)mm6;
  482.     mm0 = _mm_unpackhi_pi16(mm0, mm2);          
  483.     *(uint64_t *)(p_buffer + 6) = (uint64_t)mm0;
  484. #define MMX_UNPACK_32_BGRA                      
  485.     mm3 = _mm_setzero_si64();                   
  486.     mm4 = mm2;                                  
  487.     mm4 = _mm_unpacklo_pi8(mm4, mm0);           
  488.     mm3 = _mm_unpacklo_pi8(mm3, mm1);           
  489.     mm5 = mm3;                                  
  490.     mm3 = _mm_unpacklo_pi16(mm3, mm4);          
  491.     *(uint64_t *)p_buffer = (uint64_t)mm3;      
  492.     mm5 = _mm_unpackhi_pi16(mm5, mm4);          
  493.     *(uint64_t *)(p_buffer + 2) = (uint64_t)mm5;
  494.     mm6 = _mm_setzero_si64();                   
  495.     mm2 = _mm_unpackhi_pi8(mm2, mm0);           
  496.     mm6 = _mm_unpackhi_pi8(mm6, mm1);           
  497.     mm0 = mm6;                                  
  498.     mm6 = _mm_unpacklo_pi16(mm6, mm2);          
  499.     *(uint64_t *)(p_buffer + 4) = (uint64_t)mm6;
  500.     mm0 = _mm_unpackhi_pi16(mm0, mm2);          
  501.     *(uint64_t *)(p_buffer + 6) = (uint64_t)mm0;
  502. #define MMX_UNPACK_32_ABGR                      
  503.     mm3 = _mm_setzero_si64();                   
  504.     mm4 = mm1;                                  
  505.     mm4 = _mm_unpacklo_pi8(mm4, mm2);           
  506.     mm5 = mm0;                                  
  507.     mm5 = _mm_unpacklo_pi8(mm5, mm3);           
  508.     mm6 = mm4;                                  
  509.     mm4 = _mm_unpacklo_pi16(mm4, mm5);          
  510.     *(uint64_t *)p_buffer = (uint64_t)mm4;      
  511.     mm6 = _mm_unpackhi_pi16(mm6, mm5);          
  512.     *(uint64_t *)(p_buffer + 2) = (uint64_t)mm6;
  513.     mm1 = _mm_unpackhi_pi8(mm1, mm2);           
  514.     mm0 = _mm_unpackhi_pi8(mm0, mm3);           
  515.     mm2 = mm1;                                  
  516.     mm1 = _mm_unpacklo_pi16(mm1, mm0);          
  517.     *(uint64_t *)(p_buffer + 4) = (uint64_t)mm1;
  518.     mm2 = _mm_unpackhi_pi16(mm2, mm0);          
  519.     *(uint64_t *)(p_buffer + 6) = (uint64_t)mm2;
  520. #endif
  521. #elif defined( MODULE_NAME_IS_i420_rgb_sse2 )
  522. #if defined(CAN_COMPILE_SSE2)
  523. /* SSE2 assembly */
  524. #define SSE2_CALL(SSE2_INSTRUCTIONS)    
  525.     do {                                
  526.     __asm__ __volatile__(               
  527.         ".p2align 3 nt"               
  528.         SSE2_INSTRUCTIONS               
  529.         :                               
  530.         : "r" (p_y), "r" (p_u),         
  531.           "r" (p_v), "r" (p_buffer)     
  532.         : "eax" );                      
  533.     } while(0)
  534. #define SSE2_END  __asm__ __volatile__ ( "sfence" ::: "memory" )
  535. #define SSE2_INIT_16_ALIGNED "                                              n
  536. movq        (%1), %%xmm0    # Load 8 Cb       00 00 00 00 u3 u2 u1 u0       n
  537. movq        (%2), %%xmm1    # Load 8 Cr       00 00 00 00 v3 v2 v1 v0       n
  538. pxor      %%xmm4, %%xmm4    # zero mm4                                      n
  539. movdqa      (%0), %%xmm6    # Load 16 Y       Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0       n
  540. "
  541. #define SSE2_INIT_16_UNALIGNED "                                            n
  542. movq        (%1), %%xmm0    # Load 8 Cb       00 00 00 00 u3 u2 u1 u0       n
  543. movq        (%2), %%xmm1    # Load 8 Cr       00 00 00 00 v3 v2 v1 v0       n
  544. pxor      %%xmm4, %%xmm4    # zero mm4                                      n
  545. movdqu      (%0), %%xmm6    # Load 16 Y       Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0       n
  546. prefetchnta (%3)            # Tell CPU not to cache output RGB data         n
  547. "
  548. #define SSE2_INIT_32_ALIGNED "                                              n
  549. movq        (%1), %%xmm0    # Load 8 Cb       00 00 00 00 u3 u2 u1 u0       n
  550. movq        (%2), %%xmm1    # Load 8 Cr       00 00 00 00 v3 v2 v1 v0       n
  551. pxor      %%xmm4, %%xmm4    # zero mm4                                      n
  552. movdqa      (%0), %%xmm6    # Load 16 Y       Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0       n
  553. "
  554. #define SSE2_INIT_32_UNALIGNED "                                            n
  555. movq        (%1), %%xmm0    # Load 8 Cb       00 00 00 00 u3 u2 u1 u0       n
  556. movq        (%2), %%xmm1    # Load 8 Cr       00 00 00 00 v3 v2 v1 v0       n
  557. pxor      %%xmm4, %%xmm4    # zero mm4                                      n
  558. movdqu      (%0), %%xmm6    # Load 16 Y       Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0       n
  559. prefetchnta (%3)            # Tell CPU not to cache output RGB data         n
  560. "
  561. #define SSE2_YUV_MUL "                                                      n
  562. # convert the chroma part                                                   n
  563. punpcklbw %%xmm4, %%xmm0        # scatter 8 Cb    00 u3 00 u2 00 u1 00 u0   n
  564. punpcklbw %%xmm4, %%xmm1        # scatter 8 Cr    00 v3 00 v2 00 v1 00 v0   n
  565. movl      $0x00800080, %%eax    #                                           n
  566. movd      %%eax, %%xmm5         #                                           n
  567. pshufd    $0, %%xmm5, %%xmm5    # Set xmm5 to     0080 0080 ... 0080 0080   n
  568. psubsw    %%xmm5, %%xmm0        # Cb -= 128                                 n
  569. psubsw    %%xmm5, %%xmm1        # Cr -= 128                                 n
  570. psllw     $3, %%xmm0            # Promote precision                         n
  571. psllw     $3, %%xmm1            # Promote precision                         n
  572. movdqa    %%xmm0, %%xmm2        # Copy 8 Cb       00 u3 00 u2 00 u1 00 u0   n
  573. movdqa    %%xmm1, %%xmm3        # Copy 8 Cr       00 v3 00 v2 00 v1 00 v0   n
  574. movl      $0xf37df37d, %%eax    #                                           n
  575. movd      %%eax, %%xmm5         #                                           n
  576. pshufd    $0, %%xmm5, %%xmm5    # Set xmm5 to     f37d f37d ... f37d f37d   n
  577. pmulhw    %%xmm5, %%xmm2        # Mul Cb with green coeff -> Cb green       n
  578. movl      $0xe5fce5fc, %%eax    #                                           n
  579. movd      %%eax, %%xmm5         #                                           n
  580. pshufd    $0, %%xmm5, %%xmm5    # Set xmm5 to     e5fc e5fc ... e5fc e5fc   n
  581. pmulhw    %%xmm5, %%xmm3        # Mul Cr with green coeff -> Cr green       n
  582. movl      $0x40934093, %%eax    #                                           n
  583. movd      %%eax, %%xmm5         #                                           n
  584. pshufd    $0, %%xmm5, %%xmm5    # Set xmm5 to     4093 4093 ... 4093 4093   n
  585. pmulhw    %%xmm5, %%xmm0        # Mul Cb -> Cblue 00 b3 00 b2 00 b1 00 b0   n
  586. movl      $0x33123312, %%eax    #                                           n
  587. movd      %%eax, %%xmm5         #                                           n
  588. pshufd    $0, %%xmm5, %%xmm5    # Set xmm5 to     3312 3312 ... 3312 3312   n
  589. pmulhw    %%xmm5, %%xmm1        # Mul Cr -> Cred  00 r3 00 r2 00 r1 00 r0   n
  590. paddsw    %%xmm3, %%xmm2        # Cb green + Cr green -> Cgreen             n
  591.                                                                             n
  592. # convert the luma part                                                     n
  593. movl      $0x10101010, %%eax    #                                           n
  594. movd      %%eax, %%xmm5         #                                           n
  595. pshufd    $0, %%xmm5, %%xmm5    # Set xmm5 to   1010 1010 ... 1010 1010     n
  596. psubusb   %%xmm5, %%xmm6        # Y -= 16                                   n
  597. movdqa    %%xmm6, %%xmm7        # Copy 16 Y       Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0   n
  598. movl      $0x00ff00ff, %%eax    #                                           n
  599. movd      %%eax, %%xmm5         #                                           n
  600. pshufd    $0, %%xmm5, %%xmm5    # set xmm5 to     00ff 00ff ... 00ff 00ff   n
  601. pand      %%xmm5, %%xmm6        # get Y even      00 Y6 00 Y4 00 Y2 00 Y0   n
  602. psrlw     $8, %%xmm7            # get Y odd       00 Y7 00 Y5 00 Y3 00 Y1   n
  603. psllw     $3, %%xmm6            # Promote precision                         n
  604. psllw     $3, %%xmm7            # Promote precision                         n
  605. movl      $0x253f253f, %%eax    #                                           n
  606. movd      %%eax, %%xmm5         #                                           n
  607. pshufd    $0, %%xmm5, %%xmm5    # set xmm5 to     253f 253f ... 253f 253f   n
  608. pmulhw    %%xmm5, %%xmm6        # Mul 8 Y even    00 y6 00 y4 00 y2 00 y0   n
  609. pmulhw    %%xmm5, %%xmm7        # Mul 8 Y odd     00 y7 00 y5 00 y3 00 y1   n
  610. "
  611. #define SSE2_YUV_ADD "                                                      n
  612. # Do horizontal and vertical scaling                                        n
  613. movdqa    %%xmm0, %%xmm3        # Copy Cblue                                n
  614. movdqa    %%xmm1, %%xmm4        # Copy Cred                                 n
  615. movdqa    %%xmm2, %%xmm5        # Copy Cgreen                               n
  616. paddsw    %%xmm6, %%xmm0        # Y even + Cblue  00 B6 00 B4 00 B2 00 B0   n
  617. paddsw    %%xmm7, %%xmm3        # Y odd  + Cblue  00 B7 00 B5 00 B3 00 B1   n
  618. paddsw    %%xmm6, %%xmm1        # Y even + Cred   00 R6 00 R4 00 R2 00 R0   n
  619. paddsw    %%xmm7, %%xmm4        # Y odd  + Cred   00 R7 00 R5 00 R3 00 R1   n
  620. paddsw    %%xmm6, %%xmm2        # Y even + Cgreen 00 G6 00 G4 00 G2 00 G0   n
  621. paddsw    %%xmm7, %%xmm5        # Y odd  + Cgreen 00 G7 00 G5 00 G3 00 G1   n
  622.                                                                             n
  623. # Limit RGB even to 0..255                                                  n
  624. packuswb  %%xmm0, %%xmm0        # B6 B4 B2 B0 / B6 B4 B2 B0                 n
  625. packuswb  %%xmm1, %%xmm1        # R6 R4 R2 R0 / R6 R4 R2 R0                 n
  626. packuswb  %%xmm2, %%xmm2        # G6 G4 G2 G0 / G6 G4 G2 G0                 n
  627.                                                                             n
  628. # Limit RGB odd to 0..255                                                   n
  629. packuswb  %%xmm3, %%xmm3        # B7 B5 B3 B1 / B7 B5 B3 B1                 n
  630. packuswb  %%xmm4, %%xmm4        # R7 R5 R3 R1 / R7 R5 R3 R1                 n
  631. packuswb  %%xmm5, %%xmm5        # G7 G5 G3 G1 / G7 G5 G3 G1                 n
  632.                                                                             n
  633. # Interleave RGB even and odd                                               n
  634. punpcklbw %%xmm3, %%xmm0        #                 B7 B6 B5 B4 B3 B2 B1 B0   n
  635. punpcklbw %%xmm4, %%xmm1        #                 R7 R6 R5 R4 R3 R2 R1 R0   n
  636. punpcklbw %%xmm5, %%xmm2        #                 G7 G6 G5 G4 G3 G2 G1 G0   n
  637. "
  638. #define SSE2_UNPACK_15_ALIGNED "                                            n
  639. # mask unneeded bits off                                                    n
  640. movl      $0xf8f8f8f8, %%eax    #                                           n
  641. movd      %%eax, %%xmm5         #                                           n
  642. pshufd    $0, %%xmm5, %%xmm5    # set xmm5 to     f8f8 f8f8 ... f8f8 f8f8   n
  643. pand      %%xmm5, %%xmm0        # b7b6b5b4 b3______ b7b6b5b4 b3______       n
  644. psrlw     $3,%%xmm0             # ______b7 b6b5b4b3 ______b7 b6b5b4b3       n
  645. pand      %%xmm5, %%xmm2        # g7g6g5g4 g3______ g7g6g5g4 g3______       n
  646. pand      %%xmm5, %%xmm1        # r7r6r5r4 r3______ r7r6r5r4 r3______       n
  647. psrlw     $1,%%xmm1             # __r7r6r5 r4r3____ __r7r6r5 r4r3____       n
  648. pxor      %%xmm4, %%xmm4        # zero mm4                                  n
  649. movdqa    %%xmm0, %%xmm5        # Copy B15-B0                               n
  650. movdqa    %%xmm2, %%xmm7        # Copy G15-G0                               n
  651.                                                                             n
  652. # convert rgb24 plane to rgb15 pack for pixel 0-7                           n
  653. punpcklbw %%xmm4, %%xmm2        # ________ ________ g7g6g5g4 g3______       n
  654. punpcklbw %%xmm1, %%xmm0        # r7r6r5r4 r3______ ______b7 b6b5b4b3       n
  655. psllw     $2,%%xmm2             # ________ ____g7g6 g5g4g3__ ________       n
  656. por       %%xmm2, %%xmm0        # r7r6r5r4 r3__g7g6 g5g4g3b7 b6b5b4b3       n
  657. movntdq   %%xmm0, (%3)          # store pixel 0-7                           n
  658.                                                                             n
  659. # convert rgb24 plane to rgb15 pack for pixel 8-15                          n
  660. punpckhbw %%xmm4, %%xmm7        # ________ ________ g7g6g5g4 g3______       n
  661. punpckhbw %%xmm1, %%xmm5        # r7r6r5r4 r3______ ______b7 b6b5b4b3       n
  662. psllw     $2,%%xmm7             # ________ ____g7g6 g5g4g3__ ________       n
  663. por       %%xmm7, %%xmm5        # r7r6r5r4 r3__g7g6 g5g4g3b7 b6b5b4b3       n
  664. movntdq   %%xmm5, 16(%3)        # store pixel 4-7                           n
  665. "
  666. #define SSE2_UNPACK_15_UNALIGNED "                                          n
  667. # mask unneeded bits off                                                    n
  668. movl      $0xf8f8f8f8, %%eax    #                                           n
  669. movd      %%eax, %%xmm5         #                                           n
  670. pshufd    $0, %%xmm5, %%xmm5    # set xmm5 to     f8f8 f8f8 ... f8f8 f8f8   n
  671. pand      %%xmm5, %%xmm0        # b7b6b5b4 b3______ b7b6b5b4 b3______       n
  672. psrlw     $3,%%xmm0             # ______b7 b6b5b4b3 ______b7 b6b5b4b3       n
  673. pand      %%xmm5, %%xmm2        # g7g6g5g4 g3______ g7g6g5g4 g3______       n
  674. pand      %%xmm5, %%xmm1        # r7r6r5r4 r3______ r7r6r5r4 r3______       n
  675. psrlw     $1,%%xmm1             # __r7r6r5 r4r3____ __r7r6r5 r4r3____       n
  676. pxor      %%xmm4, %%xmm4        # zero mm4                                  n
  677. movdqa    %%xmm0, %%xmm5        # Copy B15-B0                               n
  678. movdqa    %%xmm2, %%xmm7        # Copy G15-G0                               n
  679.                                                                             n
  680. # convert rgb24 plane to rgb15 pack for pixel 0-7                           n
  681. punpcklbw %%xmm4, %%xmm2        # ________ ________ g7g6g5g4 g3______       n
  682. punpcklbw %%xmm1, %%xmm0        # r7r6r5r4 r3______ ______b7 b6b5b4b3       n
  683. psllw     $2,%%xmm2             # ________ ____g7g6 g5g4g3__ ________       n
  684. por       %%xmm2, %%xmm0        # r7r6r5r4 r3__g7g6 g5g4g3b7 b6b5b4b3       n
  685. movdqu    %%xmm0, (%3)          # store pixel 0-7                           n
  686.                                                                             n
  687. # convert rgb24 plane to rgb15 pack for pixel 8-15                          n
  688. punpckhbw %%xmm4, %%xmm7        # ________ ________ g7g6g5g4 g3______       n
  689. punpckhbw %%xmm1, %%xmm5        # r7r6r5r4 r3______ ______b7 b6b5b4b3       n
  690. psllw     $2,%%xmm7             # ________ ____g7g6 g5g4g3__ ________       n
  691. por       %%xmm7, %%xmm5        # r7r6r5r4 r3__g7g6 g5g4g3b7 b6b5b4b3       n
  692. movdqu    %%xmm5, 16(%3)        # store pixel 4-7                           n
  693. "
  694. #define SSE2_UNPACK_16_ALIGNED "                                            n
  695. # mask unneeded bits off                                                    n
  696. movl      $0xf8f8f8f8, %%eax    #                                           n
  697. movd      %%eax, %%xmm5         #                                           n
  698. pshufd    $0, %%xmm5, %%xmm5    # set xmm5 to     f8f8 f8f8 ... f8f8 f8f8   n
  699. pand      %%xmm5, %%xmm0        # b7b6b5b4 b3______ b7b6b5b4 b3______       n
  700. pand      %%xmm5, %%xmm1        # r7r6r5r4 r3______ r7r6r5r4 r3______       n
  701. movl      $0xfcfcfcfc, %%eax    #                                           n
  702. movd      %%eax, %%xmm5         #                                           n
  703. pshufd    $0, %%xmm5, %%xmm5    # set xmm5 to     f8f8 f8f8 ... f8f8 f8f8   n
  704. pand      %%xmm5, %%xmm2        # g7g6g5g4 g3g2____ g7g6g5g4 g3g2____       n
  705. psrlw     $3,%%xmm0             # ______b7 b6b5b4b3 ______b7 b6b5b4b3       n
  706. pxor      %%xmm4, %%xmm4        # zero mm4                                  n
  707. movdqa    %%xmm0, %%xmm5        # Copy B15-B0                               n
  708. movdqa    %%xmm2, %%xmm7        # Copy G15-G0                               n
  709.                                                                             n
  710. # convert rgb24 plane to rgb16 pack for pixel 0-7                           n
  711. punpcklbw %%xmm4, %%xmm2        # ________ ________ g7g6g5g4 g3g2____       n
  712. punpcklbw %%xmm1, %%xmm0        # r7r6r5r4 r3______ ______b7 b6b5b4b3       n
  713. psllw     $3,%%xmm2             # ________ __g7g6g5 g4g3g2__ ________       n
  714. por       %%xmm2, %%xmm0        # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3       n
  715. movntdq   %%xmm0, (%3)          # store pixel 0-7                           n
  716.                                                                             n
  717. # convert rgb24 plane to rgb16 pack for pixel 8-15                          n
  718. punpckhbw %%xmm4, %%xmm7        # ________ ________ g7g6g5g4 g3g2____       n
  719. punpckhbw %%xmm1, %%xmm5        # r7r6r5r4 r3______ ______b7 b6b5b4b3       n
  720. psllw     $3,%%xmm7             # ________ __g7g6g5 g4g3g2__ ________       n
  721. por       %%xmm7, %%xmm5        # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3       n
  722. movntdq   %%xmm5, 16(%3)        # store pixel 4-7                           n
  723. "
  724. #define SSE2_UNPACK_16_UNALIGNED "                                          n
  725. # mask unneeded bits off                                                    n
  726. movl      $0xf8f8f8f8, %%eax    #                                           n
  727. movd      %%eax, %%xmm5         #                                           n
  728. pshufd    $0, %%xmm5, %%xmm5    # set xmm5 to     f8f8 f8f8 ... f8f8 f8f8   n
  729. pand      %%xmm5, %%xmm0        # b7b6b5b4 b3______ b7b6b5b4 b3______       n
  730. pand      %%xmm5, %%xmm1        # r7r6r5r4 r3______ r7r6r5r4 r3______       n
  731. movl      $0xfcfcfcfc, %%eax    #                                           n
  732. movd      %%eax, %%xmm5         #                                           n
  733. pshufd    $0, %%xmm5, %%xmm5    # set xmm5 to     f8f8 f8f8 ... f8f8 f8f8   n
  734. pand      %%xmm5, %%xmm2        # g7g6g5g4 g3g2____ g7g6g5g4 g3g2____       n
  735. psrlw     $3,%%xmm0             # ______b7 b6b5b4b3 ______b7 b6b5b4b3       n
  736. pxor      %%xmm4, %%xmm4        # zero mm4                                  n
  737. movdqa    %%xmm0, %%xmm5        # Copy B15-B0                               n
  738. movdqa    %%xmm2, %%xmm7        # Copy G15-G0                               n
  739.                                                                             n
  740. # convert rgb24 plane to rgb16 pack for pixel 0-7                           n
  741. punpcklbw %%xmm4, %%xmm2        # ________ ________ g7g6g5g4 g3g2____       n
  742. punpcklbw %%xmm1, %%xmm0        # r7r6r5r4 r3______ ______b7 b6b5b4b3       n
  743. psllw     $3,%%xmm2             # ________ __g7g6g5 g4g3g2__ ________       n
  744. por       %%xmm2, %%xmm0        # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3       n
  745. movdqu    %%xmm0, (%3)          # store pixel 0-7                           n
  746.                                                                             n
  747. # convert rgb24 plane to rgb16 pack for pixel 8-15                          n
  748. punpckhbw %%xmm4, %%xmm7        # ________ ________ g7g6g5g4 g3g2____       n
  749. punpckhbw %%xmm1, %%xmm5        # r7r6r5r4 r3______ ______b7 b6b5b4b3       n
  750. psllw     $3,%%xmm7             # ________ __g7g6g5 g4g3g2__ ________       n
  751. por       %%xmm7, %%xmm5        # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3       n
  752. movdqu    %%xmm5, 16(%3)        # store pixel 4-7                           n
  753. "
  754. #define SSE2_UNPACK_32_ARGB_ALIGNED "                                       n
  755. pxor      %%xmm3, %%xmm3  # zero xmm3                                       n
  756. movdqa    %%xmm0, %%xmm4  #               B7 B6 B5 B4 B3 B2 B1 B0           n
  757. punpcklbw %%xmm2, %%xmm4  #               G3 B3 G2 B2 G1 B1 G0 B0           n
  758. movdqa    %%xmm1, %%xmm5  #               R7 R6 R5 R4 R3 R2 R1 R0           n
  759. punpcklbw %%xmm3, %%xmm5  #               00 R3 00 R2 00 R1 00 R0           n
  760. movdqa    %%xmm4, %%xmm6  #               G3 B3 G2 B2 G1 B1 G0 B0           n
  761. punpcklwd %%xmm5, %%xmm4  #               00 R1 B1 G1 00 R0 B0 G0           n
  762. movntdq   %%xmm4, (%3)    # Store ARGB3 ARGB2 ARGB1 ARGB0                   n
  763. punpckhwd %%xmm5, %%xmm6  #               00 R3 B3 G3 00 R2 B2 G2           n
  764. movntdq   %%xmm6, 16(%3)  # Store ARGB7 ARGB6 ARGB5 ARGB4                   n
  765. punpckhbw %%xmm2, %%xmm0  #               G7 B7 G6 B6 G5 B5 G4 B4           n
  766. punpckhbw %%xmm3, %%xmm1  #               00 R7 00 R6 00 R5 00 R4           n
  767. movdqa    %%xmm0, %%xmm5  #               G7 B7 G6 B6 G5 B5 G4 B4           n
  768. punpcklwd %%xmm1, %%xmm5  #               00 R5 B5 G5 00 R4 B4 G4           n
  769. movntdq   %%xmm5, 32(%3)  # Store ARGB11 ARGB10 ARGB9 ARGB8                 n
  770. punpckhwd %%xmm1, %%xmm0  #               00 R7 B7 G7 00 R6 B6 G6           n
  771. movntdq   %%xmm0, 48(%3)  # Store ARGB15 ARGB14 ARGB13 ARGB12               n
  772. "
  773. #define SSE2_UNPACK_32_ARGB_UNALIGNED "                                     n
  774. pxor      %%xmm3, %%xmm3  # zero xmm3                                       n
  775. movdqa    %%xmm0, %%xmm4  #               B7 B6 B5 B4 B3 B2 B1 B0           n
  776. punpcklbw %%xmm2, %%xmm4  #               G3 B3 G2 B2 G1 B1 G0 B0           n
  777. movdqa    %%xmm1, %%xmm5  #               R7 R6 R5 R4 R3 R2 R1 R0           n
  778. punpcklbw %%xmm3, %%xmm5  #               00 R3 00 R2 00 R1 00 R0           n
  779. movdqa    %%xmm4, %%xmm6  #               G3 B3 G2 B2 G1 B1 G0 B0           n
  780. punpcklwd %%xmm5, %%xmm4  #               00 R1 B1 G1 00 R0 B0 G0           n
  781. movdqu    %%xmm4, (%3)    # Store ARGB3 ARGB2 ARGB1 ARGB0                   n
  782. punpckhwd %%xmm5, %%xmm6  #               00 R3 B3 G3 00 R2 B2 G2           n
  783. movdqu    %%xmm6, 16(%3)  # Store ARGB7 ARGB6 ARGB5 ARGB4                   n
  784. punpckhbw %%xmm2, %%xmm0  #               G7 B7 G6 B6 G5 B5 G4 B4           n
  785. punpckhbw %%xmm3, %%xmm1  #               00 R7 00 R6 00 R5 00 R4           n
  786. movdqa    %%xmm0, %%xmm5  #               G7 B7 G6 B6 G5 B5 G4 B4           n
  787. punpcklwd %%xmm1, %%xmm5  #               00 R5 B5 G5 00 R4 B4 G4           n
  788. movdqu    %%xmm5, 32(%3)  # Store ARGB11 ARGB10 ARGB9 ARGB8                 n
  789. punpckhwd %%xmm1, %%xmm0  #               00 R7 B7 G7 00 R6 B6 G6           n
  790. movdqu    %%xmm0, 48(%3)  # Store ARGB15 ARGB14 ARGB13 ARGB12               n
  791. "
  792. #define SSE2_UNPACK_32_RGBA_ALIGNED "                                       n
  793. pxor      %%xmm3, %%xmm3  # zero mm3                                        n
  794. movdqa    %%xmm2, %%xmm4  #                 G7 G6 G5 G4 G3 G2 G1 G0         n
  795. punpcklbw %%xmm1, %%xmm4  #                 R3 G3 R2 G2 R1 G1 R0 G0         n
  796. punpcklbw %%xmm0, %%xmm3  #                 B3 00 B2 00 B1 00 B0 00         n
  797. movdqa    %%xmm3, %%xmm5  #                 R3 00 R2 00 R1 00 R0 00         n
  798. punpcklwd %%xmm4, %%xmm3  #                 R1 G1 B1 00 R0 B0 G0 00         n
  799. movntdq   %%xmm3, (%3)    # Store RGBA3 RGBA2 RGBA1 RGBA0                   n
  800. punpckhwd %%xmm4, %%xmm5  #                 R3 G3 B3 00 R2 G2 B2 00         n
  801. movntdq   %%xmm5, 16(%3)  # Store RGBA7 RGBA6 RGBA5 RGBA4                   n
  802. pxor      %%xmm6, %%xmm6  # zero mm6                                        n
  803. punpckhbw %%xmm1, %%xmm2  #                 R7 G7 R6 G6 R5 G5 R4 G4         n
  804. punpckhbw %%xmm0, %%xmm6  #                 B7 00 B6 00 B5 00 B4 00         n
  805. movdqa    %%xmm6, %%xmm0  #                 B7 00 B6 00 B5 00 B4 00         n
  806. punpcklwd %%xmm2, %%xmm6  #                 R5 G5 B5 00 R4 G4 B4 00         n
  807. movntdq   %%xmm6, 32(%3)  # Store BGRA11 BGRA10 BGRA9 RGBA8                 n
  808. punpckhwd %%xmm2, %%xmm0  #                 R7 G7 B7 00 R6 G6 B6 00         n
  809. movntdq   %%xmm0, 48(%3)  # Store RGBA15 RGBA14 RGBA13 RGBA12               n
  810. "
  811. #define SSE2_UNPACK_32_RGBA_UNALIGNED "                                     n
  812. pxor      %%xmm3, %%xmm3  # zero mm3                                        n
  813. movdqa    %%xmm2, %%xmm4  #                 G7 G6 G5 G4 G3 G2 G1 G0         n
  814. punpcklbw %%xmm1, %%xmm4  #                 R3 G3 R2 G2 R1 G1 R0 G0         n
  815. punpcklbw %%xmm0, %%xmm3  #                 B3 00 B2 00 B1 00 B0 00         n
  816. movdqa    %%xmm3, %%xmm5  #                 R3 00 R2 00 R1 00 R0 00         n
  817. punpcklwd %%xmm4, %%xmm3  #                 R1 G1 B1 00 R0 B0 G0 00         n
  818. movdqu    %%xmm3, (%3)    # Store RGBA3 RGBA2 RGBA1 RGBA0                   n
  819. punpckhwd %%xmm4, %%xmm5  #                 R3 G3 B3 00 R2 G2 B2 00         n
  820. movdqu    %%xmm5, 16(%3)  # Store RGBA7 RGBA6 RGBA5 RGBA4                   n
  821. pxor      %%xmm6, %%xmm6  # zero mm6                                        n
  822. punpckhbw %%xmm1, %%xmm2  #                 R7 G7 R6 G6 R5 G5 R4 G4         n
  823. punpckhbw %%xmm0, %%xmm6  #                 B7 00 B6 00 B5 00 B4 00         n
  824. movdqa    %%xmm6, %%xmm0  #                 B7 00 B6 00 B5 00 B4 00         n
  825. punpcklwd %%xmm2, %%xmm6  #                 R5 G5 B5 00 R4 G4 B4 00         n
  826. movdqu    %%xmm6, 32(%3)  # Store RGBA11 RGBA10 RGBA9 RGBA8                 n
  827. punpckhwd %%xmm2, %%xmm0  #                 R7 G7 B7 00 R6 G6 B6 00         n
  828. movdqu    %%xmm0, 48(%3)  # Store RGBA15 RGBA14 RGBA13 RGBA12               n
  829. "
  830. #define SSE2_UNPACK_32_BGRA_ALIGNED "                                       n
  831. pxor      %%xmm3, %%xmm3  # zero mm3                                        n
  832. movdqa    %%xmm2, %%xmm4  #                 G7 G6 G5 G4 G3 G2 G1 G0         n
  833. punpcklbw %%xmm0, %%xmm4  #                 B3 G3 B2 G2 B1 G1 B0 G0         n
  834. punpcklbw %%xmm1, %%xmm3  #                 R3 00 R2 00 R1 00 R0 00         n
  835. movdqa    %%xmm3, %%xmm5  #                 R3 00 R2 00 R1 00 R0 00         n
  836. punpcklwd %%xmm4, %%xmm3  #                 B1 G1 R1 00 B0 G0 R0 00         n
  837. movntdq   %%xmm3, (%3)    # Store BGRA3 BGRA2 BGRA1 BGRA0                   n
  838. punpckhwd %%xmm4, %%xmm5  #                 B3 G3 R3 00 B2 G2 R2 00         n
  839. movntdq   %%xmm5, 16(%3)  # Store BGRA7 BGRA6 BGRA5 BGRA4                   n
  840. pxor      %%xmm6, %%xmm6  # zero mm6                                        n
  841. punpckhbw %%xmm0, %%xmm2  #                 B7 G7 B6 G6 B5 G5 B4 G4         n
  842. punpckhbw %%xmm1, %%xmm6  #                 R7 00 R6 00 R5 00 R4 00         n
  843. movdqa    %%xmm6, %%xmm0  #                 R7 00 R6 00 R5 00 R4 00         n
  844. punpcklwd %%xmm2, %%xmm6  #                 B5 G5 R5 00 B4 G4 R4 00         n
  845. movntdq   %%xmm6, 32(%3)  # Store BGRA11 BGRA10 BGRA9 BGRA8                 n
  846. punpckhwd %%xmm2, %%xmm0  #                 B7 G7 R7 00 B6 G6 R6 00         n
  847. movntdq   %%xmm0, 48(%3)  # Store BGRA15 BGRA14 BGRA13 BGRA12               n
  848. "
  849. #define SSE2_UNPACK_32_BGRA_UNALIGNED "                                     n
  850. pxor      %%xmm3, %%xmm3  # zero mm3                                        n
  851. movdqa    %%xmm2, %%xmm4  #                 G7 G6 G5 G4 G3 G2 G1 G0         n
  852. punpcklbw %%xmm0, %%xmm4  #                 B3 G3 B2 G2 B1 G1 B0 G0         n
  853. punpcklbw %%xmm1, %%xmm3  #                 R3 00 R2 00 R1 00 R0 00         n
  854. movdqa    %%xmm3, %%xmm5  #                 R3 00 R2 00 R1 00 R0 00         n
  855. punpcklwd %%xmm4, %%xmm3  #                 B1 G1 R1 00 B0 G0 R0 00         n
  856. movdqu    %%xmm3, (%3)    # Store BGRA3 BGRA2 BGRA1 BGRA0                   n
  857. punpckhwd %%xmm4, %%xmm5  #                 B3 G3 R3 00 B2 G2 R2 00         n
  858. movdqu    %%xmm5, 16(%3)  # Store BGRA7 BGRA6 BGRA5 BGRA4                   n
  859. pxor      %%xmm6, %%xmm6  # zero mm6                                        n
  860. punpckhbw %%xmm0, %%xmm2  #                 B7 G7 B6 G6 B5 G5 B4 G4         n
  861. punpckhbw %%xmm1, %%xmm6  #                 R7 00 R6 00 R5 00 R4 00         n
  862. movdqa    %%xmm6, %%xmm0  #                 R7 00 R6 00 R5 00 R4 00         n
  863. punpcklwd %%xmm2, %%xmm6  #                 B5 G5 R5 00 B4 G4 R4 00         n
  864. movdqu    %%xmm6, 32(%3)  # Store BGRA11 BGRA10 BGRA9 BGRA8                 n
  865. punpckhwd %%xmm2, %%xmm0  #                 B7 G7 R7 00 B6 G6 R6 00         n
  866. movdqu    %%xmm0, 48(%3)  # Store BGRA15 BGRA14 BGRA13 BGRA12               n
  867. "
  868. #define SSE2_UNPACK_32_ABGR_ALIGNED "                                       n
  869. pxor      %%xmm3, %%xmm3  # zero mm3                                        n
  870. movdqa    %%xmm1, %%xmm4  #                 R7 R6 R5 R4 R3 R2 R1 R0         n
  871. punpcklbw %%xmm2, %%xmm4  #                 G3 R3 G2 R2 G1 R1 G0 R0         n
  872. movdqa    %%xmm0, %%xmm5  #                 B7 B6 B5 B4 B3 B2 B1 B0         n
  873. punpcklbw %%xmm3, %%xmm5  #                 00 B3 00 B2 00 B1 00 B0         n
  874. movdqa    %%xmm4, %%xmm6  #                 G3 R3 G2 R2 G1 R1 G0 R0         n
  875. punpcklwd %%xmm5, %%xmm4  #                 00 B1 G1 R1 00 B0 G0 R0         n
  876. movntdq   %%xmm4, (%3)    # Store ABGR3 ABGR2 ABGR1 ABGR0                   n
  877. punpckhwd %%xmm5, %%xmm6  #                 00 B3 G3 R3 00 B2 G2 R2         n
  878. movntdq   %%xmm6, 16(%3)  # Store ABGR7 ABGR6 ABGR5 ABGR4                   n
  879. punpckhbw %%xmm2, %%xmm1  #                 G7 R7 G6 R6 G5 R5 G4 R4         n
  880. punpckhbw %%xmm3, %%xmm0  #                 00 B7 00 B6 00 B5 00 B4         n
  881. movdqa    %%xmm1, %%xmm2  #                 G7 R7 G6 R6 G5 R5 G4 R4         n
  882. punpcklwd %%xmm0, %%xmm1  #                 00 B5 G5 R5 00 B4 G4 R4         n
  883. movntdq   %%xmm1, 32(%3)  # Store ABGR11 ABGR10 ABGR9 ABGR8                 n
  884. punpckhwd %%xmm0, %%xmm2  #                 B7 G7 R7 00 B6 G6 R6 00         n
  885. movntdq   %%xmm2, 48(%3)  # Store ABGR15 ABGR14 ABGR13 ABGR12               n
  886. "
  887. #define SSE2_UNPACK_32_ABGR_UNALIGNED "                                     n
  888. pxor      %%xmm3, %%xmm3  # zero mm3                                        n
  889. movdqa    %%xmm1, %%xmm4  #                 R7 R6 R5 R4 R3 R2 R1 R0         n
  890. punpcklbw %%xmm2, %%xmm4  #                 G3 R3 G2 R2 G1 R1 G0 R0         n
  891. movdqa    %%xmm0, %%xmm5  #                 B7 B6 B5 B4 B3 B2 B1 B0         n
  892. punpcklbw %%xmm3, %%xmm5  #                 00 B3 00 B2 00 B1 00 B0         n
  893. movdqa    %%xmm4, %%xmm6  #                 G3 R3 G2 R2 G1 R1 G0 R0         n
  894. punpcklwd %%xmm5, %%xmm4  #                 00 B1 G1 R1 00 B0 G0 R0         n
  895. movdqu    %%xmm4, (%3)    # Store ABGR3 ABGR2 ABGR1 ABGR0                   n
  896. punpckhwd %%xmm5, %%xmm6  #                 00 B3 G3 R3 00 B2 G2 R2         n
  897. movdqu    %%xmm6, 16(%3)  # Store ABGR7 ABGR6 ABGR5 ABGR4                   n
  898. punpckhbw %%xmm2, %%xmm1  #                 G7 R7 G6 R6 G5 R5 G4 R4         n
  899. punpckhbw %%xmm3, %%xmm0  #                 00 B7 00 B6 00 B5 00 B4         n
  900. movdqa    %%xmm1, %%xmm2  #                 R7 00 R6 00 R5 00 R4 00         n
  901. punpcklwd %%xmm0, %%xmm1  #                 00 B5 G5 R5 00 B4 G4 R4         n
  902. movdqu    %%xmm1, 32(%3)  # Store ABGR11 ABGR10 ABGR9 ABGR8                 n
  903. punpckhwd %%xmm0, %%xmm2  #                 B7 G7 R7 00 B6 G6 R6 00         n
  904. movdqu    %%xmm2, 48(%3)  # Store ABGR15 ABGR14 ABGR13 ABGR12               n
  905. "
  906. #elif defined(HAVE_SSE2_INTRINSICS)
  907. /* SSE2 intrinsics */
  908. #include <emmintrin.h>
  909. #define SSE2_CALL(SSE2_INSTRUCTIONS)        
  910.     do {                                    
  911.         __m128i xmm0, xmm1, xmm2, xmm3,     
  912.                 xmm4, xmm5, xmm6, xmm7;     
  913.         SSE2_INSTRUCTIONS                   
  914.     } while(0)
  915. #define SSE2_END  _mm_sfence()
  916. #define SSE2_INIT_16_ALIGNED                
  917.     xmm0 = _mm_loadl_epi64((__m128i *)p_u); 
  918.     xmm1 = _mm_loadl_epi64((__m128i *)p_v); 
  919.     xmm4 = _mm_setzero_si128();             
  920.     xmm6 = _mm_load_si128((__m128i *)p_y);
  921. #define SSE2_INIT_16_UNALIGNED              
  922.     xmm0 = _mm_loadl_epi64((__m128i *)p_u); 
  923.     xmm1 = _mm_loadl_epi64((__m128i *)p_v); 
  924.     xmm4 = _mm_setzero_si128();             
  925.     xmm6 = _mm_loadu_si128((__m128i *)p_y); 
  926.     _mm_prefetch(p_buffer, _MM_HINT_NTA);
  927. #define SSE2_INIT_32_ALIGNED                
  928.     xmm0 = _mm_loadl_epi64((__m128i *)p_u); 
  929.     xmm1 = _mm_loadl_epi64((__m128i *)p_v); 
  930.     xmm4 = _mm_setzero_si128();             
  931.     xmm6 = _mm_load_si128((__m128i *)p_y);
  932. #define SSE2_INIT_32_UNALIGNED              
  933.     xmm0 = _mm_loadl_epi64((__m128i *)p_u); 
  934.     xmm1 = _mm_loadl_epi64((__m128i *)p_v); 
  935.     xmm4 = _mm_setzero_si128();             
  936.     xmm6 = _mm_loadu_si128((__m128i *)p_y); 
  937.     _mm_prefetch(p_buffer, _MM_HINT_NTA);
  938. #define SSE2_YUV_MUL                        
  939.     xmm0 = _mm_unpacklo_epi8(xmm0, xmm4);   
  940.     xmm1 = _mm_unpacklo_epi8(xmm1, xmm4);   
  941.     xmm5 = _mm_set1_epi32(0x00800080UL);    
  942.     xmm0 = _mm_subs_epi16(xmm0, xmm5);      
  943.     xmm1 = _mm_subs_epi16(xmm1, xmm5);      
  944.     xmm0 = _mm_slli_epi16(xmm0, 3);         
  945.     xmm1 = _mm_slli_epi16(xmm1, 3);         
  946.     xmm2 = xmm0;                            
  947.     xmm3 = xmm1;                            
  948.     xmm5 = _mm_set1_epi32(0xf37df37dUL);    
  949.     xmm2 = _mm_mulhi_epi16(xmm2, xmm5);     
  950.     xmm5 = _mm_set1_epi32(0xe5fce5fcUL);    
  951.     xmm3 = _mm_mulhi_epi16(xmm3, xmm5);     
  952.     xmm5 = _mm_set1_epi32(0x40934093UL);    
  953.     xmm0 = _mm_mulhi_epi16(xmm0, xmm5);     
  954.     xmm5 = _mm_set1_epi32(0x33123312UL);    
  955.     xmm1 = _mm_mulhi_epi16(xmm1, xmm5);     
  956.     xmm2 = _mm_adds_epi16(xmm2, xmm3);      
  957.     
  958.     xmm5 = _mm_set1_epi32(0x10101010UL);    
  959.     xmm6 = _mm_subs_epu8(xmm6, xmm5);       
  960.     xmm7 = xmm6;                            
  961.     xmm5 = _mm_set1_epi32(0x00ff00ffUL);    
  962.     xmm6 = _mm_and_si128(xmm6, xmm5);       
  963.     xmm7 = _mm_srli_epi16(xmm7, 8);         
  964.     xmm6 = _mm_slli_epi16(xmm6, 3);         
  965.     xmm7 = _mm_slli_epi16(xmm7, 3);         
  966.     xmm5 = _mm_set1_epi32(0x253f253fUL);    
  967.     xmm6 = _mm_mulhi_epi16(xmm6, xmm5);     
  968.     xmm7 = _mm_mulhi_epi16(xmm7, xmm5);
  969. #define SSE2_YUV_ADD                        
  970.     xmm3 = xmm0;                            
  971.     xmm4 = xmm1;                            
  972.     xmm5 = xmm2;                            
  973.     xmm0 = _mm_adds_epi16(xmm0, xmm6);      
  974.     xmm3 = _mm_adds_epi16(xmm3, xmm7);      
  975.     xmm1 = _mm_adds_epi16(xmm1, xmm6);      
  976.     xmm4 = _mm_adds_epi16(xmm4, xmm7);      
  977.     xmm2 = _mm_adds_epi16(xmm2, xmm6);      
  978.     xmm5 = _mm_adds_epi16(xmm5, xmm7);      
  979.     
  980.     xmm0 = _mm_packus_epi16(xmm0, xmm0);    
  981.     xmm1 = _mm_packus_epi16(xmm1, xmm1);    
  982.     xmm2 = _mm_packus_epi16(xmm2, xmm2);    
  983.     
  984.     xmm3 = _mm_packus_epi16(xmm3, xmm3);    
  985.     xmm4 = _mm_packus_epi16(xmm4, xmm4);    
  986.     xmm5 = _mm_packus_epi16(xmm5, xmm5);    
  987.     
  988.     xmm0 = _mm_unpacklo_epi8(xmm0, xmm3);   
  989.     xmm1 = _mm_unpacklo_epi8(xmm1, xmm4);   
  990.     xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);
  991. #define SSE2_UNPACK_15_ALIGNED                      
  992.     xmm5 = _mm_set1_epi32(0xf8f8f8f8UL);            
  993.     xmm0 = _mm_and_si128(xmm0, xmm5);               
  994.     xmm0 = _mm_srli_epi16(xmm0, 3);                 
  995.     xmm2 = _mm_and_si128(xmm2, xmm5);               
  996.     xmm1 = _mm_and_si128(xmm1, xmm5);               
  997.     xmm1 = _mm_srli_epi16(xmm1, 1);                 
  998.     xmm4 = _mm_setzero_si128();                     
  999.     xmm5 = xmm0;                                    
  1000.     xmm7 = xmm2;                                    
  1001.     
  1002.     xmm2 = _mm_unpacklo_epi8(xmm2, xmm4);           
  1003.     xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);           
  1004.     xmm2 = _mm_slli_epi16(xmm2, 2);                 
  1005.     xmm0 = _mm_or_si128(xmm0, xmm2);                
  1006.     _mm_stream_si128((__m128i*)p_buffer, xmm0);     
  1007.     
  1008.     xmm7 = _mm_unpackhi_epi8(xmm7, xmm4);           
  1009.     xmm5 = _mm_unpackhi_epi8(xmm5, xmm1);           
  1010.     xmm7 = _mm_slli_epi16(xmm7, 2);                 
  1011.     xmm5 = _mm_or_si128(xmm5, xmm7);                
  1012.     _mm_stream_si128((__m128i*)(p_buffer+8), xmm5);
  1013. #define SSE2_UNPACK_15_UNALIGNED                    
  1014.     xmm5 = _mm_set1_epi32(0xf8f8f8f8UL);            
  1015.     xmm0 = _mm_and_si128(xmm0, xmm5);               
  1016.     xmm0 = _mm_srli_epi16(xmm0, 3);                 
  1017.     xmm2 = _mm_and_si128(xmm2, xmm5);               
  1018.     xmm1 = _mm_and_si128(xmm1, xmm5);               
  1019.     xmm1 = _mm_srli_epi16(xmm1, 1);                 
  1020.     xmm4 = _mm_setzero_si128();                     
  1021.     xmm5 = xmm0;                                    
  1022.     xmm7 = xmm2;                                    
  1023.     
  1024.     xmm2 = _mm_unpacklo_epi8(xmm2, xmm4);           
  1025.     xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);           
  1026.     xmm2 = _mm_slli_epi16(xmm2, 2);                 
  1027.     xmm0 = _mm_or_si128(xmm0, xmm2);                
  1028.     _mm_storeu_si128((__m128i*)p_buffer, xmm0);     
  1029.     
  1030.     xmm7 = _mm_unpackhi_epi8(xmm7, xmm4);           
  1031.     xmm5 = _mm_unpackhi_epi8(xmm5, xmm1);           
  1032.     xmm7 = _mm_slli_epi16(xmm7, 2);                 
  1033.     xmm5 = _mm_or_si128(xmm5, xmm7);                
  1034.     _mm_storeu_si128((__m128i*)(p_buffer+16), xmm5);
  1035. #define SSE2_UNPACK_16_ALIGNED                      
  1036.     xmm5 = _mm_set1_epi32(0xf8f8f8f8UL);            
  1037.     xmm0 = _mm_and_si128(xmm0, xmm5);               
  1038.     xmm1 = _mm_and_si128(xmm1, xmm5);               
  1039.     xmm5 = _mm_set1_epi32(0xfcfcfcfcUL);            
  1040.     xmm2 = _mm_and_si128(xmm2, xmm5);               
  1041.     xmm0 = _mm_srli_epi16(xmm0, 3);                 
  1042.     xmm4 = _mm_setzero_si128();                     
  1043.     xmm5 = xmm0;                                    
  1044.     xmm7 = xmm2;                                    
  1045.     
  1046.     xmm2 = _mm_unpacklo_epi8(xmm2, xmm4);           
  1047.     xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);           
  1048.     xmm2 = _mm_slli_epi16(xmm2, 3);                 
  1049.     xmm0 = _mm_or_si128(xmm0, xmm2);                
  1050.     _mm_stream_si128((__m128i*)p_buffer, xmm0);     
  1051.     
  1052.     xmm7 = _mm_unpackhi_epi8(xmm7, xmm4);           
  1053.     xmm5 = _mm_unpackhi_epi8(xmm5, xmm1);           
  1054.     xmm7 = _mm_slli_epi16(xmm7, 3);                 
  1055.     xmm5 = _mm_or_si128(xmm5, xmm7);                
  1056.     _mm_stream_si128((__m128i*)(p_buffer+8), xmm5);
  1057. #define SSE2_UNPACK_16_UNALIGNED                    
  1058.     xmm5 = _mm_set1_epi32(0xf8f8f8f8UL);            
  1059.     xmm0 = _mm_and_si128(xmm0, xmm5);               
  1060.     xmm1 = _mm_and_si128(xmm1, xmm5);               
  1061.     xmm5 = _mm_set1_epi32(0xfcfcfcfcUL);            
  1062.     xmm2 = _mm_and_si128(xmm2, xmm5);               
  1063.     xmm0 = _mm_srli_epi16(xmm0, 3);                 
  1064.     xmm4 = _mm_setzero_si128();                     
  1065.     xmm5 = xmm0;                                    
  1066.     xmm7 = xmm2;                                    
  1067.     
  1068.     xmm2 = _mm_unpacklo_epi8(xmm2, xmm4);           
  1069.     xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);           
  1070.     xmm2 = _mm_slli_epi16(xmm2, 3);                 
  1071.     xmm0 = _mm_or_si128(xmm0, xmm2);                
  1072.     _mm_storeu_si128((__m128i*)p_buffer, xmm0);     
  1073.     
  1074.     xmm7 = _mm_unpackhi_epi8(xmm7, xmm4);           
  1075.     xmm5 = _mm_unpackhi_epi8(xmm5, xmm1);           
  1076.     xmm7 = _mm_slli_epi16(xmm7, 3);                 
  1077.     xmm5 = _mm_or_si128(xmm5, xmm7);                
  1078.     _mm_storeu_si128((__m128i*)(p_buffer+8), xmm5);
  1079. #define SSE2_UNPACK_32_ARGB_ALIGNED                 
  1080.     xmm3 = _mm_setzero_si128();                     
  1081.     xmm4 = xmm0;                                    
  1082.     xmm4 = _mm_unpacklo_epi8(xmm4, xmm2);           
  1083.     xmm5 = xmm1;                                    
  1084.     xmm5 = _mm_unpacklo_epi8(xmm5, xmm3);           
  1085.     xmm6 = xmm4;                                    
  1086.     xmm4 = _mm_unpacklo_epi16(xmm4, xmm5);          
  1087.     _mm_stream_si128((__m128i*)(p_buffer), xmm4);   
  1088.     xmm6 = _mm_unpackhi_epi16(xmm6, xmm5);          
  1089.     _mm_stream_si128((__m128i*)(p_buffer+4), xmm6); 
  1090.     xmm0 = _mm_unpackhi_epi8(xmm0, xmm2);           
  1091.     xmm1 = _mm_unpackhi_epi8(xmm1, xmm3);           
  1092.     xmm5 = xmm0;                                    
  1093.     xmm5 = _mm_unpacklo_epi16(xmm5, xmm1);          
  1094.     _mm_stream_si128((__m128i*)(p_buffer+8), xmm5); 
  1095.     xmm0 = _mm_unpackhi_epi16(xmm0, xmm1);          
  1096.     _mm_stream_si128((__m128i*)(p_buffer+12), xmm0);
  1097. #define SSE2_UNPACK_32_ARGB_UNALIGNED               
  1098.     xmm3 = _mm_setzero_si128();                     
  1099.     xmm4 = xmm0;                                    
  1100.     xmm4 = _mm_unpacklo_epi8(xmm4, xmm2);           
  1101.     xmm5 = xmm1;                                    
  1102.     xmm5 = _mm_unpacklo_epi8(xmm5, xmm3);           
  1103.     xmm6 = xmm4;                                    
  1104.     xmm4 = _mm_unpacklo_epi16(xmm4, xmm5);          
  1105.     _mm_storeu_si128((__m128i*)(p_buffer), xmm4);   
  1106.     xmm6 = _mm_unpackhi_epi16(xmm6, xmm5);          
  1107.     _mm_storeu_si128((__m128i*)(p_buffer+4), xmm6); 
  1108.     xmm0 = _mm_unpackhi_epi8(xmm0, xmm2);           
  1109.     xmm1 = _mm_unpackhi_epi8(xmm1, xmm3);           
  1110.     xmm5 = xmm0;                                    
  1111.     xmm5 = _mm_unpacklo_epi16(xmm5, xmm1);          
  1112.     _mm_storeu_si128((__m128i*)(p_buffer+8), xmm5); 
  1113.     xmm0 = _mm_unpackhi_epi16(xmm0, xmm1);          
  1114.     _mm_storeu_si128((__m128i*)(p_buffer+12), xmm0);
  1115. #define SSE2_UNPACK_32_RGBA_ALIGNED                 
  1116.     xmm3 = _mm_setzero_si128();                     
  1117.     xmm4 = xmm2;                                    
  1118.     xmm4 = _mm_unpacklo_epi8(xmm4, xmm1);           
  1119.     xmm3 = _mm_unpacklo_epi8(xmm3, xmm0);           
  1120.     xmm5 = xmm3;                                    
  1121.     xmm3 = _mm_unpacklo_epi16(xmm3, xmm4);          
  1122.     _mm_stream_si128((__m128i*)(p_buffer), xmm3);   
  1123.     xmm5 = _mm_unpackhi_epi16(xmm5, xmm4);          
  1124.     _mm_stream_si128((__m128i*)(p_buffer+4), xmm5); 
  1125.     xmm6 = _mm_setzero_si128();                     
  1126.     xmm2 = _mm_unpackhi_epi8(xmm2, xmm1);           
  1127.     xmm6 = _mm_unpackhi_epi8(xmm6, xmm0);           
  1128.     xmm0 = xmm6;                                    
  1129.     xmm6 = _mm_unpacklo_epi16(xmm6, xmm2);          
  1130.     _mm_stream_si128((__m128i*)(p_buffer+8), xmm6); 
  1131.     xmm0 = _mm_unpackhi_epi16(xmm0, xmm2);          
  1132.     _mm_stream_si128((__m128i*)(p_buffer+12), xmm0);
  1133. #define SSE2_UNPACK_32_RGBA_UNALIGNED               
  1134.     xmm3 = _mm_setzero_si128();                     
  1135.     xmm4 = xmm2;                                    
  1136.     xmm4 = _mm_unpacklo_epi8(xmm4, xmm1);           
  1137.     xmm3 = _mm_unpacklo_epi8(xmm3, xmm0);           
  1138.     xmm5 = xmm3;                                    
  1139.     xmm3 = _mm_unpacklo_epi16(xmm3, xmm4);          
  1140.     _mm_storeu_si128((__m128i*)(p_buffer), xmm3);   
  1141.     xmm5 = _mm_unpackhi_epi16(xmm5, xmm4);          
  1142.     _mm_storeu_si128((__m128i*)(p_buffer+4), xmm5); 
  1143.     xmm6 = _mm_setzero_si128();                     
  1144.     xmm2 = _mm_unpackhi_epi8(xmm2, xmm1);           
  1145.     xmm6 = _mm_unpackhi_epi8(xmm6, xmm0);           
  1146.     xmm0 = xmm6;                                    
  1147.     xmm6 = _mm_unpacklo_epi16(xmm6, xmm2);          
  1148.     _mm_storeu_si128((__m128i*)(p_buffer+8), xmm6); 
  1149.     xmm0 = _mm_unpackhi_epi16(xmm0, xmm2);          
  1150.     _mm_storeu_si128((__m128i*)(p_buffer+12), xmm0);
  1151. #define SSE2_UNPACK_32_BGRA_ALIGNED                 
  1152.     xmm3 = _mm_setzero_si128();                     
  1153.     xmm4 = xmm2;                                    
  1154.     xmm4 = _mm_unpacklo_epi8(xmm4, xmm0);           
  1155.     xmm3 = _mm_unpacklo_epi8(xmm3, xmm1);           
  1156.     xmm5 = xmm3;                                    
  1157.     xmm3 = _mm_unpacklo_epi16(xmm3, xmm4);          
  1158.     _mm_stream_si128((__m128i*)(p_buffer), xmm3);   
  1159.     xmm5 = _mm_unpackhi_epi16(xmm5, xmm4);          
  1160.     _mm_stream_si128((__m128i*)(p_buffer+4), xmm5); 
  1161.     xmm6 = _mm_setzero_si128();                     
  1162.     xmm2 = _mm_unpackhi_epi8(xmm2, xmm0);           
  1163.     xmm6 = _mm_unpackhi_epi8(xmm6, xmm1);           
  1164.     xmm0 = xmm6;                                    
  1165.     xmm6 = _mm_unpacklo_epi16(xmm6, xmm2);          
  1166.     _mm_stream_si128((__m128i*)(p_buffer+8), xmm6); 
  1167.     xmm0 = _mm_unpackhi_epi16(xmm0, xmm2);          
  1168.     _mm_stream_si128((__m128i*)(p_buffer+12), xmm0);
  1169. #define SSE2_UNPACK_32_BGRA_UNALIGNED               
  1170.     xmm3 = _mm_setzero_si128();                     
  1171.     xmm4 = xmm2;                                    
  1172.     xmm4 = _mm_unpacklo_epi8(xmm4, xmm0);           
  1173.     xmm3 = _mm_unpacklo_epi8(xmm3, xmm1);           
  1174.     xmm5 = xmm3;                                    
  1175.     xmm3 = _mm_unpacklo_epi16(xmm3, xmm4);          
  1176.     _mm_storeu_si128((__m128i*)(p_buffer), xmm3);   
  1177.     xmm5 = _mm_unpackhi_epi16(xmm5, xmm4);          
  1178.     _mm_storeu_si128((__m128i*)(p_buffer+4), xmm5); 
  1179.     xmm6 = _mm_setzero_si128();                     
  1180.     xmm2 = _mm_unpackhi_epi8(xmm2, xmm0);           
  1181.     xmm6 = _mm_unpackhi_epi8(xmm6, xmm1);           
  1182.     xmm0 = xmm6;                                    
  1183.     xmm6 = _mm_unpacklo_epi16(xmm6, xmm2);          
  1184.     _mm_storeu_si128((__m128i*)(p_buffer+8), xmm6); 
  1185.     xmm0 = _mm_unpackhi_epi16(xmm0, xmm2);          
  1186.     _mm_storeu_si128((__m128i*)(p_buffer+12), xmm0);
  1187. #define SSE2_UNPACK_32_ABGR_ALIGNED                 
  1188.     xmm3 = _mm_setzero_si128();                     
  1189.     xmm4 = xmm1;                                    
  1190.     xmm4 = _mm_unpacklo_epi8(xmm4, xmm2);           
  1191.     xmm5 = xmm0;                                    
  1192.     xmm5 = _mm_unpacklo_epi8(xmm5, xmm3);           
  1193.     xmm6 = xmm4;                                    
  1194.     xmm4 = _mm_unpacklo_epi16(xmm4, xmm5);          
  1195.     _mm_stream_si128((__m128i*)(p_buffer), xmm4);   
  1196.     xmm6 = _mm_unpackhi_epi16(xmm6, xmm5);          
  1197.     _mm_stream_si128((__m128i*)(p_buffer+4), xmm6); 
  1198.     xmm1 = _mm_unpackhi_epi8(xmm1, xmm2);           
  1199.     xmm0 = _mm_unpackhi_epi8(xmm0, xmm3);           
  1200.     xmm2 = xmm1;                                    
  1201.     xmm1 = _mm_unpacklo_epi16(xmm1, xmm0);          
  1202.     _mm_stream_si128((__m128i*)(p_buffer+8), xmm1); 
  1203.     xmm2 = _mm_unpackhi_epi16(xmm2, xmm0);          
  1204.     _mm_stream_si128((__m128i*)(p_buffer+12), xmm2);
  1205. #define SSE2_UNPACK_32_ABGR_UNALIGNED               
  1206.     xmm3 = _mm_setzero_si128();                     
  1207.     xmm4 = xmm1;                                    
  1208.     xmm4 = _mm_unpacklo_epi8(xmm4, xmm2);           
  1209.     xmm5 = xmm0;                                    
  1210.     xmm5 = _mm_unpacklo_epi8(xmm5, xmm3);           
  1211.     xmm6 = xmm4;                                    
  1212.     xmm4 = _mm_unpacklo_epi16(xmm4, xmm5);          
  1213.     _mm_storeu_si128((__m128i*)(p_buffer), xmm4);   
  1214.     xmm6 = _mm_unpackhi_epi16(xmm6, xmm5);          
  1215.     _mm_storeu_si128((__m128i*)(p_buffer+4), xmm6); 
  1216.     xmm1 = _mm_unpackhi_epi8(xmm1, xmm2);           
  1217.     xmm0 = _mm_unpackhi_epi8(xmm0, xmm3);           
  1218.     xmm2 = xmm1;                                    
  1219.     xmm1 = _mm_unpacklo_epi16(xmm1, xmm0);          
  1220.     _mm_storeu_si128((__m128i*)(p_buffer+8), xmm1); 
  1221.     xmm2 = _mm_unpackhi_epi16(xmm2, xmm0);          
  1222.     _mm_storeu_si128((__m128i*)(p_buffer+12), xmm2);
  1223. #endif
  1224. #endif