blit_wmmx_fix.c
上传用户:wstnjxml
上传日期:2014-04-03
资源大小:7248k
文件大小:21k
源码类别:

Windows CE

开发平台:

C/C++

  1. /*****************************************************************************
  2.  *
  3.  * This program is free software ; you can redistribute it and/or modify
  4.  * it under the terms of the GNU General Public License as published by
  5.  * the Free Software Foundation; either version 2 of the License, or
  6.  * (at your option) any later version.
  7.  *
  8.  * This program is distributed in the hope that it will be useful,
  9.  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10.  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11.  * GNU General Public License for more details.
  12.  *
  13.  * You should have received a copy of the GNU General Public License
  14.  * along with this program; if not, write to the Free Software
  15.  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
  16.  *
  17.  * $Id: blit_wmmx_fix.c 271 2005-08-09 08:31:35Z picard $
  18.  *
  19.  * The Core Pocket Media Player
  20.  * Copyright (c) 2004-2005 Gabor Kovacs
  21.  *
  22.  ****************************************************************************/
  23. #include "../common.h"
  24. #include "../dyncode/dyncode.h"
  25. #include "blit_soft.h"
  26. // RScale==16 && !SwapXY 8x2 -> 8x2
  27. // RScale==16 && SwapXY 2x8 -> 8x2
  28. // RScale==8  && !SwapXY 8x2 -> 16x4
  29. // RScale==8  && SwapXY 2x8 -> 16x4
  30. // RScale==32 && !SwapXY 16x4 -> 8x2
  31. // RScale==32 && SwapXY 4x16 -> 8x2
  32. #if defined(ARM) 
  33. typedef struct stack
  34. {
  35. int EndOfRect;
  36. int DstNext;
  37. int SrcNext;
  38. int UVNext;
  39. int StackFrame[STACKFRAME];
  40. //void* this   R0
  41. //char* Dst    R1
  42. //char* Src    R2
  43. //int DstPitch R3 can be signed
  44. int SrcPitch; //can be signed
  45. int Width; 
  46. int Height;
  47. int Src2SrcLast;
  48. } stack;
  49. // r0 bmask (if bpos!=0)
  50. // r1 temp
  51. // r2 dst1 (dst+dstpitch*DoubleY or dst+8*DoubleX*DirX)
  52. // r3 u (-=4 if !swapxy)
  53. // r4 v (-=4 if !swapxy)
  54. // r5..r7 temp
  55. // r8 endofline
  56. // r9 dst0
  57. // r10 dstpitch
  58. // r11 srcpitch
  59. // r12 y0
  60. // r14 y1 (y0+(srchalfy?2:1)*srcpitch)
  61. // wr0..wr2 r0,g0,b0 (from uv)
  62. // wr3..wr5 temp1 (r,g,b)
  63. // wr6..wr8 temp2 (r,g,b)
  64. // wr9,wr10 mask r,g
  65. // wr11 radd (if !dither)
  66. // wr12 gadd (if !dither)
  67. // wr13 badd (if !dither)
  68. // wr11 radd,gadd,badd (if dither)
  69. // wr12,wr13 dither
  70. // wr14 rvmul,gumul,gvmul,bumul
  71. // wr15 ymul
  72. // wcgr0 abs(rpos-8)
  73. // wcgr1 abs(gpos-8)
  74. // wcgr2 abs(bpos-8)
  75. // wcgr3 8 (if HalfMode)
  76. // using: wr0,wr1,wr2,wr6,wr7
  77. static void Fix_UVSwapXY(blit_soft* p,bool_t HalfMode,bool_t Part,bool_t Row,reg Src,reg Dst)
  78. {
  79. if (p->DstHalfY && Part)
  80. {
  81. Byte(); I3S(LDR_POSTSUB,R7,Src,R11,ASR,1); //3
  82. Byte(); I3S(LDR_POSTSUB,R1,Src,R11,ASR,1); //2
  83. Byte(); I3S(LDR_POSTSUB,R6,Src,R11,ASR,1); //1
  84. Byte(); I2C(LDR_POST,R5,Src,Row?1:-1); //0
  85. I2C(TINSRB,Dst,R7,p->DirX>0?7:1);
  86. I2C(TINSRB,Dst,R1,p->DirX>0?5:3);
  87. I2C(TINSRB,Dst,R6,p->DirX>0?3:5);
  88. I2C(TINSRB,Dst,R5,p->DirX>0?1:7);
  89. if (p->SrcHalfY)
  90. I3S(Row?SUB:ADD,Src,Src,R11,LSL,1); // 4 rows
  91. }
  92. else
  93. if (p->DstHalfY && !Part)
  94. {
  95. Byte(); I3S(LDR_POST,R5,Src,R11,ASR,1); //0
  96. Byte(); I3S(LDR_POST,R6,Src,R11,ASR,1); //1
  97. Byte(); I3S(LDR_POST,R1,Src,R11,ASR,1); //2
  98. Byte(); I2C(LDR_POST,R7,Src,1); //3
  99. I2C(TINSRB,Dst,R5,p->DirX>0?1:7);
  100. I2C(TINSRB,Dst,R6,p->DirX>0?3:5);
  101. I2C(TINSRB,Dst,R1,p->DirX>0?5:3);
  102. I2C(TINSRB,Dst,R7,p->DirX>0?7:1);
  103. }
  104. else
  105. {
  106. Byte(); I3S(LDR_POST,R5,Src,R11,ASR,1); //0
  107. Byte(); I3S(LDR_POST,R6,Src,R11,ASR,0); //1
  108. Byte(); I3S(LDR_POSTSUB,R7,Src,R11,ASR,1); //3
  109. Byte(); I3S(LDR_POSTSUB,R1,Src,R11,ASR,0); //2
  110. I2C(TINSRB,Dst,R5,p->DirX>0?1:3);
  111. I2C(TINSRB,Dst,R6,p->DirX>0?3:1);
  112. I2C(TINSRB,Dst,R7,p->DirX>0?7:5);
  113. I2C(TINSRB,Dst,R1,p->DirX>0?5:7);
  114. I2C(ADD,Src,Src,1);
  115. }
  116. }
  117. static void Fix_UV(blit_soft* p,bool_t HalfMode,bool_t Part,bool_t Row)
  118. {
  119. if (p->SwapXY)
  120. {
  121. Fix_UVSwapXY(p,HalfMode,Part,Row,R3,WR6);
  122. Fix_UVSwapXY(p,HalfMode,Part,Row,R4,WR7);
  123. }
  124. else
  125. {
  126. if (p->DstHalfY && Part)
  127. {
  128. // one uv row below
  129. MB(); I3S(ADD,R5,R3,R11,ASR,1);
  130. MB(); I3S(ADD,R6,R4,R11,ASR,1);
  131. MB(); I2C(WLDRW,WR6,R5,0);
  132. MB(); I2C(WLDRW,WR7,R6,0);
  133. }
  134. else
  135. {
  136. int Pre = (HalfMode && !p->SrcHalfX && Row)?0:4;
  137. MB(); I2C(WLDRW_PRE,WR6,R3,Pre);
  138. MB(); I2C(WLDRW_PRE,WR7,R4,Pre);
  139. }
  140. I3(WUNPCKILB,WR6,WR6,WR6);
  141. I3(WUNPCKILB,WR7,WR7,WR7);
  142. if (p->DirX<0)
  143. {
  144. I2C(WSHUFH,WR6,WR6,p->SrcHalfY?0x1B:0xB1); //swap order (2:2)
  145. I2C(WSHUFH,WR7,WR7,p->SrcHalfY?0x1B:0xB1); //swap order (2:2)
  146. }
  147. }
  148. if (HalfMode && !p->DstHalfX)
  149. {
  150. I3(Row==(p->DirX>0)?WUNPCKIHH:WUNPCKILH,WR6,WR6,WR6);
  151. I3(Row==(p->DirX>0)?WUNPCKIHH:WUNPCKILH,WR7,WR7,WR7);
  152. }
  153. //WR6 U
  154. //WR7 V
  155. I2C(WSHUFH,WR1,WR14,0x55); //gumul
  156. I3(WMULUM,WR1,WR1,WR6);
  157. I2C(WSHUFH,WR2,WR14,0xFF); //bumul
  158. I3(WMULUM,WR2,WR2,WR6);
  159. I2C(WSHUFH,WR6,WR14,0xAA); //gvmul
  160. I3(WMULUM,WR6,WR6,WR7);
  161. I2C(WSHUFH,WR0,WR14,0x00); //rvmul
  162. I3(WMULUM,WR0,WR0,WR7);
  163. I3((p->_GUMul<0)^(p->_GVMul<0)?WSUBH:WADDH,WR1,WR1,WR6);
  164. //WR0 R
  165. //WR1 G
  166. //WR2 B
  167. if (p->FX.Flags & BLITFX_DITHER)
  168. {
  169. I2C(WSHUFH,WR6,WR11,0x00); //radd
  170. I2C(WSHUFH,WR7,WR11,0x55); //gadd
  171. I3(p->_RVMul<0?WSUBH:WADDH,WR0,WR6,WR0);
  172. I2C(WSHUFH,WR6,WR11,0xAA); //badd
  173. I3(p->_GUMul<0?WSUBH:WADDH,WR1,WR7,WR1);
  174. I3(p->_BUMul<0?WSUBH:WADDH,WR2,WR6,WR2);
  175. }
  176. else
  177. {
  178. I3(p->_RVMul<0?WSUBH:WADDH,WR0,WR11,WR0);
  179. I3(p->_GUMul<0?WSUBH:WADDH,WR1,WR12,WR1);
  180. I3(p->_BUMul<0?WSUBH:WADDH,WR2,WR13,WR2);
  181. }
  182. }
  183. static void Fix_Y(blit_soft* p,bool_t Row,int Col,bool_t HalfMode)
  184. {
  185. // load to upper 8bits (doesn't matter what is in the lower 8 bits)
  186. // load y0 wr5
  187. // load y1 wr8
  188. reg Dither;
  189. int AddY;
  190. if (p->SwapXY)
  191. {
  192. Dither = (reg)(Col?WR13:WR12);
  193. if (p->ArithStretch && p->SrcHalfX && p->SrcHalfY)
  194. I3(LDR_POST,R5,R12,R11);
  195. I3(LDR_POST,R6,R12,R11);
  196. I3(LDR_POST,R7,R12,R11);
  197. I3(LDR_POST,R1,R12,R11);
  198. I2C(TINSRH,WR5,R5,p->DirX>0?0:3);
  199. I2C(TINSRH,WR6,R6,p->DirX>0?0:3);
  200. I2C(TINSRH,WR5,R7,p->DirX>0?1:2);
  201. I2C(TINSRH,WR6,R1,p->DirX>0?1:2);
  202. I3S(MOV,R5,NONE,R5,LSR,16);
  203. I3S(MOV,R6,NONE,R6,LSR,16);
  204. I3S(MOV,R7,NONE,R7,LSR,16);
  205. I3S(MOV,R1,NONE,R1,LSR,16);
  206. I2C(TINSRH,WR8,R5,p->DirX>0?0:3);
  207. I2C(TINSRH,WR7,R6,p->DirX>0?0:3);
  208. I2C(TINSRH,WR8,R7,p->DirX>0?1:2);
  209. I2C(TINSRH,WR7,R1,p->DirX>0?1:2);
  210. I3(LDR_POST,R5,R12,R11);
  211. I3(LDR_POST,R6,R12,R11);
  212. I3(LDR_POST,R7,R12,R11);
  213. I3(LDR_POST,R1,R12,R11);
  214. I2C(TINSRH,WR5,R5,p->DirX>0?2:1);
  215. I2C(TINSRH,WR6,R6,p->DirX>0?2:1);
  216. I2C(TINSRH,WR5,R7,p->DirX>0?3:0);
  217. I2C(TINSRH,WR6,R1,p->DirX>0?3:0);
  218. I3S(MOV,R5,NONE,R5,LSR,16);
  219. I3S(MOV,R6,NONE,R6,LSR,16);
  220. I3S(MOV,R7,NONE,R7,LSR,16);
  221. I3S(MOV,R1,NONE,R1,LSR,16);
  222. I2C(TINSRH,WR8,R5,p->DirX>0?2:1);
  223. I2C(TINSRH,WR7,R6,p->DirX>0?2:1);
  224. I2C(TINSRH,WR8,R7,p->DirX>0?3:0);
  225. I2C(TINSRH,WR7,R1,p->DirX>0?3:0);
  226. I3(WAVG2B,WR5,WR5,WR6);
  227. I3(WSLLHG,WR6,WR5,WCGR3);
  228. I3(WAVG2B,WR8,WR8,WR7);
  229. I3(WSLLHG,WR7,WR8,WCGR3);
  230. I3(WAVG2B,WR5,WR5,WR6);
  231. I3(WAVG2B,WR8,WR8,WR7);
  232. }
  233. else
  234. if (p->SrcHalfX)
  235. {
  236. reg Add = R11;
  237. if (p->SrcHalfY)
  238. {
  239. I3(ADD,R1,R11,R11);
  240. Add = R1;
  241. }
  242. I3(LDR_POST,R5,R12,Add);
  243. I3(LDR_POST,R6,R12,Add);
  244. I3(LDR_POST,R7,R12,Add);
  245. I3(LDR_POST,R1,R12,Add);
  246. I2C(TINSRH,WR5,R5,p->DirX>0?0:3);
  247. I2C(TINSRH,WR5,R6,p->DirX>0?1:2);
  248. I2C(TINSRH,WR5,R7,p->DirX>0?2:1);
  249. I2C(TINSRH,WR5,R1,p->DirX>0?3:0);
  250. I3S(MOV,R5,NONE,R5,LSR,16);
  251. I3S(MOV,R6,NONE,R6,LSR,16);
  252. I3S(MOV,R7,NONE,R7,LSR,16);
  253. I3S(MOV,R1,NONE,R1,LSR,16);
  254. I2C(TINSRH,WR8,R5,p->DirX>0?0:3);
  255. I2C(TINSRH,WR8,R6,p->DirX>0?1:2);
  256. I2C(TINSRH,WR8,R7,p->DirX>0?2:1);
  257. I2C(TINSRH,WR8,R1,p->DirX>0?3:0);
  258. }
  259. else
  260. {
  261. reg Add = R11;
  262. if (p->SrcHalfY)
  263. {
  264. I3(ADD,R1,R11,R11);
  265. Add = R1;
  266. }
  267. Half(); I3(LDR_POST,R5,R12,Add);
  268. Half(); I3(LDR_POST,R6,R12,Add);
  269. Half(); I3(LDR_POST,R7,R12,Add);
  270. Half(); I3(LDR_POST,R1,R12,Add);
  271. I2C(TINSRB,WR5,R5,p->DirX>0?1:7);
  272. I2C(TINSRH,WR8,R5,p->DirX>0?0:3);
  273. I2C(TINSRB,WR5,R6,p->DirX>0?3:5);
  274. I2C(TINSRH,WR8,R6,p->DirX>0?1:2);
  275. I2C(TINSRB,WR5,R7,p->DirX>0?5:3);
  276. I2C(TINSRH,WR8,R7,p->DirX>0?2:1);
  277. I2C(TINSRB,WR5,R1,p->DirX>0?7:1);
  278. I2C(TINSRH,WR8,R1,p->DirX>0?3:0);
  279. }
  280. if (Row)
  281. {
  282. I3S(SUB,R12,R12,R11,LSL,3+p->SrcHalfY);
  283. I2C(ADD,R12,R12,2 << p->SrcHalfX);
  284. }
  285. }
  286. else
  287. {
  288. Dither = WR12;
  289. if (p->ArithStretch && p->SrcHalfX && p->SrcHalfY)
  290. MB(); I3(ADD,R1,R12,R11);
  291. MB(); I3(ADD,R5,R14,R11);
  292. MB(); I2C(WLDRD_POST,WR5,R12,8);
  293. MB(); I2C(WLDRD,WR6,R1,0);
  294. MB(); I2C(WLDRD_POST,WR8,R14,8);
  295. MB(); I2C(WLDRD,WR7,R5,0);
  296. I3(WAVG2B,WR5,WR5,WR6);
  297. I3(WSLLHG,WR6,WR5,WCGR3);
  298. I3(WAVG2B,WR8,WR8,WR7);
  299. I3(WSLLHG,WR7,WR8,WCGR3);
  300. I3(WAVG2B,WR5,WR5,WR6);
  301. I3(WAVG2B,WR8,WR8,WR7);
  302. }
  303. else
  304. if (p->SrcHalfX)
  305. {
  306. MB(); I2C(WLDRD_POST,WR5,R12,8); // use only every second pixel
  307. MB(); I2C(WLDRD_POST,WR8,R14,8); // use only every second pixel
  308. }
  309. else
  310. {
  311. MB(); I2C(WLDRW_POST,WR5,R12,4);
  312. MB(); I2C(WLDRW_POST,WR8,R14,4);
  313. I3(WUNPCKILB,WR5,WR5,WR5);
  314. I3(WUNPCKILB,WR8,WR8,WR8);
  315. }
  316. if (p->DirX<0)
  317. {
  318. I2C(WSHUFH,WR8,WR8,0x1B); //swap order
  319. I2C(WSHUFH,WR5,WR5,0x1B); //swap order
  320. }
  321. }
  322. MB(); I3(WMULUM,WR5,WR5,WR15);
  323. MB(); I3(WMULUM,WR8,WR8,WR15);
  324. AddY = p->_YMul>=0?WADDH:WSUBH;
  325. if (HalfMode)
  326. {
  327. Fix_UV(p,HalfMode,0,Row); 
  328. I3(AddY,WR3,WR0,WR5);
  329. I3(AddY,WR4,WR1,WR5);
  330. I3(AddY,WR5,WR2,WR5);
  331. if (p->DstHalfY)
  332. Fix_UV(p,HalfMode,1,Row);
  333. I3(AddY,WR6,WR0,WR8);
  334. I3(AddY,WR7,WR1,WR8);
  335. I3(AddY,WR8,WR2,WR8);
  336. I3(WPACKHUS,WR3,WR3,WR6);
  337. I3(WPACKHUS,WR4,WR4,WR7);
  338. I3(WPACKHUS,WR5,WR5,WR8);
  339. }
  340. else
  341. {
  342. // WR0,WR1,WR2 already has UV information
  343. I3(Row?WUNPCKIHH:WUNPCKILH,WR6,WR0,WR0);
  344. I3(Row?WUNPCKIHH:WUNPCKILH,WR7,WR1,WR1);
  345. I3(AddY,WR3,WR6,WR5);
  346. I3(AddY,WR4,WR7,WR5);
  347. I3(AddY,WR6,WR6,WR8);
  348. I3(AddY,WR7,WR7,WR8);
  349. I3(WPACKHUS,WR3,WR3,WR6);
  350. I3(WPACKHUS,WR4,WR4,WR7);
  351. MB(); I3(Row?WUNPCKIHH:WUNPCKILH,WR6,WR2,WR2);
  352. I3(AddY,WR5,WR6,WR5);
  353. I3(AddY,WR8,WR6,WR8);
  354. I3(WPACKHUS,WR5,WR5,WR8);
  355. }
  356. if (p->FX.Flags & BLITFX_DITHER)
  357. {
  358. if (p->DstSize[0] > p->DitherSize || p->DstSize[1] > p->DitherSize || p->DstSize[2] > p->DitherSize)
  359. {
  360. MB(); I3(WXOR,WR6,WR6,WR6);
  361. I3(WAVG2B,WR6,WR6,Dither);
  362. }
  363. I3(WADDBUS,WR3,WR3,(reg)(p->DstSize[0] > p->DitherSize ? WR6:Dither));
  364. I3(WADDBUS,WR4,WR4,(reg)(p->DstSize[1] > p->DitherSize ? WR6:Dither));
  365. I3(WADDBUS,WR5,WR5,(reg)(p->DstSize[2] > p->DitherSize ? WR6:Dither));
  366. }
  367. if (p->DstPos[2]==0 && p->DstPos[0]+p->DstSize[0]==16)
  368. {
  369. // special optimized case for rgb565
  370. // (red and blue word mask in WR9)
  371. I3(WAND,WR4,WR4,WR10);
  372. I3(WSRLDG,WR5,WR5,WCGR2);
  373. I2(WUNPCKEHUB,WR7,WR4);
  374. I3(WUNPCKIHB,WR6,WR5,WR3);
  375. I3(WSLLWG,WR7,WR7,WCGR1);
  376. I3(WAND,WR6,WR6,WR9); 
  377. I3(WOR,WR6,WR6,WR7);
  378. I2(WUNPCKELUB,WR4,WR4);
  379. I3(WUNPCKILB,WR3,WR5,WR3);
  380. I3(WSLLWG,WR4,WR4,WCGR1);
  381. I3(WAND,WR3,WR3,WR9);
  382. I3(WOR,WR3,WR3,WR4);
  383. }
  384. else
  385. {
  386. if (p->DstPos[0]!=0) I3(WAND,WR3,WR3,WR9);
  387. if (p->DstPos[1]!=0) I3(WAND,WR4,WR4,WR10);
  388. if (p->DstPos[2]!=0) 
  389. {
  390. I2(TBCSTH,WR6,R0);
  391. I3(WAND,WR5,WR5,WR6);
  392. }
  393. I2(WUNPCKEHUB,WR6,WR3);
  394. I2(WUNPCKEHUB,WR7,WR4);
  395. I2(WUNPCKEHUB,WR8,WR5);
  396. if (p->DstPos[0]+p->DstSize[0]!=8) I3(p->DstPos[0]+p->DstSize[0]>8?WSLLWG:WSRLWG,WR6,WR6,WCGR0);
  397. if (p->DstPos[1]+p->DstSize[1]!=8) I3(p->DstPos[1]+p->DstSize[1]>8?WSLLWG:WSRLWG,WR7,WR7,WCGR1);
  398. if (p->DstPos[2]+p->DstSize[2]!=8) I3(p->DstPos[2]+p->DstSize[2]>8?WSLLWG:WSRLWG,WR8,WR8,WCGR2);
  399. I2(WUNPCKELUB,WR3,WR3);
  400. I2(WUNPCKELUB,WR4,WR4);
  401. I2(WUNPCKELUB,WR5,WR5);
  402. if (p->DstPos[0]+p->DstSize[0]!=8) I3(p->DstPos[0]+p->DstSize[0]>8?WSLLWG:WSRLWG,WR3,WR3,WCGR0);
  403. if (p->DstPos[1]+p->DstSize[1]!=8) I3(p->DstPos[1]+p->DstSize[1]>8?WSLLWG:WSRLWG,WR4,WR4,WCGR1);
  404. if (p->DstPos[2]+p->DstSize[2]!=8) I3(p->DstPos[2]+p->DstSize[2]>8?WSLLWG:WSRLWG,WR5,WR5,WCGR2);
  405. I3(WOR,WR6,WR6,WR7);
  406. I3(WOR,WR3,WR3,WR4);
  407. I3(WOR,WR6,WR6,WR8);
  408. I3(WOR,WR3,WR3,WR5);
  409. }
  410. if (p->DstDoubleX)
  411. {
  412. I2C(WSHUFH,WR7,WR6,0xFA); // 7 7 6 6
  413. I2C(WSHUFH,WR4,WR3,0xFA); // 3 3 2 2
  414. I2C(WSHUFH,WR6,WR6,0x50); // 5 5 4 4
  415. I2C(WSHUFH,WR3,WR3,0x50); // 1 1 0 0
  416. }
  417. if (p->SwapXY)
  418. {
  419. reg Dst = (reg)(Row?R2:R9);
  420. MB(); I3S(ADD,R1,Dst,R10,LSL,p->DstDoubleY);
  421. MB(); I2C(WSTRD,WR3,Dst,0);
  422. if (p->DstDoubleX)
  423. {
  424. MB(); I2C(WSTRD,WR4,Dst,8);
  425. }
  426. if (p->DstDoubleY)
  427. {
  428. MB(); I3(ADD,R5,Dst,R10);
  429. MB(); I2C(WSTRD,WR3,R5,0);
  430. if (p->DstDoubleX)
  431. {
  432. MB(); I2C(WSTRD,WR4,R5,8);
  433. }
  434. }
  435. MB(); I2C(WSTRD,WR6,R1,0);
  436. if (p->DstDoubleX)
  437. {
  438. MB(); I2C(WSTRD,WR7,R1,8);
  439. }
  440. if (p->DstDoubleY)
  441. {
  442. MB(); I3(ADD,R6,R1,R10);
  443. MB(); I2C(WSTRD,WR6,R6,0);
  444. if (p->DstDoubleX)
  445. {
  446. MB(); I2C(WSTRD,WR7,R6,8);
  447. }
  448. }
  449. MB(); I3S(ADD,Dst,Dst,R10,LSL,1+p->DstDoubleY);
  450. }
  451. else
  452. {
  453. if (p->DstDoubleY)
  454. {
  455. MB(); I3(ADD,R1,R2,R10);
  456. MB(); I3(ADD,R5,R9,R10);
  457. }
  458. if (p->DstDoubleX)
  459. {
  460. MB(); I2C(WSTRD_POST,WR6,R2,8);
  461. MB(); I2C(WSTRD_POST,WR3,R9,8);
  462. }
  463. else
  464. {
  465. MB(); I2C(WSTRD_POST,WR6,R2,8*p->DirX);
  466. MB(); I2C(WSTRD_POST,WR3,R9,8*p->DirX);
  467. }
  468. if (p->DstDoubleY)
  469. {
  470. MB(); I2C(WSTRD,WR6,R1,0);
  471. MB(); I2C(WSTRD,WR3,R5,0);
  472. }
  473. if (p->DstDoubleX)
  474. {
  475. MB(); I2C(WSTRD_POST,WR7,R2,p->DirX>0?8:-24);
  476. MB(); I2C(WSTRD_POST,WR4,R9,p->DirX>0?8:-24);
  477. if (p->DstDoubleY)
  478. {
  479. MB(); I2C(WSTRD,WR7,R1,8);
  480. MB(); I2C(WSTRD,WR4,R5,8);
  481. }
  482. }
  483. }
  484. }
  485. void WMMXFix_RGB_UV(blit_soft* p)
  486. {
  487. bool_t HalfMode = p->SrcHalfX || p->SrcHalfY;
  488. dyninst* LoopY;
  489. dyninst* LoopX;
  490. dyninst* EndLine;
  491. dyninst* Dither = NULL;
  492. p->SrcAlignPos = p->DstAlignPos = p->DstAlignSize = 8;
  493. if (p->RScaleX==8) p->DstAlignSize = 16;
  494. p->DstStepX = p->DirX * ((p->DstBPP*8) >> 3) << p->DstDoubleX;
  495. p->YMul = InstCreate16(abs(p->_YMul) >> 8,NONE,NONE,NONE,0,0);
  496. p->RVMul = InstCreate16(abs(p->_RVMul) >> 8,NONE,NONE,NONE,0,0);
  497. p->RAdd = InstCreate16((p->_RAdd) >> 16,NONE,NONE,NONE,0,0);
  498. p->GUMul = InstCreate16(abs(p->_GUMul) >> 8,NONE,NONE,NONE,0,0);
  499. p->GVMul = InstCreate16(abs(p->_GVMul) >> 8,NONE,NONE,NONE,0,0);
  500. p->GAdd = InstCreate16((p->_GAdd) >> 16,NONE,NONE,NONE,0,0);
  501. p->BUMul = InstCreate16(abs(p->_BUMul) >> 8,NONE,NONE,NONE,0,0);
  502. p->BAdd = InstCreate16((p->_BAdd) >> 16,NONE,NONE,NONE,0,0);
  503. CodeBegin();
  504. I2C(SUB,SP,SP,OFS(stack,StackFrame));
  505. I1P(WLDRD,WR11,p->RAdd,0);
  506. I1P(WLDRD,WR14,p->RVMul,0);
  507. I3(MOV,R10,NONE,R3); //DstPitch
  508. I2C(LDR,R9,R1,0); //Dst[0] RGB
  509. I2C(LDR,R3,R2,4); //Src[1] U
  510. I2C(LDR,R4,R2,8); //Src[2] V
  511. I2C(LDR,R12,R2,0); //Src[0] Y
  512. I2C(LDR,R11,SP,OFS(stack,SrcPitch));
  513. I2C(WSHUFH,WR15,WR11,0xFF); //ymul
  514. if (!(p->FX.Flags & BLITFX_DITHER))
  515. {
  516. I2C(WSHUFH,WR12,WR11,0x55); //gadd
  517. I2C(WSHUFH,WR13,WR11,0xAA); //badd
  518. I2C(WSHUFH,WR11,WR11,0x00); //radd
  519. }
  520. else
  521. {
  522. int i;
  523. static const uint8_t Matrix0[16] = 
  524. { 0,   8,  2, 10,
  525. 12,  4, 14,  6,
  526. 3,  11,  1,  9,
  527. 15,  7, 13,  5 };
  528. uint8_t Matrix[16];
  529. memcpy(Matrix,Matrix0,sizeof(Matrix));
  530. p->DitherSize = min(p->DstSize[0],min(p->DstSize[1],p->DstSize[2]));
  531. if (p->DitherSize>4)
  532. for (i=0;i<16;++i)
  533. Matrix[i] >>= p->DitherSize-4;
  534. // dither mask
  535. Dither = InstCreate(Matrix,16,NONE,NONE,NONE,0,0);
  536. I1P(WLDRD,WR12,Dither,0);
  537. I1P(WLDRD,WR13,Dither,8);
  538. }
  539.   I2C(LDR,R5,SP,OFS(stack,Height));
  540. I2C(LDR,R6,SP,OFS(stack,Width));
  541. //SrcNext = 2*(SrcHalfY?2:1)*(SwapXY?4:1)*Src->Pitch - (Width*(SrcHalfY?2:1) >> SrcDoubleX)
  542. I3S(MOV,R1,NONE,R11,LSL,1+p->SrcHalfY+p->SwapXY*2);
  543. I3S(SUB,R1,R1,R6,LSR,p->SrcDoubleX-p->SrcHalfX); 
  544. I2C(STR,R1,SP,OFS(stack,SrcNext));
  545. //UVNext = (Src->Pitch >> 1)*(SrcHalfY?2:1)*(SwapXY?4:1) - (Width*(SrcHalfY?2:1) >> SrcDoubleX >> 1);
  546. I3S(MOV,R2,NONE,R11,ASR,1-p->SrcHalfY-p->SwapXY*2);
  547. I3S(SUB,R2,R2,R6,LSR,p->SrcDoubleX+1-p->SrcHalfX); 
  548. I2C(STR,R2,SP,OFS(stack,UVNext));
  549. if (p->DirX<0) //adjust reversed destination for block size
  550. I2C(SUB,R9,R9,-(p->DstStepX >> 1)-(p->DstBPP >> 3));
  551. if (p->SwapXY)
  552. {
  553. // EndOfRect = Dst + ((Height * DstBPP * DirX) >> 3)
  554. I2C(MOV,R1,NONE,p->DstBPP * p->DirX);
  555. I3(MUL,R1,R5,R1);
  556. I3S(ADD,R1,R9,R1,ASR,3);
  557. I2C(STR,R1,SP,OFS(stack,EndOfRect));
  558. //DstNext = DstStepX - Width*DstPitch;
  559. MB(); I3(MUL,R2,R10,R6);
  560. I2C(MOV,R1,NONE,p->DstStepX); 
  561. I3(SUB,R1,R1,R2); 
  562. I2C(STR,R1,SP,OFS(stack,DstNext));
  563. }
  564. else
  565. {
  566. // EndOfRect = Dst + DstPitch * Height
  567. I3(MUL,R1,R10,R5);
  568. I3(ADD,R1,R9,R1);
  569. I2C(STR,R1,SP,OFS(stack,EndOfRect));
  570. //DstNext = ((DstPitch*2 << DstDoubleY) - DirX * Width << DstBPP2;
  571. I3S(MOV,R2,NONE,R10,LSL,p->DstDoubleY+1);
  572. I3S(p->DirX>0?SUB:ADD,R2,R2,R6,LSL,p->DstBPP2); 
  573. I2C(STR,R2,SP,OFS(stack,DstNext));
  574. }
  575. // setup shift registers
  576. // wcgr0 abs(rpos-8)
  577. // wcgr1 abs(gpos-8)
  578. // wcgr2 abs(bpos-8)
  579. // wcgr3 8 (if HalfMode)
  580. I2C(MOV,R5,NONE,abs(p->DstPos[0]+p->DstSize[0]-8));
  581. I2C(MOV,R6,NONE,abs(p->DstPos[1]+p->DstSize[1]-8));
  582. I2C(MOV,R7,NONE,abs(p->DstPos[2]+p->DstSize[2]-8));
  583. I2(TMCR,WCGR0,R5);
  584. I2(TMCR,WCGR1,R6);
  585. I2(TMCR,WCGR2,R7);
  586. if (HalfMode)
  587. {
  588. I2C(MOV,R1,NONE,8);
  589. I2(TMCR,WCGR3,R1);
  590. }
  591. // setup masks
  592. // r0 bmask (if bpos!=0)
  593. // wr9,wr10 mask r,g
  594. I2C(MOV,R1,NONE,((1 << p->DstSize[1])-1)<<(8-p->DstSize[1]));
  595. I2(TBCSTB,WR10,R1);
  596. if (p->DstPos[2]==0 && p->DstPos[0]+p->DstSize[0]==16)
  597. {
  598. // (red and blue word mask in R9)
  599. I2C(MOV,R1,NONE,((1 << p->DstSize[0])-1)<<(16-p->DstSize[0]));
  600. I2C(ORR,R1,R1,(1 << p->DstSize[2])-1);
  601. I2(TBCSTH,WR9,R1);
  602. }
  603. else
  604. {
  605. I2C(MOV,R1,NONE,((1 << p->DstSize[0])-1)<<(8-p->DstSize[0]));
  606. I2C(MOV,R0,NONE,((1 << p->DstSize[2])-1)<<(8-p->DstSize[2]));
  607. I2(TBCSTB,WR9,R1);
  608. }
  609. if (p->SwapXY)
  610. I2C(ADD,R2,R9,(8*p->DirX) << p->DstDoubleX);
  611. else
  612. I3S(ADD,R2,R9,R10,LSL,p->DstDoubleY);
  613. I3S(ADD,R14,R12,R11,LSL,p->SrcHalfY);
  614. if (!p->SwapXY)
  615. {
  616. I2C(SUB,R3,R3,4);
  617. I2C(SUB,R4,R4,4);
  618. }
  619. I2C(LDR,R5,SP,OFS(stack,Width));
  620. LoopY = Label(0);
  621. I0P(B,AL,LoopY);
  622. Align(8);
  623. InstPost(p->RVMul);
  624. InstPost(p->GUMul);
  625. InstPost(p->GVMul);
  626. InstPost(p->BUMul);
  627. InstPost(p->RAdd);
  628. InstPost(p->GAdd);
  629. InstPost(p->BAdd);
  630. InstPost(p->YMul);
  631. if (Dither)
  632. InstPost(Dither);
  633. InstPost(LoopY);
  634. if (p->SwapXY)
  635. {
  636. I3(MUL,R1,R10,R5); //dstpitch * width
  637. I3(ADD,R8,R9,R1);
  638. }
  639. else
  640. {
  641. if (p->DirX > 0)
  642. I3S(ADD,R8,R9,R5,LSL,p->DstBPP2);
  643. else
  644. I3S(SUB,R8,R9,R5,LSL,p->DstBPP2);
  645. }
  646. LoopX = Label(0);
  647. // preload
  648. if (!p->Slices)
  649. {
  650. dyninst* PreLoad1;
  651. dyninst* PreLoad2;
  652. dyninst* PreLoad3;
  653. dyninst* PreLoad4;
  654. int UVAdj = p->SwapXY?0:4;
  655. I3S(ADD,R1,R12,R5,ASR,(p->SrcDoubleX?1:0)-(p->SrcHalfX?1:0));
  656. I2C(ADD,R5,R12,32);
  657. I3(CMP,NONE,R5,R1);
  658. I0P(B,CS,LoopX);
  659. //y0
  660. PreLoad1 = Label(1);
  661. Byte(); I2C(LDR,R6,R5,-32);
  662. I2C(ADD,R5,R5,64);
  663. I3(CMP,NONE,R5,R1);
  664. Byte(); I2C(LDR,R7,R5,-64);
  665. I0P(B,CC,PreLoad1);
  666. I3(SUB,R1,R1,R12);
  667. I3(ADD,R1,R1,R14);
  668. I2C(ADD,R5,R14,32);
  669. //y1
  670. PreLoad2 = Label(1);
  671. Byte(); I2C(LDR,R6,R5,-32);
  672. I2C(ADD,R5,R5,64);
  673. I3(CMP,NONE,R5,R1);
  674. Byte(); I2C(LDR,R7,R5,-64);
  675. I0P(B,CC,PreLoad2);
  676. I3(SUB,R1,R1,R14);
  677. I3S(ADD,R1,R3,R1,ASR,p->SrcUVX2);
  678. I2C(ADD,R5,R3,32);
  679. I3(CMP,NONE,R5,R1);
  680. I0P(B,CS,LoopX);
  681. //u
  682. PreLoad3 = Label(1);
  683. Byte(); I2C(LDR,R6,R5,-32+UVAdj);
  684. I2C(ADD,R5,R5,64);
  685. I3(CMP,NONE,R5,R1);
  686. Byte(); I2C(LDR,R7,R5,-64+UVAdj);
  687. I0P(B,CC,PreLoad3);
  688. I3(SUB,R1,R1,R3);
  689. I3(ADD,R1,R1,R4);
  690. I2C(ADD,R5,R4,32);
  691. //v
  692. PreLoad4 = Label(1);
  693. Byte(); I2C(LDR,R6,R5,-32+UVAdj);
  694. I2C(ADD,R5,R5,64);
  695. I3(CMP,NONE,R5,R1);
  696. Byte(); I2C(LDR,R7,R5,-64+UVAdj);
  697. I0P(B,CC,PreLoad4);
  698. }
  699. else
  700. if (p->ARM5)
  701. {
  702. //preload next
  703. I3S(PLD,NONE,R12,R11,LSL,p->SrcHalfY+1); 
  704. I3S(PLD,NONE,R14,R11,LSL,p->SrcHalfY+1);
  705. I3S(PLD,NONE,R3,R11,ASR,p->SrcUVPitch2);
  706. I3S(PLD,NONE,R4,R11,ASR,p->SrcUVPitch2);
  707. }
  708. EndLine = Label(0);
  709. InstPost(LoopX);
  710. {
  711. if (!HalfMode) Fix_UV(p,0,0,0);
  712. Fix_Y(p,0,0,HalfMode);
  713. Fix_Y(p,1,0,HalfMode);
  714. if (p->SwapXY && (p->FX.Flags & BLITFX_DITHER))
  715. {
  716. I3(CMP,NONE,R9,R8);
  717. I0P(B,EQ,EndLine);
  718. if (!HalfMode) Fix_UV(p,0,0,0);
  719. Fix_Y(p,0,1,HalfMode);
  720. Fix_Y(p,1,1,HalfMode);
  721. }
  722. I3(CMP,NONE,R9,R8);
  723. I0P(B,NE,LoopX);
  724. }
  725. InstPost(EndLine);
  726. I2C(LDR,R5,SP,OFS(stack,SrcNext));
  727. I2C(LDR,R6,SP,OFS(stack,DstNext));
  728. I2C(LDR,R7,SP,OFS(stack,UVNext));
  729. I2C(LDR,R8,SP,OFS(stack,EndOfRect));
  730. //increment pointers
  731. I3(ADD,R12,R12,R5);
  732. I3(ADD,R14,R14,R5);
  733. I3(ADD,R2,R2,R6);
  734. I3(ADD,R9,R9,R6);
  735. I3(ADD,R3,R3,R7);
  736. I3(ADD,R4,R4,R7);
  737. if (!p->SwapXY && (p->FX.Flags & BLITFX_DITHER))
  738. {
  739. //swap WR12 and WR13
  740. I3(WOR,WR3,WR12,WR12);
  741. I3(WOR,WR12,WR13,WR13);
  742. I3(WOR,WR13,WR3,WR3);
  743. }
  744. //prepare registers for next row
  745. I2C(LDR,R5,SP,OFS(stack,Width));
  746. I3(CMP,NONE,R9,R8);
  747. I0P(B,NE,LoopY);
  748. I2C(ADD,SP,SP,OFS(stack,StackFrame));
  749. CodeEnd();
  750. }
  751. #endif