submul_1.asm
上传用户:qaz666999
上传日期:2022-08-06
资源大小:2570k
文件大小:8k
源码类别:

数学计算

开发平台:

Unix_Linux

  1. dnl  Alpha ev6 nails mpn_submul_1.
  2. dnl  Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
  3. dnl
  4. dnl  This file is part of the GNU MP Library.
  5. dnl
  6. dnl  The GNU MP Library is free software; you can redistribute it and/or
  7. dnl  modify it under the terms of the GNU Lesser General Public License as
  8. dnl  published by the Free Software Foundation; either version 3 of the
  9. dnl  License, or (at your option) any later version.
  10. dnl
  11. dnl  The GNU MP Library is distributed in the hope that it will be useful,
  12. dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14. dnl  Lesser General Public License for more details.
  15. dnl
  16. dnl  You should have received a copy of the GNU Lesser General Public License
  17. dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  18. include(`../config.m4')
  19. C      cycles/limb
  20. C EV4:    42
  21. C EV5:    18
  22. C EV6:     4
  23. C TODO
  24. C  * Reroll loop for 3.75 c/l with current 4-way unrolling.
  25. C  * The loop is overscheduled wrt loads and wrt multiplies, in particular
  26. C    umulh.
  27. C  * Use FP loop count and multiple exit points, that would simplify feed-in lp0
  28. C    and would work since the loop structure is really regular.
  29. C  INPUT PARAMETERS
  30. define(`rp',`r16')
  31. define(`up',`r17')
  32. define(`n', `r18')
  33. define(`vl0',`r19')
  34. define(`numb_mask',`r6')
  35. define(`m0a',`r0')
  36. define(`m0b',`r1')
  37. define(`m1a',`r2')
  38. define(`m1b',`r3')
  39. define(`m2a',`r20')
  40. define(`m2b',`r21')
  41. define(`m3a',`r22')
  42. define(`m3b',`r23')
  43. define(`acc0',`r25')
  44. define(`acc1',`r27')
  45. define(`ul0',`r4')
  46. define(`ul1',`r5')
  47. define(`ul2',`r4')
  48. define(`ul3',`r5')
  49. define(`rl0',`r24')
  50. define(`rl1',`r24')
  51. define(`rl2',`r24')
  52. define(`rl3',`r24')
  53. define(`t0',`r7')
  54. define(`t1',`r8')
  55. define(`NAIL_BITS',`GMP_NAIL_BITS')
  56. define(`NUMB_BITS',`GMP_NUMB_BITS')
  57. dnl  This declaration is munged by configure
  58. NAILS_SUPPORT(2-63)
  59. ASM_START()
  60. PROLOGUE(mpn_submul_1)
  61. sll vl0, NAIL_BITS, vl0
  62. lda numb_mask, -1(r31)
  63. srl numb_mask, NAIL_BITS, numb_mask
  64. and n, 3, r25
  65. cmpeq r25, 1, r21
  66. bne r21, L(1m4)
  67. cmpeq r25, 2, r21
  68. bne r21, L(2m4)
  69. beq r25, L(0m4)
  70. L(3m4): ldq ul3, 0(up)
  71. lda n, -4(n)
  72. ldq ul0, 8(up)
  73. mulq vl0, ul3, m3a
  74. umulh vl0, ul3, m3b
  75. ldq ul1, 16(up)
  76. lda up, 24(up)
  77. lda rp, -8(rp)
  78. mulq vl0, ul0, m0a
  79. umulh vl0, ul0, m0b
  80. bge n, L(ge3)
  81. mulq vl0, ul1, m1a
  82. umulh vl0, ul1, m1b
  83. ldq rl3, 8(rp)
  84. srl m3a,NAIL_BITS, t0
  85. addq t0, r31, acc1
  86. subq rl3, acc1, acc1
  87. ldq rl0, 16(rp)
  88. srl m0a,NAIL_BITS, t0
  89. addq t0, m3b, acc0
  90. sra acc1,NUMB_BITS, t1
  91. br r31, L(ta3)
  92. L(ge3): ldq ul2, 0(up)
  93. mulq vl0, ul1, m1a
  94. umulh vl0, ul1, m1b
  95. ldq rl3, 8(rp)
  96. srl m3a,NAIL_BITS, t0
  97. ldq ul3, 8(up)
  98. lda n, -4(n)
  99. mulq vl0, ul2, m2a
  100. addq t0, r31, acc1
  101. umulh vl0, ul2, m2b
  102. subq rl3, acc1, acc1
  103. ldq rl0, 16(rp)
  104. srl m0a,NAIL_BITS, t0
  105. ldq ul0, 16(up)
  106. mulq vl0, ul3, m3a
  107. addq t0, m3b, acc0
  108. sra acc1,NUMB_BITS, t1
  109. br r31, L(el3)
  110. L(0m4): lda n, -8(n)
  111. ldq ul2, 0(up)
  112. ldq ul3, 8(up)
  113. mulq vl0, ul2, m2a
  114. umulh vl0, ul2, m2b
  115. ldq ul0, 16(up)
  116. mulq vl0, ul3, m3a
  117. umulh vl0, ul3, m3b
  118. ldq ul1, 24(up)
  119. lda up, 32(up)
  120. mulq vl0, ul0, m0a
  121. umulh vl0, ul0, m0b
  122. bge n, L(ge4)
  123. ldq rl2, 0(rp)
  124. srl m2a,NAIL_BITS, t0
  125. mulq vl0, ul1, m1a
  126. addq t0, r31, acc0
  127. umulh vl0, ul1, m1b
  128. subq rl2, acc0, acc0
  129. ldq rl3, 8(rp)
  130. srl m3a,NAIL_BITS, t0
  131. addq t0, m2b, acc1
  132. sra acc0,NUMB_BITS, t1
  133. br r31, L(ta4)
  134. L(ge4): ldq rl2, 0(rp)
  135. srl m2a,NAIL_BITS, t0
  136. ldq ul2, 0(up)
  137. mulq vl0, ul1, m1a
  138. addq t0, r31, acc0
  139. umulh vl0, ul1, m1b
  140. subq rl2, acc0, acc0
  141. ldq rl3, 8(rp)
  142. srl m3a,NAIL_BITS, t0
  143. ldq ul3, 8(up)
  144. lda n, -4(n)
  145. mulq vl0, ul2, m2a
  146. addq t0, m2b, acc1
  147. sra acc0,NUMB_BITS, t1
  148. br r31, L(el0)
  149. L(2m4): lda n, -4(n)
  150. ldq ul0, 0(up)
  151. ldq ul1, 8(up)
  152. lda up, 16(up)
  153. lda rp, -16(rp)
  154. mulq vl0, ul0, m0a
  155. umulh vl0, ul0, m0b
  156. bge n, L(ge2)
  157. mulq vl0, ul1, m1a
  158. umulh vl0, ul1, m1b
  159. ldq rl0, 16(rp)
  160. srl m0a,NAIL_BITS, t0
  161. addq t0, r31, acc0
  162. subq rl0, acc0, acc0
  163. ldq rl1, 24(rp)
  164. srl m1a,NAIL_BITS, t0
  165. addq t0, m0b, acc1
  166. sra acc0,NUMB_BITS, t1
  167. br r31, L(ta2)
  168. L(ge2): ldq ul2, 0(up)
  169. mulq vl0, ul1, m1a
  170. umulh vl0, ul1, m1b
  171. ldq ul3, 8(up)
  172. lda n, -4(n)
  173. mulq vl0, ul2, m2a
  174. umulh vl0, ul2, m2b
  175. ldq rl0, 16(rp)
  176. srl m0a,NAIL_BITS, t0
  177. ldq ul0, 16(up)
  178. mulq vl0, ul3, m3a
  179. addq t0, r31, acc0
  180. umulh vl0, ul3, m3b
  181. subq rl0, acc0, acc0
  182. ldq rl1, 24(rp)
  183. srl m1a,NAIL_BITS, t0
  184. ldq ul1, 24(up)
  185. lda up, 32(up)
  186. lda rp, 32(rp)
  187. mulq vl0, ul0, m0a
  188. addq t0, m0b, acc1
  189. sra acc0,NUMB_BITS, t1
  190. bge n, L(el2)
  191. br r31, L(ta6)
  192. L(1m4): lda n, -4(n)
  193. ldq ul1, 0(up)
  194. lda up, 8(up)
  195. lda rp, -24(rp)
  196. bge n, L(ge1)
  197. mulq vl0, ul1, m1a
  198. umulh vl0, ul1, m1b
  199. ldq rl1, 24(rp)
  200. srl m1a,NAIL_BITS, t0
  201. subq rl1, t0, acc1
  202. and acc1,numb_mask, r28
  203. sra acc1,NUMB_BITS, t1
  204. stq r28, 24(rp)
  205. subq m1b, t1, r0
  206. ret r31, (r26), 1
  207. L(ge1): ldq ul2, 0(up)
  208. mulq vl0, ul1, m1a
  209. umulh vl0, ul1, m1b
  210. ldq ul3, 8(up)
  211. lda n, -4(n)
  212. mulq vl0, ul2, m2a
  213. umulh vl0, ul2, m2b
  214. ldq ul0, 16(up)
  215. mulq vl0, ul3, m3a
  216. umulh vl0, ul3, m3b
  217. ldq rl1, 24(rp)
  218. srl m1a,NAIL_BITS, t0
  219. ldq ul1, 24(up)
  220. lda up, 32(up)
  221. lda rp, 32(rp)
  222. mulq vl0, ul0, m0a
  223. addq t0, r31, acc1
  224. umulh vl0, ul0, m0b
  225. subq rl1, acc1, acc1
  226. ldq rl2, 0(rp)
  227. srl m2a,NAIL_BITS, t0
  228. mulq vl0, ul1, m1a
  229. addq t0, m1b, acc0
  230. sra acc1,NUMB_BITS, t1
  231. blt n, L(ta5)
  232. L(ge5): ldq ul2, 0(up)
  233. br r31, L(el1)
  234. ALIGN(16)
  235. L(top): mulq vl0, ul0, m0a C U1
  236. addq t0, m0b, acc1 C L0
  237. sra acc0,NUMB_BITS, t1 C U0
  238. stq r28, -24(rp) C L1
  239. C
  240. L(el2): umulh vl0, ul0, m0b C U1
  241. and acc0,numb_mask, r28 C L0
  242. subq rl1, acc1, acc1 C U0
  243. ldq rl2, 0(rp) C L1
  244. C
  245. unop C U1
  246. addq t1, acc1, acc1 C L0
  247. srl m2a,NAIL_BITS, t0 C U0
  248. ldq ul2, 0(up) C L1
  249. C
  250. mulq vl0, ul1, m1a C U1
  251. addq t0, m1b, acc0 C L0
  252. sra acc1,NUMB_BITS, t1 C U0
  253. stq r28, -16(rp) C L1
  254. C
  255. L(el1): umulh vl0, ul1, m1b C U1
  256. and acc1,numb_mask, r28 C L0
  257. subq rl2, acc0, acc0 C U0
  258. ldq rl3, 8(rp) C L1
  259. C
  260. lda n, -4(n) C L1
  261. addq t1, acc0, acc0 C L0
  262. srl m3a,NAIL_BITS, t0 C U0
  263. ldq ul3, 8(up) C L1
  264. C
  265. mulq vl0, ul2, m2a C U1
  266. addq t0, m2b, acc1 C L0
  267. sra acc0,NUMB_BITS, t1 C U0
  268. stq r28, -8(rp) C L1
  269. C
  270. L(el0): umulh vl0, ul2, m2b C U1
  271. and acc0,numb_mask, r28 C L0
  272. subq rl3, acc1, acc1 C U0
  273. ldq rl0, 16(rp) C L1
  274. C
  275. unop C U1
  276. addq t1, acc1, acc1 C L0
  277. srl m0a,NAIL_BITS, t0 C U0
  278. ldq ul0, 16(up) C L1
  279. C
  280. mulq vl0, ul3, m3a C U1
  281. addq t0, m3b, acc0 C L0
  282. sra acc1,NUMB_BITS, t1 C U0
  283. stq r28, 0(rp) C L1
  284. C
  285. L(el3): umulh vl0, ul3, m3b C U1
  286. and acc1,numb_mask, r28 C L0
  287. subq rl0, acc0, acc0 C U0
  288. ldq rl1, 24(rp) C L1
  289. C
  290. unop C U1
  291. addq t1, acc0, acc0 C L0
  292. srl m1a,NAIL_BITS, t0 C U0
  293. ldq ul1, 24(up) C L1
  294. C
  295. lda up, 32(up) C L0
  296. unop C U1
  297. lda rp, 32(rp) C L1
  298. bge n, L(top) C U0
  299. L(end): mulq vl0, ul0, m0a
  300. addq t0, m0b, acc1
  301. sra acc0,NUMB_BITS, t1
  302. stq r28, -24(rp)
  303. L(ta6): umulh vl0, ul0, m0b
  304. and acc0,numb_mask, r28
  305. subq rl1, acc1, acc1
  306. ldq rl2, 0(rp)
  307. addq t1, acc1, acc1
  308. srl m2a,NAIL_BITS, t0
  309. mulq vl0, ul1, m1a
  310. addq t0, m1b, acc0
  311. sra acc1,NUMB_BITS, t1
  312. stq r28, -16(rp)
  313. L(ta5): umulh vl0, ul1, m1b
  314. and acc1,numb_mask, r28
  315. subq rl2, acc0, acc0
  316. ldq rl3, 8(rp)
  317. addq t1, acc0, acc0
  318. srl m3a,NAIL_BITS, t0
  319. addq t0, m2b, acc1
  320. sra acc0,NUMB_BITS, t1
  321. stq r28, -8(rp)
  322. unop
  323. ALIGN(16)
  324. L(ta4): and acc0,numb_mask, r28
  325. subq rl3, acc1, acc1
  326. ldq rl0, 16(rp)
  327. addq t1, acc1, acc1
  328. srl m0a,NAIL_BITS, t0
  329. addq t0, m3b, acc0
  330. sra acc1,NUMB_BITS, t1
  331. stq r28, 0(rp)
  332. unop
  333. ALIGN(16)
  334. L(ta3): and acc1,numb_mask, r28
  335. subq rl0, acc0, acc0
  336. ldq rl1, 24(rp)
  337. addq t1, acc0, acc0
  338. srl m1a,NAIL_BITS, t0
  339. addq t0, m0b, acc1
  340. sra acc0,NUMB_BITS, t1
  341. stq r28, 8(rp)
  342. unop
  343. ALIGN(16)
  344. L(ta2): and acc0,numb_mask, r28
  345. subq rl1, acc1, acc1
  346. addq t1, acc1, acc1
  347. sra acc1,NUMB_BITS, t1
  348. stq r28, 16(rp)
  349. and acc1,numb_mask, r28
  350. subq m1b, t1, r0
  351. stq r28, 24(rp)
  352. ret r31, (r26), 1
  353. EPILOGUE()
  354. ASM_END()