addmul_1.asm
上传用户:qaz666999
上传日期:2022-08-06
资源大小:2570k
文件大小:8k
源码类别:

数学计算

开发平台:

Unix_Linux

  1. dnl  Alpha ev6 nails mpn_addmul_1.
  2. dnl  Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
  3. dnl
  4. dnl  This file is part of the GNU MP Library.
  5. dnl
  6. dnl  The GNU MP Library is free software; you can redistribute it and/or
  7. dnl  modify it under the terms of the GNU Lesser General Public License as
  8. dnl  published by the Free Software Foundation; either version 3 of the
  9. dnl  License, or (at your option) any later version.
  10. dnl
  11. dnl  The GNU MP Library is distributed in the hope that it will be useful,
  12. dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14. dnl  Lesser General Public License for more details.
  15. dnl
  16. dnl  You should have received a copy of the GNU Lesser General Public License
  17. dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  18. include(`../config.m4')
  19. C      cycles/limb
  20. C EV4:    42
  21. C EV5:    18
  22. C EV6:     4
  23. C TODO
  24. C  * Reroll loop for 3.75 c/l with current 4-way unrolling.
  25. C  * The loop is overscheduled wrt loads and wrt multiplies, in particular
  26. C    umulh.
  27. C  * Use FP loop count and multiple exit points, that would simplify feed-in lp0
  28. C    and would work since the loop structure is really regular.
  29. C  INPUT PARAMETERS
  30. define(`rp',`r16')
  31. define(`up',`r17')
  32. define(`n', `r18')
  33. define(`vl0',`r19')
  34. define(`numb_mask',`r6')
  35. define(`m0a',`r0')
  36. define(`m0b',`r1')
  37. define(`m1a',`r2')
  38. define(`m1b',`r3')
  39. define(`m2a',`r20')
  40. define(`m2b',`r21')
  41. define(`m3a',`r22')
  42. define(`m3b',`r23')
  43. define(`acc0',`r25')
  44. define(`acc1',`r27')
  45. define(`ul0',`r4')
  46. define(`ul1',`r5')
  47. define(`ul2',`r4')
  48. define(`ul3',`r5')
  49. define(`rl0',`r24')
  50. define(`rl1',`r24')
  51. define(`rl2',`r24')
  52. define(`rl3',`r24')
  53. define(`t0',`r7')
  54. define(`t1',`r8')
  55. define(`NAIL_BITS',`GMP_NAIL_BITS')
  56. define(`NUMB_BITS',`GMP_NUMB_BITS')
  57. dnl  This declaration is munged by configure
  58. NAILS_SUPPORT(2-63)
  59. ASM_START()
  60. PROLOGUE(mpn_addmul_1)
  61. sll vl0, NAIL_BITS, vl0
  62. lda numb_mask, -1(r31)
  63. srl numb_mask, NAIL_BITS, numb_mask
  64. and n, 3, r25
  65. cmpeq r25, 1, r21
  66. bne r21, L(1m4)
  67. cmpeq r25, 2, r21
  68. bne r21, L(2m4)
  69. beq r25, L(0m4)
  70. L(3m4): ldq ul3, 0(up)
  71. lda n, -4(n)
  72. ldq ul0, 8(up)
  73. mulq vl0, ul3, m3a
  74. umulh vl0, ul3, m3b
  75. ldq ul1, 16(up)
  76. lda up, 24(up)
  77. lda rp, -8(rp)
  78. mulq vl0, ul0, m0a
  79. umulh vl0, ul0, m0b
  80. bge n, L(ge3)
  81. mulq vl0, ul1, m1a
  82. umulh vl0, ul1, m1b
  83. ldq rl3, 8(rp)
  84. srl m3a,NAIL_BITS, t0
  85. addq t0, r31, acc1
  86. addq rl3, acc1, acc1
  87. ldq rl0, 16(rp)
  88. srl m0a,NAIL_BITS, t0
  89. addq t0, m3b, acc0
  90. srl acc1,NUMB_BITS, t1
  91. br r31, L(ta3)
  92. L(ge3): ldq ul2, 0(up)
  93. mulq vl0, ul1, m1a
  94. umulh vl0, ul1, m1b
  95. ldq rl3, 8(rp)
  96. srl m3a,NAIL_BITS, t0
  97. ldq ul3, 8(up)
  98. lda n, -4(n)
  99. mulq vl0, ul2, m2a
  100. addq t0, r31, acc1
  101. umulh vl0, ul2, m2b
  102. addq rl3, acc1, acc1
  103. ldq rl0, 16(rp)
  104. srl m0a,NAIL_BITS, t0
  105. ldq ul0, 16(up)
  106. mulq vl0, ul3, m3a
  107. addq t0, m3b, acc0
  108. srl acc1,NUMB_BITS, t1
  109. br r31, L(el3)
  110. L(0m4): lda n, -8(n)
  111. ldq ul2, 0(up)
  112. ldq ul3, 8(up)
  113. mulq vl0, ul2, m2a
  114. umulh vl0, ul2, m2b
  115. ldq ul0, 16(up)
  116. mulq vl0, ul3, m3a
  117. umulh vl0, ul3, m3b
  118. ldq ul1, 24(up)
  119. lda up, 32(up)
  120. mulq vl0, ul0, m0a
  121. umulh vl0, ul0, m0b
  122. bge n, L(ge4)
  123. ldq rl2, 0(rp)
  124. srl m2a,NAIL_BITS, t0
  125. mulq vl0, ul1, m1a
  126. addq t0, r31, acc0
  127. umulh vl0, ul1, m1b
  128. addq rl2, acc0, acc0
  129. ldq rl3, 8(rp)
  130. srl m3a,NAIL_BITS, t0
  131. addq t0, m2b, acc1
  132. srl acc0,NUMB_BITS, t1
  133. br r31, L(ta4)
  134. L(ge4): ldq rl2, 0(rp)
  135. srl m2a,NAIL_BITS, t0
  136. ldq ul2, 0(up)
  137. mulq vl0, ul1, m1a
  138. addq t0, r31, acc0
  139. umulh vl0, ul1, m1b
  140. addq rl2, acc0, acc0
  141. ldq rl3, 8(rp)
  142. srl m3a,NAIL_BITS, t0
  143. ldq ul3, 8(up)
  144. lda n, -4(n)
  145. mulq vl0, ul2, m2a
  146. addq t0, m2b, acc1
  147. srl acc0,NUMB_BITS, t1
  148. br r31, L(el0)
  149. L(2m4): lda n, -4(n)
  150. ldq ul0, 0(up)
  151. ldq ul1, 8(up)
  152. lda up, 16(up)
  153. lda rp, -16(rp)
  154. mulq vl0, ul0, m0a
  155. umulh vl0, ul0, m0b
  156. bge n, L(ge2)
  157. mulq vl0, ul1, m1a
  158. umulh vl0, ul1, m1b
  159. ldq rl0, 16(rp)
  160. srl m0a,NAIL_BITS, t0
  161. addq t0, r31, acc0
  162. addq rl0, acc0, acc0
  163. ldq rl1, 24(rp)
  164. srl m1a,NAIL_BITS, t0
  165. addq t0, m0b, acc1
  166. srl acc0,NUMB_BITS, t1
  167. br r31, L(ta2)
  168. L(ge2): ldq ul2, 0(up)
  169. mulq vl0, ul1, m1a
  170. umulh vl0, ul1, m1b
  171. ldq ul3, 8(up)
  172. lda n, -4(n)
  173. mulq vl0, ul2, m2a
  174. umulh vl0, ul2, m2b
  175. ldq rl0, 16(rp)
  176. srl m0a,NAIL_BITS, t0
  177. ldq ul0, 16(up)
  178. mulq vl0, ul3, m3a
  179. addq t0, r31, acc0
  180. umulh vl0, ul3, m3b
  181. addq rl0, acc0, acc0
  182. ldq rl1, 24(rp)
  183. srl m1a,NAIL_BITS, t0
  184. ldq ul1, 24(up)
  185. lda up, 32(up)
  186. lda rp, 32(rp)
  187. mulq vl0, ul0, m0a
  188. addq t0, m0b, acc1
  189. srl acc0,NUMB_BITS, t1
  190. bge n, L(el2)
  191. br r31, L(ta6)
  192. L(1m4): lda n, -4(n)
  193. ldq ul1, 0(up)
  194. lda up, 8(up)
  195. lda rp, -24(rp)
  196. bge n, L(ge1)
  197. mulq vl0, ul1, m1a
  198. umulh vl0, ul1, m1b
  199. ldq rl1, 24(rp)
  200. srl m1a,NAIL_BITS, t0
  201. addq rl1, t0, acc1
  202. and acc1,numb_mask, r28
  203. srl acc1,NUMB_BITS, t1
  204. stq r28, 24(rp)
  205. addq t1, m1b, r0
  206. ret r31, (r26), 1
  207. L(ge1): ldq ul2, 0(up)
  208. mulq vl0, ul1, m1a
  209. umulh vl0, ul1, m1b
  210. ldq ul3, 8(up)
  211. lda n, -4(n)
  212. mulq vl0, ul2, m2a
  213. umulh vl0, ul2, m2b
  214. ldq ul0, 16(up)
  215. mulq vl0, ul3, m3a
  216. umulh vl0, ul3, m3b
  217. ldq rl1, 24(rp)
  218. srl m1a,NAIL_BITS, t0
  219. ldq ul1, 24(up)
  220. lda up, 32(up)
  221. lda rp, 32(rp)
  222. mulq vl0, ul0, m0a
  223. addq t0, r31, acc1
  224. umulh vl0, ul0, m0b
  225. addq rl1, acc1, acc1
  226. ldq rl2, 0(rp)
  227. srl m2a,NAIL_BITS, t0
  228. mulq vl0, ul1, m1a
  229. addq t0, m1b, acc0
  230. srl acc1,NUMB_BITS, t1
  231. blt n, L(ta5)
  232. L(ge5): ldq ul2, 0(up)
  233. br r31, L(el1)
  234. ALIGN(16)
  235. L(top): mulq vl0, ul0, m0a C U1
  236. addq t0, m0b, acc1 C L0
  237. srl acc0,NUMB_BITS, t1 C U0
  238. stq r28, -24(rp) C L1
  239. C
  240. L(el2): umulh vl0, ul0, m0b C U1
  241. and acc0,numb_mask, r28 C L0
  242. addq rl1, acc1, acc1 C U0
  243. ldq rl2, 0(rp) C L1
  244. C
  245. unop C U1
  246. addq t1, acc1, acc1 C L0
  247. srl m2a,NAIL_BITS, t0 C U0
  248. ldq ul2, 0(up) C L1
  249. C
  250. mulq vl0, ul1, m1a C U1
  251. addq t0, m1b, acc0 C L0
  252. srl acc1,NUMB_BITS, t1 C U0
  253. stq r28, -16(rp) C L1
  254. C
  255. L(el1): umulh vl0, ul1, m1b C U1
  256. and acc1,numb_mask, r28 C L0
  257. addq rl2, acc0, acc0 C U0
  258. ldq rl3, 8(rp) C L1
  259. C
  260. lda n, -4(n) C L1
  261. addq t1, acc0, acc0 C L0
  262. srl m3a,NAIL_BITS, t0 C U0
  263. ldq ul3, 8(up) C L1
  264. C
  265. mulq vl0, ul2, m2a C U1
  266. addq t0, m2b, acc1 C L0
  267. srl acc0,NUMB_BITS, t1 C U0
  268. stq r28, -8(rp) C L1
  269. C
  270. L(el0): umulh vl0, ul2, m2b C U1
  271. and acc0,numb_mask, r28 C L0
  272. addq rl3, acc1, acc1 C U0
  273. ldq rl0, 16(rp) C L1
  274. C
  275. unop C U1
  276. addq t1, acc1, acc1 C L0
  277. srl m0a,NAIL_BITS, t0 C U0
  278. ldq ul0, 16(up) C L1
  279. C
  280. mulq vl0, ul3, m3a C U1
  281. addq t0, m3b, acc0 C L0
  282. srl acc1,NUMB_BITS, t1 C U0
  283. stq r28, 0(rp) C L1
  284. C
  285. L(el3): umulh vl0, ul3, m3b C U1
  286. and acc1,numb_mask, r28 C L0
  287. addq rl0, acc0, acc0 C U0
  288. ldq rl1, 24(rp) C L1
  289. C
  290. unop C U1
  291. addq t1, acc0, acc0 C L0
  292. srl m1a,NAIL_BITS, t0 C U0
  293. ldq ul1, 24(up) C L1
  294. C
  295. lda up, 32(up) C L0
  296. unop C U1
  297. lda rp, 32(rp) C L1
  298. bge n, L(top) C U0
  299. L(end): mulq vl0, ul0, m0a
  300. addq t0, m0b, acc1
  301. srl acc0,NUMB_BITS, t1
  302. stq r28, -24(rp)
  303. L(ta6): umulh vl0, ul0, m0b
  304. and acc0,numb_mask, r28
  305. addq rl1, acc1, acc1
  306. ldq rl2, 0(rp)
  307. addq t1, acc1, acc1
  308. srl m2a,NAIL_BITS, t0
  309. mulq vl0, ul1, m1a
  310. addq t0, m1b, acc0
  311. srl acc1,NUMB_BITS, t1
  312. stq r28, -16(rp)
  313. L(ta5): umulh vl0, ul1, m1b
  314. and acc1,numb_mask, r28
  315. addq rl2, acc0, acc0
  316. ldq rl3, 8(rp)
  317. addq t1, acc0, acc0
  318. srl m3a,NAIL_BITS, t0
  319. addq t0, m2b, acc1
  320. srl acc0,NUMB_BITS, t1
  321. stq r28, -8(rp)
  322. unop
  323. ALIGN(16)
  324. L(ta4): and acc0,numb_mask, r28
  325. addq rl3, acc1, acc1
  326. ldq rl0, 16(rp)
  327. addq t1, acc1, acc1
  328. srl m0a,NAIL_BITS, t0
  329. addq t0, m3b, acc0
  330. srl acc1,NUMB_BITS, t1
  331. stq r28, 0(rp)
  332. unop
  333. ALIGN(16)
  334. L(ta3): and acc1,numb_mask, r28
  335. addq rl0, acc0, acc0
  336. ldq rl1, 24(rp)
  337. addq t1, acc0, acc0
  338. srl m1a,NAIL_BITS, t0
  339. addq t0, m0b, acc1
  340. srl acc0,NUMB_BITS, t1
  341. stq r28, 8(rp)
  342. unop
  343. ALIGN(16)
  344. L(ta2): and acc0,numb_mask, r28
  345. addq rl1, acc1, acc1
  346. addq t1, acc1, acc1
  347. srl acc1,NUMB_BITS, t1
  348. stq r28, 16(rp)
  349. and acc1,numb_mask, r28
  350. addq t1, m1b, r0
  351. stq r28, 24(rp)
  352. ret r31, (r26), 1
  353. EPILOGUE()
  354. ASM_END()