rshift.asm
上传用户:qaz666999
上传日期:2022-08-06
资源大小:2570k
文件大小:5k
源码类别:

数学计算

开发平台:

Unix_Linux

  1. dnl  Intel Pentium mpn_rshift -- mpn right shift.
  2. dnl  Copyright 1992, 1994, 1995, 1996, 1999, 2000, 2002 Free Software
  3. dnl  Foundation, Inc.
  4. dnl
  5. dnl  This file is part of the GNU MP Library.
  6. dnl
  7. dnl  The GNU MP Library is free software; you can redistribute it and/or
  8. dnl  modify it under the terms of the GNU Lesser General Public License as
  9. dnl  published by the Free Software Foundation; either version 3 of the
  10. dnl  License, or (at your option) any later version.
  11. dnl
  12. dnl  The GNU MP Library is distributed in the hope that it will be useful,
  13. dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15. dnl  Lesser General Public License for more details.
  16. dnl
  17. dnl  You should have received a copy of the GNU Lesser General Public License
  18. dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  19. include(`../config.m4')
  20. C         cycles/limb
  21. C P5,P54:    6.0
  22. C P55:       5.375
  23. C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
  24. C                       unsigned shift);
  25. C
  26. C The main shift-by-N loop should run at 5.375 c/l and that's what P55 does,
  27. C but P5 and P54 run only at 6.0 c/l, which is 4 cycles lost somewhere.
  28. defframe(PARAM_SHIFT,16)
  29. defframe(PARAM_SIZE, 12)
  30. defframe(PARAM_SRC,  8)
  31. defframe(PARAM_DST,  4)
  32. TEXT
  33. ALIGN(8)
  34. PROLOGUE(mpn_rshift)
  35. pushl %edi
  36. pushl %esi
  37. pushl %ebx
  38. pushl %ebp
  39. deflit(`FRAME',16)
  40. movl PARAM_DST,%edi
  41. movl PARAM_SRC,%esi
  42. movl PARAM_SIZE,%ebp
  43. movl PARAM_SHIFT,%ecx
  44. C We can use faster code for shift-by-1 under certain conditions.
  45. cmp $1,%ecx
  46. jne L(normal)
  47. leal 4(%edi),%eax
  48. cmpl %esi,%eax
  49. jnc L(special) C jump if res_ptr + 1 >= s_ptr
  50. leal (%edi,%ebp,4),%eax
  51. cmpl %eax,%esi
  52. jnc L(special) C jump if s_ptr >= res_ptr + size
  53. L(normal):
  54. movl (%esi),%edx
  55. addl $4,%esi
  56. xorl %eax,%eax
  57. shrdl( %cl, %edx, %eax) C compute carry limb
  58. pushl %eax C push carry limb onto stack
  59. decl %ebp
  60. pushl %ebp
  61. shrl $3,%ebp
  62. jz L(end)
  63. movl (%edi),%eax C fetch destination cache line
  64. ALIGN(4)
  65. L(oop): movl 28(%edi),%eax C fetch destination cache line
  66. movl %edx,%ebx
  67. movl (%esi),%eax
  68. movl 4(%esi),%edx
  69. shrdl( %cl, %eax, %ebx)
  70. shrdl( %cl, %edx, %eax)
  71. movl %ebx,(%edi)
  72. movl %eax,4(%edi)
  73. movl 8(%esi),%ebx
  74. movl 12(%esi),%eax
  75. shrdl( %cl, %ebx, %edx)
  76. shrdl( %cl, %eax, %ebx)
  77. movl %edx,8(%edi)
  78. movl %ebx,12(%edi)
  79. movl 16(%esi),%edx
  80. movl 20(%esi),%ebx
  81. shrdl( %cl, %edx, %eax)
  82. shrdl( %cl, %ebx, %edx)
  83. movl %eax,16(%edi)
  84. movl %edx,20(%edi)
  85. movl 24(%esi),%eax
  86. movl 28(%esi),%edx
  87. shrdl( %cl, %eax, %ebx)
  88. shrdl( %cl, %edx, %eax)
  89. movl %ebx,24(%edi)
  90. movl %eax,28(%edi)
  91. addl $32,%esi
  92. addl $32,%edi
  93. decl %ebp
  94. jnz L(oop)
  95. L(end): popl %ebp
  96. andl $7,%ebp
  97. jz L(end2)
  98. L(oop2):
  99. movl (%esi),%eax
  100. shrdl( %cl,%eax,%edx) C compute result limb
  101. movl %edx,(%edi)
  102. movl %eax,%edx
  103. addl $4,%esi
  104. addl $4,%edi
  105. decl %ebp
  106. jnz L(oop2)
  107. L(end2):
  108. shrl %cl,%edx C compute most significant limb
  109. movl %edx,(%edi) C store it
  110. popl %eax C pop carry limb
  111. popl %ebp
  112. popl %ebx
  113. popl %esi
  114. popl %edi
  115. ret
  116. C We loop from least significant end of the arrays, which is only
  117. C permissable if the source and destination don't overlap, since the
  118. C function is documented to work for overlapping source and destination.
  119. L(special):
  120. leal -4(%edi,%ebp,4),%edi
  121. leal -4(%esi,%ebp,4),%esi
  122. movl (%esi),%edx
  123. subl $4,%esi
  124. decl %ebp
  125. pushl %ebp
  126. shrl $3,%ebp
  127. shrl %edx
  128. incl %ebp
  129. decl %ebp
  130. jz L(Lend)
  131. movl (%edi),%eax C fetch destination cache line
  132. ALIGN(4)
  133. L(Loop):
  134. movl -28(%edi),%eax C fetch destination cache line
  135. movl %edx,%ebx
  136. movl (%esi),%eax
  137. movl -4(%esi),%edx
  138. rcrl %eax
  139. movl %ebx,(%edi)
  140. rcrl %edx
  141. movl %eax,-4(%edi)
  142. movl -8(%esi),%ebx
  143. movl -12(%esi),%eax
  144. rcrl %ebx
  145. movl %edx,-8(%edi)
  146. rcrl %eax
  147. movl %ebx,-12(%edi)
  148. movl -16(%esi),%edx
  149. movl -20(%esi),%ebx
  150. rcrl %edx
  151. movl %eax,-16(%edi)
  152. rcrl %ebx
  153. movl %edx,-20(%edi)
  154. movl -24(%esi),%eax
  155. movl -28(%esi),%edx
  156. rcrl %eax
  157. movl %ebx,-24(%edi)
  158. rcrl %edx
  159. movl %eax,-28(%edi)
  160. leal -32(%esi),%esi C use leal not to clobber carry
  161. leal -32(%edi),%edi
  162. decl %ebp
  163. jnz L(Loop)
  164. L(Lend):
  165. popl %ebp
  166. sbbl %eax,%eax C save carry in %eax
  167. andl $7,%ebp
  168. jz L(Lend2)
  169. addl %eax,%eax C restore carry from eax
  170. L(Loop2):
  171. movl %edx,%ebx
  172. movl (%esi),%edx
  173. rcrl %edx
  174. movl %ebx,(%edi)
  175. leal -4(%esi),%esi C use leal not to clobber carry
  176. leal -4(%edi),%edi
  177. decl %ebp
  178. jnz L(Loop2)
  179. jmp L(L1)
  180. L(Lend2):
  181. addl %eax,%eax C restore carry from eax
  182. L(L1): movl %edx,(%edi) C store last limb
  183. movl $0,%eax
  184. rcrl %eax
  185. popl %ebp
  186. popl %ebx
  187. popl %esi
  188. popl %edi
  189. ret
  190. EPILOGUE()