invert_limb.asm
上传用户:qaz666999
上传日期:2022-08-06
资源大小:2570k
文件大小:6k
源码类别:

数学计算

开发平台:

Unix_Linux

  1.  dnl  x86 mpn_invert_limb
  2. dnl  Contributed to the GNU project by Niels M鰈ler
  3. dnl  Copyright 2009 Free Software Foundation, Inc.
  4. dnl
  5. dnl  This file is part of the GNU MP Library.
  6. dnl
  7. dnl  The GNU MP Library is free software; you can redistribute it and/or
  8. dnl  modify it under the terms of the GNU Lesser General Public License as
  9. dnl  published by the Free Software Foundation; either version 3 of the
  10. dnl  License, or (at your option) any later version.
  11. dnl
  12. dnl  The GNU MP Library is distributed in the hope that it will be useful,
  13. dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15. dnl  Lesser General Public License for more details.
  16. dnl
  17. dnl  You should have received a copy of the GNU Lesser General Public License
  18. dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  19. include(`../config.m4')
  20. C      cycles (approx) div
  21. C K7:  46 53
  22. C Register usage:
  23. C Input D in %edi
  24. C Current approximation is in %eax and/or %ecx
  25. C %ebx and %edx are temporaries.
  26. C %esi and %ebp is unused.
  27. defframe(PARAM_DIVISOR,4)
  28. ASM_START()
  29. C Make approx_tab global to work around Apple relocation bug.
  30. ifdef(`DARWIN',`
  31. define(`approx_tab', MPN(invert_limb_tab))
  32. GLOBL approx_tab')
  33. TEXT
  34. ALIGN(16)
  35. PROLOGUE(mpn_invert_limb)
  36. deflit(`FRAME', 0)
  37. C Adding the push of %ebp and the corresponding pop seems to
  38. C reduce running time from 46 to 43 cycles on K7. Don't know
  39. C if this is a benchmark artefact or some alignment issue.
  40. push %ebx FRAME_pushl()
  41. C push %ebp FRAME_pushl()
  42. push %edi FRAME_pushl()
  43. mov PARAM_DIVISOR, %edi
  44. mov %edi, %eax
  45. shr $22, %eax
  46. ifdef(`PIC',`
  47. LEA( approx_tab, %ebx)
  48. movzwl -1024(%ebx, %eax, 2), %eax
  49. ',`
  50. movzwl -1024+approx_tab(%eax, %eax), %eax C %eax = v0
  51. ')
  52. C v1 = (v0 << 4) - ((v0*v0*d_21) >> 32) - 1
  53. mov %eax, %ecx
  54. imul %eax, %eax
  55. mov %edi, %ebx
  56. shr $11, %ebx
  57. inc %ebx
  58. mul %ebx
  59. mov %edi, %ebx C Prepare
  60. shr %ebx
  61. sbb %eax, %eax
  62. sub %eax, %ebx C %ebx = d_31, %eax = mask
  63. shl $4, %ecx
  64. dec %ecx
  65. sub %edx, %ecx C %ecx = v1
  66. C v_2 = (v1 << 15) + ((v1 *(2^48 - v1 * d31 + (v1 >> 1) & mask)) >> 33)
  67. imul %ecx, %ebx
  68. and %ecx, %eax
  69. shr %eax
  70. sub %ebx, %eax
  71. mul %ecx
  72. mov %edi, %eax C Prepare for next mul
  73. shl $15, %ecx
  74. shr %edx
  75. add %edx, %ecx C %ecx = v2
  76. mul %ecx
  77. add %edi, %eax
  78. mov %ecx, %eax
  79. adc %edi, %edx
  80. sub %edx, %eax C %eax = v3
  81. pop %edi
  82. C pop %ebp
  83. pop %ebx
  84. ret
  85. EPILOGUE()
  86. DEF_OBJECT(approx_tab,2)
  87. .value 0x7fe1,0x7fa1,0x7f61,0x7f22,0x7ee3,0x7ea4,0x7e65,0x7e27
  88. .value 0x7de9,0x7dab,0x7d6d,0x7d30,0x7cf3,0x7cb6,0x7c79,0x7c3d
  89. .value 0x7c00,0x7bc4,0x7b89,0x7b4d,0x7b12,0x7ad7,0x7a9c,0x7a61
  90. .value 0x7a27,0x79ec,0x79b2,0x7979,0x793f,0x7906,0x78cc,0x7894
  91. .value 0x785b,0x7822,0x77ea,0x77b2,0x777a,0x7742,0x770b,0x76d3
  92. .value 0x769c,0x7665,0x762f,0x75f8,0x75c2,0x758c,0x7556,0x7520
  93. .value 0x74ea,0x74b5,0x7480,0x744b,0x7416,0x73e2,0x73ad,0x7379
  94. .value 0x7345,0x7311,0x72dd,0x72aa,0x7277,0x7243,0x7210,0x71de
  95. .value 0x71ab,0x7179,0x7146,0x7114,0x70e2,0x70b1,0x707f,0x704e
  96. .value 0x701c,0x6feb,0x6fba,0x6f8a,0x6f59,0x6f29,0x6ef9,0x6ec8
  97. .value 0x6e99,0x6e69,0x6e39,0x6e0a,0x6ddb,0x6dab,0x6d7d,0x6d4e
  98. .value 0x6d1f,0x6cf1,0x6cc2,0x6c94,0x6c66,0x6c38,0x6c0a,0x6bdd
  99. .value 0x6bb0,0x6b82,0x6b55,0x6b28,0x6afb,0x6acf,0x6aa2,0x6a76
  100. .value 0x6a49,0x6a1d,0x69f1,0x69c6,0x699a,0x696e,0x6943,0x6918
  101. .value 0x68ed,0x68c2,0x6897,0x686c,0x6842,0x6817,0x67ed,0x67c3
  102. .value 0x6799,0x676f,0x6745,0x671b,0x66f2,0x66c8,0x669f,0x6676
  103. .value 0x664d,0x6624,0x65fc,0x65d3,0x65aa,0x6582,0x655a,0x6532
  104. .value 0x650a,0x64e2,0x64ba,0x6493,0x646b,0x6444,0x641c,0x63f5
  105. .value 0x63ce,0x63a7,0x6381,0x635a,0x6333,0x630d,0x62e7,0x62c1
  106. .value 0x629a,0x6275,0x624f,0x6229,0x6203,0x61de,0x61b8,0x6193
  107. .value 0x616e,0x6149,0x6124,0x60ff,0x60da,0x60b6,0x6091,0x606d
  108. .value 0x6049,0x6024,0x6000,0x5fdc,0x5fb8,0x5f95,0x5f71,0x5f4d
  109. .value 0x5f2a,0x5f07,0x5ee3,0x5ec0,0x5e9d,0x5e7a,0x5e57,0x5e35
  110. .value 0x5e12,0x5def,0x5dcd,0x5dab,0x5d88,0x5d66,0x5d44,0x5d22
  111. .value 0x5d00,0x5cde,0x5cbd,0x5c9b,0x5c7a,0x5c58,0x5c37,0x5c16
  112. .value 0x5bf5,0x5bd4,0x5bb3,0x5b92,0x5b71,0x5b51,0x5b30,0x5b10
  113. .value 0x5aef,0x5acf,0x5aaf,0x5a8f,0x5a6f,0x5a4f,0x5a2f,0x5a0f
  114. .value 0x59ef,0x59d0,0x59b0,0x5991,0x5972,0x5952,0x5933,0x5914
  115. .value 0x58f5,0x58d6,0x58b7,0x5899,0x587a,0x585b,0x583d,0x581f
  116. .value 0x5800,0x57e2,0x57c4,0x57a6,0x5788,0x576a,0x574c,0x572e
  117. .value 0x5711,0x56f3,0x56d5,0x56b8,0x569b,0x567d,0x5660,0x5643
  118. .value 0x5626,0x5609,0x55ec,0x55cf,0x55b2,0x5596,0x5579,0x555d
  119. .value 0x5540,0x5524,0x5507,0x54eb,0x54cf,0x54b3,0x5497,0x547b
  120. .value 0x545f,0x5443,0x5428,0x540c,0x53f0,0x53d5,0x53b9,0x539e
  121. .value 0x5383,0x5368,0x534c,0x5331,0x5316,0x52fb,0x52e0,0x52c6
  122. .value 0x52ab,0x5290,0x5276,0x525b,0x5240,0x5226,0x520c,0x51f1
  123. .value 0x51d7,0x51bd,0x51a3,0x5189,0x516f,0x5155,0x513b,0x5121
  124. .value 0x5108,0x50ee,0x50d5,0x50bb,0x50a2,0x5088,0x506f,0x5056
  125. .value 0x503c,0x5023,0x500a,0x4ff1,0x4fd8,0x4fbf,0x4fa6,0x4f8e
  126. .value 0x4f75,0x4f5c,0x4f44,0x4f2b,0x4f13,0x4efa,0x4ee2,0x4eca
  127. .value 0x4eb1,0x4e99,0x4e81,0x4e69,0x4e51,0x4e39,0x4e21,0x4e09
  128. .value 0x4df1,0x4dda,0x4dc2,0x4daa,0x4d93,0x4d7b,0x4d64,0x4d4d
  129. .value 0x4d35,0x4d1e,0x4d07,0x4cf0,0x4cd8,0x4cc1,0x4caa,0x4c93
  130. .value 0x4c7d,0x4c66,0x4c4f,0x4c38,0x4c21,0x4c0b,0x4bf4,0x4bde
  131. .value 0x4bc7,0x4bb1,0x4b9a,0x4b84,0x4b6e,0x4b58,0x4b41,0x4b2b
  132. .value 0x4b15,0x4aff,0x4ae9,0x4ad3,0x4abd,0x4aa8,0x4a92,0x4a7c
  133. .value 0x4a66,0x4a51,0x4a3b,0x4a26,0x4a10,0x49fb,0x49e5,0x49d0
  134. .value 0x49bb,0x49a6,0x4990,0x497b,0x4966,0x4951,0x493c,0x4927
  135. .value 0x4912,0x48fe,0x48e9,0x48d4,0x48bf,0x48ab,0x4896,0x4881
  136. .value 0x486d,0x4858,0x4844,0x482f,0x481b,0x4807,0x47f3,0x47de
  137. .value 0x47ca,0x47b6,0x47a2,0x478e,0x477a,0x4766,0x4752,0x473e
  138. .value 0x472a,0x4717,0x4703,0x46ef,0x46db,0x46c8,0x46b4,0x46a1
  139. .value 0x468d,0x467a,0x4666,0x4653,0x4640,0x462c,0x4619,0x4606
  140. .value 0x45f3,0x45e0,0x45cd,0x45ba,0x45a7,0x4594,0x4581,0x456e
  141. .value 0x455b,0x4548,0x4536,0x4523,0x4510,0x44fe,0x44eb,0x44d8
  142. .value 0x44c6,0x44b3,0x44a1,0x448f,0x447c,0x446a,0x4458,0x4445
  143. .value 0x4433,0x4421,0x440f,0x43fd,0x43eb,0x43d9,0x43c7,0x43b5
  144. .value 0x43a3,0x4391,0x437f,0x436d,0x435c,0x434a,0x4338,0x4327
  145. .value 0x4315,0x4303,0x42f2,0x42e0,0x42cf,0x42bd,0x42ac,0x429b
  146. .value 0x4289,0x4278,0x4267,0x4256,0x4244,0x4233,0x4222,0x4211
  147. .value 0x4200,0x41ef,0x41de,0x41cd,0x41bc,0x41ab,0x419a,0x418a
  148. .value 0x4179,0x4168,0x4157,0x4147,0x4136,0x4125,0x4115,0x4104
  149. .value 0x40f4,0x40e3,0x40d3,0x40c2,0x40b2,0x40a2,0x4091,0x4081
  150. .value 0x4071,0x4061,0x4050,0x4040,0x4030,0x4020,0x4010,0x4000
  151. END_OBJECT(approx_tab)