invert_limb.asm
上传用户:qaz666999
上传日期:2022-08-06
资源大小:2570k
文件大小:4k
源码类别:

数学计算

开发平台:

Unix_Linux

  1. dnl  AMD64 mpn_invert_limb -- Invert a normalized limb.
  2. dnl  Contributed to the GNU project by Torbjorn Granlund and Niels M鰈ler.
  3. dnl  Copyright 2004, 2007, 2008, 2009 Free Software Foundation, Inc.
  4. dnl  This file is part of the GNU MP Library.
  5. dnl  The GNU MP Library is free software; you can redistribute it and/or modify
  6. dnl  it under the terms of the GNU Lesser General Public License as published
  7. dnl  by the Free Software Foundation; either version 3 of the License, or (at
  8. dnl  your option) any later version.
  9. dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  10. dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  11. dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  12. dnl  License for more details.
  13. dnl  You should have received a copy of the GNU Lesser General Public License
  14. dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  15. include(`../config.m4')
  16. C      cycles/limb (approx) div
  17. C K8,K9:  48  71
  18. C K10:  48  77
  19. C P4:         135 161
  20. C P6 core2:  69 116
  21. C P6 corei7:  55  89
  22. C P6 atom: 129 191
  23. C rax rcx rdx rdi rsi r8
  24. ASM_START()
  25. TEXT
  26. ALIGN(16)
  27. PROLOGUE(mpn_invert_limb) C Kn C2 Ci
  28. mov %rdi, %rax C  0  0  0
  29. shr $55, %rax C  1  1  1
  30. ifdef(`PIC',`
  31. ifdef(`DARWIN',`
  32. mov approx_tab@GOTPCREL(%rip), %r8
  33. add $-512, %r8
  34. ',`
  35. lea -512+approx_tab(%rip), %r8
  36. ')',`
  37. movabs $-512+approx_tab, %r8
  38. ')
  39. movzwl (%r8,%rax,2), R32(%rcx) C %rcx = v0
  40. C v1 = (v0 << 11) - (v0*v0*d40 >> 40) - 1
  41. mov %rdi, %rsi C  0  0  0
  42. mov R32(%rcx), R32(%rax) C  4  5  5
  43. imul R32(%rcx), R32(%rcx) C  4  5  5
  44. shr $24, %rsi C  1  1  1
  45. inc %rsi C %rsi = d40
  46. imul %rsi, %rcx C  8 10  8
  47. shr $40, %rcx C 12 15 11
  48. sal $11, R32(%rax) C  5  6  6
  49. dec R32(%rax)
  50. sub R32(%rcx), R32(%rax) C %rax = v1
  51. C v2 = (v1 << 13) + (v1 * (2^60 - v1*d40) >> 47
  52. mov $0x1000000000000000, %rcx
  53. imul %rax, %rsi C 14 17 13
  54. sub %rsi, %rcx
  55. imul %rax, %rcx
  56. sal $13, %rax
  57. shr $47, %rcx
  58. add %rax, %rcx C %rcx = v2
  59. C v3 = (v2 << 31) + (v2 * (2^96 - v2 * d63 + (v2>>1) & mask) >> 65
  60. mov %rdi, %rsi C  0  0  0
  61. shr $1, %rsi C d/2
  62. sbb %rax, %rax C -d0 = -(d mod 2)
  63. sub %rax, %rsi C d63 = ceil(d/2)
  64. imul %rcx, %rsi C v2 * d63
  65. and %rcx, %rax C v2 * d0
  66. shr $1, %rax C (v2>>1) * d0
  67. sub %rsi, %rax C (v2>>1) * d0 - v2 * d63
  68. mul %rcx
  69. sal $31, %rcx
  70. shr $1, %rdx
  71. add %rdx, %rcx C %rcx = v3
  72. mov %rdi, %rax
  73. mul %rcx
  74. add %rdi, %rax
  75. mov %rcx, %rax
  76. adc %rdi, %rdx
  77. sub %rdx, %rax
  78. ret
  79. EPILOGUE()
  80. RODATA
  81. ALIGN(2)
  82. approx_tab:
  83. .value 0x7fd,0x7f5,0x7ed,0x7e5,0x7dd,0x7d5,0x7ce,0x7c6
  84. .value 0x7bf,0x7b7,0x7b0,0x7a8,0x7a1,0x79a,0x792,0x78b
  85. .value 0x784,0x77d,0x776,0x76f,0x768,0x761,0x75b,0x754
  86. .value 0x74d,0x747,0x740,0x739,0x733,0x72c,0x726,0x720
  87. .value 0x719,0x713,0x70d,0x707,0x700,0x6fa,0x6f4,0x6ee
  88. .value 0x6e8,0x6e2,0x6dc,0x6d6,0x6d1,0x6cb,0x6c5,0x6bf
  89. .value 0x6ba,0x6b4,0x6ae,0x6a9,0x6a3,0x69e,0x698,0x693
  90. .value 0x68d,0x688,0x683,0x67d,0x678,0x673,0x66e,0x669
  91. .value 0x664,0x65e,0x659,0x654,0x64f,0x64a,0x645,0x640
  92. .value 0x63c,0x637,0x632,0x62d,0x628,0x624,0x61f,0x61a
  93. .value 0x616,0x611,0x60c,0x608,0x603,0x5ff,0x5fa,0x5f6
  94. .value 0x5f1,0x5ed,0x5e9,0x5e4,0x5e0,0x5dc,0x5d7,0x5d3
  95. .value 0x5cf,0x5cb,0x5c6,0x5c2,0x5be,0x5ba,0x5b6,0x5b2
  96. .value 0x5ae,0x5aa,0x5a6,0x5a2,0x59e,0x59a,0x596,0x592
  97. .value 0x58e,0x58a,0x586,0x583,0x57f,0x57b,0x577,0x574
  98. .value 0x570,0x56c,0x568,0x565,0x561,0x55e,0x55a,0x556
  99. .value 0x553,0x54f,0x54c,0x548,0x545,0x541,0x53e,0x53a
  100. .value 0x537,0x534,0x530,0x52d,0x52a,0x526,0x523,0x520
  101. .value 0x51c,0x519,0x516,0x513,0x50f,0x50c,0x509,0x506
  102. .value 0x503,0x500,0x4fc,0x4f9,0x4f6,0x4f3,0x4f0,0x4ed
  103. .value 0x4ea,0x4e7,0x4e4,0x4e1,0x4de,0x4db,0x4d8,0x4d5
  104. .value 0x4d2,0x4cf,0x4cc,0x4ca,0x4c7,0x4c4,0x4c1,0x4be
  105. .value 0x4bb,0x4b9,0x4b6,0x4b3,0x4b0,0x4ad,0x4ab,0x4a8
  106. .value 0x4a5,0x4a3,0x4a0,0x49d,0x49b,0x498,0x495,0x493
  107. .value 0x490,0x48d,0x48b,0x488,0x486,0x483,0x481,0x47e
  108. .value 0x47c,0x479,0x477,0x474,0x472,0x46f,0x46d,0x46a
  109. .value 0x468,0x465,0x463,0x461,0x45e,0x45c,0x459,0x457
  110. .value 0x455,0x452,0x450,0x44e,0x44b,0x449,0x447,0x444
  111. .value 0x442,0x440,0x43e,0x43b,0x439,0x437,0x435,0x432
  112. .value 0x430,0x42e,0x42c,0x42a,0x428,0x425,0x423,0x421
  113. .value 0x41f,0x41d,0x41b,0x419,0x417,0x414,0x412,0x410
  114. .value 0x40e,0x40c,0x40a,0x408,0x406,0x404,0x402,0x400
  115. ASM_END()