invert_limb.asm
上传用户:qaz666999
上传日期:2022-08-06
资源大小:2570k
文件大小:4k
- dnl AMD64 mpn_invert_limb -- Invert a normalized limb.
- dnl Contributed to the GNU project by Torbjorn Granlund and Niels M鰈ler.
- dnl Copyright 2004, 2007, 2008, 2009 Free Software Foundation, Inc.
- dnl This file is part of the GNU MP Library.
- dnl The GNU MP Library is free software; you can redistribute it and/or modify
- dnl it under the terms of the GNU Lesser General Public License as published
- dnl by the Free Software Foundation; either version 3 of the License, or (at
- dnl your option) any later version.
- dnl The GNU MP Library is distributed in the hope that it will be useful, but
- dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
- dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
- dnl License for more details.
- dnl You should have received a copy of the GNU Lesser General Public License
- dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
- include(`../config.m4')
- C cycles/limb (approx) div
- C K8,K9: 48 71
- C K10: 48 77
- C P4: 135 161
- C P6 core2: 69 116
- C P6 corei7: 55 89
- C P6 atom: 129 191
- C rax rcx rdx rdi rsi r8
- ASM_START()
- TEXT
- ALIGN(16)
- PROLOGUE(mpn_invert_limb) C Kn C2 Ci
- mov %rdi, %rax C 0 0 0
- shr $55, %rax C 1 1 1
- ifdef(`PIC',`
- ifdef(`DARWIN',`
- mov approx_tab@GOTPCREL(%rip), %r8
- add $-512, %r8
- ',`
- lea -512+approx_tab(%rip), %r8
- ')',`
- movabs $-512+approx_tab, %r8
- ')
- movzwl (%r8,%rax,2), R32(%rcx) C %rcx = v0
- C v1 = (v0 << 11) - (v0*v0*d40 >> 40) - 1
- mov %rdi, %rsi C 0 0 0
- mov R32(%rcx), R32(%rax) C 4 5 5
- imul R32(%rcx), R32(%rcx) C 4 5 5
- shr $24, %rsi C 1 1 1
- inc %rsi C %rsi = d40
- imul %rsi, %rcx C 8 10 8
- shr $40, %rcx C 12 15 11
- sal $11, R32(%rax) C 5 6 6
- dec R32(%rax)
- sub R32(%rcx), R32(%rax) C %rax = v1
- C v2 = (v1 << 13) + (v1 * (2^60 - v1*d40) >> 47
- mov $0x1000000000000000, %rcx
- imul %rax, %rsi C 14 17 13
- sub %rsi, %rcx
- imul %rax, %rcx
- sal $13, %rax
- shr $47, %rcx
- add %rax, %rcx C %rcx = v2
- C v3 = (v2 << 31) + (v2 * (2^96 - v2 * d63 + (v2>>1) & mask) >> 65
- mov %rdi, %rsi C 0 0 0
- shr $1, %rsi C d/2
- sbb %rax, %rax C -d0 = -(d mod 2)
- sub %rax, %rsi C d63 = ceil(d/2)
- imul %rcx, %rsi C v2 * d63
- and %rcx, %rax C v2 * d0
- shr $1, %rax C (v2>>1) * d0
- sub %rsi, %rax C (v2>>1) * d0 - v2 * d63
- mul %rcx
- sal $31, %rcx
- shr $1, %rdx
- add %rdx, %rcx C %rcx = v3
- mov %rdi, %rax
- mul %rcx
- add %rdi, %rax
- mov %rcx, %rax
- adc %rdi, %rdx
- sub %rdx, %rax
- ret
- EPILOGUE()
- RODATA
- ALIGN(2)
- approx_tab:
- .value 0x7fd,0x7f5,0x7ed,0x7e5,0x7dd,0x7d5,0x7ce,0x7c6
- .value 0x7bf,0x7b7,0x7b0,0x7a8,0x7a1,0x79a,0x792,0x78b
- .value 0x784,0x77d,0x776,0x76f,0x768,0x761,0x75b,0x754
- .value 0x74d,0x747,0x740,0x739,0x733,0x72c,0x726,0x720
- .value 0x719,0x713,0x70d,0x707,0x700,0x6fa,0x6f4,0x6ee
- .value 0x6e8,0x6e2,0x6dc,0x6d6,0x6d1,0x6cb,0x6c5,0x6bf
- .value 0x6ba,0x6b4,0x6ae,0x6a9,0x6a3,0x69e,0x698,0x693
- .value 0x68d,0x688,0x683,0x67d,0x678,0x673,0x66e,0x669
- .value 0x664,0x65e,0x659,0x654,0x64f,0x64a,0x645,0x640
- .value 0x63c,0x637,0x632,0x62d,0x628,0x624,0x61f,0x61a
- .value 0x616,0x611,0x60c,0x608,0x603,0x5ff,0x5fa,0x5f6
- .value 0x5f1,0x5ed,0x5e9,0x5e4,0x5e0,0x5dc,0x5d7,0x5d3
- .value 0x5cf,0x5cb,0x5c6,0x5c2,0x5be,0x5ba,0x5b6,0x5b2
- .value 0x5ae,0x5aa,0x5a6,0x5a2,0x59e,0x59a,0x596,0x592
- .value 0x58e,0x58a,0x586,0x583,0x57f,0x57b,0x577,0x574
- .value 0x570,0x56c,0x568,0x565,0x561,0x55e,0x55a,0x556
- .value 0x553,0x54f,0x54c,0x548,0x545,0x541,0x53e,0x53a
- .value 0x537,0x534,0x530,0x52d,0x52a,0x526,0x523,0x520
- .value 0x51c,0x519,0x516,0x513,0x50f,0x50c,0x509,0x506
- .value 0x503,0x500,0x4fc,0x4f9,0x4f6,0x4f3,0x4f0,0x4ed
- .value 0x4ea,0x4e7,0x4e4,0x4e1,0x4de,0x4db,0x4d8,0x4d5
- .value 0x4d2,0x4cf,0x4cc,0x4ca,0x4c7,0x4c4,0x4c1,0x4be
- .value 0x4bb,0x4b9,0x4b6,0x4b3,0x4b0,0x4ad,0x4ab,0x4a8
- .value 0x4a5,0x4a3,0x4a0,0x49d,0x49b,0x498,0x495,0x493
- .value 0x490,0x48d,0x48b,0x488,0x486,0x483,0x481,0x47e
- .value 0x47c,0x479,0x477,0x474,0x472,0x46f,0x46d,0x46a
- .value 0x468,0x465,0x463,0x461,0x45e,0x45c,0x459,0x457
- .value 0x455,0x452,0x450,0x44e,0x44b,0x449,0x447,0x444
- .value 0x442,0x440,0x43e,0x43b,0x439,0x437,0x435,0x432
- .value 0x430,0x42e,0x42c,0x42a,0x428,0x425,0x423,0x421
- .value 0x41f,0x41d,0x41b,0x419,0x417,0x414,0x412,0x410
- .value 0x40e,0x40c,0x40a,0x408,0x406,0x404,0x402,0x400
- ASM_END()