dive_1.c
上传用户:qaz666999
上传日期:2022-08-06
资源大小:2570k
文件大小:4k
- /* UltraSPARC 64 mpn_divexact_1 -- mpn by limb exact division.
- THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST
- CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
- FUTURE GNU MP RELEASES.
- Copyright 2000, 2001, 2003 Free Software Foundation, Inc.
- This file is part of the GNU MP Library.
- The GNU MP Library is free software; you can redistribute it and/or modify
- it under the terms of the GNU Lesser General Public License as published by
- the Free Software Foundation; either version 3 of the License, or (at your
- option) any later version.
- The GNU MP Library is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
- or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
- License for more details.
- You should have received a copy of the GNU Lesser General Public License
- along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
- #include "gmp.h"
- #include "gmp-impl.h"
- #include "longlong.h"
- #include "mpn/sparc64/sparc64.h"
- /* 64-bit divisor 32-bit divisor
- cycles/limb cycles/limb
- (approx) (approx)
- Ultrasparc 2i: 110 70
- */
- /* There are two key ideas here to reduce mulx's. Firstly when the divisor
- is 32-bits the high of q*d can be calculated without the two 32x32->64
- cross-products involving the high 32-bits of the divisor, that being zero
- of course. Secondly umul_ppmm_lowequal and umul_ppmm_half_lowequal save
- one mulx (each) knowing the low of q*d is equal to the input limb l.
- For size==1, a simple udivx is used. This is faster than calculating an
- inverse.
- For a 32-bit divisor and small sizes, an attempt was made at a simple
- udivx loop (two per 64-bit limb), but it turned out to be slower than
- mul-by-inverse. At size==2 the inverse is about 260 cycles total
- compared to a udivx at 291. Perhaps the latter would suit when size==2
- but the high 32-bits of the second limb is zero (saving one udivx), but
- it doesn't seem worth a special case just for that. */
- void
- mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor)
- {
- mp_limb_t inverse, s, s_next, c, l, ls, q;
- unsigned rshift, lshift;
- mp_limb_t lshift_mask;
- mp_limb_t divisor_h;
- ASSERT (size >= 1);
- ASSERT (divisor != 0);
- ASSERT (MPN_SAME_OR_SEPARATE_P (dst, src, size));
- ASSERT_MPN (src, size);
- ASSERT_LIMB (divisor);
- s = *src++; /* src low limb */
- size--;
- if (size == 0)
- {
- *dst = s / divisor;
- return;
- }
- if ((divisor & 1) == 0)
- {
- count_trailing_zeros (rshift, divisor);
- divisor >>= rshift;
- }
- else
- rshift = 0;
- binvert_limb (inverse, divisor);
- lshift = 64 - rshift;
- /* lshift==64 means no shift, so must mask out other part in this case */
- lshift_mask = (rshift == 0 ? 0 : MP_LIMB_T_MAX);
- c = 0;
- divisor_h = HIGH32 (divisor);
- if (divisor_h == 0)
- {
- /* 32-bit divisor */
- do
- {
- s_next = *src++;
- ls = (s >> rshift) | ((s_next << lshift) & lshift_mask);
- s = s_next;
- SUBC_LIMB (c, l, ls, c);
- q = l * inverse;
- *dst++ = q;
- umul_ppmm_half_lowequal (l, q, divisor, l);
- c += l;
- size--;
- }
- while (size != 0);
- ls = s >> rshift;
- l = ls - c;
- q = l * inverse;
- *dst = q;
- }
- else
- {
- /* 64-bit divisor */
- mp_limb_t divisor_l = LOW32 (divisor);
- do
- {
- s_next = *src++;
- ls = (s >> rshift) | ((s_next << lshift) & lshift_mask);
- s = s_next;
- SUBC_LIMB (c, l, ls, c);
- q = l * inverse;
- *dst++ = q;
- umul_ppmm_lowequal (l, q, divisor, divisor_h, divisor_l, l);
- c += l;
- size--;
- }
- while (size != 0);
- ls = s >> rshift;
- l = ls - c;
- q = l * inverse;
- *dst = q;
- }
- }