/* Mulders' MulHigh function (short product) Copyright 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc. Contributed by the AriC and Caramel projects, INRIA. This file is part of the GNU MPFR Library. The GNU MPFR Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. The GNU MPFR Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the GNU MPFR Library; see the file COPYING.LESSER. If not, see http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ /* References: [1] Short Division of Long Integers, David Harvey and Paul Zimmermann, Proceedings of the 20th Symposium on Computer Arithmetic (ARITH-20), July 25-27, 2011, pages 7-14. */ #define MPFR_NEED_LONGLONG_H #include "mpfr-impl.h" #ifndef MUL_FFT_THRESHOLD #define MUL_FFT_THRESHOLD 8448 #endif /* Don't use MPFR_MULHIGH_SIZE since it is handled by tuneup */ #ifdef MPFR_MULHIGH_TAB_SIZE static short mulhigh_ktab[MPFR_MULHIGH_TAB_SIZE]; #else static short mulhigh_ktab[] = {MPFR_MULHIGH_TAB}; #define MPFR_MULHIGH_TAB_SIZE \ ((mp_size_t) (sizeof(mulhigh_ktab) / sizeof(mulhigh_ktab[0]))) #endif /* Put in rp[n..2n-1] an approximation of the n high limbs of {up, n} * {vp, n}. The error is less than n ulps of rp[n] (and the approximation is always less or equal to the truncated full product). Assume 2n limbs are allocated at rp. Implements Algorithm ShortMulNaive from [1]. */ static void mpfr_mulhigh_n_basecase (mpfr_limb_ptr rp, mpfr_limb_srcptr up, mpfr_limb_srcptr vp, mp_size_t n) { mp_size_t i; rp += n - 1; umul_ppmm (rp[1], rp[0], up[n-1], vp[0]); /* we neglect up[0..n-2]*vp[0], which is less than B^n */ for (i = 1 ; i < n ; i++) /* here, we neglect up[0..n-i-2] * vp[i], which is less than B^n too */ rp[i + 1] = mpn_addmul_1 (rp, up + (n - i - 1), i + 1, vp[i]); /* in total, we neglect less than n*B^n, i.e., n ulps of rp[n]. */ } /* Put in rp[0..n] the n+1 low limbs of {up, n} * {vp, n}. Assume 2n limbs are allocated at rp. */ static void mpfr_mullow_n_basecase (mpfr_limb_ptr rp, mpfr_limb_srcptr up, mpfr_limb_srcptr vp, mp_size_t n) { mp_size_t i; rp[n] = mpn_mul_1 (rp, up, n, vp[0]); for (i = 1 ; i < n ; i++) mpn_addmul_1 (rp + i, up, n - i + 1, vp[i]); } /* Put in rp[n..2n-1] an approximation of the n high limbs of {np, n} * {mp, n}. The error is less than n ulps of rp[n] (and the approximation is always less or equal to the truncated full product). Implements Algorithm ShortMul from [1]. */ void mpfr_mulhigh_n (mpfr_limb_ptr rp, mpfr_limb_srcptr np, mpfr_limb_srcptr mp, mp_size_t n) { mp_size_t k; MPFR_ASSERTN (MPFR_MULHIGH_TAB_SIZE >= 8); /* so that 3*(n/4) > n/2 */ k = MPFR_LIKELY (n < MPFR_MULHIGH_TAB_SIZE) ? mulhigh_ktab[n] : 3*(n/4); /* Algorithm ShortMul from [1] requires k >= (n+3)/2, which translates into k >= (n+4)/2 in the C language. */ MPFR_ASSERTD (k == -1 || k == 0 || (k >= (n+4)/2 && k < n)); if (k < 0) mpn_mul_basecase (rp, np, n, mp, n); /* result is exact, no error */ else if (k == 0) mpfr_mulhigh_n_basecase (rp, np, mp, n); /* basecase error < n ulps */ else if (n > MUL_FFT_THRESHOLD) mpn_mul_n (rp, np, mp, n); /* result is exact, no error */ else { mp_size_t l = n - k; mp_limb_t cy; mpn_mul_n (rp + 2 * l, np + l, mp + l, k); /* fills rp[2l..2n-1] */ mpfr_mulhigh_n (rp, np + k, mp, l); /* fills rp[l-1..2l-1] */ cy = mpn_add_n (rp + n - 1, rp + n - 1, rp + l - 1, l + 1); mpfr_mulhigh_n (rp, np, mp + k, l); /* fills rp[l-1..2l-1] */ cy += mpn_add_n (rp + n - 1, rp + n - 1, rp + l - 1, l + 1); mpn_add_1 (rp + n + l, rp + n + l, k, cy); /* propagate carry */ } } /* Put in rp[0..n] the n+1 low limbs of {np, n} * {mp, n}. Assume 2n limbs are allocated at rp. */ void mpfr_mullow_n (mpfr_limb_ptr rp, mpfr_limb_srcptr np, mpfr_limb_srcptr mp, mp_size_t n) { mp_size_t k; MPFR_ASSERTN (MPFR_MULHIGH_TAB_SIZE >= 8); /* so that 3*(n/4) > n/2 */ k = MPFR_LIKELY (n < MPFR_MULHIGH_TAB_SIZE) ? mulhigh_ktab[n] : 3*(n/4); MPFR_ASSERTD (k == -1 || k == 0 || (2 * k >= n && k < n)); if (k < 0) mpn_mul_basecase (rp, np, n, mp, n); else if (k == 0) mpfr_mullow_n_basecase (rp, np, mp, n); else if (n > MUL_FFT_THRESHOLD) mpn_mul_n (rp, np, mp, n); else { mp_size_t l = n - k; mpn_mul_n (rp, np, mp, k); /* fills rp[0..2k] */ mpfr_mullow_n (rp + n, np + k, mp, l); /* fills rp[n..n+2l] */ mpn_add_n (rp + k, rp + k, rp + n, l + 1); mpfr_mullow_n (rp + n, np, mp + k, l); /* fills rp[n..n+2l] */ mpn_add_n (rp + k, rp + k, rp + n, l + 1); } } #ifdef MPFR_SQRHIGH_TAB_SIZE static short sqrhigh_ktab[MPFR_SQRHIGH_TAB_SIZE]; #else static short sqrhigh_ktab[] = {MPFR_SQRHIGH_TAB}; #define MPFR_SQRHIGH_TAB_SIZE (sizeof(sqrhigh_ktab) / sizeof(sqrhigh_ktab[0])) #endif /* Put in rp[n..2n-1] an approximation of the n high limbs of {np, n}^2. The error is less than n ulps of rp[n]. */ void mpfr_sqrhigh_n (mpfr_limb_ptr rp, mpfr_limb_srcptr np, mp_size_t n) { mp_size_t k; MPFR_ASSERTN (MPFR_SQRHIGH_TAB_SIZE > 2); /* ensures k < n */ k = MPFR_LIKELY (n < MPFR_SQRHIGH_TAB_SIZE) ? sqrhigh_ktab[n] : (n+4)/2; /* ensures that k >= (n+3)/2 */ MPFR_ASSERTD (k == -1 || k == 0 || (k >= (n+4)/2 && k < n)); if (k < 0) /* we can't use mpn_sqr_basecase here, since it requires n <= SQR_KARATSUBA_THRESHOLD, where SQR_KARATSUBA_THRESHOLD is not exported by GMP */ mpn_sqr_n (rp, np, n); else if (k == 0) mpfr_mulhigh_n_basecase (rp, np, np, n); else { mp_size_t l = n - k; mp_limb_t cy; mpn_sqr_n (rp + 2 * l, np + l, k); /* fills rp[2l..2n-1] */ mpfr_mulhigh_n (rp, np, np + k, l); /* fills rp[l-1..2l-1] */ /* {rp+n-1,l+1} += 2 * {rp+l-1,l+1} */ cy = mpn_lshift (rp + l - 1, rp + l - 1, l + 1, 1); cy += mpn_add_n (rp + n - 1, rp + n - 1, rp + l - 1, l + 1); mpn_add_1 (rp + n + l, rp + n + l, k, cy); /* propagate carry */ } } #ifdef MPFR_DIVHIGH_TAB_SIZE static short divhigh_ktab[MPFR_DIVHIGH_TAB_SIZE]; #else static short divhigh_ktab[] = {MPFR_DIVHIGH_TAB}; #define MPFR_DIVHIGH_TAB_SIZE (sizeof(divhigh_ktab) / sizeof(divhigh_ktab[0])) #endif #ifndef __GMPFR_GMP_H__ #define mpfr_pi1_t gmp_pi1_t /* with a GMP build */ #endif #if !(defined(WANT_GMP_INTERNALS) && defined(HAVE___GMPN_SBPI1_DIVAPPR_Q)) /* Put in Q={qp, n} an approximation of N={np, 2*n} divided by D={dp, n}, with the most significant limb of the quotient as return value (0 or 1). Assumes the most significant bit of D is set. Clobbers N. The approximate quotient Q satisfies - 2(n-1) < N/D - Q <= 4. */ static mp_limb_t mpfr_divhigh_n_basecase (mpfr_limb_ptr qp, mpfr_limb_ptr np, mpfr_limb_srcptr dp, mp_size_t n) { mp_limb_t qh, d1, d0, dinv, q2, q1, q0; mpfr_pi1_t dinv2; np += n; if ((qh = (mpn_cmp (np, dp, n) >= 0))) mpn_sub_n (np, np, dp, n); /* now {np, n} is less than D={dp, n}, which implies np[n-1] <= dp[n-1] */ d1 = dp[n - 1]; if (n == 1) { invert_limb (dinv, d1); umul_ppmm (q1, q0, np[0], dinv); qp[0] = np[0] + q1; return qh; } /* now n >= 2 */ d0 = dp[n - 2]; invert_pi1 (dinv2, d1, d0); /* dinv2.inv32 = floor ((B^3 - 1) / (d0 + d1 B)) - B */ while (n > 1) { /* Invariant: it remains to reduce n limbs from N (in addition to the initial low n limbs). Since n >= 2 here, necessarily we had n >= 2 initially, which means that in addition to the limb np[n-1] to reduce, we have at least 2 extra limbs, thus accessing np[n-3] is valid. */ /* warning: we can have np[n-1]=d1 and np[n-2]=d0, but since {np,n} < D, the largest possible partial quotient is B-1 */ if (MPFR_UNLIKELY(np[n - 1] == d1 && np[n - 2] == d0)) q2 = ~ (mp_limb_t) 0; else udiv_qr_3by2 (q2, q1, q0, np[n - 1], np[n - 2], np[n - 3], d1, d0, dinv2.inv32); /* since q2 = floor((np[n-1]*B^2+np[n-2]*B+np[n-3])/(d1*B+d0)), we have q2 <= (np[n-1]*B^2+np[n-2]*B+np[n-3])/(d1*B+d0), thus np[n-1]*B^2+np[n-2]*B+np[n-3] >= q2*(d1*B+d0) and {np-1, n} >= q2*D - q2*B^(n-2) >= q2*D - B^(n-1) thus {np-1, n} - (q2-1)*D >= D - B^(n-1) >= 0 which proves that at most one correction is needed */ q0 = mpn_submul_1 (np - 1, dp, n, q2); if (MPFR_UNLIKELY(q0 > np[n - 1])) { mpn_add_n (np - 1, np - 1, dp, n); q2 --; } qp[--n] = q2; dp ++; } /* we have B+dinv2 = floor((B^3-1)/(d1*B+d0)) < B^2/d1 q1 = floor(np[0]*(B+dinv2)/B) <= floor(np[0]*B/d1) <= floor((np[0]*B+np[1])/d1) thus q1 is not larger than the true quotient. q1 > np[0]*(B+dinv2)/B - 1 > np[0]*(B^3-1)/(d1*B+d0)/B - 2 For d1*B+d0 <> B^2/2, we have B+dinv2 = floor(B^3/(d1*B+d0)) thus q1 > np[0]*B^2/(d1*B+d0) - 2, i.e., (d1*B+d0)*q1 > np[0]*B^2 - 2*(d1*B+d0) d1*B*q1 > np[0]*B^2 - 2*d1*B - 2*d0 - d0*q1 >= np[0]*B^2 - 2*d1*B - B^2 thus q1 > np[0]*B/d1 - 2 - B/d1 > np[0]*B/d1 - 4. For d1*B+d0 = B^2/2, dinv2 = B-1 thus q1 > np[0]*(2B-1)/B - 1 > np[0]*B/d1 - 2. In all cases, if q = floor((np[0]*B+np[1])/d1), we have: q - 4 <= q1 <= q */ umul_ppmm (q1, q0, np[0], dinv2.inv32); qp[0] = np[0] + q1; return qh; } #endif /* Put in {qp, n} an approximation of N={np, 2*n} divided by D={dp, n}, with the most significant limb of the quotient as return value (0 or 1). Assumes the most significant bit of D is set. Clobbers N. This implements the ShortDiv algorithm from reference [1]. */ #if 1 mp_limb_t mpfr_divhigh_n (mpfr_limb_ptr qp, mpfr_limb_ptr np, mpfr_limb_ptr dp, mp_size_t n) { mp_size_t k, l; mp_limb_t qh, cy; mpfr_limb_ptr tp; MPFR_TMP_DECL(marker); MPFR_ASSERTN (MPFR_MULHIGH_TAB_SIZE >= 15); /* so that 2*(n/3) >= (n+4)/2 */ k = MPFR_LIKELY (n < MPFR_DIVHIGH_TAB_SIZE) ? divhigh_ktab[n] : 2*(n/3); if (k == 0) #if defined(WANT_GMP_INTERNALS) && defined(HAVE___GMPN_SBPI1_DIVAPPR_Q) { mpfr_pi1_t dinv2; invert_pi1 (dinv2, dp[n - 1], dp[n - 2]); return __gmpn_sbpi1_divappr_q (qp, np, n + n, dp, n, dinv2.inv32); } #else /* use our own code for base-case short division */ return mpfr_divhigh_n_basecase (qp, np, dp, n); #endif else if (k == n) /* for k=n, we use a division with remainder (mpn_divrem), which computes the exact quotient */ return mpn_divrem (qp, 0, np, 2 * n, dp, n); MPFR_ASSERTD ((n+4)/2 <= k && k < n); /* bounds from [1] */ MPFR_TMP_MARK (marker); l = n - k; /* first divide the most significant 2k limbs from N by the most significant k limbs of D */ qh = mpn_divrem (qp + l, 0, np + 2 * l, 2 * k, dp + l, k); /* exact */ /* it remains {np,2l+k} = {np,n+l} as remainder */ /* now we have to subtract high(Q1)*D0 where Q1=qh*B^k+{qp+l,k} and D0={dp,l} */ tp = MPFR_TMP_LIMBS_ALLOC (2 * l); mpfr_mulhigh_n (tp, qp + k, dp, l); /* we are only interested in the upper l limbs from {tp,2l} */ cy = mpn_sub_n (np + n, np + n, tp + l, l); if (qh) cy += mpn_sub_n (np + n, np + n, dp, l); while (cy > 0) /* Q1 was too large: subtract 1 to Q1 and add D to np+l */ { qh -= mpn_sub_1 (qp + l, qp + l, k, MPFR_LIMB_ONE); cy -= mpn_add_n (np + l, np + l, dp, n); } /* now it remains {np,n+l} to divide by D */ cy = mpfr_divhigh_n (qp, np + k, dp + k, l); qh += mpn_add_1 (qp + l, qp + l, k, cy); MPFR_TMP_FREE(marker); return qh; } #else /* below is the FoldDiv(K) algorithm from [1] */ mp_limb_t mpfr_divhigh_n (mpfr_limb_ptr qp, mpfr_limb_ptr np, mpfr_limb_ptr dp, mp_size_t n) { mp_size_t k, r; mpfr_limb_ptr ip, tp, up; mp_limb_t qh = 0, cy, cc; int count; MPFR_TMP_DECL(marker); #define K 3 if (n < K) return mpn_divrem (qp, 0, np, 2 * n, dp, n); k = (n - 1) / K + 1; /* ceil(n/K) */ MPFR_TMP_MARK (marker); ip = MPFR_TMP_LIMBS_ALLOC (k + 1); tp = MPFR_TMP_LIMBS_ALLOC (n + k); up = MPFR_TMP_LIMBS_ALLOC (2 * (k + 1)); mpn_invert (ip, dp + n - (k + 1), k + 1, NULL); /* takes about 13% for n=1000 */ /* {ip, k+1} = floor((B^(2k+2)-1)/D - B^(k+1) where D = {dp+n-(k+1),k+1} */ for (r = n, cc = 0UL; r > 0;) { /* cc is the carry at np[n+r] */ MPFR_ASSERTD(cc <= 1); /* FIXME: why can we have cc as large as say 8? */ count = 0; while (cc > 0) { count ++; MPFR_ASSERTD(count <= 1); /* subtract {dp+n-r,r} from {np+n,r} */ cc -= mpn_sub_n (np + n, np + n, dp + n - r, r); /* add 1 at qp[r] */ qh += mpn_add_1 (qp + r, qp + r, n - r, 1UL); } /* it remains r limbs to reduce, i.e., the remainder is {np, n+r} */ if (r < k) { ip += k - r; k = r; } /* now r >= k */ /* qp + r - 2 * k -> up */ mpfr_mulhigh_n (up, np + n + r - (k + 1), ip, k + 1); /* take into account the term B^k in the inverse: B^k * {np+n+r-k, k} */ cy = mpn_add_n (qp + r - k, up + k + 2, np + n + r - k, k); /* since we need only r limbs of tp (below), it suffices to consider r high limbs of dp */ if (r > k) { #if 0 mpn_mul (tp, dp + n - r, r, qp + r - k, k); #else /* use a short product for the low k x k limbs */ /* we know the upper k limbs of the r-limb product cancel with the remainder, thus we only need to compute the low r-k limbs */ if (r - k >= k) mpn_mul (tp + k, dp + n - r + k, r - k, qp + r - k, k); else /* r-k < k */ { /* #define LOW */ #ifndef LOW mpn_mul (tp + k, qp + r - k, k, dp + n - r + k, r - k); #else mpfr_mullow_n_basecase (tp + k, qp + r - k, dp + n - r + k, r - k); /* take into account qp[2r-2k] * dp[n - r + k] */ tp[r] += qp[2*r-2*k] * dp[n - r + k]; #endif /* tp[k..r] is filled */ } #if 0 mpfr_mulhigh_n (up, dp + n - r, qp + r - k, k); #else /* compute one more limb. FIXME: we could add one limb of dp in the above, to save one mpn_addmul_1 call */ mpfr_mulhigh_n (up, dp + n - r, qp + r - k, k - 1); /* {up,2k-2} */ /* add {qp + r - k, k - 1} * dp[n-r+k-1] */ up[2*k-2] = mpn_addmul_1 (up + k - 1, qp + r - k, k-1, dp[n-r+k-1]); /* add {dp+n-r, k} * qp[r-1] */ up[2*k-1] = mpn_addmul_1 (up + k - 1, dp + n - r, k, qp[r-1]); #endif #ifndef LOW cc = mpn_add_n (tp + k, tp + k, up + k, k); mpn_add_1 (tp + 2 * k, tp + 2 * k, r - k, cc); #else /* update tp[k..r] */ if (r - k + 1 <= k) mpn_add_n (tp + k, tp + k, up + k, r - k + 1); else /* r - k >= k */ { cc = mpn_add_n (tp + k, tp + k, up + k, k); mpn_add_1 (tp + 2 * k, tp + 2 * k, r - 2 * k + 1, cc); } #endif #endif } else /* last step: since we only want the quotient, no need to update, just propagate the carry cy */ { MPFR_ASSERTD(r < n); if (cy > 0) qh += mpn_add_1 (qp + r, qp + r, n - r, cy); break; } /* subtract {tp, n+k} from {np+r-k, n+k}; however we only want to update {np+n, n} */ /* we should have tp[r] = np[n+r-k] up to 1 */ MPFR_ASSERTD(tp[r] == np[n + r - k] || tp[r] + 1 == np[n + r - k]); #ifndef LOW cc = mpn_sub_n (np + n - 1, np + n - 1, tp + k - 1, r + 1); /* borrow at np[n+r] */ #else cc = mpn_sub_n (np + n - 1, np + n - 1, tp + k - 1, r - k + 2); #endif /* if cy = 1, subtract {dp, n} from {np+r, n}, thus {dp+n-r,r} from {np+n,r} */ if (cy) { if (r < n) cc += mpn_sub_n (np + n - 1, np + n - 1, dp + n - r - 1, r + 1); else cc += mpn_sub_n (np + n, np + n, dp + n - r, r); /* propagate cy */ if (r == n) qh = cy; else qh += mpn_add_1 (qp + r, qp + r, n - r, cy); } /* cc is the borrow at np[n+r] */ count = 0; while (cc > 0) /* quotient was too large */ { count++; MPFR_ASSERTD (count <= 1); cy = mpn_add_n (np + n, np + n, dp + n - (r - k), r - k); cc -= mpn_add_1 (np + n + r - k, np + n + r - k, k, cy); qh -= mpn_sub_1 (qp + r - k, qp + r - k, n - (r - k), 1UL); } r -= k; cc = np[n + r]; } MPFR_TMP_FREE(marker); return qh; } #endif