#ifndef MPU_MULMOD_H #define MPU_MULMOD_H #include "ptypes.h" #if defined(__GNUC__) #define INLINE inline #elif defined(_MSC_VER) #define INLINE __inline #else #define INLINE #endif /* if n is smaller than this, you can multiply without overflow */ #define HALF_WORD (UVCONST(1) << (BITS_PER_WORD/2)) #if (BITS_PER_WORD == 32) && HAVE_STD_U64 /* We have 64-bit available, but UV is 32-bit. Do the math in 64-bit. * Even if it is emulated, it should be as fast or faster than us doing it. */ #define addmod(n,a,m) (UV)(((uint64_t)(n)+(uint64_t)(a)) % ((uint64_t)(m))) #define mulmod(a,b,m) (UV)(((uint64_t)(a)*(uint64_t)(b)) % ((uint64_t)(m))) #define sqrmod(n,m) (UV)(((uint64_t)(n)*(uint64_t)(n)) % ((uint64_t)(m))) #elif defined(__GNUC__) && defined(__x86_64__) /* GCC on a 64-bit Intel x86 */ static INLINE UV _mulmod(UV a, UV b, UV c) { UV d; /* to hold the result of a*b mod c */ /* calculates a*b mod c, stores result in d */ asm ("mov %1, %%rax;" /* put a into rax */ "mul %2;" /* mul a*b -> rdx:rax */ "div %3;" /* (a*b)/c -> quot in rax remainder in rdx */ "mov %%rdx, %0;" /* store result in d */ :"=r"(d) /* output */ :"r"(a), "r"(b), "r"(c) /* input */ :"%rax", "%rdx" /* clobbered registers */ ); /* A version for _MSC_VER: * * __asm { mov rax, qword ptr a * mul qword ptr b * div qword ptr c * mov qword ptr d, rdx } */ return d; } #define mulmod(a,b,m) _mulmod(a,b,m) #define sqrmod(n,m) _mulmod(n,n,m) /* addmod from Kruppa 2010 page 67 */ static INLINE UV _addmod(UV a, UV b, UV m) { UV r = a+b; UV t = a-m; asm ("add %2, %1\n\t" /* t := t + b */ "cmovc %1, %0\n\t" /* if (carry) r := t */ :"+r" (r), "+&r" (t) :"g" (b) :"cc" ); return r; } #define addmod(a,b,n) _addmod(a,b,n) #elif BITS_PER_WORD == 64 && __GNUC__ == 4 && __GNUC_MINOR__ >= 4 && (defined(__x86_64__) || defined(__powerpc64__)) /* We're 64-bit, using a modern gcc, and the target has some 128-bit type. * The actual number of targets that have this implemented are limited. */ #if __GNUC__ == 4 && __GNUC_MINOR__ >= 4 && __GNUC_MINOR__ < 6 typedef unsigned int uint128_t __attribute__ ((__mode__ (TI))); #else typedef unsigned __int128 uint128_t; #endif #define mulmod(a,b,m) (UV)(((uint128_t)(a)*(uint128_t)(b)) % ((uint128_t)(m))) #define sqrmod(n,m) (UV)(((uint128_t)(n)*(uint128_t)(n)) % ((uint128_t)(m))) #else /* UV is the largest integral type available (that we know of). */ /* Do it by hand */ static INLINE UV _mulmod(UV a, UV b, UV m) { UV r = 0; if (a >= m) a %= m; /* Careful attention from the caller should make */ if (b >= m) b %= m; /* these unnecessary. */ if (a < b) { UV t = a; a = b; b = t; } while (b > 0) { if (b & 1) r = ((m-r) > a) ? r+a : r+a-m; /* r = (r + a) % m */ b >>= 1; if (b) a = ((m-a) > a) ? a+a : a+a-m; /* a = (a + a) % m */ } return r; } #define mulmod(a,b,m) (((a)|(b)) < HALF_WORD) ? ((a)*(b))%(m):_mulmod(a,b,m) #define sqrmod(n,m) ((n) < HALF_WORD) ? ((n)*(n))%(m):_mulmod(n,n,m) #endif #ifndef addmod #define addmod(a,b,m) ((((m)-(a)) > (b)) ? ((a)+(b)) : ((a)+(b)-(m))) #endif /* We need to make sure a and b get evaluated into UVs, then do the * subtract into a UV before the addmod. */ static INLINE UV submod(UV a, UV b, UV m) { UV t1 = m - b; return addmod(a, t1, m); } /* n^2 + a mod m */ #define sqraddmod(n, a, m) addmod(sqrmod(n,m), a,m) /* i*j + a mod m */ #define muladdmod(i, j, a, m) addmod(mulmod(i,j,m), a, m) /* i*j - a mod m */ #define mulsubmod(i, j, a, m) submod(mulmod(i,j,m), a, m) /* n^power mod m */ #ifndef HALF_WORD static INLINE UV powmod(UV n, UV power, UV m) { UV t = 1; n %= m; while (power) { if (power & 1) t = mulmod(t, n, m); power >>= 1; if (power) n = sqrmod(n, m); } return t; } #else static INLINE UV powmod(UV n, UV power, UV m) { UV t = 1; n %= m; if (m < HALF_WORD) { while (power) { if (power & 1) t = (t*n)%m; power >>= 1; if (power) n = (n*n)%m; } } else { while (power) { if (power & 1) t = mulmod(t, n, m); power >>= 1; if (power) n = sqrmod(n,m); } } return t; } #endif /* n^power + a mod m */ #define powaddmod(n, p, a, m) addmod(powmod(n,p,m),a,m) #endif