The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
/*
 * Utility functions, such as sqrt mod p, polynomial manipulation, etc.
 */

#include <stdio.h>
#include <stddef.h>
#include <stdlib.h>
#include <gmp.h>
#include <math.h>

#include "ptypes.h"

/* includes mpz_mulmod(r, a, b, n, temp) */
#include "utility.h"
#include "factor.h"
#include "primality.h"
#include "isaac.h"

static int _verbose = 0;
int get_verbose_level(void) { return _verbose; }
void set_verbose_level(int level) { _verbose = level; }

static gmp_randstate_t _randstate;
gmp_randstate_t* get_randstate(void) { return &_randstate; }

#if __LITTLE_ENDIAN__ || (defined(BYTEORDER) && (BYTEORDER == 0x1234 || BYTEORDER == 0x12345678))
#define LESWAP(mem, val)
#else
#if !defined(__x86_64__)
#undef U8TO32_LE
#undef U32TO8_LE
#endif
#ifndef U32TO8_LE
#define U32TO8_LE(p, v) \
  do { \
    uint32_t _v = v; \
    (p)[0] = (((_v)      ) & 0xFFU); \
    (p)[1] = (((_v) >>  8) & 0xFFU); \
    (p)[2] = (((_v) >> 16) & 0xFFU); \
    (p)[3] = (((_v) >> 24) & 0xFFU); \
  } while (0)
#endif
#define LESWAP(mem, val) U32TO8_LE(mem,val)
#endif

void init_randstate(unsigned long seed) {
  unsigned char seedstr[8] = {0};
#if (__GNU_MP_VERSION > 4) || (__GNU_MP_VERSION == 4 && __GNU_MP_VERSION_MINOR >= 2)
  /* MT was added in GMP 4.2 released in 2006. */
  gmp_randinit_mt(_randstate);
#else
  gmp_randinit_default(_randstate);
#endif
  gmp_randseed_ui(_randstate, seed);

#if BITS_PER_WORD == 64
  if (seed > UVCONST(4294967295)) {
    LESWAP(seedstr, seed);
    LESWAP(seedstr + 4, (seed >> 16) >> 16);
    isaac_init(8, seedstr);
  } else
#endif
  {
    LESWAP(seedstr, seed);
    isaac_init(4, seedstr);
  }
}
void clear_randstate(void) {  gmp_randclear(_randstate);  }

UV irand64(int nbits)
{
  if (nbits ==  0) return 0;
  if (nbits <= 32) return( isaac_rand32() >> (32-nbits) );
#if BITS_PER_WORD == 64
  if (nbits <= 64) return( ((UV)isaac_rand32() | ((UV)isaac_rand32() << 32)) >> (64-nbits) );
#endif
  croak("irand64 too many bits for UV");
}
static NV _tonv_32 = -1.0;
static NV _tonv_64;
NV drand64(void)
{
  if (_tonv_32 < 0) {
    int i;
    NV t64, t32 = 1.0;
    for (i = 0; i < 32; i++)
      t32 /= 2.0;
    t64 = t32;
    for (i = 0; i < 32; i++)
      t64 /= 2.0;
    _tonv_64 = t64;
    _tonv_32 = t32;
  }
  return isaac_rand32() * _tonv_32 + isaac_rand32() * _tonv_64;
}

void mpz_isaac_urandomb(mpz_t rop, int nbits)
{
  if (nbits <= 32) {
    mpz_set_ui(rop, irand64(nbits));
  } else {
    unsigned char* d;
    int nbytes = (nbits+7)/8;
    New(0, d, nbytes, unsigned char);
    isaac_rand_bytes(nbytes, d);
    mpz_import(rop, nbytes, 1, sizeof(unsigned char), 0, 0, d);
    Safefree(d);
    if (nbits != nbytes*8)
      mpz_tdiv_r_2exp(rop, rop, nbits);
  }
}

void mpz_isaac_urandomm(mpz_t rop, mpz_t n)
{
  int count = 80;
  unsigned long nbits = mpz_sizeinbase(n,2);

  if (mpz_sgn(n) <= 0) {
    mpz_set_ui(rop,0);
    return;
  } else if (nbits <= 32) {
    mpz_set_ui(rop, isaac_rand(mpz_get_ui(n)));
  } else if (nbits < 3000) {
    /* Just like GMP, try until we're in range or we're tired. */
    while (count-- > 0) {
      mpz_isaac_urandomb(rop, nbits);
      if (mpz_cmp(rop,n) < 0)
        return;
    }
    mpz_mod(rop, rop, n);
  } else {
    /* Reduce tries needed by selecting from a range that is a multiple of n
     * (so no bias) and uses the max space inside the power-of-2 range.
     * Downside is that we do an alloc and two mods.  For large values
     * it can be much faster however. */
    mpz_t rmax;
    mpz_init(rmax);
    mpz_setbit(rmax, nbits+8);
    mpz_sub_ui(rmax,rmax,1);
    mpz_tdiv_q(rmax, rmax, n);
    mpz_mul(rmax, rmax, n);
    do {
      mpz_isaac_urandomb(rop, nbits+8);
    } while (mpz_cmp(rop, rmax) >= 0 && count-- > 0);
    mpz_mod(rop, rop, n);
  }
}

int is_primitive_root(mpz_t ina, mpz_t n, int nprime)
{
  mpz_t a, s, sreduced, t, *factors;
  int ret, i, nfactors, *exponents;

  if (mpz_sgn(n) == 0)
    return 0;
  if (mpz_sgn(n) < 0)
    mpz_neg(n,n);
  if (mpz_cmp_ui(n,1) == 0)
    return 1;

  mpz_init(a);
  mpz_mod(a,ina,n);
  mpz_init(s);
  mpz_gcd(s, a, n);

  if (mpz_cmp_ui(s,1) != 0) {
    mpz_clear(s);
    mpz_clear(a);
    return 0;
  }
  if (nprime) {
    mpz_sub_ui(s, n, 1);
  } else {
    totient(s, n);
  }
  mpz_init_set(sreduced, s);
  mpz_init(t);
  ret = 1;

#define IPR_TEST_UI(s, p, a, n, t, ret) \
  mpz_divexact_ui(t, s, p); \
  mpz_powm(t, a, t, n); \
  if (mpz_cmp_ui(t, 1) == 0) { ret = 0; }

#define IPR_TEST(s, p, a, n, t, ret) \
  mpz_divexact(t, s, p); \
  mpz_powm(t, a, t, n); \
  if (mpz_cmp_ui(t, 1) == 0) { ret = 0; }

  { /* Pull out small factors and test */
    UV p, fromp = 0;
    while (ret == 1 && (p = _GMP_trial_factor(sreduced, fromp, 60))) {
      if (mpz_cmp_ui(sreduced,p) == 0) break;
      IPR_TEST_UI(s, p, a, n, t, ret);
      mpz_set_ui(t, p);
      (void) mpz_remove(sreduced, sreduced, t);
      fromp = p+1;
    }
  }
  if (ret == 0 || mpz_cmp_ui(sreduced,1) == 0)
    goto DONE_IPR;
  if (_GMP_BPSW(sreduced)) {
    IPR_TEST(s, sreduced, a, n, t, ret);
    goto DONE_IPR;
  }

  /* Pull out more factors, noting they can be composites. */
  while (_GMP_pminus1_factor(sreduced, t, 100000, 100000)) {
    mpz_divexact(sreduced, sreduced, t);
    if (_GMP_BPSW(t)) {
      IPR_TEST(s, t, a, n, t, ret);
    } else {
      nfactors = factor(t, &factors, &exponents);
      for (i = 0; ret == 1 && i < nfactors; i++) {
        IPR_TEST(s, factors[i], a, n, t, ret);
      }
      clear_factors(nfactors, &factors, &exponents);
    }
    if (ret == 0 || mpz_cmp_ui(sreduced,1) == 0)
      goto DONE_IPR;
    if (_GMP_BPSW(sreduced)) {
      IPR_TEST(s, sreduced, a, n, t, ret);
      goto DONE_IPR;
    }
  }

  /* We have a composite and so far it could be a primitive root.  Factor. */
  nfactors = factor(sreduced, &factors, &exponents);
  for (i = 0; ret == 1 && i < nfactors; i++) {
    IPR_TEST(s, factors[i], a, n, t, ret);
  }
  clear_factors(nfactors, &factors, &exponents);

DONE_IPR:
  mpz_clear(sreduced);  mpz_clear(t);
  mpz_clear(s);  mpz_clear(a);
  return ret;
}


int mpz_divmod(mpz_t r, mpz_t a, mpz_t b, mpz_t n, mpz_t t)
{
  int invertible;
  invertible = mpz_invert(t, b, n);
  if (!invertible)
    return 0;
  mpz_mulmod(r, t, a, n, t); /* mpz_mul(t,t,a); mpz_mod(r,t,n); */
  return 1;
}

/* set x to sqrt(a) mod p.  Returns 0 if a is not a square root mod p
 * See Cohen section 1.5 and http://www.math.vt.edu/people/brown/doc/sqrts.pdf
 */
int sqrtmod(mpz_t s, mpz_t a, mpz_t p) {
  int res;
  mpz_t x, t1, t2, t3, t4;
  mpz_init(x); mpz_init(t1); mpz_init(t2); mpz_init(t3); mpz_init(t4);
  res = sqrtmod_t(x, a, p, t1, t2, t3, t4);
  mpz_set(s, x);
  mpz_clear(x); mpz_clear(t1); mpz_clear(t2); mpz_clear(t3); mpz_clear(t4);
  return res;
}

/* Returns 1 if x^2 = a mod p, otherwise set x to 0 and return 0. */
static int verify_sqrt(mpz_t x, mpz_t a, mpz_t p, mpz_t t, mpz_t t2) {
  /* reflect to get the smaller of +/- x */
  mpz_sub(t, p, x); if (mpz_cmp(t,x) < 0) mpz_set(x,t);

  mpz_mulmod(t, x, x, p, t2);
  mpz_mod(t2, a, p);
  if (mpz_cmp(t, t2) == 0) return 1;
  mpz_set_ui(x, 0);
  return 0;
}

/* Internal version that takes temp variables and x cannot overlap args */
int sqrtmod_t(mpz_t x, mpz_t a, mpz_t p,
              mpz_t t, mpz_t q, mpz_t b, mpz_t z) /* 4 temp variables */
{
  int r, e, m;

  if (mpz_cmp_ui(p,2) <= 0) {
    if (mpz_cmp_ui(p,0) <= 0) {
      mpz_set_ui(x,0);
      return 0;
    }
    mpz_mod(x, a, p);
    return verify_sqrt(x, a, p, t, q);
  }
  if (!mpz_cmp_ui(a,0) || !mpz_cmp_ui(a,1)) {
    mpz_set(x,a);
    return verify_sqrt(x, a, p, t, q);
  }

  /* Easy cases from page 31 (or Menezes 3.36, 3.37) */
  if (mpz_congruent_ui_p(p, 3, 4)) {
    mpz_add_ui(t, p, 1);
    mpz_tdiv_q_2exp(t, t, 2);
    mpz_powm(x, a, t, p);
    return verify_sqrt(x, a, p, t, q);
  }

  if (mpz_congruent_ui_p(p, 5, 8)) {
    mpz_sub_ui(t, p, 1);
    mpz_tdiv_q_2exp(t, t, 2);
    mpz_powm(q, a, t, p);
    if (mpz_cmp_si(q, 1) == 0) {  /* s = a^((p+3)/8) mod p */
      mpz_add_ui(t, p, 3);
      mpz_tdiv_q_2exp(t, t, 3);
      mpz_powm(x, a, t, p);
    } else {                      /* s = 2a * (4a)^((p-5)/8) mod p */
      mpz_sub_ui(t, p, 5);
      mpz_tdiv_q_2exp(t, t, 3);
      mpz_mul_ui(q, a, 4);
      mpz_powm(x, q, t, p);
      mpz_mul_ui(x, x, 2);
      mpz_mulmod(x, x, a, p, x);
    }
    return verify_sqrt(x, a, p, t, q);
  }

  if (mpz_kronecker(a, p) != 1) {
    /* Possible no solution exists.  Check Euler criterion. */
    mpz_sub_ui(t, p, 1);
    mpz_tdiv_q_2exp(t, t, 1);
    mpz_powm(x, a, t, p);
    if (mpz_cmp_si(x, 1) != 0) {
      mpz_set_ui(x, 0);
      return 0;
    }
  }

  mpz_sub_ui(q, p, 1);
  e = mpz_scan1(q, 0);                 /* Remove 2^e from q */
  mpz_tdiv_q_2exp(q, q, e);
  mpz_set_ui(t, 2);
  while (mpz_kronecker(t, p) != -1) {  /* choose t "at random" */
    mpz_add_ui(t, t, 1);
    if (!mpz_cmp_ui(t,133)) {
      /* If a root of p exists, then our chances are nearly 1/2 that
       * (t|p) = -1.  After 133 tries it seems dubious that a root
       * exists.  It's likely that p is not prime. */
      if (mpz_even_p(p)) { mpz_set_ui(x,0); return 0; }
      /* Euler probable prime test with base t.  (t|p) = 1 or t divides p */
      if (mpz_divisible_p(p, t)) { mpz_set_ui(x,0); return 0; }
      mpz_sub_ui(z, p, 1);  mpz_fdiv_q_2exp(b,z,1);  mpz_powm(z, t, b, p);
      if (mpz_cmp_ui(z,1)) { mpz_set_ui(x,0); return 0; }
      /* Fermat base 2 */
      mpz_set_ui(b,2);  mpz_sub_ui(z, p, 1);  mpz_powm(z, b, z, p);
      if (mpz_cmp_ui(z,1)) { mpz_set_ui(x,0); return 0; }
    }
    if (!mpz_cmp_ui(t,286)) {
      /* Another Euler probable prime test, p not even so t can't divide. */
      mpz_sub_ui(z, p, 1);  mpz_fdiv_q_2exp(b,z,1);  mpz_powm(z, t, b, p);
      if (mpz_cmp_ui(z,1)) { mpz_set_ui(x,0); return 0; }
    }
    if (!mpz_cmp_ui(t,20000)) { mpz_set_ui(x,0); return 0; }
  }
  mpz_powm(z, t, q, p);                     /* Step 1 complete */
  r = e;

  mpz_powm(b, a, q, p);
  mpz_add_ui(q, q, 1);
  mpz_divexact_ui(q, q, 2);
  mpz_powm(x, a, q, p);   /* Done with q, will use it for y now */

  while (mpz_cmp_ui(b, 1)) {
    /* calculate how many times b^2 mod p == 1 */
    mpz_set(t, b);
    m = 0;
    do {
      mpz_powm_ui(t, t, 2, p);
      m++;
    } while (m < r && mpz_cmp_ui(t, 1));
    if (m >= r) break;
    mpz_ui_pow_ui(t, 2, r-m-1);
    mpz_powm(t, z, t, p);
    mpz_mulmod(x, x, t, p, x);
    mpz_powm_ui(z, t, 2, p);
    mpz_mulmod(b, b, z, p, b);
    r = m;
  }
  return verify_sqrt(x, a, p, t, q);
}

/* Smith-Cornacchia: Solve x,y for x^2 + |D|y^2 = p given prime p */
/* See Cohen 1.5.2 */
int cornacchia(mpz_t x, mpz_t y, mpz_t D, mpz_t p)
{
  int result = 0;
  mpz_t a, b, c, d;

  if (mpz_jacobi(D, p) < 0)     /* No solution */
    return 0;

  mpz_init(a); mpz_init(b); mpz_init(c); mpz_init(d);

  sqrtmod_t(x, D, p, a, b, c, d);
  mpz_set(a, p);
  mpz_set(b, x);
  mpz_sqrt(c, p);

  while (mpz_cmp(b,c) > 0) {
    mpz_set(d, a);
    mpz_set(a, b);
    mpz_mod(b, d, b);
  }

  mpz_mul(a, b, b);
  mpz_sub(a, p, a);   /* a = p - b^2 */
  mpz_abs(d, D);      /* d = |D| */

  if (mpz_divisible_p(a, d)) {
    mpz_divexact(c, a, d);
    if (mpz_perfect_square_p(c)) {
      mpz_set(x, b);
      mpz_sqrt(y, c);
      result = 1;
    }
  }

  mpz_clear(a); mpz_clear(b); mpz_clear(c); mpz_clear(d);

  return result;
}

/* Modified Cornacchia, Solve x,y for x^2 + |D|y^2 = 4p given prime p */
/* See Cohen 1.5.3 */
int modified_cornacchia(mpz_t x, mpz_t y, mpz_t D, mpz_t p)
{
  int result = 0;
  mpz_t a, b, c, d;

  if (mpz_cmp_ui(p, 2) == 0) {
    mpz_add_ui(x, D, 8);
    if (mpz_perfect_square_p(x)) {
      mpz_sqrt(x, x);
      mpz_set_ui(y, 1);
      result = 1;
    }
    return result;
  }
  if (mpz_jacobi(D, p) == -1)     /* No solution */
    return 0;

  mpz_init(a); mpz_init(b); mpz_init(c); mpz_init(d);

  sqrtmod_t(x, D, p, a, b, c, d);
  if ( (mpz_even_p(D) && mpz_odd_p(x)) || (mpz_odd_p(D) && mpz_even_p(x)) )
    mpz_sub(x, p, x);

  mpz_mul_ui(a, p, 2);
  mpz_set(b, x);
  mpz_sqrt(c, p);
  mpz_mul_ui(c, c, 2);

  /* Euclidean algorithm */
  while (mpz_cmp(b, c) > 0) {
    mpz_set(d, a);
    mpz_set(a, b);
    mpz_mod(b, d, b);
  }

  mpz_mul_ui(c, p, 4);
  mpz_mul(a, b, b);
  mpz_sub(a, c, a);   /* a = 4p - b^2 */
  mpz_abs(d, D);      /* d = |D| */

  if (mpz_divisible_p(a, d)) {
    mpz_divexact(c, a, d);
    if (mpz_perfect_square_p(c)) {
      mpz_set(x, b);
      mpz_sqrt(y, c);
      result = 1;
    }
  }

  mpz_clear(a); mpz_clear(b); mpz_clear(c); mpz_clear(d);

  return result;
}


/* Modular inversion: invert a mod p.
 * This implementation from William Hart, using extended gcd.
 */
unsigned long modinverse(unsigned long a, unsigned long p)
{
  long u1 = 1;
  long u3 = a;
  long v1 = 0;
  long v3 = p;
  long t1 = 0;
  long t3 = 0;
  long quot;
  while (v3) {
    quot = u3 - v3;
    if (u3 < (v3<<2)) {
      if (quot < v3) {
        if (quot < 0) {
          t1 = u1; u1 = v1; v1 = t1;
          t3 = u3; u3 = v3; v3 = t3;
        } else {
          t1 = u1 - v1; u1 = v1; v1 = t1;
          t3 = u3 - v3; u3 = v3; v3 = t3;
        }
      } else if (quot < (v3<<1)) {
        t1 = u1 - (v1<<1); u1 = v1; v1 = t1;
        t3 = u3 - (v3<<1); u3 = v3; v3 = t3;
      } else {
        t1 = u1 - v1*3; u1 = v1; v1 = t1;
        t3 = u3 - v3*3; u3 = v3; v3 = t3;
      }
    } else {
      quot = u3 / v3;
      t1 = u1 - v1*quot; u1 = v1; v1 = t1;
      t3 = u3 - v3*quot; u3 = v3; v3 = t3;
    }
 }
 if (u1 < 0) u1 += p;
 return u1;
}

#if __GNU_MP_VERSION < 5
extern void gcdext(mpz_t g, mpz_t s, mpz_t t, const mpz_t ia, const mpz_t ib)
{
  mpz_t a, b;
  mpz_init_set(a, ia);
  mpz_init_set(b, ib);

  if (mpz_sgn(a) == 0 || mpz_cmp(a,b) == 0) {
    mpz_set_si(s, 0);
    mpz_set_si(t, mpz_sgn(b));
    mpz_abs(g, b);
  } else if (mpz_sgn(b) == 0) {
    mpz_set_si(s, mpz_sgn(a));
    mpz_set_si(t, 0);
    mpz_abs(g, a);
  } else {
    mpz_t os, ot, or, r, q, tmp;
    mpz_init(os);  mpz_init(ot);  mpz_init(or);
    mpz_init(r);  mpz_init(q);  mpz_init(tmp);
    mpz_set_ui(s,0);  mpz_set_ui(os,1);
    mpz_set_ui(t,1);  mpz_set_ui(ot,0);
    mpz_set(r,b);  mpz_set(or,a);
    while (mpz_sgn(r)) {
      mpz_tdiv_q(q, or, r);
      mpz_set(tmp, r); mpz_mul(r, r, q); mpz_sub(r, or, r); mpz_set(or, tmp);
      mpz_set(tmp, s); mpz_mul(s, s, q); mpz_sub(s, os, s); mpz_set(os, tmp);
      mpz_set(tmp, t); mpz_mul(t, t, q); mpz_sub(t, ot, t); mpz_set(ot, tmp);
    }
    mpz_set(s, os);
    mpz_set(t, ot);
    mpz_set(g, or);
    mpz_clear(r);  mpz_clear(q);  mpz_clear(tmp);
    mpz_clear(os);  mpz_clear(ot);  mpz_clear(or);

    if (mpz_sgn(g) < 0) {
      mpz_neg(s, s);
      mpz_neg(t, t);
      mpz_neg(g, g);
    }
  }
  mpz_clear(a); mpz_clear(b);
}
#endif

int chinese(mpz_t ret, mpz_t lcm, mpz_t *a, mpz_t *m, int items)
{
  mpz_t sum, gcd, u, v, s, t, temp1, temp2;
  int i, rval = 1;

#if 0
  if (items >= 128) {
    int first = items/2;
    mpz_t ca[2], cm[2];

    for (i = 0; i < 2; i++)
      { mpz_init(ca[i]); mpz_init(cm[i]); }
    rval = chinese(ca[0], cm[0], a, m, first);
    if (rval == 1)
      rval = chinese(ca[1], cm[1], a+first, m+first, items-first);
    if (rval == 1)
      rval = chinese(ret, lcm, ca, cm, 2);
    for (i = 0; i < 2; i++)
      { mpz_clear(ca[i]); mpz_clear(cm[i]); }
    return rval;
  }
#else
#define CRTN 8
  if (items >= 64) {
    int step = items/CRTN;
    mpz_t ca[CRTN], cm[CRTN];

    for (i = 0; i < CRTN; i++)
      { mpz_init(ca[i]); mpz_init(cm[i]); }
    for (i = 0; rval && i < CRTN; i++) {
      int citems = (i==CRTN-1) ? items-(CRTN-1)*step : step;
      rval = chinese(ca[i], cm[i], a+i*step, m+i*step, citems);
    }
    if (rval) rval = chinese(ret, lcm, ca, cm, CRTN);
    for (i = 0; i < CRTN; i++)
      { mpz_clear(ca[i]); mpz_clear(cm[i]); }
    return rval;
  }
#endif

  /* Avoid dividing by zero, which GMP doesn't enjoy. */
  for (i = 0; i < items; i++)
    if (mpz_sgn(m[i]) == 0)
      return 0;

  mpz_init(temp1); mpz_init(temp2);
  mpz_init(sum); mpz_init(gcd);
  mpz_init(s); mpz_init(t);
  mpz_init(u); mpz_init(v);

  mpz_set(lcm, m[0]);
  mpz_mod(sum, a[0], m[0]);
  for (i = 1; i < items; i++) {
    mpz_gcdext(gcd, u, v, lcm, m[i]);
    mpz_divexact(s, m[i], gcd);
    mpz_divexact(t, lcm, gcd);
    if (mpz_cmp_ui(gcd,1) != 0) {
      mpz_mod(temp1, sum, gcd);
      mpz_mod(temp2, a[i], gcd);
      if (mpz_cmp(temp1, temp2) != 0) {
        rval = 0;
        break;
      }
    }
    if (mpz_sgn(s) < 0) mpz_neg(s,s);
    if (mpz_sgn(t) < 0) mpz_neg(t,t);
    mpz_mul(lcm, lcm, s);
    if (mpz_sgn(u) < 0) mpz_add(u, u, lcm);
    if (mpz_sgn(v) < 0) mpz_add(v, v, lcm);
    mpz_mul(temp1, v, s);
    mpz_mul(v, temp1, sum);
    mpz_mul(temp1, u, t);
    mpz_mul(u, temp1, a[i]);
    mpz_add(temp1, v, u);
    mpz_mod(sum, temp1, lcm);
  }
  mpz_set(ret, sum);
  mpz_clear(sum); mpz_clear(gcd);
  mpz_clear(s); mpz_clear(t);
  mpz_clear(u); mpz_clear(v);
  mpz_clear(temp1); mpz_clear(temp2);
  return rval;
}


UV mpz_order_ui(UV r, mpz_t n, UV limit) {
  UV j;
  mpz_t t;

  /* If n < limit, set limit to n */
  if (mpz_cmp_ui(n, limit) < 0)
    limit = mpz_get_ui(n);
  mpz_init_set_ui(t, 1);
  for (j = 1; j <= limit; j++) {
    mpz_mul(t, t, n);
    mpz_mod_ui(t, t, r);
    if (!mpz_cmp_ui(t, 1))
      break;
  }
  mpz_clear(t);
  return j;
}

void mpz_arctan(mpz_t r, unsigned long base, mpz_t pow, mpz_t t1, mpz_t t2)
{
  unsigned long i = 1;
  mpz_tdiv_q_ui(r, pow, base);
  mpz_set(t1, r);
  do {
    if (base > 65535) { mpz_ui_pow_ui(t2, base, 2); mpz_tdiv_q(t1, t1, t2); }
    else               mpz_tdiv_q_ui(t1, t1, base*base);
    mpz_tdiv_q_ui(t2, t1, 2*i+1);
    if (i++ & 1) mpz_sub(r, r, t2); else mpz_add(r, r, t2);
  } while (mpz_sgn(t2));
}
void mpz_arctanh(mpz_t r, unsigned long base, mpz_t pow, mpz_t t1, mpz_t t2)
{
  unsigned long i = 1;
  mpz_tdiv_q_ui(r, pow, base);
  mpz_set(t1, r);
  do {
    if (base > 65535) { mpz_ui_pow_ui(t2, base, 2); mpz_tdiv_q(t1, t1, t2); }
    else               mpz_tdiv_q_ui(t1, t1, base*base);
    mpz_tdiv_q_ui(t2, t1, 1 + (i++ << 1));
    mpz_add(r, r, t2);
  } while (mpz_sgn(t2));
}

void mpz_product(mpz_t* A, UV a, UV b) {
  if (b <= a) {
    /* nothing */
  } else if (b == a+1) {
    mpz_mul(A[a], A[a], A[b]);
  } else if (b == a+2) {
    mpz_mul(A[a+1], A[a+1], A[a+2]);
    mpz_mul(A[a], A[a], A[a+1]);
  } else {
    UV c = a + (b-a+1)/2;
    mpz_product(A, a, c-1);
    mpz_product(A, c, b);
    mpz_mul(A[a], A[a], A[c]);
  }
}

UV logint(mpz_t n, UV base) {
  double logn, logbn, coreps;
  UV res, nbits, logn_red;

  if (mpz_cmp_ui(n,0) <= 0 || base <= 1)
    croak("mpz_logint: bad input\n");

  /* If base is a small power of 2, then this is exact */
  if (base <= 62 && (base & (base-1)) == 0)
    return mpz_sizeinbase(n, base)-1;

#if 0  /* example using mpf_log for high precision.  Slow. */
  {
    mpf_t fr, fn;
    mpf_init(fr); mpf_init(fn);
    mpf_set_z(fn, n);
    mpf_log(fr, fn);
    logn = mpf_get_d(fr);
    mpf_clear(fn); mpf_clear(fr);
    coreps = 1e-8;
  }
#endif

  /* Step 1, get an approximation of log(n) */
  nbits = mpz_sizeinbase(n,2);
  if (nbits < 768) {
    logn = log(mpz_get_d(n));
    coreps = 1e-8;
  } else {
    long double logn_adj = 45426.093625176575797967724311883L;/* log(2^65536) */
    mpz_t nr;

    for ( mpz_init_set(nr,n),  logn_red = 65536,  logn = 0;
          logn_red >= 128;
          logn_red /= 2,  logn_adj /= 2) {
      while (nbits >= 512+logn_red) {
        mpz_tdiv_q_2exp(nr, nr, logn_red);
        nbits -= logn_red;
        logn += logn_adj;
      }
    }
    logn += log(mpz_get_d(nr));
    mpz_clear(nr);
    coreps = 1e-4;
  }
  /* Step 2, approximate log_base(n) */
  logbn = logn / log(base);
  res = (UV) logbn;

  /* Step 3, ensure exact if logbn might be rounded wrong */
  if (res != (UV)(logbn+coreps) || res != (UV)(logbn-coreps)) {
    mpz_t be;
    mpz_init(be);
    /* Decrease until <= n */
    while (mpz_ui_pow_ui(be, base, res),mpz_cmp(be,n) > 0)
      res--;
    /* Increase until n+1 > n */
    while (mpz_ui_pow_ui(be, base, res+1),mpz_cmp(be,n) <= 0)
      res++;
    mpz_clear(be);
  }
  /* res is largest res such that base^res <= n */
  return res;
}

/******************************************************************************/
/*
 * Floating point routines.
 * These are not rigorously accurate.  Use MPFR if possible.
 *
 * See also: http://fredrikj.net/math/elefun.pdf
 * for how to really look at this correctly.
 *
 * Many ideas from Brent's presentation:
 * https://pdfs.semanticscholar.org/8aec/ea97b8f2f23d4f09ec8f69025598f742ae9e.pdf
 */


extern void const_pi(mpf_t pi, unsigned long prec);
extern void const_log2(mpf_t logn, unsigned long prec);

/* Log using Brent's second AGM algorithm (Sasaki and Kanada theta) */
void mpf_log(mpf_t logn, mpf_t n)
{
  mpf_t N, t, q, theta2, theta3, logdn;
  unsigned long k, bits = mpf_get_prec(logn);
  int neg = (mpf_sgn(n) < 0);

  if (mpf_sgn(n) == 0)
    croak("mpf_log(0)");
  if (mpf_cmp_ui(n,2) == 0)
    { const_log2(logn,BITS2DIGS(bits)); return; }
  if ((neg && !mpf_cmp_si(n,-1)) || (!neg && !mpf_cmp_si(n,1)))
    { mpf_set_ui(logn,0); return; }

  mpf_init2(N, bits);
  mpf_set(N, n);
  if (neg) mpf_neg(N, N);

  mpf_init2(t, 64 + bits);
  mpf_init2(q,      64 + bits);
  mpf_init2(theta2, 64 + bits);
  mpf_init2(theta3, 64 + bits);
  mpf_init2(logdn,  64 + bits);
  mpf_set_ui(logn, 0);

  /* ensure N >> 1 */
  mpf_set_ui(t, 1);  mpf_mul_2exp(t, t, 1+(35+bits)/36);
  if (mpf_cmp(N, t) <= 0) {
    /* log(n) = log(n*2^k) - k*log(2) */
    for (k = 0; mpf_cmp(N, t) <= 0; k += 16)
      mpf_mul_2exp(N, N, 16);
    if (k > 0) {
      const_log2(t, BITS2DIGS(bits));
      mpf_mul_ui(logn, t, k);
      mpf_neg(logn, logn);
    }
  }

  mpf_ui_div(q, 1, N);
  mpf_pow_ui(t,q,9); mpf_add(theta2, q, t);
  mpf_pow_ui(t,q,25); mpf_add(theta2, theta2, t);
  mpf_mul_2exp(theta2, theta2, 1);
  mpf_pow_ui(theta3,q,4);
  mpf_pow_ui(t,q,16); mpf_add(theta3, theta3, t);
  mpf_mul_2exp(theta3, theta3, 1);
  mpf_add_ui(theta3, theta3, 1);

  /* Normally we would do:
   *   mpf_mul(theta2, theta2, theta2); mpf_mul(theta3, theta3, theta3);
   *   mpf_agm(t, theta2, theta3); mpf_mul_2exp(t, t, 2);
   * but Brent points out the one term acceleration:
   *   AGM(t2^2,t3^2)  =>  AGM(2*t2*t3,t2^2+t3^2)/2
   */
  mpf_mul(t, theta2, theta3);
  mpf_mul_2exp(q, t, 1);  /* q = 2*t2*t3 */
  mpf_add(t, theta2, theta3);
  mpf_mul(t, t, t);       /* t = (t2+t3)^2 = t2^2 + t3^2 + 2*t2*t3 */
  mpf_sub(theta3, t, q);
  mpf_set(theta2, q);
  mpf_agm(t, theta2, theta3);
  mpf_mul_2exp(t, t, 1);

  const_pi(logdn, BITS2DIGS(bits));
  mpf_div(logdn, logdn, t);

  mpf_add(logn, logn, logdn);
  mpf_clear(logdn); mpf_clear(theta3); mpf_clear(theta2); mpf_clear(q);
  mpf_clear(t); mpf_clear(N);
  if (neg) mpf_neg(logn, logn);
}

/* x should be 0 < x < 1 */
static void _exp_sinh(mpf_t expx, mpf_t x, unsigned long bits)
{
  unsigned long k;
  mpf_t t, s, N, D, X;

  mpf_init2(t, 10 + bits);
  mpf_init2(s, 10 + bits);
  mpf_init2(N, 10 + bits);
  mpf_init2(D, 10 + bits);
  mpf_init2(X, 10 + bits);

  /* 1. Compute s =~ sinh(x). */
  mpf_set(s,x);
  mpf_set(N,x);
  mpf_mul(X,x,x);
  mpf_set_ui(D,1);
  for (k = 1; k < bits; k++) {
    mpf_mul(N, N, X);
    mpf_mul_ui(D, D, 2*k);
    mpf_mul_ui(D, D, 2*k+1);
    mpf_div(t, N, D);
    mpf_add(s, s, t);

    mpf_abs(t, t);
    mpf_mul_2exp(t, t, bits);
    if (mpf_cmp_d(t, .5) < 0)
      break;
  }
  mpf_clear(X); mpf_clear(D); mpf_clear(N);

  /* 2. Compute s =~ e(x) from sinh(x). */
  mpf_mul(t, s, s);
  mpf_add_ui(t, t, 1);
  mpf_sqrt(t, t);
  mpf_add(s, s, t);

  mpf_set(expx, s);
  mpf_clear(s); mpf_clear(t);
}

static void _exp_lift(mpf_t expx, mpf_t x, unsigned long bits)
{
  mpf_t s, t1, t2;
  unsigned long k;

  mpf_init2(s,  10 + bits);
  mpf_init2(t1, 10 + bits);
  mpf_init2(t2, 10 + bits);

  mpf_set(s, expx);
  mpf_log(t1, s);
  mpf_sub(t2, x, t1);     /* t2 = s-ln(x) */
  mpf_mul(t1, s, t2);     /* t1 = s(s-ln(x) */
  mpf_add(s, s, t1);
  /* third and higher orders */
  for (k = 3; k <= 8; k++) {
    mpf_mul(t1, t1, t2);
    mpf_div_ui(t1, t1, k-1);
    mpf_add(s, s, t1);
  }
  mpf_set(expx, s);
  mpf_clear(t2); mpf_clear(t1); mpf_clear(s);
}

void mpf_exp(mpf_t expn, mpf_t x)
{
  mpf_t t;
  unsigned long k, r, rbits, bits = mpf_get_prec(expn);

  if (mpf_sgn(x) == 0) { mpf_set_ui(expn, 1); return; }

  mpf_init2(t, 10 + bits);

  if (mpf_sgn(x) < 0) { /* As per Brent, exp(x) = 1/exp(-x) */
    mpf_neg(t, x);
    mpf_exp(t, t);
    if (mpf_sgn(t) != 0) mpf_ui_div(expn, 1, t);
    else                 mpf_set_ui(expn, 0);
    mpf_clear(t);
    return;
  }

  /* Doubling rule, to make -.25 < x < .25.  Speeds convergence. */
  mpf_set(t, x);
  for (k = 0; mpf_cmp_d(t, 1.0L/8192.0L) > 0; k++)
    mpf_div_2exp(t, t, 1);

  /* exp with sinh method, then Newton lift to final bits */
  for (rbits = bits, r = 0;  rbits > 4000;  rbits = (rbits+7)/8)
    r++;
  _exp_sinh(expn, t, rbits);
  while (r-- > 0) {
    rbits *= 8;
    _exp_lift(expn, t, rbits);
  }
  if (rbits < bits)
    _exp_lift(expn, t, bits);

  if (k > 0) {
    const unsigned long maxred = 8*sizeof(unsigned long)-1;
    for ( ; k > maxred; k -= maxred)
      mpf_pow_ui(expn, expn, 1UL << maxred);
    mpf_pow_ui(expn, expn, 1UL << k);
  }
  mpf_clear(t);
}


/* Negative b with non-int x usually gives a complex result.
 * We try to at least give a consistent result. */

void mpf_pow(mpf_t powx, mpf_t b, mpf_t x)
{
  mpf_t t;
  int neg = 0;

  if (mpf_sgn(b) == 0) { mpf_set_ui(powx, 0); return; }
  if (mpf_sgn(b) <  0) { neg = 1; }
  if (mpf_cmp_ui(b,1) == 0) { mpf_set_ui(powx, 1-2*neg); return; }

  if (mpf_integer_p(x) && mpf_fits_ulong_p(x)) {
    mpf_pow_ui(powx, b, mpf_get_ui(x));
    return;
  }

  if (neg) mpf_neg(b,b);
  mpf_init2(t, mpf_get_prec(powx));
  mpf_log(t, b);
  mpf_mul(t, t, x);
  mpf_exp(powx, t);
  if (neg) mpf_neg(powx,powx);
  mpf_clear(t);
}

void mpf_agm(mpf_t r, mpf_t a, mpf_t b)
{
  mpf_t t;
  unsigned long bits = mpf_get_prec(r);

  if (mpf_cmp(a,b) > 0) mpf_swap(a,b);

  mpf_init2(t, 6+bits);
  while (1) {
    mpf_sub(t, b, a);
    mpf_abs(t, t);
    mpf_mul_2exp(t, t, bits);
    mpf_sub_ui(t,t,1);
    if (mpf_sgn(t) < 0)
      break;
    mpf_sub_ui(t,t,1);
    mpf_set(t, a);
    mpf_add(a, a, b);
    mpf_div_2exp(a, a, 1);
    mpf_mul(b, b, t);
    mpf_sqrt(b, b);
  }
  mpf_set(r, b);
  mpf_clear(t);
}

/******************************************************************************/


#if 0
/* Simple polynomial multiplication */
void poly_mod_mul(mpz_t* px, mpz_t* py, mpz_t* ptmp, UV r, mpz_t mod)
{
  UV i, j, prindex;

  for (i = 0; i < r; i++)
    mpz_set_ui(ptmp[i], 0);
  for (i = 0; i < r; i++) {
    if (!mpz_sgn(px[i])) continue;
    for (j = 0; j < r; j++) {
      if (!mpz_sgn(py[j])) continue;
      prindex = (i+j) % r;
      mpz_addmul( ptmp[prindex], px[i], py[j] );
    }
  }
  /* Put ptmp into px and mod n */
  for (i = 0; i < r; i++)
    mpz_mod(px[i], ptmp[i], mod);
}
void poly_mod_sqr(mpz_t* px, mpz_t* ptmp, UV r, mpz_t mod)
{
  UV i, d, s;
  UV degree = r-1;

  for (i = 0; i < r; i++)
    mpz_set_ui(ptmp[i], 0);
  for (d = 0; d <= 2*degree; d++) {
    UV prindex = d % r;
    for (s = (d <= degree) ? 0 : d-degree; s <= (d/2); s++) {
      if (s*2 == d) {
        mpz_addmul( ptmp[prindex], px[s], px[s] );
      } else {
        mpz_addmul( ptmp[prindex], px[s], px[d-s] );
        mpz_addmul( ptmp[prindex], px[s], px[d-s] );
      }
    }
  }
  /* Put ptmp into px and mod n */
  for (i = 0; i < r; i++)
    mpz_mod(px[i], ptmp[i], mod);
}
#endif

#if (__GNU_MP_VERSION < 4) || (__GNU_MP_VERSION == 4 && __GNU_MP_VERSION_MINOR < 1)
/* Binary segmentation, using simple shift+add method for processing p.
 * Faster than twiddling bits, but not nearly as fast as import/export.
 * mpz_import and mpz_export were added in GMP 4.1 released in 2002.
 */
void poly_mod_mul(mpz_t* px, mpz_t* py, UV r, mpz_t mod, mpz_t p, mpz_t p2, mpz_t t)
{
  UV i, d, bits;
  UV degree = r-1;

  mpz_mul(t, mod, mod);
  mpz_mul_ui(t, t, r);
  bits = mpz_sizeinbase(t, 2);

  mpz_set_ui(p, 0);
  for (i = 0; i < r; i++) {
    mpz_mul_2exp(p, p, bits);
    mpz_add(p, p, px[r-i-1]);
  }

  if (px == py) {
    mpz_mul(p, p, p);
  } else {
    mpz_set_ui(p2, 0);
    for (i = 0; i < r; i++) {
      mpz_mul_2exp(p2, p2, bits);
      mpz_add(p2, p2, py[r-i-1]);
    }
    mpz_mul(p, p, p2);
  }

  for (d = 0; d <= 2*degree; d++) {
    mpz_tdiv_r_2exp(t, p, bits);
    mpz_tdiv_q_2exp(p, p, bits);
    if (d < r)
      mpz_set(px[d], t);
    else
      mpz_add(px[d-r], px[d-r], t);
  }
  for (i = 0; i < r; i++)
    mpz_mod(px[i], px[i], mod);
}

#else

/* Binary segmentation, using import/export method for processing p.
 * Thanks to Dan Bernstein's 2007 Quartic paper.
 */
void poly_mod_mul(mpz_t* px, mpz_t* py, UV r, mpz_t mod, mpz_t p, mpz_t p2, mpz_t t)
{
  UV i, bytes;
  char* s;

  mpz_mul(t, mod, mod);
  mpz_mul_ui(t, t, r);
  bytes = mpz_sizeinbase(t, 256);
  mpz_set_ui(p, 0);
  mpz_set_ui(p2, 0);

  /* 1. Create big integers from px and py with padding. */
  {
    Newz(0, s, r*bytes, char);
    for (i = 0; i < r; i++)
      mpz_export(s + i*bytes, NULL, -1, 1, 0, 0, px[i]);
    mpz_import(p, r*bytes, -1, 1, 0, 0, s);
    Safefree(s);
  }
  if (px != py) {
    Newz(0, s, r*bytes, char);
    for (i = 0; i < r; i++)
      mpz_export(s + i*bytes, NULL, -1, 1, 0, 0, py[i]);
    mpz_import(p2, r*bytes, -1, 1, 0, 0, s);
    Safefree(s);
  }

  /* 2. Multiply using the awesomeness that is GMP. */
  mpz_mul( p, p, (px == py) ? p : p2 );

  /* 3. Pull out the parts of the result, add+mod, and put in px. */
  {
    Newz(0, s, 2*r*bytes, char);
    /* fill s with data from p */
    mpz_export(s, NULL, -1, 1, 0, 0, p);
    for (i = 0; i < r; i++) {
      /* Set px[i] to the top part, t to the bottom. */
      mpz_import(px[i], bytes, -1, 1, 0, 0, s + (i+r)*bytes);
      mpz_import(t,     bytes, -1, 1, 0, 0, s +     i*bytes);
      /* Add and mod */
      mpz_add(px[i], px[i], t);
      mpz_mod(px[i], px[i], mod);
    }
    Safefree(s);
  }
}
#endif

void poly_mod_pow(mpz_t *pres, mpz_t *pn, mpz_t power, UV r, mpz_t mod)
{
  UV i;
  mpz_t mpow, t1, t2, t3;

  for (i = 0; i < r; i++)
    mpz_set_ui(pres[i], 0);
  mpz_set_ui(pres[0], 1);

  mpz_init_set(mpow, power);
  mpz_init(t1);  mpz_init(t2);  mpz_init(t3);

  while (mpz_cmp_ui(mpow, 0) > 0) {
    if (mpz_odd_p(mpow))            poly_mod_mul(pres, pn, r, mod, t1, t2, t3);
    mpz_tdiv_q_2exp(mpow, mpow, 1);
    if (mpz_cmp_ui(mpow, 0) > 0)    poly_mod_mul(pn, pn, r, mod, t1, t2, t3);
  }
  mpz_clear(t1);  mpz_clear(t2);  mpz_clear(t3);
  mpz_clear(mpow);
}

void poly_mod(mpz_t *pres, mpz_t *pn, UV *dn, mpz_t mod)
{
  UV i;
  for (i = 0; i < *dn; i++) {
    mpz_mod(pres[i], pn[i], mod);
  }
  while (*dn > 0 && mpz_sgn(pres[*dn-1]) == 0)
    *dn -= 1;
}
void polyz_mod(mpz_t *pres, mpz_t *pn, long *dn, mpz_t mod)
{
  long i;
  for (i = 0; i <= *dn; i++) {
    mpz_mod(pres[i], pn[i], mod);
  }
  while (*dn > 0 && mpz_sgn(pres[*dn]) == 0)
    *dn -= 1;
}

void polyz_set(mpz_t* pr, long* dr, mpz_t* ps, long ds)
{
  *dr = ds;
  do {
    mpz_set(pr[ds], ps[ds]);
  } while (ds-- > 0);
}

void polyz_print(const char* header, mpz_t* pn, long dn)
{
  gmp_printf("%s", header);
  do { gmp_printf("%Zd ", pn[dn]); } while (dn-- > 0);
  gmp_printf("\n");
}

/* Multiply polys px and py to create new poly pr, all modulo 'mod' */
#if 0
void polyz_mulmod(mpz_t* pr, mpz_t* px, mpz_t *py, long *dr, long dx, long dy, mpz_t mod)
{
  long i, j;
  *dr = dx + dy;
  for (i = 0; i <= *dr; i++)
    mpz_set_ui(pr[i], 0);
  for (i = 0; i <= dx; i++) {
    if (!mpz_sgn(px[i])) continue;
    for (j = 0; j <= dy; j++) {
      if (!mpz_sgn(py[j])) continue;
      mpz_addmul( pr[i+j], px[i], py[j] );
    }
  }
  for (i = 0; i <= *dr; i++)
    mpz_mod(pr[i], pr[i], mod);
  while (*dr > 0 && mpz_sgn(pr[*dr]) == 0)  dr[0]--;
}
#endif
#if 1
void polyz_mulmod(mpz_t* pr, mpz_t* px, mpz_t *py, long *dr, long dx, long dy, mpz_t mod)
{
  UV i, bits, r;
  mpz_t p, p2, t;

  mpz_init(p); mpz_init(t);
  *dr = dx+dy;
  r = *dr+1;
  mpz_mul(t, mod, mod);
  mpz_mul_ui(t, t, r);
  bits = mpz_sizeinbase(t, 2);
  mpz_set_ui(p, 0);

  /* Create big integers p and p2 from px and py, with padding */
  {
    for (i = 0; i <= (UV)dx; i++) {
      mpz_mul_2exp(p, p, bits);
      mpz_add(p, p, px[dx-i]);
    }
  }
  if (px == py) {
    mpz_pow_ui(p, p, 2);
  } else {
    mpz_init_set_ui(p2, 0);
    for (i = 0; i <= (UV)dy; i++) {
      mpz_mul_2exp(p2, p2, bits);
      mpz_add(p2, p2, py[dy-i]);
    }
    mpz_mul(p, p, p2);
    mpz_clear(p2);
  }

  /* Pull out parts of result p to pr */
  for (i = 0; i < r; i++) {
    mpz_tdiv_r_2exp(t, p, bits);
    mpz_tdiv_q_2exp(p, p, bits);
    mpz_mod(pr[i], t, mod);
  }

  mpz_clear(p); mpz_clear(t);
}
#endif
#if 0
void polyz_mulmod(mpz_t* pr, mpz_t* px, mpz_t *py, long *dr, long dx, long dy, mpz_t mod)
{
  UV i, bytes, r;
  char* s;
  mpz_t p, p2, t;

  mpz_init(p); mpz_init(p2); mpz_init(t);
  *dr = dx+dy;
  r = *dr+1;
  mpz_mul(t, mod, mod);
  mpz_mul_ui(t, t, r);
  bytes = mpz_sizeinbase(t, 256);
  mpz_set_ui(p, 0);
  mpz_set_ui(p2, 0);

  /* Create big integers p and p2 from px and py, with padding */
  {
    Newz(0, s, (dx+1)*bytes, char);
    for (i = 0; i <= dx; i++)
      mpz_export(s + i*bytes, NULL, -1, 1, 0, 0, px[i]);
    mpz_import(p, (dx+1)*bytes, -1, 1, 0, 0, s);
    Safefree(s);
  }
  if (px != py) {
    Newz(0, s, (dy+1)*bytes, char);
    for (i = 0; i <= dy; i++)
      mpz_export(s + i*bytes, NULL, -1, 1, 0, 0, py[i]);
    mpz_import(p2, (dy+1)*bytes, -1, 1, 0, 0, s);
    Safefree(s);
  }

  /* Multiply! */
  mpz_mul( p ,p, (px == py) ? p : p2 );

  /* Pull out parts of result p to pr */
  {
    Newz(0, s, r*bytes, char);
    /* fill s with data from p */
    mpz_export(s, NULL, -1, 1, 0, 0, p);
    for (i = 0; i < r; i++) {
      mpz_import(t,     bytes, -1, 1, 0, 0, s +     i*bytes);
      mpz_mod(pr[i], t, mod);
    }
    Safefree(s);
  }

  mpz_clear(p); mpz_clear(p2); mpz_clear(t);
}
#endif

/* Polynomial division modulo N.
 * This is Cohen algorithm 3.1.2 "pseudo-division". */
void polyz_div(mpz_t *pq, mpz_t *pr, mpz_t *pn, mpz_t *pd,
               long *dq, long *dr, long dn, long dd, mpz_t NMOD)
{
  long i, j;
  mpz_t t;

  /* Ensure n and d are reduced */
  while (dn > 0 && mpz_sgn(pn[dn]) == 0)   dn--;
  while (dd > 0 && mpz_sgn(pd[dd]) == 0)   dd--;
  if (dd == 0 && mpz_sgn(pd[0]) == 0)
    croak("polyz_divmod: divide by zero\n");

  /* Q = 0 */
  *dq = 0;
  mpz_set_ui(pq[0], 0);

  /* R = N */
  *dr = dn;
  for (i = 0; i <= dn; i++)
    mpz_set(pr[i], pn[i]);  /* pn should already be mod */

  if (*dr < dd)
    return;
  if (dd == 0) {
    *dq = 0; *dr = 0;
    mpz_tdiv_qr( pq[0], pr[0], pn[0], pd[0] );
    return;
  }

  *dq = dn - dd;
  *dr = dd-1;

  if (mpz_cmp_ui(pd[dd], 1) == 0) {
    for (i = *dq; i >= 0; i--) {
      long di = dd + i;
      mpz_set(pq[i], pr[di]);
      for (j = di-1; j >= i; j--) {
        mpz_submul(pr[j], pr[di], pd[j-i]);
        mpz_mod(pr[j], pr[j], NMOD);
      }
    }
  } else {
    mpz_init(t);
    for (i = *dq; i >= 0; i--) {
      long di = dd + i;
      mpz_powm_ui(t, pd[dd], i, NMOD);
      mpz_mul(t, t, pr[di]);
      mpz_mod(pq[i], t, NMOD);
      for (j = di-1; j >= 0; j--) {
        mpz_mul(pr[j], pr[j], pd[dd]);   /* j != di so this is safe */
        if (j >= i)
          mpz_submul(pr[j], pr[di], pd[j-i]);
        mpz_mod(pr[j], pr[j], NMOD);
      }
    }
    mpz_clear(t);
  }
  /* Reduce R and Q. */
  while (*dr > 0 && mpz_sgn(pr[*dr]) == 0)  dr[0]--;
  while (*dq > 0 && mpz_sgn(pq[*dq]) == 0)  dq[0]--;
}

/* Raise poly pn to the power, modulo poly pmod and coefficient NMOD. */
void polyz_pow_polymod(mpz_t* pres,  mpz_t* pn,  mpz_t* pmod,
                              long *dres,   long   dn,  long   dmod,
                              mpz_t power, mpz_t NMOD)
{
  mpz_t mpow;
  long dProd, dQ, dX, maxd, i;
  mpz_t *pProd, *pQ, *pX;

  /* Product = res*x.  With a prediv this would be dmod+dmod, but without it
   * is max(dmod,dn)+dmod. */
  maxd = (dn > dmod) ? dn+dmod : dmod+dmod;
  New(0, pProd, maxd+1, mpz_t);
  New(0, pQ, maxd+1, mpz_t);
  New(0, pX, maxd+1, mpz_t);
  for (i = 0; i <= maxd; i++) {
    mpz_init(pProd[i]);
    mpz_init(pQ[i]);
    mpz_init(pX[i]);
  }

  *dres = 0;
  mpz_set_ui(pres[0], 1);

  dX = dn;
  for (i = 0; i <= dX; i++)
    mpz_set(pX[i], pn[i]);

  mpz_init_set(mpow, power);
  while (mpz_cmp_ui(mpow, 0) > 0) {
    if (mpz_odd_p(mpow)) {
      polyz_mulmod(pProd, pres, pX, &dProd, *dres, dX, NMOD);
      polyz_div(pQ, pres,  pProd, pmod,  &dQ, dres, dProd, dmod, NMOD);
    }
    mpz_tdiv_q_2exp(mpow, mpow, 1);
    if (mpz_cmp_ui(mpow, 0) > 0) {
      polyz_mulmod(pProd, pX, pX, &dProd, dX, dX, NMOD);
      polyz_div(pQ, pX,  pProd, pmod,  &dQ, &dX, dProd, dmod, NMOD);
    }
  }
  mpz_clear(mpow);
  for (i = 0; i <= maxd; i++) {
    mpz_clear(pProd[i]);
    mpz_clear(pQ[i]);
    mpz_clear(pX[i]);
  }
  Safefree(pProd);
  Safefree(pQ);
  Safefree(pX);
}

void polyz_gcd(mpz_t* pres, mpz_t* pa, mpz_t* pb, long* dres, long da, long db, mpz_t MODN)
{
  long i;
  long dr1, dq, dr, maxd;
  mpz_t *pr1, *pq, *pr;

  /* Early reduction so we're not wasteful with messy input. */
  while (da > 0 && mpz_sgn(pa[da]) == 0)   da--;
  while (db > 0 && mpz_sgn(pb[db]) == 0)   db--;

  /* Swap a/b so degree a >= degree b */
  if (db > da) {
    mpz_t* mtmp;
    long ltmp;
    mtmp = pa; pa = pb; pb = mtmp;
    ltmp = da; da = db; db = ltmp;
  }
  /* TODO: Should set c=pa[da], then after Euclid loop, res = c^-1 * res */

  /* Allocate temporary polys */
  maxd = da;
  New(0, pr1, maxd+1, mpz_t);
  New(0, pq, maxd+1, mpz_t);
  New(0, pr, maxd+1, mpz_t);
  for (i = 0; i <= maxd; i++) {
    mpz_init(pr1[i]);
    mpz_init(pq[i]);
    mpz_init(pr[i]);
  }

  /* R0 = a (we use pres) */
  *dres = da;
  for (i = 0; i <= da; i++)
    mpz_mod(pres[i], pa[i], MODN);
  while (*dres > 0 && mpz_sgn(pres[*dres]) == 0)   dres[0]--;

  /* R1 = b */
  dr1 = db;
  for (i = 0; i <= db; i++)
    mpz_mod(pr1[i], pb[i], MODN);
  while (dr1 > 0 && mpz_sgn(pr1[dr1]) == 0)   dr1--;

  while (dr1 > 0 || mpz_sgn(pr1[dr1]) != 0) {
    polyz_div(pq, pr,  pres, pr1,  &dq, &dr,  *dres, dr1, MODN);
    if (dr < 0 || dq < 0 || dr > maxd || dq > maxd)
      croak("division error: dq %ld dr %ld maxd %ld\n", dq, dr, maxd);
    /* pr0 = pr1.  pr1 = pr */
    *dres = dr1;
    for (i = 0; i <= dr1; i++)
      mpz_set(pres[i], pr1[i]);  /* pr1 is mod MODN already */
    dr1 = dr;
    for (i = 0; i <= dr; i++)
      mpz_set(pr1[i], pr[i]);
  }
  /* return pr0 */
  while (*dres > 0 && mpz_sgn(pres[*dres]) == 0)   dres[0]--;

  for (i = 0; i <= maxd; i++) {
    mpz_clear(pr1[i]);
    mpz_clear(pq[i]);
    mpz_clear(pr[i]);
  }
  Safefree(pr1);
  Safefree(pq);
  Safefree(pr);
}



void polyz_root_deg1(mpz_t root, mpz_t* pn, mpz_t NMOD)
{
  mpz_invert(root, pn[1], NMOD);
  mpz_mul(root, root, pn[0]);
  mpz_neg(root, root);
  mpz_mod(root, root, NMOD);
}
void polyz_root_deg2(mpz_t root1, mpz_t root2, mpz_t* pn, mpz_t NMOD)
{
  mpz_t e, d, t, t2, t3, t4;

  mpz_init(e); mpz_init(d);
  mpz_init(t); mpz_init(t2); mpz_init(t3); mpz_init(t4);

  mpz_mul(t, pn[0], pn[2]);
  mpz_mul_ui(t, t, 4);
  mpz_mul(d, pn[1], pn[1]);
  mpz_sub(d, d, t);
  sqrtmod_t(e, d, NMOD, t, t2, t3, t4);

  mpz_neg(t4, pn[1]);                    /* t4 = -a_1      */
  mpz_mul_ui(t3, pn[2], 2);              /* t3 = 2a_2      */
  mpz_add(t, t4, e);                     /* t = -a_1 + e   */
  mpz_divmod( root1, t, t3, NMOD, t2);   /* (-a_1+e)/2a_2  */
  mpz_sub(t, t4, e);                     /* t = -a_1 - e   */
  mpz_divmod( root2, t, t3, NMOD, t2);   /* (-a_1-e)/2a_2  */
  mpz_clear(e); mpz_clear(d);
  mpz_clear(t); mpz_clear(t2); mpz_clear(t3); mpz_clear(t4);
}

/* Algorithm 2.3.10 procedure "roots" from Crandall & Pomerance.
 * Step 3/4 of Cohen Algorithm 1.6.1.
 * Uses some hints from Pate Williams (1997-1998) for the poly math */
static void polyz_roots(mpz_t* roots, long *nroots,
                        long maxroots, mpz_t* pg, long dg, mpz_t NMOD)
{
  long i, ntries, maxtries, maxd, dxa, dt, dh, dq, dup;
  mpz_t t, power;
  mpz_t pxa[2];
  mpz_t *pt, *ph, *pq;

  if (*nroots >= maxroots || dg <= 0)  return;

  mpz_init(t);
  mpz_init(pxa[0]);  mpz_init(pxa[1]);

  if (dg <= 2) {
    if (dg == 1) polyz_root_deg1( pxa[0], pg, NMOD );
    else         polyz_root_deg2( pxa[0], pxa[1], pg, NMOD );
    for (dt = 0; dt < dg; dt++) {
      mpz_set(t, pxa[dt]);
      dup = 0; /* don't add duplicate roots */
      for (i = 0; i < *nroots; i++)
        if (mpz_cmp(t, roots[i]) == 0)
          { dup = 1; break; }
      if (!dup)
        mpz_set(roots[ (*nroots)++ ], t);
    }
    mpz_clear(t);
    mpz_clear(pxa[0]);  mpz_clear(pxa[1]);
    return;
  }

  /* If not a monic polynomial, divide by leading coefficient */
  if (mpz_cmp_ui(pg[dg], 1) != 0) {
    if (!mpz_invert(t, pg[dg], NMOD)) {
      mpz_clear(t);
      return;
    }
    for (i = 0; i <= dg; i++)
      mpz_mulmod(pg[i], pg[i], t, NMOD, pg[i]);
  }

  /* Try hard to find a single root, work less after we got one or two.
   * In a generic routine we would want to try hard all the time. */
  ntries = 0;
  maxtries = (*nroots == 0) ? 200 : (*nroots == 1) ? 50 : 10;

  mpz_init(power);
  mpz_set_ui(pxa[1], 1);
  dxa = 1;

  maxd = 2 * dg;
  New(0, pt, maxd+1, mpz_t);
  New(0, ph, maxd+1, mpz_t);
  New(0, pq, maxd+1, mpz_t);
  for (i = 0; i <= maxd; i++) {
    mpz_init(pt[i]);
    mpz_init(ph[i]);
    mpz_init(pq[i]);
  }

  mpz_sub_ui(t, NMOD, 1);
  mpz_tdiv_q_2exp(power, t, 1);
  /* We'll pick random "a" values from 1 to 1000M */
  mpz_set_ui(t, 1000000000UL);
  if (mpz_cmp(t, NMOD) > 0) mpz_set(t, NMOD);

  while (ntries++ < maxtries) {
    /* pxa = X+a for randomly selected a */
    if (ntries <= 2)  mpz_set_ui(pxa[0], ntries);  /* Simple small values */
    else              mpz_isaac_urandomm(pxa[0], t);

    /* Raise pxa to (NMOD-1)/2, all modulo NMOD and g(x) */
    polyz_pow_polymod(pt, pxa, pg, &dt, dxa, dg, power, NMOD);

    /* Subtract 1 and gcd */
    mpz_sub_ui(pt[0], pt[0], 1);
    polyz_gcd(ph, pt, pg, &dh, dt, dg, NMOD);

    if (dh >= 1 && dh < dg)
      break;
  }

  if (dh >= 1 && dh < dg) {
    /* Pick the smaller of the two splits to process first */
    if (dh <= 2 || dh <= (dg-dh)) {
      polyz_roots(roots, nroots, maxroots, ph, dh, NMOD);
      if (*nroots < maxroots) {
        /* q = g/h, and recurse */
        polyz_div(pq, pt,  pg, ph,  &dq, &dt, dg, dh, NMOD);
        polyz_roots(roots, nroots, maxroots, pq, dq, NMOD);
      }
    } else {
      polyz_div(pq, pt,  pg, ph,  &dq, &dt, dg, dh, NMOD);
      polyz_roots(roots, nroots, maxroots, pq, dq, NMOD);
      if (*nroots < maxroots) {
        polyz_roots(roots, nroots, maxroots, ph, dh, NMOD);
      }
    }
  }

  mpz_clear(t);
  mpz_clear(power);
  mpz_clear(pxa[0]);  mpz_clear(pxa[1]);

  for (i = 0; i <= maxd; i++) {
    mpz_clear(pt[i]);
    mpz_clear(ph[i]);
    mpz_clear(pq[i]);
  }
  Safefree(pt);
  Safefree(ph);
  Safefree(pq);
}


/* Algorithm 1.6.1 from Cohen, minus step 1. */
void polyz_roots_modp(mpz_t** roots, long *nroots, long maxroots,
                      mpz_t *pP, long dP, mpz_t NMOD)
{
  long i;

  *nroots = 0;
  *roots = 0;

  if (dP == 0)
    return;

  /* Do degree 1 or 2 explicitly */
  if (dP == 1) {
    New(0, *roots, 1, mpz_t);
    mpz_init((*roots)[0]);
    polyz_root_deg1( (*roots)[0], pP, NMOD );
    *nroots = 1;
    return;
  }
  if (dP == 2) {
    New(0, *roots, 2, mpz_t);
    mpz_init((*roots)[0]);
    mpz_init((*roots)[1]);
    polyz_root_deg2( (*roots)[0], (*roots)[1], pP, NMOD );
    *nroots = 2;
    return;
  }

  /* Allocate space for the maximum number of roots (plus 1 for safety) */
  New(0, *roots, dP+1, mpz_t);
  for (i = 0; i <= dP; i++)
    mpz_init((*roots)[i]);

  if (maxroots > dP || maxroots == 0)
    maxroots = dP;

  polyz_roots(*roots, nroots, maxroots, pP, dP, NMOD);
  /* This could be just really bad luck.  Let the caller handle it. */
  /* if (*nroots == 0) croak("failed to find roots\n"); */

  /* Clear out space for roots we didn't find */
  for (i = *nroots; i <= dP; i++)
    mpz_clear((*roots)[i]);
}


#include "class_poly_data.h"

int* poly_class_nums(void)
{
  int* dlist;
  UV i;
  int degree_offset[256] = {0};

  for (i = 1; i < NUM_CLASS_POLYS; i++)
    if (_class_poly_data[i].D < _class_poly_data[i-1].D)
      croak("Problem with data file, out of order at D=%d\n", (int)_class_poly_data[i].D);

  Newz(0, dlist, NUM_CLASS_POLYS + 1, int);
  /* init degree_offset to total number of this degree */
  for (i = 0; i < NUM_CLASS_POLYS; i++)
    degree_offset[_class_poly_data[i].degree]++;
  /* set degree_offset to sum of this and all previous degrees. */
  for (i = 1; i < 256; i++)
    degree_offset[i] += degree_offset[i-1];
  /* Fill in dlist, sorted */
  for (i = 0; i < NUM_CLASS_POLYS; i++) {
    int position = degree_offset[_class_poly_data[i].degree-1]++;
    dlist[position] = i+1;
  }
  /* Null terminate */
  dlist[NUM_CLASS_POLYS] = 0;
  return dlist;
}

UV poly_class_poly_num(int i, int *D, mpz_t**T, int* type)
{
  UV degree, j;
  int ctype;
  mpz_t t;
  const char* s;

  if (i < 1 || i > (int)NUM_CLASS_POLYS) { /* Invalid number */
     if (D != 0) *D = 0;
     if (T != 0) *T = 0;
     return 0;
  }
  i--; /* i now is the index into our table */

  degree = _class_poly_data[i].degree;
  ctype  = _class_poly_data[i].type;
  s = _class_poly_data[i].coefs;

  if (D != 0)  *D = -_class_poly_data[i].D;
  if (type != 0)  *type = ctype;
  if (T == 0) return degree;

  New(0, *T, degree+1, mpz_t);
  mpz_init(t);
  for (j = 0; j < degree; j++) {
    unsigned char signcount = (*s++) & 0xFF;
    unsigned char sign = signcount >> 7;
    unsigned long count = signcount & 0x7F;
    if (count == 127) {
      do {
        signcount = (*s++) & 0xFF;
        count += signcount;
      } while (signcount == 127);
    }
    mpz_set_ui(t, 0);
    while (count-- > 0) {
      mpz_mul_2exp(t, t, 8);
      mpz_add_ui(t, t, (unsigned long) (*s++) & 0xFF);
    }
    /* Cube the last coefficient of Hilbert polys */
    if (j == 0 && ctype == 1) mpz_pow_ui(t, t, 3);
    if (sign) mpz_neg(t, t);
    mpz_init_set( (*T)[j], t );
  }
  mpz_clear(t);
  mpz_init_set_ui( (*T)[degree], 1 );
  return degree;
}