ce54336801
notes: - support for thumb-less ARM chips was in our port of 5.1.3, but a similar method has been provided upstream now - someone should look at the AVX failure reports, and fix them Changes between GMP version 6.1.0 and 6.1.1 FEATURES * Work around faulty cpuid on some recent Intel chips (this allows GMP to run on Skylake Pentiums). * Support thumb-less ARM chips. Changes between GMP version 6.0.* and 6.1.0 BUGS FIXED * The public function mpn_com is now correctly declared in gmp.h. * Healed possible failures of mpn_sec_sqr for non-cryptographic sizes for some obsolete CPUs. * Various problems related to precision for mpf have been fixed. * Fixed ABI incompatible stack alignment in calls from assembly code. * Fixed PIC bug in popcount affecting Intel processors using the 32-bit ABI. SPEEDUPS * Speedup for Intel Broadwell and Skylake through assembly code making use of new ADX instructions. * Square root is now faster when the remainder is not needed. Also the speed to compute the k-th root improved, for small sizes. FEATURES * New C++ functions gcd and lcm for mpz_class. * New public mpn functions mpn_divexact_1, mpn_zero_p, and mpn_cnd_swap. * New public mpq_cmp_z function, to efficiently compare rationals with integers. * Support for more 32-bit arm processors. * Support for AVX-less modern x86 CPUs. (Such support might be missing either because the CPU vendor chose to disable AVX, or because the running kernel lacks AVX context switch support.) * Support for NetBSD under Xen; we switch off AVX unconditionally under NetBSD since a bug in NetBSD makes AVX fail under Xen. MISC * Tuned values for FFT multiplications are provided for larger number on many platforms. Changes between GMP version 5.1.* and 6.0.0 BUGS FIXED * The function mpz_invert now considers any number invertible in Z/1Z. * The mpn multiply code now handles operands of more than 2^31 limbs correctly. (Note however that the mpz code is limited to 2^32 bits on 32-bit hosts and 2^37 bits on 64-bit hosts.) SPEEDUPS * Plain division of large operands is faster and more monotonous in operand size. * Major speedup for ARM, in particular ARM Cortex-A15, thanks to improved assembly. * Speedup for Intel Sandy Bridge, Ivy Bridge, Haswell, thanks to rewritten and vastly expanded assembly support. Speedup also for the older Core 2 and Nehalem. * Faster mixed arithmetic between mpq_class and double. FEATURES * Support for new Intel and AMD CPUs. * New public functions mpn_sec_mul and mpn_sec_sqr, implementing side-channel silent multiplication and squaring. * New public functions mpn_sec_div_qr and mpn_sec_div_r, implementing side-channel silent division. * New public functions mpn_cnd_add_n and mpn_cnd_sub_n. Side-channel silent conditional addition and subtraction. * New public function mpn_sec_powm, implementing side-channel silent modexp. * New public function mpn_sec_invert, implementing side-channel silent modular inversion. * Better support for applications which use the mpz_t type, but nevertheless need to call some of the lower-level mpn functions. See the documentation for mpz_limbs_read and related functions.
586 lines
17 KiB
C
586 lines
17 KiB
C
/* Generate perfect square testing data.
|
|
|
|
Copyright 2002-2004, 2012, 2014 Free Software Foundation, Inc.
|
|
|
|
This file is part of the GNU MP Library.
|
|
|
|
The GNU MP Library is free software; you can redistribute it and/or modify
|
|
it under the terms of either:
|
|
|
|
* the GNU Lesser General Public License as published by the Free
|
|
Software Foundation; either version 3 of the License, or (at your
|
|
option) any later version.
|
|
|
|
or
|
|
|
|
* the GNU General Public License as published by the Free Software
|
|
Foundation; either version 2 of the License, or (at your option) any
|
|
later version.
|
|
|
|
or both in parallel, as here.
|
|
|
|
The GNU MP Library is distributed in the hope that it will be useful, but
|
|
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
|
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
for more details.
|
|
|
|
You should have received copies of the GNU General Public License and the
|
|
GNU Lesser General Public License along with the GNU MP Library. If not,
|
|
see https://www.gnu.org/licenses/. */
|
|
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
|
|
#include "bootstrap.c"
|
|
|
|
|
|
/* The aim of this program is to choose either mpn_mod_34lsub1 or mpn_mod_1
|
|
(plus a PERFSQR_PP modulus), and generate tables indicating quadratic
|
|
residues and non-residues modulo small factors of that modulus.
|
|
|
|
For the usual 32 or 64 bit cases mpn_mod_34lsub1 gets used. That
|
|
function exists specifically because 2^24-1 and 2^48-1 have nice sets of
|
|
prime factors. For other limb sizes it's considered, but if it doesn't
|
|
have good factors then mpn_mod_1 will be used instead.
|
|
|
|
When mpn_mod_1 is used, the modulus PERFSQR_PP is created from a
|
|
selection of small primes, chosen to fill PERFSQR_MOD_BITS of a limb,
|
|
with that bit count chosen so (2*GMP_LIMB_BITS)*2^PERFSQR_MOD_BITS <=
|
|
GMP_LIMB_MAX, allowing PERFSQR_MOD_IDX in mpn/generic/perfsqr.c to do its
|
|
calculation within a single limb.
|
|
|
|
In either case primes can be combined to make divisors. The table data
|
|
then effectively indicates remainders which are quadratic residues mod
|
|
all the primes. This sort of combining reduces the number of steps
|
|
needed after mpn_mod_34lsub1 or mpn_mod_1, saving code size and time.
|
|
Nothing is gained or lost in terms of detections, the same total fraction
|
|
of non-residues will be identified.
|
|
|
|
Nothing particularly sophisticated is attempted for combining factors to
|
|
make divisors. This is probably a kind of knapsack problem so it'd be
|
|
too hard to attempt anything completely general. For the usual 32 and 64
|
|
bit limbs we get a good enough result just pairing the biggest and
|
|
smallest which fit together, repeatedly.
|
|
|
|
Another aim is to get powerful combinations, ie. divisors which identify
|
|
biggest fraction of non-residues, and have those run first. Again for
|
|
the usual 32 and 64 bits it seems good enough just to pair for big
|
|
divisors then sort according to the resulting fraction of non-residues
|
|
identified.
|
|
|
|
Also in this program, a table sq_res_0x100 of residues modulo 256 is
|
|
generated. This simply fills bits into limbs of the appropriate
|
|
build-time GMP_LIMB_BITS each.
|
|
|
|
*/
|
|
|
|
|
|
/* Normally we aren't using const in gen*.c programs, so as not to have to
|
|
bother figuring out if it works, but using it with f_cmp_divisor and
|
|
f_cmp_fraction avoids warnings from the qsort calls. */
|
|
|
|
/* Same tests as gmp.h. */
|
|
#if defined (__STDC__) \
|
|
|| defined (__cplusplus) \
|
|
|| defined (_AIX) \
|
|
|| defined (__DECC) \
|
|
|| (defined (__mips) && defined (_SYSTYPE_SVR4)) \
|
|
|| defined (_MSC_VER) \
|
|
|| defined (_WIN32)
|
|
#define HAVE_CONST 1
|
|
#endif
|
|
|
|
#if ! HAVE_CONST
|
|
#define const
|
|
#endif
|
|
|
|
|
|
mpz_t *sq_res_0x100; /* table of limbs */
|
|
int nsq_res_0x100; /* elements in sq_res_0x100 array */
|
|
int sq_res_0x100_num; /* squares in sq_res_0x100 */
|
|
double sq_res_0x100_fraction; /* sq_res_0x100_num / 256 */
|
|
|
|
int mod34_bits; /* 3*GMP_NUMB_BITS/4 */
|
|
int mod_bits; /* bits from PERFSQR_MOD_34 or MOD_PP */
|
|
int max_divisor; /* all divisors <= max_divisor */
|
|
int max_divisor_bits; /* ceil(log2(max_divisor)) */
|
|
double total_fraction; /* of squares */
|
|
mpz_t pp; /* product of primes, or 0 if mod_34lsub1 used */
|
|
mpz_t pp_norm; /* pp shifted so NUMB high bit set */
|
|
mpz_t pp_inverted; /* invert_limb style inverse */
|
|
mpz_t mod_mask; /* 2^mod_bits-1 */
|
|
char mod34_excuse[128]; /* why mod_34lsub1 not used (if it's not) */
|
|
|
|
/* raw list of divisors of 2^mod34_bits-1 or pp, just to show in a comment */
|
|
struct rawfactor_t {
|
|
int divisor;
|
|
int multiplicity;
|
|
};
|
|
struct rawfactor_t *rawfactor;
|
|
int nrawfactor;
|
|
|
|
/* factors of 2^mod34_bits-1 or pp and associated data, after combining etc */
|
|
struct factor_t {
|
|
int divisor;
|
|
mpz_t inverse; /* 1/divisor mod 2^mod_bits */
|
|
mpz_t mask; /* indicating squares mod divisor */
|
|
double fraction; /* squares/total */
|
|
};
|
|
struct factor_t *factor;
|
|
int nfactor; /* entries in use in factor array */
|
|
int factor_alloc; /* entries allocated to factor array */
|
|
|
|
|
|
int
|
|
f_cmp_divisor (const void *parg, const void *qarg)
|
|
{
|
|
const struct factor_t *p, *q;
|
|
p = (const struct factor_t *) parg;
|
|
q = (const struct factor_t *) qarg;
|
|
if (p->divisor > q->divisor)
|
|
return 1;
|
|
else if (p->divisor < q->divisor)
|
|
return -1;
|
|
else
|
|
return 0;
|
|
}
|
|
|
|
int
|
|
f_cmp_fraction (const void *parg, const void *qarg)
|
|
{
|
|
const struct factor_t *p, *q;
|
|
p = (const struct factor_t *) parg;
|
|
q = (const struct factor_t *) qarg;
|
|
if (p->fraction > q->fraction)
|
|
return 1;
|
|
else if (p->fraction < q->fraction)
|
|
return -1;
|
|
else
|
|
return 0;
|
|
}
|
|
|
|
/* Remove array[idx] by copying the remainder down, and adjust narray
|
|
accordingly. */
|
|
#define COLLAPSE_ELEMENT(array, idx, narray) \
|
|
do { \
|
|
memmove (&(array)[idx], \
|
|
&(array)[idx+1], \
|
|
((narray)-((idx)+1)) * sizeof (array[0])); \
|
|
(narray)--; \
|
|
} while (0)
|
|
|
|
|
|
/* return n*2^p mod m */
|
|
int
|
|
mul_2exp_mod (int n, int p, int m)
|
|
{
|
|
while (--p >= 0)
|
|
n = (2 * n) % m;
|
|
return n;
|
|
}
|
|
|
|
/* return -n mod m */
|
|
int
|
|
neg_mod (int n, int m)
|
|
{
|
|
assert (n >= 0 && n < m);
|
|
return (n == 0 ? 0 : m-n);
|
|
}
|
|
|
|
/* Set "mask" to a value such that "mask & (1<<idx)" is non-zero if
|
|
"-(idx<<mod_bits)" can be a square modulo m. */
|
|
void
|
|
square_mask (mpz_t mask, int m)
|
|
{
|
|
int p, i, r, idx;
|
|
|
|
p = mul_2exp_mod (1, mod_bits, m);
|
|
p = neg_mod (p, m);
|
|
|
|
mpz_set_ui (mask, 0L);
|
|
for (i = 0; i < m; i++)
|
|
{
|
|
r = (i * i) % m;
|
|
idx = (r * p) % m;
|
|
mpz_setbit (mask, (unsigned long) idx);
|
|
}
|
|
}
|
|
|
|
void
|
|
generate_sq_res_0x100 (int limb_bits)
|
|
{
|
|
int i, res;
|
|
|
|
nsq_res_0x100 = (0x100 + limb_bits - 1) / limb_bits;
|
|
sq_res_0x100 = (mpz_t *) xmalloc (nsq_res_0x100 * sizeof (*sq_res_0x100));
|
|
|
|
for (i = 0; i < nsq_res_0x100; i++)
|
|
mpz_init_set_ui (sq_res_0x100[i], 0L);
|
|
|
|
for (i = 0; i < 0x100; i++)
|
|
{
|
|
res = (i * i) % 0x100;
|
|
mpz_setbit (sq_res_0x100[res / limb_bits],
|
|
(unsigned long) (res % limb_bits));
|
|
}
|
|
|
|
sq_res_0x100_num = 0;
|
|
for (i = 0; i < nsq_res_0x100; i++)
|
|
sq_res_0x100_num += mpz_popcount (sq_res_0x100[i]);
|
|
sq_res_0x100_fraction = (double) sq_res_0x100_num / 256.0;
|
|
}
|
|
|
|
void
|
|
generate_mod (int limb_bits, int nail_bits)
|
|
{
|
|
int numb_bits = limb_bits - nail_bits;
|
|
int i, divisor;
|
|
|
|
mpz_init_set_ui (pp, 0L);
|
|
mpz_init_set_ui (pp_norm, 0L);
|
|
mpz_init_set_ui (pp_inverted, 0L);
|
|
|
|
/* no more than limb_bits many factors in a one limb modulus (and of
|
|
course in reality nothing like that many) */
|
|
factor_alloc = limb_bits;
|
|
factor = (struct factor_t *) xmalloc (factor_alloc * sizeof (*factor));
|
|
rawfactor = (struct rawfactor_t *) xmalloc (factor_alloc * sizeof (*rawfactor));
|
|
|
|
if (numb_bits % 4 != 0)
|
|
{
|
|
strcpy (mod34_excuse, "GMP_NUMB_BITS % 4 != 0");
|
|
goto use_pp;
|
|
}
|
|
|
|
max_divisor = 2*limb_bits;
|
|
max_divisor_bits = log2_ceil (max_divisor);
|
|
|
|
if (numb_bits / 4 < max_divisor_bits)
|
|
{
|
|
/* Wind back to one limb worth of max_divisor, if that will let us use
|
|
mpn_mod_34lsub1. */
|
|
max_divisor = limb_bits;
|
|
max_divisor_bits = log2_ceil (max_divisor);
|
|
|
|
if (numb_bits / 4 < max_divisor_bits)
|
|
{
|
|
strcpy (mod34_excuse, "GMP_NUMB_BITS / 4 too small");
|
|
goto use_pp;
|
|
}
|
|
}
|
|
|
|
{
|
|
/* Can use mpn_mod_34lsub1, find small factors of 2^mod34_bits-1. */
|
|
mpz_t m, q, r;
|
|
int multiplicity;
|
|
|
|
mod34_bits = (numb_bits / 4) * 3;
|
|
|
|
/* mpn_mod_34lsub1 returns a full limb value, PERFSQR_MOD_34 folds it at
|
|
the mod34_bits mark, adding the two halves for a remainder of at most
|
|
mod34_bits+1 many bits */
|
|
mod_bits = mod34_bits + 1;
|
|
|
|
mpz_init_set_ui (m, 1L);
|
|
mpz_mul_2exp (m, m, mod34_bits);
|
|
mpz_sub_ui (m, m, 1L);
|
|
|
|
mpz_init (q);
|
|
mpz_init (r);
|
|
|
|
for (i = 3; i <= max_divisor; i+=2)
|
|
{
|
|
if (! isprime (i))
|
|
continue;
|
|
|
|
mpz_tdiv_qr_ui (q, r, m, (unsigned long) i);
|
|
if (mpz_sgn (r) != 0)
|
|
continue;
|
|
|
|
/* if a repeated prime is found it's used as an i^n in one factor */
|
|
divisor = 1;
|
|
multiplicity = 0;
|
|
do
|
|
{
|
|
if (divisor > max_divisor / i)
|
|
break;
|
|
multiplicity++;
|
|
mpz_set (m, q);
|
|
mpz_tdiv_qr_ui (q, r, m, (unsigned long) i);
|
|
}
|
|
while (mpz_sgn (r) == 0);
|
|
|
|
assert (nrawfactor < factor_alloc);
|
|
rawfactor[nrawfactor].divisor = i;
|
|
rawfactor[nrawfactor].multiplicity = multiplicity;
|
|
nrawfactor++;
|
|
}
|
|
|
|
mpz_clear (m);
|
|
mpz_clear (q);
|
|
mpz_clear (r);
|
|
}
|
|
|
|
if (nrawfactor <= 2)
|
|
{
|
|
mpz_t new_pp;
|
|
|
|
sprintf (mod34_excuse, "only %d small factor%s",
|
|
nrawfactor, nrawfactor == 1 ? "" : "s");
|
|
|
|
use_pp:
|
|
/* reset to two limbs of max_divisor, in case the mpn_mod_34lsub1 code
|
|
tried with just one */
|
|
max_divisor = 2*limb_bits;
|
|
max_divisor_bits = log2_ceil (max_divisor);
|
|
|
|
mpz_init (new_pp);
|
|
nrawfactor = 0;
|
|
mod_bits = MIN (numb_bits, limb_bits - max_divisor_bits);
|
|
|
|
/* one copy of each small prime */
|
|
mpz_set_ui (pp, 1L);
|
|
for (i = 3; i <= max_divisor; i+=2)
|
|
{
|
|
if (! isprime (i))
|
|
continue;
|
|
|
|
mpz_mul_ui (new_pp, pp, (unsigned long) i);
|
|
if (mpz_sizeinbase (new_pp, 2) > mod_bits)
|
|
break;
|
|
mpz_set (pp, new_pp);
|
|
|
|
assert (nrawfactor < factor_alloc);
|
|
rawfactor[nrawfactor].divisor = i;
|
|
rawfactor[nrawfactor].multiplicity = 1;
|
|
nrawfactor++;
|
|
}
|
|
|
|
/* Plus an extra copy of one or more of the primes selected, if that
|
|
still fits in max_divisor and the total in mod_bits. Usually only
|
|
3 or 5 will be candidates */
|
|
for (i = nrawfactor-1; i >= 0; i--)
|
|
{
|
|
if (rawfactor[i].divisor > max_divisor / rawfactor[i].divisor)
|
|
continue;
|
|
mpz_mul_ui (new_pp, pp, (unsigned long) rawfactor[i].divisor);
|
|
if (mpz_sizeinbase (new_pp, 2) > mod_bits)
|
|
continue;
|
|
mpz_set (pp, new_pp);
|
|
|
|
rawfactor[i].multiplicity++;
|
|
}
|
|
|
|
mod_bits = mpz_sizeinbase (pp, 2);
|
|
|
|
mpz_set (pp_norm, pp);
|
|
while (mpz_sizeinbase (pp_norm, 2) < numb_bits)
|
|
mpz_add (pp_norm, pp_norm, pp_norm);
|
|
|
|
mpz_preinv_invert (pp_inverted, pp_norm, numb_bits);
|
|
|
|
mpz_clear (new_pp);
|
|
}
|
|
|
|
/* start the factor array */
|
|
for (i = 0; i < nrawfactor; i++)
|
|
{
|
|
int j;
|
|
assert (nfactor < factor_alloc);
|
|
factor[nfactor].divisor = 1;
|
|
for (j = 0; j < rawfactor[i].multiplicity; j++)
|
|
factor[nfactor].divisor *= rawfactor[i].divisor;
|
|
nfactor++;
|
|
}
|
|
|
|
combine:
|
|
/* Combine entries in the factor array. Combine the smallest entry with
|
|
the biggest one that will fit with it (ie. under max_divisor), then
|
|
repeat that with the new smallest entry. */
|
|
qsort (factor, nfactor, sizeof (factor[0]), f_cmp_divisor);
|
|
for (i = nfactor-1; i >= 1; i--)
|
|
{
|
|
if (factor[i].divisor <= max_divisor / factor[0].divisor)
|
|
{
|
|
factor[0].divisor *= factor[i].divisor;
|
|
COLLAPSE_ELEMENT (factor, i, nfactor);
|
|
goto combine;
|
|
}
|
|
}
|
|
|
|
total_fraction = 1.0;
|
|
for (i = 0; i < nfactor; i++)
|
|
{
|
|
mpz_init (factor[i].inverse);
|
|
mpz_invert_ui_2exp (factor[i].inverse,
|
|
(unsigned long) factor[i].divisor,
|
|
(unsigned long) mod_bits);
|
|
|
|
mpz_init (factor[i].mask);
|
|
square_mask (factor[i].mask, factor[i].divisor);
|
|
|
|
/* fraction of possible squares */
|
|
factor[i].fraction = (double) mpz_popcount (factor[i].mask)
|
|
/ factor[i].divisor;
|
|
|
|
/* total fraction of possible squares */
|
|
total_fraction *= factor[i].fraction;
|
|
}
|
|
|
|
/* best tests first (ie. smallest fraction) */
|
|
qsort (factor, nfactor, sizeof (factor[0]), f_cmp_fraction);
|
|
}
|
|
|
|
void
|
|
print (int limb_bits, int nail_bits)
|
|
{
|
|
int i;
|
|
mpz_t mhi, mlo;
|
|
|
|
printf ("/* This file generated by gen-psqr.c - DO NOT EDIT. */\n");
|
|
printf ("\n");
|
|
|
|
printf ("#if GMP_LIMB_BITS != %d || GMP_NAIL_BITS != %d\n",
|
|
limb_bits, nail_bits);
|
|
printf ("Error, error, this data is for %d bit limb and %d bit nail\n",
|
|
limb_bits, nail_bits);
|
|
printf ("#endif\n");
|
|
printf ("\n");
|
|
|
|
printf ("/* Non-zero bit indicates a quadratic residue mod 0x100.\n");
|
|
printf (" This test identifies %.2f%% as non-squares (%d/256). */\n",
|
|
(1.0 - sq_res_0x100_fraction) * 100.0,
|
|
0x100 - sq_res_0x100_num);
|
|
printf ("static const mp_limb_t\n");
|
|
printf ("sq_res_0x100[%d] = {\n", nsq_res_0x100);
|
|
for (i = 0; i < nsq_res_0x100; i++)
|
|
{
|
|
printf (" CNST_LIMB(0x");
|
|
mpz_out_str (stdout, 16, sq_res_0x100[i]);
|
|
printf ("),\n");
|
|
}
|
|
printf ("};\n");
|
|
printf ("\n");
|
|
|
|
if (mpz_sgn (pp) != 0)
|
|
{
|
|
printf ("/* mpn_mod_34lsub1 not used due to %s */\n", mod34_excuse);
|
|
printf ("/* PERFSQR_PP = ");
|
|
}
|
|
else
|
|
printf ("/* 2^%d-1 = ", mod34_bits);
|
|
for (i = 0; i < nrawfactor; i++)
|
|
{
|
|
if (i != 0)
|
|
printf (" * ");
|
|
printf ("%d", rawfactor[i].divisor);
|
|
if (rawfactor[i].multiplicity != 1)
|
|
printf ("^%d", rawfactor[i].multiplicity);
|
|
}
|
|
printf (" %s*/\n", mpz_sgn (pp) == 0 ? "... " : "");
|
|
|
|
printf ("#define PERFSQR_MOD_BITS %d\n", mod_bits);
|
|
if (mpz_sgn (pp) != 0)
|
|
{
|
|
printf ("#define PERFSQR_PP CNST_LIMB(0x");
|
|
mpz_out_str (stdout, 16, pp);
|
|
printf (")\n");
|
|
printf ("#define PERFSQR_PP_NORM CNST_LIMB(0x");
|
|
mpz_out_str (stdout, 16, pp_norm);
|
|
printf (")\n");
|
|
printf ("#define PERFSQR_PP_INVERTED CNST_LIMB(0x");
|
|
mpz_out_str (stdout, 16, pp_inverted);
|
|
printf (")\n");
|
|
}
|
|
printf ("\n");
|
|
|
|
mpz_init (mhi);
|
|
mpz_init (mlo);
|
|
|
|
printf ("/* This test identifies %.2f%% as non-squares. */\n",
|
|
(1.0 - total_fraction) * 100.0);
|
|
printf ("#define PERFSQR_MOD_TEST(up, usize) \\\n");
|
|
printf (" do { \\\n");
|
|
printf (" mp_limb_t r; \\\n");
|
|
if (mpz_sgn (pp) != 0)
|
|
printf (" PERFSQR_MOD_PP (r, up, usize); \\\n");
|
|
else
|
|
printf (" PERFSQR_MOD_34 (r, up, usize); \\\n");
|
|
|
|
for (i = 0; i < nfactor; i++)
|
|
{
|
|
printf (" \\\n");
|
|
printf (" /* %5.2f%% */ \\\n",
|
|
(1.0 - factor[i].fraction) * 100.0);
|
|
|
|
printf (" PERFSQR_MOD_%d (r, CNST_LIMB(%2d), CNST_LIMB(0x",
|
|
factor[i].divisor <= limb_bits ? 1 : 2,
|
|
factor[i].divisor);
|
|
mpz_out_str (stdout, 16, factor[i].inverse);
|
|
printf ("), \\\n");
|
|
printf (" CNST_LIMB(0x");
|
|
|
|
if ( factor[i].divisor <= limb_bits)
|
|
{
|
|
mpz_out_str (stdout, 16, factor[i].mask);
|
|
}
|
|
else
|
|
{
|
|
mpz_tdiv_r_2exp (mlo, factor[i].mask, (unsigned long) limb_bits);
|
|
mpz_tdiv_q_2exp (mhi, factor[i].mask, (unsigned long) limb_bits);
|
|
mpz_out_str (stdout, 16, mhi);
|
|
printf ("), CNST_LIMB(0x");
|
|
mpz_out_str (stdout, 16, mlo);
|
|
}
|
|
printf (")); \\\n");
|
|
}
|
|
|
|
printf (" } while (0)\n");
|
|
printf ("\n");
|
|
|
|
printf ("/* Grand total sq_res_0x100 and PERFSQR_MOD_TEST, %.2f%% non-squares. */\n",
|
|
(1.0 - (total_fraction * 44.0/256.0)) * 100.0);
|
|
printf ("\n");
|
|
|
|
printf ("/* helper for tests/mpz/t-perfsqr.c */\n");
|
|
printf ("#define PERFSQR_DIVISORS { 256,");
|
|
for (i = 0; i < nfactor; i++)
|
|
printf (" %d,", factor[i].divisor);
|
|
printf (" }\n");
|
|
|
|
|
|
mpz_clear (mhi);
|
|
mpz_clear (mlo);
|
|
}
|
|
|
|
int
|
|
main (int argc, char *argv[])
|
|
{
|
|
int limb_bits, nail_bits;
|
|
|
|
if (argc != 3)
|
|
{
|
|
fprintf (stderr, "Usage: gen-psqr <limbbits> <nailbits>\n");
|
|
exit (1);
|
|
}
|
|
|
|
limb_bits = atoi (argv[1]);
|
|
nail_bits = atoi (argv[2]);
|
|
|
|
if (limb_bits <= 0
|
|
|| nail_bits < 0
|
|
|| nail_bits >= limb_bits)
|
|
{
|
|
fprintf (stderr, "Invalid limb/nail bits: %d %d\n",
|
|
limb_bits, nail_bits);
|
|
exit (1);
|
|
}
|
|
|
|
generate_sq_res_0x100 (limb_bits);
|
|
generate_mod (limb_bits, nail_bits);
|
|
|
|
print (limb_bits, nail_bits);
|
|
|
|
return 0;
|
|
}
|