Fix issue with ARMv8 not performing 128-bit math against constants correctly in debug builds. Fix was to use the `__int128_t` as const for integers. Also added `./configure --enable-curve25519=no128bit` option to force FE to not use the `int128_t` math.

2017-07-14 10:39:30 -07:00 · 2017-07-14 10:39:30 -07:00 · 8612d52844
parent 43260f02f4
commit 8612d52844
6 changed files with 73 additions and 56 deletions
--- a/configure.ac
+++ b/configure.ac
@ -1163,6 +1163,12 @@ then
    ENABLED_CURVE25519=yes
 fi

+if test "$ENABLED_CURVE25519" = "no128bit"
+then
+    AM_CFLAGS="$AM_CFLAGS -DNO_CURVED25519_128BIT"
+    ENABLED_CURVE25519=yes
+fi
+
 if test "$ENABLED_CURVE25519" = "yes"
 then
    AM_CFLAGS="$AM_CFLAGS -DHAVE_CURVE25519"
--- a/wolfcrypt/src/fe_operations.c
+++ b/wolfcrypt/src/fe_operations.c
@ -41,7 +41,7 @@
    #include <wolfcrypt/src/misc.c>
 #endif

-#ifdef HAVE___UINT128_T
+#ifdef CURVED25519_128BIT
 #include "fe_x25519_128.i"
 #else

--- a/wolfcrypt/src/fe_x25519_128.i
+++ b/wolfcrypt/src/fe_x25519_128.i
@ -1,4 +1,4 @@
-/* fp_mont_small.i
+/* fp_x25519_128.i
 *
 * Copyright (C) 2006-2017 wolfSSL Inc.
 *
@ -253,6 +253,7 @@ void fe_add(fe r, const fe a, const fe b)
 */
 void fe_mul(fe r, const fe a, const fe b)
 {
+    const __int128_t k19 = 19;
    __int128_t t0 = ((__int128_t)a[0]) * b[0];
    __int128_t t1 = ((__int128_t)a[0]) * b[1]
                  + ((__int128_t)a[1]) * b[0];
@ -280,19 +281,19 @@ void fe_mul(fe r, const fe a, const fe b)
    __int128_t t8 = ((__int128_t)a[4]) * b[4];

    /* Modulo reduce double long word. */
-    t0 += t5 * 19;
-    t1 += t6 * 19;
-    t2 += t7 * 19;
-    t3 += t8 * 19;
+    t0 += t5 * k19;
+    t1 += t6 * k19;
+    t2 += t7 * k19;
+    t3 += t8 * k19;

    /* Normalize to 51-bits of data per word. */
-    t0 += (t4 >> 51) * 19; t4 &= 0x7ffffffffffff;
+    t0 += (t4 >> 51) * k19; t4 &= 0x7ffffffffffff;

    t1 += t0 >> 51; r[0] = t0 & 0x7ffffffffffff;
    t2 += t1 >> 51; r[1] = t1 & 0x7ffffffffffff;
    t3 += t2 >> 51; r[2] = t2 & 0x7ffffffffffff;
    t4 += t3 >> 51; r[3] = t3 & 0x7ffffffffffff;
-    r[0] += (t4 >> 51) * 19;
+    r[0] += (t4 >> 51) * k19;
    r[4] = t4 & 0x7ffffffffffff;
 }

@ -304,36 +305,38 @@ void fe_mul(fe r, const fe a, const fe b)
 */
 void fe_sq(fe r, const fe a)
 {
+    const __int128_t k19 = 19;
+    const __int128_t k2 = 2;
    __int128_t t0 = ((__int128_t)a[0]) * a[0];
-    __int128_t t1 = ((__int128_t)a[0]) * a[1] * 2;
-    __int128_t t2 = ((__int128_t)a[0]) * a[2] * 2
+    __int128_t t1 = ((__int128_t)a[0]) * a[1] * k2;
+    __int128_t t2 = ((__int128_t)a[0]) * a[2] * k2
                  + ((__int128_t)a[1]) * a[1];
-    __int128_t t3 = ((__int128_t)a[0]) * a[3] * 2
-                  + ((__int128_t)a[1]) * a[2] * 2;
-    __int128_t t4 = ((__int128_t)a[0]) * a[4] * 2
-                  + ((__int128_t)a[1]) * a[3] * 2
+    __int128_t t3 = ((__int128_t)a[0]) * a[3] * k2
+                  + ((__int128_t)a[1]) * a[2] * k2;
+    __int128_t t4 = ((__int128_t)a[0]) * a[4] * k2
+                  + ((__int128_t)a[1]) * a[3] * k2
                  + ((__int128_t)a[2]) * a[2];
-    __int128_t t5 = ((__int128_t)a[1]) * a[4] * 2
-                  + ((__int128_t)a[2]) * a[3] * 2;
-    __int128_t t6 = ((__int128_t)a[2]) * a[4] * 2
+    __int128_t t5 = ((__int128_t)a[1]) * a[4] * k2
+                  + ((__int128_t)a[2]) * a[3] * k2;
+    __int128_t t6 = ((__int128_t)a[2]) * a[4] * k2
                  + ((__int128_t)a[3]) * a[3];
-    __int128_t t7 = ((__int128_t)a[3]) * a[4] * 2;
+    __int128_t t7 = ((__int128_t)a[3]) * a[4] * k2;
    __int128_t t8 = ((__int128_t)a[4]) * a[4];

    /* Modulo reduce double long word. */
-    t0 += t5 * 19;
-    t1 += t6 * 19;
-    t2 += t7 * 19;
-    t3 += t8 * 19;
+    t0 += t5 * k19;
+    t1 += t6 * k19;
+    t2 += t7 * k19;
+    t3 += t8 * k19;

    /* Normalize to 51-bits of data per word. */
-    t0 += (t4 >> 51) * 19; t4 &= 0x7ffffffffffff;
+    t0 += (t4 >> 51) * k19; t4 &= 0x7ffffffffffff;

    t1 += t0 >> 51; r[0] = t0 & 0x7ffffffffffff;
    t2 += t1 >> 51; r[1] = t1 & 0x7ffffffffffff;
    t3 += t2 >> 51; r[2] = t2 & 0x7ffffffffffff;
    t4 += t3 >> 51; r[3] = t3 & 0x7ffffffffffff;
-    r[0] += (t4 >> 51) * 19;
+    r[0] += (t4 >> 51) * k19;
    r[4] = t4 & 0x7ffffffffffff;
 }

@ -345,20 +348,22 @@ void fe_sq(fe r, const fe a)
 */
 void fe_mul121666(fe r, fe a)
 {
-    __int128_t t0 = ((__int128_t)a[0]) * (int64_t)121666;
-    __int128_t t1 = ((__int128_t)a[1]) * (int64_t)121666;
-    __int128_t t2 = ((__int128_t)a[2]) * (int64_t)121666;
-    __int128_t t3 = ((__int128_t)a[3]) * (int64_t)121666;
-    __int128_t t4 = ((__int128_t)a[4]) * (int64_t)121666;
+    const __int128_t k19 = 19;
+    const __int128_t k121666 = 121666;
+    __int128_t t0 = ((__int128_t)a[0]) * k121666;
+    __int128_t t1 = ((__int128_t)a[1]) * k121666;
+    __int128_t t2 = ((__int128_t)a[2]) * k121666;
+    __int128_t t3 = ((__int128_t)a[3]) * k121666;
+    __int128_t t4 = ((__int128_t)a[4]) * k121666;

    /* Normalize to 51-bits of data per word. */
-    t0 += (t4 >> 51) * 19; t4 &= 0x7ffffffffffff;
+    t0 += (t4 >> 51) * k19; t4 &= 0x7ffffffffffff;

    t1 += t0 >> 51; r[0] = t0 & 0x7ffffffffffff;
    t2 += t1 >> 51; r[1] = t1 & 0x7ffffffffffff;
    t3 += t2 >> 51; r[2] = t2 & 0x7ffffffffffff;
    t4 += t3 >> 51; r[3] = t3 & 0x7ffffffffffff;
-    r[0] += (t4 >> 51) * 19;
+    r[0] += (t4 >> 51) * k19;
    r[4] = t4 & 0x7ffffffffffff;
 }

@ -546,36 +551,38 @@ void fe_pow22523(fe r, const fe a)
 */
 void fe_sq2(fe r, const fe a)
 {
-    __int128_t t0 = 2 * (((__int128_t)a[0]) * a[0]);
-    __int128_t t1 = 2 * (((__int128_t)a[0]) * a[1] * 2);
-    __int128_t t2 = 2 * (((__int128_t)a[0]) * a[2] * 2
+    const __int128_t k2 = 2;
+    const __int128_t k19 = 19;
+    __int128_t t0 = k2 * (((__int128_t)a[0]) * a[0]);
+    __int128_t t1 = k2 * (((__int128_t)a[0]) * a[1] * k2);
+    __int128_t t2 = k2 * (((__int128_t)a[0]) * a[2] * k2
                  + ((__int128_t)a[1]) * a[1]);
-    __int128_t t3 = 2 * (((__int128_t)a[0]) * a[3] * 2
-                  + ((__int128_t)a[1]) * a[2] * 2);
-    __int128_t t4 = 2 * (((__int128_t)a[0]) * a[4] * 2
-                  + ((__int128_t)a[1]) * a[3] * 2
+    __int128_t t3 = k2 * (((__int128_t)a[0]) * a[3] * k2
+                  + ((__int128_t)a[1]) * a[2] * k2);
+    __int128_t t4 = k2 * (((__int128_t)a[0]) * a[4] * k2
+                  + ((__int128_t)a[1]) * a[3] * k2
                  + ((__int128_t)a[2]) * a[2]);
-    __int128_t t5 = 2 * (((__int128_t)a[1]) * a[4] * 2
-                  + ((__int128_t)a[2]) * a[3] * 2);
-    __int128_t t6 = 2 * (((__int128_t)a[2]) * a[4] * 2
+    __int128_t t5 = k2 * (((__int128_t)a[1]) * a[4] * k2
+                  + ((__int128_t)a[2]) * a[3] * k2);
+    __int128_t t6 = k2 * (((__int128_t)a[2]) * a[4] * k2
                  + ((__int128_t)a[3]) * a[3]);
-    __int128_t t7 = 2 * (((__int128_t)a[3]) * a[4] * 2);
-    __int128_t t8 = 2 * (((__int128_t)a[4]) * a[4]);
+    __int128_t t7 = k2 * (((__int128_t)a[3]) * a[4] * k2);
+    __int128_t t8 = k2 * (((__int128_t)a[4]) * a[4]);

    /* Modulo reduce double long word. */
-    t0 += t5 * 19;
-    t1 += t6 * 19;
-    t2 += t7 * 19;
-    t3 += t8 * 19;
+    t0 += t5 * k19;
+    t1 += t6 * k19;
+    t2 += t7 * k19;
+    t3 += t8 * k19;

    /* Normalize to 51-bits of data per word. */
-    t0 += (t4 >> 51) * 19; t4 &= 0x7ffffffffffff;
+    t0 += (t4 >> 51) * k19; t4 &= 0x7ffffffffffff;

    t1 += t0 >> 51; r[0] = t0 & 0x7ffffffffffff;
    t2 += t1 >> 51; r[1] = t1 & 0x7ffffffffffff;
    t3 += t2 >> 51; r[2] = t2 & 0x7ffffffffffff;
    t4 += t3 >> 51; r[3] = t3 & 0x7ffffffffffff;
-    r[0] += (t4 >> 51) * 19;
+    r[0] += (t4 >> 51) * k19;
    r[4] = t4 & 0x7ffffffffffff;
 }

--- a/wolfcrypt/src/ge_operations.c
+++ b/wolfcrypt/src/ge_operations.c
@ -765,7 +765,7 @@ static void cmov(ge_precomp *t,const ge_precomp *u,unsigned char b)
  fe_cmov(t->xy2d,u->xy2d,b);
 }

-#ifdef HAVE___UINT128_T
+#ifdef CURVED25519_128BIT
 static const ge_precomp base[32][8] = {
 {
    {
@ -3569,7 +3569,7 @@ static void slide(signed char *r,const unsigned char *a)
    }
 }

-#ifdef HAVE___UINT128_T
+#ifdef CURVED25519_128BIT
 static const ge_precomp Bi[8] = {
    {
        { 0x493c6f58c3b85, 0x0df7181c325f7, 0x0f50b0b3e4cb7, 0x5329385a44c32, 0x07cf9d3a33d4b },
@ -3719,7 +3719,7 @@ int ge_double_scalarmult_vartime(ge_p2 *r, const unsigned char *a,
  return 0;
 }

-#ifdef HAVE___UINT128_T
+#ifdef CURVED25519_128BIT
 static const ge d = {
    0x34dca135978a3, 0x1a8283b156ebd, 0x5e7a26001c029, 0x739c663a03cbb,
    0x52036cee2b6ff
@ -3732,7 +3732,7 @@ static const ge d = {
 #endif


-#ifdef HAVE___UINT128_T
+#ifdef CURVED25519_128BIT
 static const ge sqrtm1 = {
    0x61b274a0ea0b0, 0x0d5a5fc8f189d, 0x7ef5e9cbd0c60, 0x78595a6804c9e,
    0x2b8324804fc1d
@ -3921,7 +3921,7 @@ void ge_p3_dbl(ge_p1p1 *r,const ge_p3 *p)
 r = p
 */

-#ifdef HAVE___UINT128_T
+#ifdef CURVED25519_128BIT
 static const ge d2 = {
    0x69b9426b2f159, 0x35050762add7a, 0x3cf44c0038052, 0x6738cc7407977,
    0x2406d9dc56dff
--- a/wolfssl/wolfcrypt/fe_operations.h
+++ b/wolfssl/wolfcrypt/fe_operations.h
@ -33,6 +33,10 @@

 #include <wolfssl/wolfcrypt/types.h>

+#if defined(HAVE___UINT128_T) && !defined(NO_CURVED25519_128BIT)
+    #define CURVED25519_128BIT
+#endif
+
 /*
 fe means field element.
 Here the field is \Z/(2^255-19).
@ -60,7 +64,7 @@ WOLFSSL_LOCAL int  curve25519(byte * q, byte * n, byte * p);
 /* default to be faster but take more memory */
 #if !defined(CURVE25519_SMALL) || !defined(ED25519_SMALL)

-#if defined(HAVE___UINT128_T)
+#if defined(CURVED25519_128BIT)
    typedef int64_t  fe[5];
 #else
    typedef int32_t  fe[10];
--- a/wolfssl/wolfcrypt/ge_operations.h
+++ b/wolfssl/wolfcrypt/ge_operations.h
@ -47,7 +47,7 @@ Representations:

 #ifdef ED25519_SMALL
  typedef byte     ge[F25519_SIZE];
-#elif defined(HAVE___UINT128_T)
+#elif defined(CURVED25519_128BIT)
  typedef int64_t  ge[5];
 #else
  typedef int32_t  ge[10];