Some small performance improvements on 32-bit architectures.

author Thomas Pornin <pornin@bolet.org>

Thu, 3 Jan 2019 18:50:54 +0000 (19:50 +0100)

committer Thomas Pornin <pornin@bolet.org>

Thu, 3 Jan 2019 18:50:54 +0000 (19:50 +0100)
author Thomas Pornin <pornin@bolet.org>
Thu, 3 Jan 2019 18:50:54 +0000 (19:50 +0100)
committer Thomas Pornin <pornin@bolet.org>
Thu, 3 Jan 2019 18:50:54 +0000 (19:50 +0100)
diff --git a/src/ec/ec_c25519_m31.c b/src/ec/ec_c25519_m31.c

index 95f1275..1dd6d51 100644 (file)
--- a/src/ec/ec_c25519_m31.c
+++ b/src/ec/ec_c25519_m31.c
@@ -372,8 +372,7 @@ reduce_final_f255(uint32_t *d)
  static void
  f255_mul(uint32_t *d, const uint32_t *a, const uint32_t *b)
  {
-       uint32_t t[18];
-       uint64_t cc, w;
+       uint32_t t[18], cc;
         int i;
  
         /*
@@ -389,21 +388,42 @@ f255_mul(uint32_t *d, const uint32_t *a, const uint32_t *b)
          * offset 9*30 = 270, word 9+k must be added to word k with
          * a factor of 19*2^15 = 622592. The extra bits in word 8 are also
          * added that way.
+        *
+        * Keeping the carry on 32 bits helps with 32-bit architectures,
+        * and does not noticeably impact performance on 64-bit systems.
          */
-       cc = MUL31(t[8] >> 15, 19);
+       cc = MUL15(t[8] >> 15, 19);  /* at most 19*(2^15-1) = 622573 */
         t[8] &= 0x7FFF;
         for (i = 0; i < 9; i ++) {
-               w = (uint64_t)t[i] + cc + MUL31(t[i + 9], 622592);
+               uint64_t w;
+
+               w = (uint64_t)t[i] + (uint64_t)cc + MUL31(t[i + 9], 622592);
                 t[i] = (uint32_t)w & 0x3FFFFFFF;
-               cc = w >> 30;
+               cc = (uint32_t)(w >> 30);  /* at most 622592 */
         }
-       cc = MUL31(w >> 15, 19);
+
+       /*
+        * Original product was up to (2^256-1)^2, i.e. a 512-bit integer.
+        * This was split into two parts (upper of 257 bits, lower of 255
+        * bits), and the upper was added to the lower with a factor 19,
+        * which means that the intermediate value is less than 77*2^255
+        * (19*2^257 + 2^255). Therefore, the extra bits "t[8] >> 15" are
+        * less than 77, and the initial carry cc is at most 76*19 = 1444.
+        */
+       cc = MUL15(t[8] >> 15, 19);
         t[8] &= 0x7FFF;
         for (i = 0; i < 9; i ++) {
-               w = t[i] + cc;
-               d[i] = (uint32_t)w & 0x3FFFFFFF;
-               cc = w >> 30;
+               uint32_t z;
+
+               z = t[i] + cc;
+               d[i] = z & 0x3FFFFFFF;
+               cc = z >> 30;
         }
+
+       /*
+        * Final result is at most 2^255 + 1443. In particular, the last
+        * carry is necessarily 0, since t[8] was truncated to 15 bits.
+        */
  }
  
  /*
@@ -415,8 +435,7 @@ f255_mul(uint32_t *d, const uint32_t *a, const uint32_t *b)
  static void
  f255_square(uint32_t *d, const uint32_t *a)
  {
-       uint32_t t[18];
-       uint64_t cc, w;
+       uint32_t t[18], cc;
         int i;
  
         /*
@@ -428,24 +447,25 @@ f255_square(uint32_t *d, const uint32_t *a)
  
         /*
          * Modular reduction: each high word is added where necessary.
-        * Since the modulus is 2^255-19 and word 9 corresponds to
-        * offset 9*30 = 270, word 9+k must be added to word k with
-        * a factor of 19*2^15 = 622592. The extra bits in word 8 are also
-        * added that way.
+        * See f255_mul() for details on the reduction and carry limits.
          */
-       cc = MUL31(t[8] >> 15, 19);
+       cc = MUL15(t[8] >> 15, 19);
         t[8] &= 0x7FFF;
         for (i = 0; i < 9; i ++) {
-               w = (uint64_t)t[i] + cc + MUL31(t[i + 9], 622592);
+               uint64_t w;
+
+               w = (uint64_t)t[i] + (uint64_t)cc + MUL31(t[i + 9], 622592);
                 t[i] = (uint32_t)w & 0x3FFFFFFF;
-               cc = w >> 30;
+               cc = (uint32_t)(w >> 30);
         }
-       cc = MUL31(w >> 15, 19);
+       cc = MUL15(t[8] >> 15, 19);
         t[8] &= 0x7FFF;
         for (i = 0; i < 9; i ++) {
-               w = t[i] + cc;
-               d[i] = (uint32_t)w & 0x3FFFFFFF;
-               cc = w >> 30;
+               uint32_t z;
+
+               z = t[i] + cc;
+               d[i] = z & 0x3FFFFFFF;
+               cc = z >> 30;
         }
  }
  
@@ -515,20 +535,31 @@ static void
  f255_mul_a24(uint32_t *d, const uint32_t *a)
  {
         int i;
-       uint64_t cc, w;
+       uint64_t w;
+       uint32_t cc;
  
+       /*
+        * a[] is over 256 bits, thus a[8] has length at most 16 bits.
+        * We single out the processing of the last word: intermediate
+        * value w is up to 121665*2^16, yielding a carry for the next
+        * loop of at most 19*(121665*2^16/2^15) = 4623289.
+        */
         cc = 0;
-       for (i = 0; i < 9; i ++) {
-               w = MUL31(a[i], 121665) + cc;
+       for (i = 0; i < 8; i ++) {
+               w = MUL31(a[i], 121665) + (uint64_t)cc;
                 d[i] = (uint32_t)w & 0x3FFFFFFF;
-               cc = w >> 30;
+               cc = (uint32_t)(w >> 30);
         }
-       cc = MUL31((uint32_t)(w >> 15), 19);
-       d[8] &= 0x7FFF;
+       w = MUL31(a[8], 121665) + (uint64_t)cc;
+       d[8] = (uint32_t)w & 0x7FFF;
+       cc = MUL15((uint32_t)(w >> 15), 19);
+
         for (i = 0; i < 9; i ++) {
-               w = (uint64_t)d[i] + cc;
-               d[i] = w & 0x3FFFFFFF;
-               cc = w >> 30;
+               uint32_t z;
+
+               z = d[i] + cc;
+               d[i] = z & 0x3FFFFFFF;
+               cc = z >> 30;
         }
  }
  
diff --git a/src/ec/ec_p256_m15.c b/src/ec/ec_p256_m15.c

index 6ce57e0..8d68d1d 100644 (file)
--- a/src/ec/ec_p256_m15.c
+++ b/src/ec/ec_p256_m15.c
@@ -1739,7 +1739,7 @@ p256_decode(p256_jacobian *P, const void *src, size_t len)
         memcpy(P->y, ty, sizeof ty);
         memset(P->z, 0, sizeof P->z);
         P->z[0] = 1;
-       return NEQ(bad, 0) ^ 1;
+       return EQ(bad, 0);
  }
  
  /*
diff --git a/src/ec/ec_p256_m31.c b/src/ec/ec_p256_m31.c

index ec22c3e..d57ef7b 100644 (file)
--- a/src/ec/ec_p256_m31.c
+++ b/src/ec/ec_p256_m31.c
@@ -1089,7 +1089,7 @@ p256_decode(p256_jacobian *P, const void *src, size_t len)
         memcpy(P->y, ty, sizeof ty);
         memset(P->z, 0, sizeof P->z);
         P->z[0] = 1;
-       return NEQ(bad, 0) ^ 1;
+       return EQ(bad, 0);
  }
  
  /*
diff --git a/src/inner.h b/src/inner.h

index 1b52673..21d996e 100644 (file)
--- a/src/inner.h
+++ b/src/inner.h
@@ -114,6 +114,8 @@
  #define BR_64   1
  #elif defined(__x86_64__) || defined(_M_X64)
  #define BR_64   1
+#elif defined(__aarch64__) || defined(_M_ARM64)
+#define BR_64   1
  #endif
  #endif
  
diff --git a/src/int/i31_montmul.c b/src/int/i31_montmul.c

index 8066808..758f8f4 100644 (file)
--- a/src/int/i31_montmul.c
+++ b/src/int/i31_montmul.c
@@ -29,16 +29,45 @@ void
  br_i31_montymul(uint32_t *d, const uint32_t *x, const uint32_t *y,
         const uint32_t *m, uint32_t m0i)
  {
+       /*
+        * Each outer loop iteration computes:
+        *   d <- (d + xu*y + f*m) / 2^31
+        * We have xu <= 2^31-1 and f <= 2^31-1.
+        * Thus, if d <= 2*m-1 on input, then:
+        *   2*m-1 + 2*(2^31-1)*m <= (2^32)*m-1
+        * and the new d value is less than 2*m.
+        *
+        * We represent d over 31-bit words, with an extra word 'dh'
+        * which can thus be only 0 or 1.
+        */
         size_t len, len4, u, v;
-       uint64_t dh;
+       uint32_t dh;
  
         len = (m[0] + 31) >> 5;
         len4 = len & ~(size_t)3;
         br_i31_zero(d, m[0]);
         dh = 0;
         for (u = 0; u < len; u ++) {
+               /*
+                * The carry for each operation fits on 32 bits:
+                *   d[v+1] <= 2^31-1
+                *   xu*y[v+1] <= (2^31-1)*(2^31-1)
+                *   f*m[v+1] <= (2^31-1)*(2^31-1)
+                *   r <= 2^32-1
+                *   (2^31-1) + 2*(2^31-1)*(2^31-1) + (2^32-1) = 2^63 - 2^31
+                * After division by 2^31, the new r is then at most 2^32-1
+                *
+                * Using a 32-bit carry has performance benefits on 32-bit
+                * systems; however, on 64-bit architectures, we prefer to
+                * keep the carry (r) in a 64-bit register, thus avoiding some
+                * "clear high bits" operations.
+                */
                 uint32_t f, xu;
-               uint64_t r, zh;
+#if BR_64
+               uint64_t r;
+#else
+               uint32_t r;
+#endif
  
                 xu = x[u + 1];
                 f = MUL31_lo((d[1] + MUL31_lo(x[u + 1], y[1])), m0i);
@@ -73,9 +102,14 @@ br_i31_montymul(uint32_t *d, const uint32_t *x, const uint32_t *y,
                         d[v] = (uint32_t)z & 0x7FFFFFFF;
                 }
  
-               zh = dh + r;
-               d[len] = (uint32_t)zh & 0x7FFFFFFF;
-               dh = zh >> 31;
+               /*
+                * Since the new dh can only be 0 or 1, the addition of
+                * the old dh with the carry MUST fit on 32 bits, and
+                * thus can be done into dh itself.
+                */
+               dh += r;
+               d[len] = dh & 0x7FFFFFFF;
+               dh >>= 31;
         }
  
         /*
diff --git a/src/int/i31_mulacc.c b/src/int/i31_mulacc.c

index 024d095..7410e54 100644 (file)
--- a/src/int/i31_mulacc.c
+++ b/src/int/i31_mulacc.c
@@ -45,7 +45,20 @@ br_i31_mulacc(uint32_t *d, const uint32_t *a, const uint32_t *b)
         for (u = 0; u < blen; u ++) {
                 uint32_t f;
                 size_t v;
+
+               /*
+                * Carry always fits on 31 bits; we want to keep it in a
+                * 32-bit register on 32-bit architectures (on a 64-bit
+                * architecture, cast down from 64 to 32 bits means
+                * clearing the high bits, which is not free; on a 32-bit
+                * architecture, the same operation really means ignoring
+                * the top register, which has negative or zero cost).
+                */
+#if BR_64
                 uint64_t cc;
+#else
+               uint32_t cc;
+#endif
  
                 f = b[1 + u];
                 cc = 0;
diff --git a/src/int/i32_mulacc.c b/src/int/i32_mulacc.c

index f62c782..55da385 100644 (file)
--- a/src/int/i32_mulacc.c
+++ b/src/int/i32_mulacc.c
@@ -36,7 +36,11 @@ br_i32_mulacc(uint32_t *d, const uint32_t *a, const uint32_t *b)
         for (u = 0; u < blen; u ++) {
                 uint32_t f;
                 size_t v;
+#if BR_64
                 uint64_t cc;
+#else
+               uint32_t cc;
+#endif
  
                 f = b[1 + u];
                 cc = 0;
author	Thomas Pornin <pornin@bolet.org>
	Thu, 3 Jan 2019 18:50:54 +0000 (19:50 +0100)
committer	Thomas Pornin <pornin@bolet.org>
	Thu, 3 Jan 2019 18:50:54 +0000 (19:50 +0100)
src/ec/ec_c25519_m31.c		patch \| blob \| history
src/ec/ec_p256_m15.c		patch \| blob \| history
src/ec/ec_p256_m31.c		patch \| blob \| history
src/inner.h		patch \| blob \| history
src/int/i31_montmul.c		patch \| blob \| history
src/int/i31_mulacc.c		patch \| blob \| history
src/int/i32_mulacc.c		patch \| blob \| history