static void
f255_mul(uint32_t *d, const uint32_t *a, const uint32_t *b)
{
- uint32_t t[18];
- uint64_t cc, w;
+ uint32_t t[18], cc;
int i;
/*
* offset 9*30 = 270, word 9+k must be added to word k with
* a factor of 19*2^15 = 622592. The extra bits in word 8 are also
* added that way.
+ *
+ * Keeping the carry on 32 bits helps with 32-bit architectures,
+ * and does not noticeably impact performance on 64-bit systems.
*/
- cc = MUL31(t[8] >> 15, 19);
+ cc = MUL15(t[8] >> 15, 19); /* at most 19*(2^15-1) = 622573 */
t[8] &= 0x7FFF;
for (i = 0; i < 9; i ++) {
- w = (uint64_t)t[i] + cc + MUL31(t[i + 9], 622592);
+ uint64_t w;
+
+ w = (uint64_t)t[i] + (uint64_t)cc + MUL31(t[i + 9], 622592);
t[i] = (uint32_t)w & 0x3FFFFFFF;
- cc = w >> 30;
+ cc = (uint32_t)(w >> 30); /* at most 622592 */
}
- cc = MUL31(w >> 15, 19);
+
+ /*
+ * Original product was up to (2^256-1)^2, i.e. a 512-bit integer.
+ * This was split into two parts (upper of 257 bits, lower of 255
+ * bits), and the upper was added to the lower with a factor 19,
+ * which means that the intermediate value is less than 77*2^255
+ * (19*2^257 + 2^255). Therefore, the extra bits "t[8] >> 15" are
+ * less than 77, and the initial carry cc is at most 76*19 = 1444.
+ */
+ cc = MUL15(t[8] >> 15, 19);
t[8] &= 0x7FFF;
for (i = 0; i < 9; i ++) {
- w = t[i] + cc;
- d[i] = (uint32_t)w & 0x3FFFFFFF;
- cc = w >> 30;
+ uint32_t z;
+
+ z = t[i] + cc;
+ d[i] = z & 0x3FFFFFFF;
+ cc = z >> 30;
}
+
+ /*
+ * Final result is at most 2^255 + 1443. In particular, the last
+ * carry is necessarily 0, since t[8] was truncated to 15 bits.
+ */
}
/*
static void
f255_square(uint32_t *d, const uint32_t *a)
{
- uint32_t t[18];
- uint64_t cc, w;
+ uint32_t t[18], cc;
int i;
/*
/*
* Modular reduction: each high word is added where necessary.
- * Since the modulus is 2^255-19 and word 9 corresponds to
- * offset 9*30 = 270, word 9+k must be added to word k with
- * a factor of 19*2^15 = 622592. The extra bits in word 8 are also
- * added that way.
+ * See f255_mul() for details on the reduction and carry limits.
*/
- cc = MUL31(t[8] >> 15, 19);
+ cc = MUL15(t[8] >> 15, 19);
t[8] &= 0x7FFF;
for (i = 0; i < 9; i ++) {
- w = (uint64_t)t[i] + cc + MUL31(t[i + 9], 622592);
+ uint64_t w;
+
+ w = (uint64_t)t[i] + (uint64_t)cc + MUL31(t[i + 9], 622592);
t[i] = (uint32_t)w & 0x3FFFFFFF;
- cc = w >> 30;
+ cc = (uint32_t)(w >> 30);
}
- cc = MUL31(w >> 15, 19);
+ cc = MUL15(t[8] >> 15, 19);
t[8] &= 0x7FFF;
for (i = 0; i < 9; i ++) {
- w = t[i] + cc;
- d[i] = (uint32_t)w & 0x3FFFFFFF;
- cc = w >> 30;
+ uint32_t z;
+
+ z = t[i] + cc;
+ d[i] = z & 0x3FFFFFFF;
+ cc = z >> 30;
}
}
f255_mul_a24(uint32_t *d, const uint32_t *a)
{
int i;
- uint64_t cc, w;
+ uint64_t w;
+ uint32_t cc;
+ /*
+ * a[] is over 256 bits, thus a[8] has length at most 16 bits.
+ * We single out the processing of the last word: intermediate
+ * value w is up to 121665*2^16, yielding a carry for the next
+ * loop of at most 19*(121665*2^16/2^15) = 4623289.
+ */
cc = 0;
- for (i = 0; i < 9; i ++) {
- w = MUL31(a[i], 121665) + cc;
+ for (i = 0; i < 8; i ++) {
+ w = MUL31(a[i], 121665) + (uint64_t)cc;
d[i] = (uint32_t)w & 0x3FFFFFFF;
- cc = w >> 30;
+ cc = (uint32_t)(w >> 30);
}
- cc = MUL31((uint32_t)(w >> 15), 19);
- d[8] &= 0x7FFF;
+ w = MUL31(a[8], 121665) + (uint64_t)cc;
+ d[8] = (uint32_t)w & 0x7FFF;
+ cc = MUL15((uint32_t)(w >> 15), 19);
+
for (i = 0; i < 9; i ++) {
- w = (uint64_t)d[i] + cc;
- d[i] = w & 0x3FFFFFFF;
- cc = w >> 30;
+ uint32_t z;
+
+ z = d[i] + cc;
+ d[i] = z & 0x3FFFFFFF;
+ cc = z >> 30;
}
}
br_i31_montymul(uint32_t *d, const uint32_t *x, const uint32_t *y,
const uint32_t *m, uint32_t m0i)
{
+ /*
+ * Each outer loop iteration computes:
+ * d <- (d + xu*y + f*m) / 2^31
+ * We have xu <= 2^31-1 and f <= 2^31-1.
+ * Thus, if d <= 2*m-1 on input, then:
+ * 2*m-1 + 2*(2^31-1)*m <= (2^32)*m-1
+ * and the new d value is less than 2*m.
+ *
+ * We represent d over 31-bit words, with an extra word 'dh'
+ * which can thus be only 0 or 1.
+ */
size_t len, len4, u, v;
- uint64_t dh;
+ uint32_t dh;
len = (m[0] + 31) >> 5;
len4 = len & ~(size_t)3;
br_i31_zero(d, m[0]);
dh = 0;
for (u = 0; u < len; u ++) {
+ /*
+ * The carry for each operation fits on 32 bits:
+ * d[v+1] <= 2^31-1
+ * xu*y[v+1] <= (2^31-1)*(2^31-1)
+ * f*m[v+1] <= (2^31-1)*(2^31-1)
+ * r <= 2^32-1
+ * (2^31-1) + 2*(2^31-1)*(2^31-1) + (2^32-1) = 2^63 - 2^31
+ * After division by 2^31, the new r is then at most 2^32-1
+ *
+ * Using a 32-bit carry has performance benefits on 32-bit
+ * systems; however, on 64-bit architectures, we prefer to
+ * keep the carry (r) in a 64-bit register, thus avoiding some
+ * "clear high bits" operations.
+ */
uint32_t f, xu;
- uint64_t r, zh;
+#if BR_64
+ uint64_t r;
+#else
+ uint32_t r;
+#endif
xu = x[u + 1];
f = MUL31_lo((d[1] + MUL31_lo(x[u + 1], y[1])), m0i);
d[v] = (uint32_t)z & 0x7FFFFFFF;
}
- zh = dh + r;
- d[len] = (uint32_t)zh & 0x7FFFFFFF;
- dh = zh >> 31;
+ /*
+ * Since the new dh can only be 0 or 1, the addition of
+ * the old dh with the carry MUST fit on 32 bits, and
+ * thus can be done into dh itself.
+ */
+ dh += r;
+ d[len] = dh & 0x7FFFFFFF;
+ dh >>= 31;
}
/*