uint32_t t[18];
uint64_t s[18];
uint64_t cc, x;
- uint32_t z;
+ uint32_t z, c;
int i;
mul9(t, a, b);
d[8] &= 0xFFFF;
/*
- * Subtract cc*p.
+ * One extra round of reduction, for cc*2^256, which means
+ * adding cc*(2^224-2^192-2^96+1) to a 256-bit (nonnegative)
+ * value. If cc is negative, then it may happen (rarely, but
+ * not neglectibly so) that the result would be negative. In
+ * order to avoid that, if cc is negative, then we add the
+ * modulus once. Note that if cc is negative, then propagating
+ * that carry must yield a value lower than the modulus, so
+ * adding the modulus once will keep the final result under
+ * twice the modulus.
*/
z = (uint32_t)cc;
d[3] -= z << 6;
d[7] -= ARSH(z, 18);
d[7] += (z << 14) & 0x3FFFFFFF;
d[8] += ARSH(z, 16);
+ c = z >> 31;
+ d[0] -= c;
+ d[3] += c << 6;
+ d[6] += c << 12;
+ d[7] -= c << 14;
+ d[8] += c << 16;
for (i = 0; i < 9; i ++) {
uint32_t w;
uint32_t t[18];
uint64_t s[18];
uint64_t cc, x;
- uint32_t z;
+ uint32_t z, c;
int i;
square9(t, a);
d[8] &= 0xFFFF;
/*
- * Subtract cc*p.
+ * One extra round of reduction, for cc*2^256, which means
+ * adding cc*(2^224-2^192-2^96+1) to a 256-bit (nonnegative)
+ * value. If cc is negative, then it may happen (rarely, but
+ * not neglectibly so) that the result would be negative. In
+ * order to avoid that, if cc is negative, then we add the
+ * modulus once. Note that if cc is negative, then propagating
+ * that carry must yield a value lower than the modulus, so
+ * adding the modulus once will keep the final result under
+ * twice the modulus.
*/
z = (uint32_t)cc;
d[3] -= z << 6;
d[7] -= ARSH(z, 18);
d[7] += (z << 14) & 0x3FFFFFFF;
d[8] += ARSH(z, 16);
+ c = z >> 31;
+ d[0] -= c;
+ d[3] += c << 6;
+ d[6] += c << 12;
+ d[7] -= c << 14;
+ d[8] += c << 16;
for (i = 0; i < 9; i ++) {
uint32_t w;