-static void
-f255_square(uint32_t *d, const uint32_t *a)
-{
- uint32_t t[40], cc, w;
- int i;
-
- /*
- * Compute raw multiplication. All result words fit in 13 bits
- * each; upper word (t[39]) must fit on 5 bits, since the product
- * of two 256-bit integers must fit on 512 bits.
- */
- square20(t, a);
-
- /*
- * Modular reduction: each high word is added where necessary.
- * Since the modulus is 2^255-19 and word 20 corresponds to
- * offset 20*13 = 260, word 20+k must be added to word k with
- * a factor of 19*2^5 = 608. The extra bits in word 19 are also
- * added that way.
- */
- cc = MUL15(t[19] >> 8, 19);
- t[19] &= 0xFF;
- for (i = 0; i < 20; i ++) {
- w = t[i] + cc + MUL15(t[i + 20], 608);
- t[i] = w & 0x1FFF;
- cc = w >> 13;
- }
- cc = MUL15(w >> 8, 19);
- t[19] &= 0xFF;
- for (i = 0; i < 20; i ++) {
- w = t[i] + cc;
- d[i] = w & 0x1FFF;
- cc = w >> 13;
- }
-}
+#define f255_mul(d, a, b) f255_mulgen(d, a, b, 0)
+#define f255_square(d, a) f255_mulgen(d, a, a, 1)