3cb98d51ef42a249a906d9930d3b2c24f3a7e6b1
[BearSSL] / src / ec / ec_c25519_m15.c
1 /*
2 * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining
5 * a copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sublicense, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be
13 * included in all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #include "inner.h"
26
27 /* obsolete
28 #include <stdio.h>
29 #include <stdlib.h>
30 static void
31 print_int(const char *name, const uint32_t *x)
32 {
33 size_t u;
34 unsigned char tmp[36];
35
36 printf("%s = ", name);
37 for (u = 0; u < 20; u ++) {
38 if (x[u] > 0x1FFF) {
39 printf("INVALID:");
40 for (u = 0; u < 20; u ++) {
41 printf(" %04X", x[u]);
42 }
43 printf("\n");
44 return;
45 }
46 }
47 memset(tmp, 0, sizeof tmp);
48 for (u = 0; u < 20; u ++) {
49 uint32_t w;
50 int j, k;
51
52 w = x[u];
53 j = 13 * (int)u;
54 k = j & 7;
55 if (k != 0) {
56 w <<= k;
57 j -= k;
58 }
59 k = j >> 3;
60 tmp[35 - k] |= (unsigned char)w;
61 tmp[34 - k] |= (unsigned char)(w >> 8);
62 tmp[33 - k] |= (unsigned char)(w >> 16);
63 tmp[32 - k] |= (unsigned char)(w >> 24);
64 }
65 for (u = 4; u < 36; u ++) {
66 printf("%02X", tmp[u]);
67 }
68 printf("\n");
69 }
70 */
71
72 /*
73 * If BR_NO_ARITH_SHIFT is undefined, or defined to 0, then we _assume_
74 * that right-shifting a signed negative integer copies the sign bit
75 * (arithmetic right-shift). This is "implementation-defined behaviour",
76 * i.e. it is not undefined, but it may differ between compilers. Each
77 * compiler is supposed to document its behaviour in that respect. GCC
78 * explicitly defines that an arithmetic right shift is used. We expect
79 * all other compilers to do the same, because underlying CPU offer an
80 * arithmetic right shift opcode that could not be used otherwise.
81 */
82 #if BR_NO_ARITH_SHIFT
83 #define ARSH(x, n) (((uint32_t)(x) >> (n)) \
84 | ((-((uint32_t)(x) >> 31)) << (32 - (n))))
85 #else
86 #define ARSH(x, n) ((*(int32_t *)&(x)) >> (n))
87 #endif
88
89 /*
90 * Convert an integer from unsigned little-endian encoding to a sequence of
91 * 13-bit words in little-endian order. The final "partial" word is
92 * returned.
93 */
94 static uint32_t
95 le8_to_le13(uint32_t *dst, const unsigned char *src, size_t len)
96 {
97 uint32_t acc;
98 int acc_len;
99
100 acc = 0;
101 acc_len = 0;
102 while (len -- > 0) {
103 acc |= (uint32_t)(*src ++) << acc_len;
104 acc_len += 8;
105 if (acc_len >= 13) {
106 *dst ++ = acc & 0x1FFF;
107 acc >>= 13;
108 acc_len -= 13;
109 }
110 }
111 return acc;
112 }
113
114 /*
115 * Convert an integer (13-bit words, little-endian) to unsigned
116 * little-endian encoding. The total encoding length is provided; all
117 * the destination bytes will be filled.
118 */
119 static void
120 le13_to_le8(unsigned char *dst, size_t len, const uint32_t *src)
121 {
122 uint32_t acc;
123 int acc_len;
124
125 acc = 0;
126 acc_len = 0;
127 while (len -- > 0) {
128 if (acc_len < 8) {
129 acc |= (*src ++) << acc_len;
130 acc_len += 13;
131 }
132 *dst ++ = (unsigned char)acc;
133 acc >>= 8;
134 acc_len -= 8;
135 }
136 }
137
138 /*
139 * Normalise an array of words to a strict 13 bits per word. Returned
140 * value is the resulting carry. The source (w) and destination (d)
141 * arrays may be identical, but shall not overlap partially.
142 */
143 static inline uint32_t
144 norm13(uint32_t *d, const uint32_t *w, size_t len)
145 {
146 size_t u;
147 uint32_t cc;
148
149 cc = 0;
150 for (u = 0; u < len; u ++) {
151 int32_t z;
152
153 z = w[u] + cc;
154 d[u] = z & 0x1FFF;
155 cc = ARSH(z, 13);
156 }
157 return cc;
158 }
159
160 /*
161 * mul20() multiplies two 260-bit integers together. Each word must fit
162 * on 13 bits; source operands use 20 words, destination operand
163 * receives 40 words. All overlaps allowed.
164 *
165 * square20() computes the square of a 260-bit integer. Each word must
166 * fit on 13 bits; source operand uses 20 words, destination operand
167 * receives 40 words. All overlaps allowed.
168 */
169
170 #if BR_SLOW_MUL15
171
172 static void
173 mul20(uint32_t *d, const uint32_t *a, const uint32_t *b)
174 {
175 /*
176 * Two-level Karatsuba: turns a 20x20 multiplication into
177 * nine 5x5 multiplications. We use 13-bit words but do not
178 * propagate carries immediately, so words may expand:
179 *
180 * - First Karatsuba decomposition turns the 20x20 mul on
181 * 13-bit words into three 10x10 muls, two on 13-bit words
182 * and one on 14-bit words.
183 *
184 * - Second Karatsuba decomposition further splits these into:
185 *
186 * * four 5x5 muls on 13-bit words
187 * * four 5x5 muls on 14-bit words
188 * * one 5x5 mul on 15-bit words
189 *
190 * Highest word value is 8191, 16382 or 32764, for 13-bit, 14-bit
191 * or 15-bit words, respectively.
192 */
193 uint32_t u[45], v[45], w[90];
194 uint32_t cc;
195 int i;
196
197 #define ZADD(dw, d_off, s1w, s1_off, s2w, s2_off) do { \
198 (dw)[5 * (d_off) + 0] = (s1w)[5 * (s1_off) + 0] \
199 + (s2w)[5 * (s2_off) + 0]; \
200 (dw)[5 * (d_off) + 1] = (s1w)[5 * (s1_off) + 1] \
201 + (s2w)[5 * (s2_off) + 1]; \
202 (dw)[5 * (d_off) + 2] = (s1w)[5 * (s1_off) + 2] \
203 + (s2w)[5 * (s2_off) + 2]; \
204 (dw)[5 * (d_off) + 3] = (s1w)[5 * (s1_off) + 3] \
205 + (s2w)[5 * (s2_off) + 3]; \
206 (dw)[5 * (d_off) + 4] = (s1w)[5 * (s1_off) + 4] \
207 + (s2w)[5 * (s2_off) + 4]; \
208 } while (0)
209
210 #define ZADDT(dw, d_off, sw, s_off) do { \
211 (dw)[5 * (d_off) + 0] += (sw)[5 * (s_off) + 0]; \
212 (dw)[5 * (d_off) + 1] += (sw)[5 * (s_off) + 1]; \
213 (dw)[5 * (d_off) + 2] += (sw)[5 * (s_off) + 2]; \
214 (dw)[5 * (d_off) + 3] += (sw)[5 * (s_off) + 3]; \
215 (dw)[5 * (d_off) + 4] += (sw)[5 * (s_off) + 4]; \
216 } while (0)
217
218 #define ZSUB2F(dw, d_off, s1w, s1_off, s2w, s2_off) do { \
219 (dw)[5 * (d_off) + 0] -= (s1w)[5 * (s1_off) + 0] \
220 + (s2w)[5 * (s2_off) + 0]; \
221 (dw)[5 * (d_off) + 1] -= (s1w)[5 * (s1_off) + 1] \
222 + (s2w)[5 * (s2_off) + 1]; \
223 (dw)[5 * (d_off) + 2] -= (s1w)[5 * (s1_off) + 2] \
224 + (s2w)[5 * (s2_off) + 2]; \
225 (dw)[5 * (d_off) + 3] -= (s1w)[5 * (s1_off) + 3] \
226 + (s2w)[5 * (s2_off) + 3]; \
227 (dw)[5 * (d_off) + 4] -= (s1w)[5 * (s1_off) + 4] \
228 + (s2w)[5 * (s2_off) + 4]; \
229 } while (0)
230
231 #define CPR1(w, cprcc) do { \
232 uint32_t cprz = (w) + cprcc; \
233 (w) = cprz & 0x1FFF; \
234 cprcc = cprz >> 13; \
235 } while (0)
236
237 #define CPR(dw, d_off) do { \
238 uint32_t cprcc; \
239 cprcc = 0; \
240 CPR1((dw)[(d_off) + 0], cprcc); \
241 CPR1((dw)[(d_off) + 1], cprcc); \
242 CPR1((dw)[(d_off) + 2], cprcc); \
243 CPR1((dw)[(d_off) + 3], cprcc); \
244 CPR1((dw)[(d_off) + 4], cprcc); \
245 CPR1((dw)[(d_off) + 5], cprcc); \
246 CPR1((dw)[(d_off) + 6], cprcc); \
247 CPR1((dw)[(d_off) + 7], cprcc); \
248 CPR1((dw)[(d_off) + 8], cprcc); \
249 (dw)[(d_off) + 9] = cprcc; \
250 } while (0)
251
252 memcpy(u, a, 20 * sizeof *a);
253 ZADD(u, 4, a, 0, a, 1);
254 ZADD(u, 5, a, 2, a, 3);
255 ZADD(u, 6, a, 0, a, 2);
256 ZADD(u, 7, a, 1, a, 3);
257 ZADD(u, 8, u, 6, u, 7);
258
259 memcpy(v, b, 20 * sizeof *b);
260 ZADD(v, 4, b, 0, b, 1);
261 ZADD(v, 5, b, 2, b, 3);
262 ZADD(v, 6, b, 0, b, 2);
263 ZADD(v, 7, b, 1, b, 3);
264 ZADD(v, 8, v, 6, v, 7);
265
266 /*
267 * Do the eight first 8x8 muls. Source words are at most 16382
268 * each, so we can add product results together "as is" in 32-bit
269 * words.
270 */
271 for (i = 0; i < 40; i += 5) {
272 w[(i << 1) + 0] = MUL15(u[i + 0], v[i + 0]);
273 w[(i << 1) + 1] = MUL15(u[i + 0], v[i + 1])
274 + MUL15(u[i + 1], v[i + 0]);
275 w[(i << 1) + 2] = MUL15(u[i + 0], v[i + 2])
276 + MUL15(u[i + 1], v[i + 1])
277 + MUL15(u[i + 2], v[i + 0]);
278 w[(i << 1) + 3] = MUL15(u[i + 0], v[i + 3])
279 + MUL15(u[i + 1], v[i + 2])
280 + MUL15(u[i + 2], v[i + 1])
281 + MUL15(u[i + 3], v[i + 0]);
282 w[(i << 1) + 4] = MUL15(u[i + 0], v[i + 4])
283 + MUL15(u[i + 1], v[i + 3])
284 + MUL15(u[i + 2], v[i + 2])
285 + MUL15(u[i + 3], v[i + 1])
286 + MUL15(u[i + 4], v[i + 0]);
287 w[(i << 1) + 5] = MUL15(u[i + 1], v[i + 4])
288 + MUL15(u[i + 2], v[i + 3])
289 + MUL15(u[i + 3], v[i + 2])
290 + MUL15(u[i + 4], v[i + 1]);
291 w[(i << 1) + 6] = MUL15(u[i + 2], v[i + 4])
292 + MUL15(u[i + 3], v[i + 3])
293 + MUL15(u[i + 4], v[i + 2]);
294 w[(i << 1) + 7] = MUL15(u[i + 3], v[i + 4])
295 + MUL15(u[i + 4], v[i + 3]);
296 w[(i << 1) + 8] = MUL15(u[i + 4], v[i + 4]);
297 w[(i << 1) + 9] = 0;
298 }
299
300 /*
301 * For the 9th multiplication, source words are up to 32764,
302 * so we must do some carry propagation. If we add up to
303 * 4 products and the carry is no more than 524224, then the
304 * result fits in 32 bits, and the next carry will be no more
305 * than 524224 (because 4*(32764^2)+524224 < 8192*524225).
306 *
307 * We thus just skip one of the products in the middle word,
308 * then do a carry propagation (this reduces words to 13 bits
309 * each, except possibly the last, which may use up to 17 bits
310 * or so), then add the missing product.
311 */
312 w[80 + 0] = MUL15(u[40 + 0], v[40 + 0]);
313 w[80 + 1] = MUL15(u[40 + 0], v[40 + 1])
314 + MUL15(u[40 + 1], v[40 + 0]);
315 w[80 + 2] = MUL15(u[40 + 0], v[40 + 2])
316 + MUL15(u[40 + 1], v[40 + 1])
317 + MUL15(u[40 + 2], v[40 + 0]);
318 w[80 + 3] = MUL15(u[40 + 0], v[40 + 3])
319 + MUL15(u[40 + 1], v[40 + 2])
320 + MUL15(u[40 + 2], v[40 + 1])
321 + MUL15(u[40 + 3], v[40 + 0]);
322 w[80 + 4] = MUL15(u[40 + 0], v[40 + 4])
323 + MUL15(u[40 + 1], v[40 + 3])
324 + MUL15(u[40 + 2], v[40 + 2])
325 + MUL15(u[40 + 3], v[40 + 1]);
326 /* + MUL15(u[40 + 4], v[40 + 0]) */
327 w[80 + 5] = MUL15(u[40 + 1], v[40 + 4])
328 + MUL15(u[40 + 2], v[40 + 3])
329 + MUL15(u[40 + 3], v[40 + 2])
330 + MUL15(u[40 + 4], v[40 + 1]);
331 w[80 + 6] = MUL15(u[40 + 2], v[40 + 4])
332 + MUL15(u[40 + 3], v[40 + 3])
333 + MUL15(u[40 + 4], v[40 + 2]);
334 w[80 + 7] = MUL15(u[40 + 3], v[40 + 4])
335 + MUL15(u[40 + 4], v[40 + 3]);
336 w[80 + 8] = MUL15(u[40 + 4], v[40 + 4]);
337
338 CPR(w, 80);
339
340 w[80 + 4] += MUL15(u[40 + 4], v[40 + 0]);
341
342 /*
343 * The products on 14-bit words in slots 6 and 7 yield values
344 * up to 5*(16382^2) each, and we need to subtract two such
345 * values from the higher word. We need the subtraction to fit
346 * in a _signed_ 32-bit integer, i.e. 31 bits + a sign bit.
347 * However, 10*(16382^2) does not fit. So we must perform a
348 * bit of reduction here.
349 */
350 CPR(w, 60);
351 CPR(w, 70);
352
353 /*
354 * Recompose results.
355 */
356
357 /* 0..1*0..1 into 0..3 */
358 ZSUB2F(w, 8, w, 0, w, 2);
359 ZSUB2F(w, 9, w, 1, w, 3);
360 ZADDT(w, 1, w, 8);
361 ZADDT(w, 2, w, 9);
362
363 /* 2..3*2..3 into 4..7 */
364 ZSUB2F(w, 10, w, 4, w, 6);
365 ZSUB2F(w, 11, w, 5, w, 7);
366 ZADDT(w, 5, w, 10);
367 ZADDT(w, 6, w, 11);
368
369 /* (0..1+2..3)*(0..1+2..3) into 12..15 */
370 ZSUB2F(w, 16, w, 12, w, 14);
371 ZSUB2F(w, 17, w, 13, w, 15);
372 ZADDT(w, 13, w, 16);
373 ZADDT(w, 14, w, 17);
374
375 /* first-level recomposition */
376 ZSUB2F(w, 12, w, 0, w, 4);
377 ZSUB2F(w, 13, w, 1, w, 5);
378 ZSUB2F(w, 14, w, 2, w, 6);
379 ZSUB2F(w, 15, w, 3, w, 7);
380 ZADDT(w, 2, w, 12);
381 ZADDT(w, 3, w, 13);
382 ZADDT(w, 4, w, 14);
383 ZADDT(w, 5, w, 15);
384
385 /*
386 * Perform carry propagation to bring all words down to 13 bits.
387 */
388 cc = norm13(d, w, 40);
389 d[39] += (cc << 13);
390
391 #undef ZADD
392 #undef ZADDT
393 #undef ZSUB2F
394 #undef CPR1
395 #undef CPR
396 }
397
398 static inline void
399 square20(uint32_t *d, const uint32_t *a)
400 {
401 mul20(d, a, a);
402 }
403
404 #else
405
406 static void
407 mul20(uint32_t *d, const uint32_t *a, const uint32_t *b)
408 {
409 uint32_t t[39];
410
411 t[ 0] = MUL15(a[ 0], b[ 0]);
412 t[ 1] = MUL15(a[ 0], b[ 1])
413 + MUL15(a[ 1], b[ 0]);
414 t[ 2] = MUL15(a[ 0], b[ 2])
415 + MUL15(a[ 1], b[ 1])
416 + MUL15(a[ 2], b[ 0]);
417 t[ 3] = MUL15(a[ 0], b[ 3])
418 + MUL15(a[ 1], b[ 2])
419 + MUL15(a[ 2], b[ 1])
420 + MUL15(a[ 3], b[ 0]);
421 t[ 4] = MUL15(a[ 0], b[ 4])
422 + MUL15(a[ 1], b[ 3])
423 + MUL15(a[ 2], b[ 2])
424 + MUL15(a[ 3], b[ 1])
425 + MUL15(a[ 4], b[ 0]);
426 t[ 5] = MUL15(a[ 0], b[ 5])
427 + MUL15(a[ 1], b[ 4])
428 + MUL15(a[ 2], b[ 3])
429 + MUL15(a[ 3], b[ 2])
430 + MUL15(a[ 4], b[ 1])
431 + MUL15(a[ 5], b[ 0]);
432 t[ 6] = MUL15(a[ 0], b[ 6])
433 + MUL15(a[ 1], b[ 5])
434 + MUL15(a[ 2], b[ 4])
435 + MUL15(a[ 3], b[ 3])
436 + MUL15(a[ 4], b[ 2])
437 + MUL15(a[ 5], b[ 1])
438 + MUL15(a[ 6], b[ 0]);
439 t[ 7] = MUL15(a[ 0], b[ 7])
440 + MUL15(a[ 1], b[ 6])
441 + MUL15(a[ 2], b[ 5])
442 + MUL15(a[ 3], b[ 4])
443 + MUL15(a[ 4], b[ 3])
444 + MUL15(a[ 5], b[ 2])
445 + MUL15(a[ 6], b[ 1])
446 + MUL15(a[ 7], b[ 0]);
447 t[ 8] = MUL15(a[ 0], b[ 8])
448 + MUL15(a[ 1], b[ 7])
449 + MUL15(a[ 2], b[ 6])
450 + MUL15(a[ 3], b[ 5])
451 + MUL15(a[ 4], b[ 4])
452 + MUL15(a[ 5], b[ 3])
453 + MUL15(a[ 6], b[ 2])
454 + MUL15(a[ 7], b[ 1])
455 + MUL15(a[ 8], b[ 0]);
456 t[ 9] = MUL15(a[ 0], b[ 9])
457 + MUL15(a[ 1], b[ 8])
458 + MUL15(a[ 2], b[ 7])
459 + MUL15(a[ 3], b[ 6])
460 + MUL15(a[ 4], b[ 5])
461 + MUL15(a[ 5], b[ 4])
462 + MUL15(a[ 6], b[ 3])
463 + MUL15(a[ 7], b[ 2])
464 + MUL15(a[ 8], b[ 1])
465 + MUL15(a[ 9], b[ 0]);
466 t[10] = MUL15(a[ 0], b[10])
467 + MUL15(a[ 1], b[ 9])
468 + MUL15(a[ 2], b[ 8])
469 + MUL15(a[ 3], b[ 7])
470 + MUL15(a[ 4], b[ 6])
471 + MUL15(a[ 5], b[ 5])
472 + MUL15(a[ 6], b[ 4])
473 + MUL15(a[ 7], b[ 3])
474 + MUL15(a[ 8], b[ 2])
475 + MUL15(a[ 9], b[ 1])
476 + MUL15(a[10], b[ 0]);
477 t[11] = MUL15(a[ 0], b[11])
478 + MUL15(a[ 1], b[10])
479 + MUL15(a[ 2], b[ 9])
480 + MUL15(a[ 3], b[ 8])
481 + MUL15(a[ 4], b[ 7])
482 + MUL15(a[ 5], b[ 6])
483 + MUL15(a[ 6], b[ 5])
484 + MUL15(a[ 7], b[ 4])
485 + MUL15(a[ 8], b[ 3])
486 + MUL15(a[ 9], b[ 2])
487 + MUL15(a[10], b[ 1])
488 + MUL15(a[11], b[ 0]);
489 t[12] = MUL15(a[ 0], b[12])
490 + MUL15(a[ 1], b[11])
491 + MUL15(a[ 2], b[10])
492 + MUL15(a[ 3], b[ 9])
493 + MUL15(a[ 4], b[ 8])
494 + MUL15(a[ 5], b[ 7])
495 + MUL15(a[ 6], b[ 6])
496 + MUL15(a[ 7], b[ 5])
497 + MUL15(a[ 8], b[ 4])
498 + MUL15(a[ 9], b[ 3])
499 + MUL15(a[10], b[ 2])
500 + MUL15(a[11], b[ 1])
501 + MUL15(a[12], b[ 0]);
502 t[13] = MUL15(a[ 0], b[13])
503 + MUL15(a[ 1], b[12])
504 + MUL15(a[ 2], b[11])
505 + MUL15(a[ 3], b[10])
506 + MUL15(a[ 4], b[ 9])
507 + MUL15(a[ 5], b[ 8])
508 + MUL15(a[ 6], b[ 7])
509 + MUL15(a[ 7], b[ 6])
510 + MUL15(a[ 8], b[ 5])
511 + MUL15(a[ 9], b[ 4])
512 + MUL15(a[10], b[ 3])
513 + MUL15(a[11], b[ 2])
514 + MUL15(a[12], b[ 1])
515 + MUL15(a[13], b[ 0]);
516 t[14] = MUL15(a[ 0], b[14])
517 + MUL15(a[ 1], b[13])
518 + MUL15(a[ 2], b[12])
519 + MUL15(a[ 3], b[11])
520 + MUL15(a[ 4], b[10])
521 + MUL15(a[ 5], b[ 9])
522 + MUL15(a[ 6], b[ 8])
523 + MUL15(a[ 7], b[ 7])
524 + MUL15(a[ 8], b[ 6])
525 + MUL15(a[ 9], b[ 5])
526 + MUL15(a[10], b[ 4])
527 + MUL15(a[11], b[ 3])
528 + MUL15(a[12], b[ 2])
529 + MUL15(a[13], b[ 1])
530 + MUL15(a[14], b[ 0]);
531 t[15] = MUL15(a[ 0], b[15])
532 + MUL15(a[ 1], b[14])
533 + MUL15(a[ 2], b[13])
534 + MUL15(a[ 3], b[12])
535 + MUL15(a[ 4], b[11])
536 + MUL15(a[ 5], b[10])
537 + MUL15(a[ 6], b[ 9])
538 + MUL15(a[ 7], b[ 8])
539 + MUL15(a[ 8], b[ 7])
540 + MUL15(a[ 9], b[ 6])
541 + MUL15(a[10], b[ 5])
542 + MUL15(a[11], b[ 4])
543 + MUL15(a[12], b[ 3])
544 + MUL15(a[13], b[ 2])
545 + MUL15(a[14], b[ 1])
546 + MUL15(a[15], b[ 0]);
547 t[16] = MUL15(a[ 0], b[16])
548 + MUL15(a[ 1], b[15])
549 + MUL15(a[ 2], b[14])
550 + MUL15(a[ 3], b[13])
551 + MUL15(a[ 4], b[12])
552 + MUL15(a[ 5], b[11])
553 + MUL15(a[ 6], b[10])
554 + MUL15(a[ 7], b[ 9])
555 + MUL15(a[ 8], b[ 8])
556 + MUL15(a[ 9], b[ 7])
557 + MUL15(a[10], b[ 6])
558 + MUL15(a[11], b[ 5])
559 + MUL15(a[12], b[ 4])
560 + MUL15(a[13], b[ 3])
561 + MUL15(a[14], b[ 2])
562 + MUL15(a[15], b[ 1])
563 + MUL15(a[16], b[ 0]);
564 t[17] = MUL15(a[ 0], b[17])
565 + MUL15(a[ 1], b[16])
566 + MUL15(a[ 2], b[15])
567 + MUL15(a[ 3], b[14])
568 + MUL15(a[ 4], b[13])
569 + MUL15(a[ 5], b[12])
570 + MUL15(a[ 6], b[11])
571 + MUL15(a[ 7], b[10])
572 + MUL15(a[ 8], b[ 9])
573 + MUL15(a[ 9], b[ 8])
574 + MUL15(a[10], b[ 7])
575 + MUL15(a[11], b[ 6])
576 + MUL15(a[12], b[ 5])
577 + MUL15(a[13], b[ 4])
578 + MUL15(a[14], b[ 3])
579 + MUL15(a[15], b[ 2])
580 + MUL15(a[16], b[ 1])
581 + MUL15(a[17], b[ 0]);
582 t[18] = MUL15(a[ 0], b[18])
583 + MUL15(a[ 1], b[17])
584 + MUL15(a[ 2], b[16])
585 + MUL15(a[ 3], b[15])
586 + MUL15(a[ 4], b[14])
587 + MUL15(a[ 5], b[13])
588 + MUL15(a[ 6], b[12])
589 + MUL15(a[ 7], b[11])
590 + MUL15(a[ 8], b[10])
591 + MUL15(a[ 9], b[ 9])
592 + MUL15(a[10], b[ 8])
593 + MUL15(a[11], b[ 7])
594 + MUL15(a[12], b[ 6])
595 + MUL15(a[13], b[ 5])
596 + MUL15(a[14], b[ 4])
597 + MUL15(a[15], b[ 3])
598 + MUL15(a[16], b[ 2])
599 + MUL15(a[17], b[ 1])
600 + MUL15(a[18], b[ 0]);
601 t[19] = MUL15(a[ 0], b[19])
602 + MUL15(a[ 1], b[18])
603 + MUL15(a[ 2], b[17])
604 + MUL15(a[ 3], b[16])
605 + MUL15(a[ 4], b[15])
606 + MUL15(a[ 5], b[14])
607 + MUL15(a[ 6], b[13])
608 + MUL15(a[ 7], b[12])
609 + MUL15(a[ 8], b[11])
610 + MUL15(a[ 9], b[10])
611 + MUL15(a[10], b[ 9])
612 + MUL15(a[11], b[ 8])
613 + MUL15(a[12], b[ 7])
614 + MUL15(a[13], b[ 6])
615 + MUL15(a[14], b[ 5])
616 + MUL15(a[15], b[ 4])
617 + MUL15(a[16], b[ 3])
618 + MUL15(a[17], b[ 2])
619 + MUL15(a[18], b[ 1])
620 + MUL15(a[19], b[ 0]);
621 t[20] = MUL15(a[ 1], b[19])
622 + MUL15(a[ 2], b[18])
623 + MUL15(a[ 3], b[17])
624 + MUL15(a[ 4], b[16])
625 + MUL15(a[ 5], b[15])
626 + MUL15(a[ 6], b[14])
627 + MUL15(a[ 7], b[13])
628 + MUL15(a[ 8], b[12])
629 + MUL15(a[ 9], b[11])
630 + MUL15(a[10], b[10])
631 + MUL15(a[11], b[ 9])
632 + MUL15(a[12], b[ 8])
633 + MUL15(a[13], b[ 7])
634 + MUL15(a[14], b[ 6])
635 + MUL15(a[15], b[ 5])
636 + MUL15(a[16], b[ 4])
637 + MUL15(a[17], b[ 3])
638 + MUL15(a[18], b[ 2])
639 + MUL15(a[19], b[ 1]);
640 t[21] = MUL15(a[ 2], b[19])
641 + MUL15(a[ 3], b[18])
642 + MUL15(a[ 4], b[17])
643 + MUL15(a[ 5], b[16])
644 + MUL15(a[ 6], b[15])
645 + MUL15(a[ 7], b[14])
646 + MUL15(a[ 8], b[13])
647 + MUL15(a[ 9], b[12])
648 + MUL15(a[10], b[11])
649 + MUL15(a[11], b[10])
650 + MUL15(a[12], b[ 9])
651 + MUL15(a[13], b[ 8])
652 + MUL15(a[14], b[ 7])
653 + MUL15(a[15], b[ 6])
654 + MUL15(a[16], b[ 5])
655 + MUL15(a[17], b[ 4])
656 + MUL15(a[18], b[ 3])
657 + MUL15(a[19], b[ 2]);
658 t[22] = MUL15(a[ 3], b[19])
659 + MUL15(a[ 4], b[18])
660 + MUL15(a[ 5], b[17])
661 + MUL15(a[ 6], b[16])
662 + MUL15(a[ 7], b[15])
663 + MUL15(a[ 8], b[14])
664 + MUL15(a[ 9], b[13])
665 + MUL15(a[10], b[12])
666 + MUL15(a[11], b[11])
667 + MUL15(a[12], b[10])
668 + MUL15(a[13], b[ 9])
669 + MUL15(a[14], b[ 8])
670 + MUL15(a[15], b[ 7])
671 + MUL15(a[16], b[ 6])
672 + MUL15(a[17], b[ 5])
673 + MUL15(a[18], b[ 4])
674 + MUL15(a[19], b[ 3]);
675 t[23] = MUL15(a[ 4], b[19])
676 + MUL15(a[ 5], b[18])
677 + MUL15(a[ 6], b[17])
678 + MUL15(a[ 7], b[16])
679 + MUL15(a[ 8], b[15])
680 + MUL15(a[ 9], b[14])
681 + MUL15(a[10], b[13])
682 + MUL15(a[11], b[12])
683 + MUL15(a[12], b[11])
684 + MUL15(a[13], b[10])
685 + MUL15(a[14], b[ 9])
686 + MUL15(a[15], b[ 8])
687 + MUL15(a[16], b[ 7])
688 + MUL15(a[17], b[ 6])
689 + MUL15(a[18], b[ 5])
690 + MUL15(a[19], b[ 4]);
691 t[24] = MUL15(a[ 5], b[19])
692 + MUL15(a[ 6], b[18])
693 + MUL15(a[ 7], b[17])
694 + MUL15(a[ 8], b[16])
695 + MUL15(a[ 9], b[15])
696 + MUL15(a[10], b[14])
697 + MUL15(a[11], b[13])
698 + MUL15(a[12], b[12])
699 + MUL15(a[13], b[11])
700 + MUL15(a[14], b[10])
701 + MUL15(a[15], b[ 9])
702 + MUL15(a[16], b[ 8])
703 + MUL15(a[17], b[ 7])
704 + MUL15(a[18], b[ 6])
705 + MUL15(a[19], b[ 5]);
706 t[25] = MUL15(a[ 6], b[19])
707 + MUL15(a[ 7], b[18])
708 + MUL15(a[ 8], b[17])
709 + MUL15(a[ 9], b[16])
710 + MUL15(a[10], b[15])
711 + MUL15(a[11], b[14])
712 + MUL15(a[12], b[13])
713 + MUL15(a[13], b[12])
714 + MUL15(a[14], b[11])
715 + MUL15(a[15], b[10])
716 + MUL15(a[16], b[ 9])
717 + MUL15(a[17], b[ 8])
718 + MUL15(a[18], b[ 7])
719 + MUL15(a[19], b[ 6]);
720 t[26] = MUL15(a[ 7], b[19])
721 + MUL15(a[ 8], b[18])
722 + MUL15(a[ 9], b[17])
723 + MUL15(a[10], b[16])
724 + MUL15(a[11], b[15])
725 + MUL15(a[12], b[14])
726 + MUL15(a[13], b[13])
727 + MUL15(a[14], b[12])
728 + MUL15(a[15], b[11])
729 + MUL15(a[16], b[10])
730 + MUL15(a[17], b[ 9])
731 + MUL15(a[18], b[ 8])
732 + MUL15(a[19], b[ 7]);
733 t[27] = MUL15(a[ 8], b[19])
734 + MUL15(a[ 9], b[18])
735 + MUL15(a[10], b[17])
736 + MUL15(a[11], b[16])
737 + MUL15(a[12], b[15])
738 + MUL15(a[13], b[14])
739 + MUL15(a[14], b[13])
740 + MUL15(a[15], b[12])
741 + MUL15(a[16], b[11])
742 + MUL15(a[17], b[10])
743 + MUL15(a[18], b[ 9])
744 + MUL15(a[19], b[ 8]);
745 t[28] = MUL15(a[ 9], b[19])
746 + MUL15(a[10], b[18])
747 + MUL15(a[11], b[17])
748 + MUL15(a[12], b[16])
749 + MUL15(a[13], b[15])
750 + MUL15(a[14], b[14])
751 + MUL15(a[15], b[13])
752 + MUL15(a[16], b[12])
753 + MUL15(a[17], b[11])
754 + MUL15(a[18], b[10])
755 + MUL15(a[19], b[ 9]);
756 t[29] = MUL15(a[10], b[19])
757 + MUL15(a[11], b[18])
758 + MUL15(a[12], b[17])
759 + MUL15(a[13], b[16])
760 + MUL15(a[14], b[15])
761 + MUL15(a[15], b[14])
762 + MUL15(a[16], b[13])
763 + MUL15(a[17], b[12])
764 + MUL15(a[18], b[11])
765 + MUL15(a[19], b[10]);
766 t[30] = MUL15(a[11], b[19])
767 + MUL15(a[12], b[18])
768 + MUL15(a[13], b[17])
769 + MUL15(a[14], b[16])
770 + MUL15(a[15], b[15])
771 + MUL15(a[16], b[14])
772 + MUL15(a[17], b[13])
773 + MUL15(a[18], b[12])
774 + MUL15(a[19], b[11]);
775 t[31] = MUL15(a[12], b[19])
776 + MUL15(a[13], b[18])
777 + MUL15(a[14], b[17])
778 + MUL15(a[15], b[16])
779 + MUL15(a[16], b[15])
780 + MUL15(a[17], b[14])
781 + MUL15(a[18], b[13])
782 + MUL15(a[19], b[12]);
783 t[32] = MUL15(a[13], b[19])
784 + MUL15(a[14], b[18])
785 + MUL15(a[15], b[17])
786 + MUL15(a[16], b[16])
787 + MUL15(a[17], b[15])
788 + MUL15(a[18], b[14])
789 + MUL15(a[19], b[13]);
790 t[33] = MUL15(a[14], b[19])
791 + MUL15(a[15], b[18])
792 + MUL15(a[16], b[17])
793 + MUL15(a[17], b[16])
794 + MUL15(a[18], b[15])
795 + MUL15(a[19], b[14]);
796 t[34] = MUL15(a[15], b[19])
797 + MUL15(a[16], b[18])
798 + MUL15(a[17], b[17])
799 + MUL15(a[18], b[16])
800 + MUL15(a[19], b[15]);
801 t[35] = MUL15(a[16], b[19])
802 + MUL15(a[17], b[18])
803 + MUL15(a[18], b[17])
804 + MUL15(a[19], b[16]);
805 t[36] = MUL15(a[17], b[19])
806 + MUL15(a[18], b[18])
807 + MUL15(a[19], b[17]);
808 t[37] = MUL15(a[18], b[19])
809 + MUL15(a[19], b[18]);
810 t[38] = MUL15(a[19], b[19]);
811 d[39] = norm13(d, t, 39);
812 }
813
814 static void
815 square20(uint32_t *d, const uint32_t *a)
816 {
817 uint32_t t[39];
818
819 t[ 0] = MUL15(a[ 0], a[ 0]);
820 t[ 1] = ((MUL15(a[ 0], a[ 1])) << 1);
821 t[ 2] = MUL15(a[ 1], a[ 1])
822 + ((MUL15(a[ 0], a[ 2])) << 1);
823 t[ 3] = ((MUL15(a[ 0], a[ 3])
824 + MUL15(a[ 1], a[ 2])) << 1);
825 t[ 4] = MUL15(a[ 2], a[ 2])
826 + ((MUL15(a[ 0], a[ 4])
827 + MUL15(a[ 1], a[ 3])) << 1);
828 t[ 5] = ((MUL15(a[ 0], a[ 5])
829 + MUL15(a[ 1], a[ 4])
830 + MUL15(a[ 2], a[ 3])) << 1);
831 t[ 6] = MUL15(a[ 3], a[ 3])
832 + ((MUL15(a[ 0], a[ 6])
833 + MUL15(a[ 1], a[ 5])
834 + MUL15(a[ 2], a[ 4])) << 1);
835 t[ 7] = ((MUL15(a[ 0], a[ 7])
836 + MUL15(a[ 1], a[ 6])
837 + MUL15(a[ 2], a[ 5])
838 + MUL15(a[ 3], a[ 4])) << 1);
839 t[ 8] = MUL15(a[ 4], a[ 4])
840 + ((MUL15(a[ 0], a[ 8])
841 + MUL15(a[ 1], a[ 7])
842 + MUL15(a[ 2], a[ 6])
843 + MUL15(a[ 3], a[ 5])) << 1);
844 t[ 9] = ((MUL15(a[ 0], a[ 9])
845 + MUL15(a[ 1], a[ 8])
846 + MUL15(a[ 2], a[ 7])
847 + MUL15(a[ 3], a[ 6])
848 + MUL15(a[ 4], a[ 5])) << 1);
849 t[10] = MUL15(a[ 5], a[ 5])
850 + ((MUL15(a[ 0], a[10])
851 + MUL15(a[ 1], a[ 9])
852 + MUL15(a[ 2], a[ 8])
853 + MUL15(a[ 3], a[ 7])
854 + MUL15(a[ 4], a[ 6])) << 1);
855 t[11] = ((MUL15(a[ 0], a[11])
856 + MUL15(a[ 1], a[10])
857 + MUL15(a[ 2], a[ 9])
858 + MUL15(a[ 3], a[ 8])
859 + MUL15(a[ 4], a[ 7])
860 + MUL15(a[ 5], a[ 6])) << 1);
861 t[12] = MUL15(a[ 6], a[ 6])
862 + ((MUL15(a[ 0], a[12])
863 + MUL15(a[ 1], a[11])
864 + MUL15(a[ 2], a[10])
865 + MUL15(a[ 3], a[ 9])
866 + MUL15(a[ 4], a[ 8])
867 + MUL15(a[ 5], a[ 7])) << 1);
868 t[13] = ((MUL15(a[ 0], a[13])
869 + MUL15(a[ 1], a[12])
870 + MUL15(a[ 2], a[11])
871 + MUL15(a[ 3], a[10])
872 + MUL15(a[ 4], a[ 9])
873 + MUL15(a[ 5], a[ 8])
874 + MUL15(a[ 6], a[ 7])) << 1);
875 t[14] = MUL15(a[ 7], a[ 7])
876 + ((MUL15(a[ 0], a[14])
877 + MUL15(a[ 1], a[13])
878 + MUL15(a[ 2], a[12])
879 + MUL15(a[ 3], a[11])
880 + MUL15(a[ 4], a[10])
881 + MUL15(a[ 5], a[ 9])
882 + MUL15(a[ 6], a[ 8])) << 1);
883 t[15] = ((MUL15(a[ 0], a[15])
884 + MUL15(a[ 1], a[14])
885 + MUL15(a[ 2], a[13])
886 + MUL15(a[ 3], a[12])
887 + MUL15(a[ 4], a[11])
888 + MUL15(a[ 5], a[10])
889 + MUL15(a[ 6], a[ 9])
890 + MUL15(a[ 7], a[ 8])) << 1);
891 t[16] = MUL15(a[ 8], a[ 8])
892 + ((MUL15(a[ 0], a[16])
893 + MUL15(a[ 1], a[15])
894 + MUL15(a[ 2], a[14])
895 + MUL15(a[ 3], a[13])
896 + MUL15(a[ 4], a[12])
897 + MUL15(a[ 5], a[11])
898 + MUL15(a[ 6], a[10])
899 + MUL15(a[ 7], a[ 9])) << 1);
900 t[17] = ((MUL15(a[ 0], a[17])
901 + MUL15(a[ 1], a[16])
902 + MUL15(a[ 2], a[15])
903 + MUL15(a[ 3], a[14])
904 + MUL15(a[ 4], a[13])
905 + MUL15(a[ 5], a[12])
906 + MUL15(a[ 6], a[11])
907 + MUL15(a[ 7], a[10])
908 + MUL15(a[ 8], a[ 9])) << 1);
909 t[18] = MUL15(a[ 9], a[ 9])
910 + ((MUL15(a[ 0], a[18])
911 + MUL15(a[ 1], a[17])
912 + MUL15(a[ 2], a[16])
913 + MUL15(a[ 3], a[15])
914 + MUL15(a[ 4], a[14])
915 + MUL15(a[ 5], a[13])
916 + MUL15(a[ 6], a[12])
917 + MUL15(a[ 7], a[11])
918 + MUL15(a[ 8], a[10])) << 1);
919 t[19] = ((MUL15(a[ 0], a[19])
920 + MUL15(a[ 1], a[18])
921 + MUL15(a[ 2], a[17])
922 + MUL15(a[ 3], a[16])
923 + MUL15(a[ 4], a[15])
924 + MUL15(a[ 5], a[14])
925 + MUL15(a[ 6], a[13])
926 + MUL15(a[ 7], a[12])
927 + MUL15(a[ 8], a[11])
928 + MUL15(a[ 9], a[10])) << 1);
929 t[20] = MUL15(a[10], a[10])
930 + ((MUL15(a[ 1], a[19])
931 + MUL15(a[ 2], a[18])
932 + MUL15(a[ 3], a[17])
933 + MUL15(a[ 4], a[16])
934 + MUL15(a[ 5], a[15])
935 + MUL15(a[ 6], a[14])
936 + MUL15(a[ 7], a[13])
937 + MUL15(a[ 8], a[12])
938 + MUL15(a[ 9], a[11])) << 1);
939 t[21] = ((MUL15(a[ 2], a[19])
940 + MUL15(a[ 3], a[18])
941 + MUL15(a[ 4], a[17])
942 + MUL15(a[ 5], a[16])
943 + MUL15(a[ 6], a[15])
944 + MUL15(a[ 7], a[14])
945 + MUL15(a[ 8], a[13])
946 + MUL15(a[ 9], a[12])
947 + MUL15(a[10], a[11])) << 1);
948 t[22] = MUL15(a[11], a[11])
949 + ((MUL15(a[ 3], a[19])
950 + MUL15(a[ 4], a[18])
951 + MUL15(a[ 5], a[17])
952 + MUL15(a[ 6], a[16])
953 + MUL15(a[ 7], a[15])
954 + MUL15(a[ 8], a[14])
955 + MUL15(a[ 9], a[13])
956 + MUL15(a[10], a[12])) << 1);
957 t[23] = ((MUL15(a[ 4], a[19])
958 + MUL15(a[ 5], a[18])
959 + MUL15(a[ 6], a[17])
960 + MUL15(a[ 7], a[16])
961 + MUL15(a[ 8], a[15])
962 + MUL15(a[ 9], a[14])
963 + MUL15(a[10], a[13])
964 + MUL15(a[11], a[12])) << 1);
965 t[24] = MUL15(a[12], a[12])
966 + ((MUL15(a[ 5], a[19])
967 + MUL15(a[ 6], a[18])
968 + MUL15(a[ 7], a[17])
969 + MUL15(a[ 8], a[16])
970 + MUL15(a[ 9], a[15])
971 + MUL15(a[10], a[14])
972 + MUL15(a[11], a[13])) << 1);
973 t[25] = ((MUL15(a[ 6], a[19])
974 + MUL15(a[ 7], a[18])
975 + MUL15(a[ 8], a[17])
976 + MUL15(a[ 9], a[16])
977 + MUL15(a[10], a[15])
978 + MUL15(a[11], a[14])
979 + MUL15(a[12], a[13])) << 1);
980 t[26] = MUL15(a[13], a[13])
981 + ((MUL15(a[ 7], a[19])
982 + MUL15(a[ 8], a[18])
983 + MUL15(a[ 9], a[17])
984 + MUL15(a[10], a[16])
985 + MUL15(a[11], a[15])
986 + MUL15(a[12], a[14])) << 1);
987 t[27] = ((MUL15(a[ 8], a[19])
988 + MUL15(a[ 9], a[18])
989 + MUL15(a[10], a[17])
990 + MUL15(a[11], a[16])
991 + MUL15(a[12], a[15])
992 + MUL15(a[13], a[14])) << 1);
993 t[28] = MUL15(a[14], a[14])
994 + ((MUL15(a[ 9], a[19])
995 + MUL15(a[10], a[18])
996 + MUL15(a[11], a[17])
997 + MUL15(a[12], a[16])
998 + MUL15(a[13], a[15])) << 1);
999 t[29] = ((MUL15(a[10], a[19])
1000 + MUL15(a[11], a[18])
1001 + MUL15(a[12], a[17])
1002 + MUL15(a[13], a[16])
1003 + MUL15(a[14], a[15])) << 1);
1004 t[30] = MUL15(a[15], a[15])
1005 + ((MUL15(a[11], a[19])
1006 + MUL15(a[12], a[18])
1007 + MUL15(a[13], a[17])
1008 + MUL15(a[14], a[16])) << 1);
1009 t[31] = ((MUL15(a[12], a[19])
1010 + MUL15(a[13], a[18])
1011 + MUL15(a[14], a[17])
1012 + MUL15(a[15], a[16])) << 1);
1013 t[32] = MUL15(a[16], a[16])
1014 + ((MUL15(a[13], a[19])
1015 + MUL15(a[14], a[18])
1016 + MUL15(a[15], a[17])) << 1);
1017 t[33] = ((MUL15(a[14], a[19])
1018 + MUL15(a[15], a[18])
1019 + MUL15(a[16], a[17])) << 1);
1020 t[34] = MUL15(a[17], a[17])
1021 + ((MUL15(a[15], a[19])
1022 + MUL15(a[16], a[18])) << 1);
1023 t[35] = ((MUL15(a[16], a[19])
1024 + MUL15(a[17], a[18])) << 1);
1025 t[36] = MUL15(a[18], a[18])
1026 + ((MUL15(a[17], a[19])) << 1);
1027 t[37] = ((MUL15(a[18], a[19])) << 1);
1028 t[38] = MUL15(a[19], a[19]);
1029 d[39] = norm13(d, t, 39);
1030 }
1031
1032 #endif
1033
1034 /*
1035 * Perform a "final reduction" in field F255 (field for Curve25519)
1036 * The source value must be less than twice the modulus. If the value
1037 * is not lower than the modulus, then the modulus is subtracted and
1038 * this function returns 1; otherwise, it leaves it untouched and it
1039 * returns 0.
1040 */
1041 static uint32_t
1042 reduce_final_f255(uint32_t *d)
1043 {
1044 uint32_t t[20];
1045 uint32_t cc;
1046 int i;
1047
1048 memcpy(t, d, sizeof t);
1049 cc = 19;
1050 for (i = 0; i < 20; i ++) {
1051 uint32_t w;
1052
1053 w = t[i] + cc;
1054 cc = w >> 13;
1055 t[i] = w & 0x1FFF;
1056 }
1057 cc = t[19] >> 8;
1058 t[19] &= 0xFF;
1059 CCOPY(cc, d, t, sizeof t);
1060 return cc;
1061 }
1062
1063 /*
1064 * Perform a multiplication of two integers modulo 2^255-19.
1065 * Operands are arrays of 20 words, each containing 13 bits of data, in
1066 * little-endian order. Input value may be up to 2^256-1; on output, value
1067 * fits on 256 bits and is lower than twice the modulus.
1068 */
1069 static void
1070 f255_mul(uint32_t *d, const uint32_t *a, const uint32_t *b)
1071 {
1072 uint32_t t[40], cc, w;
1073 int i;
1074
1075 /*
1076 * Compute raw multiplication. All result words fit in 13 bits
1077 * each; upper word (t[39]) must fit on 5 bits, since the product
1078 * of two 256-bit integers must fit on 512 bits.
1079 */
1080 mul20(t, a, b);
1081
1082 /*
1083 * Modular reduction: each high word is added where necessary.
1084 * Since the modulus is 2^255-19 and word 20 corresponds to
1085 * offset 20*13 = 260, word 20+k must be added to word k with
1086 * a factor of 19*2^5 = 608. The extra bits in word 19 are also
1087 * added that way.
1088 */
1089 cc = MUL15(t[19] >> 8, 19);
1090 t[19] &= 0xFF;
1091 for (i = 0; i < 20; i ++) {
1092 w = t[i] + cc + MUL15(t[i + 20], 608);
1093 t[i] = w & 0x1FFF;
1094 cc = w >> 13;
1095 }
1096 cc = MUL15(w >> 8, 19);
1097 t[19] &= 0xFF;
1098 for (i = 0; i < 20; i ++) {
1099 w = t[i] + cc;
1100 d[i] = w & 0x1FFF;
1101 cc = w >> 13;
1102 }
1103 }
1104
1105 /*
1106 * Square an integer modulo 2^255-19.
1107 * Operand is an array of 20 words, each containing 13 bits of data, in
1108 * little-endian order. Input value may be up to 2^256-1; on output, value
1109 * fits on 256 bits and is lower than twice the modulus.
1110 */
1111 static void
1112 f255_square(uint32_t *d, const uint32_t *a)
1113 {
1114 uint32_t t[40], cc, w;
1115 int i;
1116
1117 /*
1118 * Compute raw multiplication. All result words fit in 13 bits
1119 * each; upper word (t[39]) must fit on 5 bits, since the product
1120 * of two 256-bit integers must fit on 512 bits.
1121 */
1122 square20(t, a);
1123
1124 /*
1125 * Modular reduction: each high word is added where necessary.
1126 * Since the modulus is 2^255-19 and word 20 corresponds to
1127 * offset 20*13 = 260, word 20+k must be added to word k with
1128 * a factor of 19*2^5 = 608. The extra bits in word 19 are also
1129 * added that way.
1130 */
1131 cc = MUL15(t[19] >> 8, 19);
1132 t[19] &= 0xFF;
1133 for (i = 0; i < 20; i ++) {
1134 w = t[i] + cc + MUL15(t[i + 20], 608);
1135 t[i] = w & 0x1FFF;
1136 cc = w >> 13;
1137 }
1138 cc = MUL15(w >> 8, 19);
1139 t[19] &= 0xFF;
1140 for (i = 0; i < 20; i ++) {
1141 w = t[i] + cc;
1142 d[i] = w & 0x1FFF;
1143 cc = w >> 13;
1144 }
1145 }
1146
1147 /*
1148 * Add two values in F255. Partial reduction is performed (down to less
1149 * than twice the modulus).
1150 */
1151 static void
1152 f255_add(uint32_t *d, const uint32_t *a, const uint32_t *b)
1153 {
1154 int i;
1155 uint32_t cc, w;
1156
1157 cc = 0;
1158 for (i = 0; i < 20; i ++) {
1159 w = a[i] + b[i] + cc;
1160 d[i] = w & 0x1FFF;
1161 cc = w >> 13;
1162 }
1163 cc = MUL15(w >> 8, 19);
1164 d[19] &= 0xFF;
1165 for (i = 0; i < 20; i ++) {
1166 w = d[i] + cc;
1167 d[i] = w & 0x1FFF;
1168 cc = w >> 13;
1169 }
1170 }
1171
1172 /*
1173 * Subtract one value from another in F255. Partial reduction is
1174 * performed (down to less than twice the modulus).
1175 */
1176 static void
1177 f255_sub(uint32_t *d, const uint32_t *a, const uint32_t *b)
1178 {
1179 /*
1180 * We actually compute a - b + 2*p, so that the final value is
1181 * necessarily positive.
1182 */
1183 int i;
1184 uint32_t cc, w;
1185
1186 cc = (uint32_t)-38;
1187 for (i = 0; i < 20; i ++) {
1188 w = a[i] - b[i] + cc;
1189 d[i] = w & 0x1FFF;
1190 cc = ARSH(w, 13);
1191 }
1192 cc = MUL15((w + 0x200) >> 8, 19);
1193 d[19] &= 0xFF;
1194 for (i = 0; i < 20; i ++) {
1195 w = d[i] + cc;
1196 d[i] = w & 0x1FFF;
1197 cc = w >> 13;
1198 }
1199 }
1200
1201 /*
1202 * Multiply an integer by the 'A24' constant (121665). Partial reduction
1203 * is performed (down to less than twice the modulus).
1204 */
1205 static void
1206 f255_mul_a24(uint32_t *d, const uint32_t *a)
1207 {
1208 int i;
1209 uint32_t cc, w;
1210
1211 cc = 0;
1212 for (i = 0; i < 20; i ++) {
1213 w = MUL15(a[i], 121665) + cc;
1214 d[i] = w & 0x1FFF;
1215 cc = w >> 13;
1216 }
1217 cc = MUL15(w >> 8, 19);
1218 d[19] &= 0xFF;
1219 for (i = 0; i < 20; i ++) {
1220 w = d[i] + cc;
1221 d[i] = w & 0x1FFF;
1222 cc = w >> 13;
1223 }
1224 }
1225
1226 static const unsigned char GEN[] = {
1227 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1228 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1229 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1230 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
1231 };
1232
1233 static const unsigned char ORDER[] = {
1234 0x7F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1235 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1236 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1237 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
1238 };
1239
1240 static const unsigned char *
1241 api_generator(int curve, size_t *len)
1242 {
1243 (void)curve;
1244 *len = 32;
1245 return GEN;
1246 }
1247
1248 static const unsigned char *
1249 api_order(int curve, size_t *len)
1250 {
1251 (void)curve;
1252 *len = 32;
1253 return ORDER;
1254 }
1255
1256 static size_t
1257 api_xoff(int curve, size_t *len)
1258 {
1259 (void)curve;
1260 *len = 32;
1261 return 0;
1262 }
1263
1264 static void
1265 cswap(uint32_t *a, uint32_t *b, uint32_t ctl)
1266 {
1267 int i;
1268
1269 ctl = -ctl;
1270 for (i = 0; i < 20; i ++) {
1271 uint32_t aw, bw, tw;
1272
1273 aw = a[i];
1274 bw = b[i];
1275 tw = ctl & (aw ^ bw);
1276 a[i] = aw ^ tw;
1277 b[i] = bw ^ tw;
1278 }
1279 }
1280
1281 static uint32_t
1282 api_mul(unsigned char *G, size_t Glen,
1283 const unsigned char *kb, size_t kblen, int curve)
1284 {
1285 uint32_t x1[20], x2[20], x3[20], z2[20], z3[20];
1286 uint32_t a[20], aa[20], b[20], bb[20];
1287 uint32_t c[20], d[20], e[20], da[20], cb[20];
1288 unsigned char k[32];
1289 uint32_t swap;
1290 int i;
1291
1292 (void)curve;
1293
1294 /*
1295 * Points are encoded over exactly 32 bytes. Multipliers must fit
1296 * in 32 bytes as well.
1297 * RFC 7748 mandates that the high bit of the last point byte must
1298 * be ignored/cleared.
1299 */
1300 if (Glen != 32 || kblen > 32) {
1301 return 0;
1302 }
1303 G[31] &= 0x7F;
1304
1305 /*
1306 * Initialise variables x1, x2, z2, x3 and z3. We set all of them
1307 * into Montgomery representation.
1308 */
1309 x1[19] = le8_to_le13(x1, G, 32);
1310 memcpy(x3, x1, sizeof x1);
1311 memset(z2, 0, sizeof z2);
1312 memset(x2, 0, sizeof x2);
1313 x2[0] = 1;
1314 memset(z3, 0, sizeof z3);
1315 z3[0] = 1;
1316
1317 memcpy(k, kb, kblen);
1318 memset(k + kblen, 0, (sizeof k) - kblen);
1319 k[0] &= 0xF8;
1320 k[31] &= 0x7F;
1321 k[31] |= 0x40;
1322
1323 /* obsolete
1324 print_int("x1", x1);
1325 */
1326
1327 swap = 0;
1328 for (i = 254; i >= 0; i --) {
1329 uint32_t kt;
1330
1331 kt = (k[i >> 3] >> (i & 7)) & 1;
1332 swap ^= kt;
1333 cswap(x2, x3, swap);
1334 cswap(z2, z3, swap);
1335 swap = kt;
1336
1337 /* obsolete
1338 print_int("x2", x2);
1339 print_int("z2", z2);
1340 print_int("x3", x3);
1341 print_int("z3", z3);
1342 */
1343
1344 f255_add(a, x2, z2);
1345 f255_square(aa, a);
1346 f255_sub(b, x2, z2);
1347 f255_square(bb, b);
1348 f255_sub(e, aa, bb);
1349 f255_add(c, x3, z3);
1350 f255_sub(d, x3, z3);
1351 f255_mul(da, d, a);
1352 f255_mul(cb, c, b);
1353
1354 /* obsolete
1355 print_int("a ", a);
1356 print_int("aa", aa);
1357 print_int("b ", b);
1358 print_int("bb", bb);
1359 print_int("e ", e);
1360 print_int("c ", c);
1361 print_int("d ", d);
1362 print_int("da", da);
1363 print_int("cb", cb);
1364 */
1365
1366 f255_add(x3, da, cb);
1367 f255_square(x3, x3);
1368 f255_sub(z3, da, cb);
1369 f255_square(z3, z3);
1370 f255_mul(z3, z3, x1);
1371 f255_mul(x2, aa, bb);
1372 f255_mul_a24(z2, e);
1373 f255_add(z2, z2, aa);
1374 f255_mul(z2, e, z2);
1375
1376 /* obsolete
1377 print_int("x2", x2);
1378 print_int("z2", z2);
1379 print_int("x3", x3);
1380 print_int("z3", z3);
1381 */
1382 }
1383 cswap(x2, x3, swap);
1384 cswap(z2, z3, swap);
1385
1386 /*
1387 * Inverse z2 with a modular exponentiation. This is a simple
1388 * square-and-multiply algorithm; we mutualise most non-squarings
1389 * since the exponent contains almost only ones.
1390 */
1391 memcpy(a, z2, sizeof z2);
1392 for (i = 0; i < 15; i ++) {
1393 f255_square(a, a);
1394 f255_mul(a, a, z2);
1395 }
1396 memcpy(b, a, sizeof a);
1397 for (i = 0; i < 14; i ++) {
1398 int j;
1399
1400 for (j = 0; j < 16; j ++) {
1401 f255_square(b, b);
1402 }
1403 f255_mul(b, b, a);
1404 }
1405 for (i = 14; i >= 0; i --) {
1406 f255_square(b, b);
1407 if ((0xFFEB >> i) & 1) {
1408 f255_mul(b, z2, b);
1409 }
1410 }
1411 f255_mul(x2, x2, b);
1412 reduce_final_f255(x2);
1413 le13_to_le8(G, 32, x2);
1414 return 1;
1415 }
1416
1417 static size_t
1418 api_mulgen(unsigned char *R,
1419 const unsigned char *x, size_t xlen, int curve)
1420 {
1421 const unsigned char *G;
1422 size_t Glen;
1423
1424 G = api_generator(curve, &Glen);
1425 memcpy(R, G, Glen);
1426 api_mul(R, Glen, x, xlen, curve);
1427 return Glen;
1428 }
1429
1430 static uint32_t
1431 api_muladd(unsigned char *A, const unsigned char *B, size_t len,
1432 const unsigned char *x, size_t xlen,
1433 const unsigned char *y, size_t ylen, int curve)
1434 {
1435 /*
1436 * We don't implement this method, since it is used for ECDSA
1437 * only, and there is no ECDSA over Curve25519 (which instead
1438 * uses EdDSA).
1439 */
1440 (void)A;
1441 (void)B;
1442 (void)len;
1443 (void)x;
1444 (void)xlen;
1445 (void)y;
1446 (void)ylen;
1447 (void)curve;
1448 return 0;
1449 }
1450
1451 /* see bearssl_ec.h */
1452 const br_ec_impl br_ec_c25519_m15 = {
1453 (uint32_t)0x20000000,
1454 &api_generator,
1455 &api_order,
1456 &api_xoff,
1457 &api_mul,
1458 &api_mulgen,
1459 &api_muladd
1460 };