Added support code for RSA and EC key encoding (including reconstruction of all publi...
[BearSSL] / src / ec / ec_c25519_m31.c
1 /*
2 * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining
5 * a copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sublicense, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be
13 * included in all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #include "inner.h"
26
27 /* obsolete
28 #include <stdio.h>
29 #include <stdlib.h>
30 static void
31 print_int(const char *name, const uint32_t *x)
32 {
33 size_t u;
34 unsigned char tmp[40];
35
36 printf("%s = ", name);
37 for (u = 0; u < 9; u ++) {
38 if (x[u] > 0x3FFFFFFF) {
39 printf("INVALID:");
40 for (u = 0; u < 9; u ++) {
41 printf(" %08X", x[u]);
42 }
43 printf("\n");
44 return;
45 }
46 }
47 memset(tmp, 0, sizeof tmp);
48 for (u = 0; u < 9; u ++) {
49 uint64_t w;
50 int j, k;
51
52 w = x[u];
53 j = 30 * (int)u;
54 k = j & 7;
55 if (k != 0) {
56 w <<= k;
57 j -= k;
58 }
59 k = j >> 3;
60 for (j = 0; j < 8; j ++) {
61 tmp[39 - k - j] |= (unsigned char)w;
62 w >>= 8;
63 }
64 }
65 for (u = 8; u < 40; u ++) {
66 printf("%02X", tmp[u]);
67 }
68 printf("\n");
69 }
70 */
71
72 /*
73 * If BR_NO_ARITH_SHIFT is undefined, or defined to 0, then we _assume_
74 * that right-shifting a signed negative integer copies the sign bit
75 * (arithmetic right-shift). This is "implementation-defined behaviour",
76 * i.e. it is not undefined, but it may differ between compilers. Each
77 * compiler is supposed to document its behaviour in that respect. GCC
78 * explicitly defines that an arithmetic right shift is used. We expect
79 * all other compilers to do the same, because underlying CPU offer an
80 * arithmetic right shift opcode that could not be used otherwise.
81 */
82 #if BR_NO_ARITH_SHIFT
83 #define ARSH(x, n) (((uint32_t)(x) >> (n)) \
84 | ((-((uint32_t)(x) >> 31)) << (32 - (n))))
85 #else
86 #define ARSH(x, n) ((*(int32_t *)&(x)) >> (n))
87 #endif
88
89 /*
90 * Convert an integer from unsigned little-endian encoding to a sequence of
91 * 30-bit words in little-endian order. The final "partial" word is
92 * returned.
93 */
94 static uint32_t
95 le8_to_le30(uint32_t *dst, const unsigned char *src, size_t len)
96 {
97 uint32_t acc;
98 int acc_len;
99
100 acc = 0;
101 acc_len = 0;
102 while (len -- > 0) {
103 uint32_t b;
104
105 b = *src ++;
106 if (acc_len < 22) {
107 acc |= b << acc_len;
108 acc_len += 8;
109 } else {
110 *dst ++ = (acc | (b << acc_len)) & 0x3FFFFFFF;
111 acc = b >> (30 - acc_len);
112 acc_len -= 22;
113 }
114 }
115 return acc;
116 }
117
118 /*
119 * Convert an integer (30-bit words, little-endian) to unsigned
120 * little-endian encoding. The total encoding length is provided; all
121 * the destination bytes will be filled.
122 */
123 static void
124 le30_to_le8(unsigned char *dst, size_t len, const uint32_t *src)
125 {
126 uint32_t acc;
127 int acc_len;
128
129 acc = 0;
130 acc_len = 0;
131 while (len -- > 0) {
132 if (acc_len < 8) {
133 uint32_t w;
134
135 w = *src ++;
136 *dst ++ = (unsigned char)(acc | (w << acc_len));
137 acc = w >> (8 - acc_len);
138 acc_len += 22;
139 } else {
140 *dst ++ = (unsigned char)acc;
141 acc >>= 8;
142 acc_len -= 8;
143 }
144 }
145 }
146
147 /*
148 * Multiply two integers. Source integers are represented as arrays of
149 * nine 30-bit words, for values up to 2^270-1. Result is encoded over
150 * 18 words of 30 bits each.
151 */
152 static void
153 mul9(uint32_t *d, const uint32_t *a, const uint32_t *b)
154 {
155 /*
156 * Maximum intermediate result is no more than
157 * 10376293531797946367, which fits in 64 bits. Reason:
158 *
159 * 10376293531797946367 = 9 * (2^30-1)^2 + 9663676406
160 * 10376293531797946367 < 9663676407 * 2^30
161 *
162 * Thus, adding together 9 products of 30-bit integers, with
163 * a carry of at most 9663676406, yields an integer that fits
164 * on 64 bits and generates a carry of at most 9663676406.
165 */
166 uint64_t t[17];
167 uint64_t cc;
168 int i;
169
170 t[ 0] = MUL31(a[0], b[0]);
171 t[ 1] = MUL31(a[0], b[1])
172 + MUL31(a[1], b[0]);
173 t[ 2] = MUL31(a[0], b[2])
174 + MUL31(a[1], b[1])
175 + MUL31(a[2], b[0]);
176 t[ 3] = MUL31(a[0], b[3])
177 + MUL31(a[1], b[2])
178 + MUL31(a[2], b[1])
179 + MUL31(a[3], b[0]);
180 t[ 4] = MUL31(a[0], b[4])
181 + MUL31(a[1], b[3])
182 + MUL31(a[2], b[2])
183 + MUL31(a[3], b[1])
184 + MUL31(a[4], b[0]);
185 t[ 5] = MUL31(a[0], b[5])
186 + MUL31(a[1], b[4])
187 + MUL31(a[2], b[3])
188 + MUL31(a[3], b[2])
189 + MUL31(a[4], b[1])
190 + MUL31(a[5], b[0]);
191 t[ 6] = MUL31(a[0], b[6])
192 + MUL31(a[1], b[5])
193 + MUL31(a[2], b[4])
194 + MUL31(a[3], b[3])
195 + MUL31(a[4], b[2])
196 + MUL31(a[5], b[1])
197 + MUL31(a[6], b[0]);
198 t[ 7] = MUL31(a[0], b[7])
199 + MUL31(a[1], b[6])
200 + MUL31(a[2], b[5])
201 + MUL31(a[3], b[4])
202 + MUL31(a[4], b[3])
203 + MUL31(a[5], b[2])
204 + MUL31(a[6], b[1])
205 + MUL31(a[7], b[0]);
206 t[ 8] = MUL31(a[0], b[8])
207 + MUL31(a[1], b[7])
208 + MUL31(a[2], b[6])
209 + MUL31(a[3], b[5])
210 + MUL31(a[4], b[4])
211 + MUL31(a[5], b[3])
212 + MUL31(a[6], b[2])
213 + MUL31(a[7], b[1])
214 + MUL31(a[8], b[0]);
215 t[ 9] = MUL31(a[1], b[8])
216 + MUL31(a[2], b[7])
217 + MUL31(a[3], b[6])
218 + MUL31(a[4], b[5])
219 + MUL31(a[5], b[4])
220 + MUL31(a[6], b[3])
221 + MUL31(a[7], b[2])
222 + MUL31(a[8], b[1]);
223 t[10] = MUL31(a[2], b[8])
224 + MUL31(a[3], b[7])
225 + MUL31(a[4], b[6])
226 + MUL31(a[5], b[5])
227 + MUL31(a[6], b[4])
228 + MUL31(a[7], b[3])
229 + MUL31(a[8], b[2]);
230 t[11] = MUL31(a[3], b[8])
231 + MUL31(a[4], b[7])
232 + MUL31(a[5], b[6])
233 + MUL31(a[6], b[5])
234 + MUL31(a[7], b[4])
235 + MUL31(a[8], b[3]);
236 t[12] = MUL31(a[4], b[8])
237 + MUL31(a[5], b[7])
238 + MUL31(a[6], b[6])
239 + MUL31(a[7], b[5])
240 + MUL31(a[8], b[4]);
241 t[13] = MUL31(a[5], b[8])
242 + MUL31(a[6], b[7])
243 + MUL31(a[7], b[6])
244 + MUL31(a[8], b[5]);
245 t[14] = MUL31(a[6], b[8])
246 + MUL31(a[7], b[7])
247 + MUL31(a[8], b[6]);
248 t[15] = MUL31(a[7], b[8])
249 + MUL31(a[8], b[7]);
250 t[16] = MUL31(a[8], b[8]);
251
252 /*
253 * Propagate carries.
254 */
255 cc = 0;
256 for (i = 0; i < 17; i ++) {
257 uint64_t w;
258
259 w = t[i] + cc;
260 d[i] = (uint32_t)w & 0x3FFFFFFF;
261 cc = w >> 30;
262 }
263 d[17] = (uint32_t)cc;
264 }
265
266 /*
267 * Square a 270-bit integer, represented as an array of nine 30-bit words.
268 * Result uses 18 words of 30 bits each.
269 */
270 static void
271 square9(uint32_t *d, const uint32_t *a)
272 {
273 uint64_t t[17];
274 uint64_t cc;
275 int i;
276
277 t[ 0] = MUL31(a[0], a[0]);
278 t[ 1] = ((MUL31(a[0], a[1])) << 1);
279 t[ 2] = MUL31(a[1], a[1])
280 + ((MUL31(a[0], a[2])) << 1);
281 t[ 3] = ((MUL31(a[0], a[3])
282 + MUL31(a[1], a[2])) << 1);
283 t[ 4] = MUL31(a[2], a[2])
284 + ((MUL31(a[0], a[4])
285 + MUL31(a[1], a[3])) << 1);
286 t[ 5] = ((MUL31(a[0], a[5])
287 + MUL31(a[1], a[4])
288 + MUL31(a[2], a[3])) << 1);
289 t[ 6] = MUL31(a[3], a[3])
290 + ((MUL31(a[0], a[6])
291 + MUL31(a[1], a[5])
292 + MUL31(a[2], a[4])) << 1);
293 t[ 7] = ((MUL31(a[0], a[7])
294 + MUL31(a[1], a[6])
295 + MUL31(a[2], a[5])
296 + MUL31(a[3], a[4])) << 1);
297 t[ 8] = MUL31(a[4], a[4])
298 + ((MUL31(a[0], a[8])
299 + MUL31(a[1], a[7])
300 + MUL31(a[2], a[6])
301 + MUL31(a[3], a[5])) << 1);
302 t[ 9] = ((MUL31(a[1], a[8])
303 + MUL31(a[2], a[7])
304 + MUL31(a[3], a[6])
305 + MUL31(a[4], a[5])) << 1);
306 t[10] = MUL31(a[5], a[5])
307 + ((MUL31(a[2], a[8])
308 + MUL31(a[3], a[7])
309 + MUL31(a[4], a[6])) << 1);
310 t[11] = ((MUL31(a[3], a[8])
311 + MUL31(a[4], a[7])
312 + MUL31(a[5], a[6])) << 1);
313 t[12] = MUL31(a[6], a[6])
314 + ((MUL31(a[4], a[8])
315 + MUL31(a[5], a[7])) << 1);
316 t[13] = ((MUL31(a[5], a[8])
317 + MUL31(a[6], a[7])) << 1);
318 t[14] = MUL31(a[7], a[7])
319 + ((MUL31(a[6], a[8])) << 1);
320 t[15] = ((MUL31(a[7], a[8])) << 1);
321 t[16] = MUL31(a[8], a[8]);
322
323 /*
324 * Propagate carries.
325 */
326 cc = 0;
327 for (i = 0; i < 17; i ++) {
328 uint64_t w;
329
330 w = t[i] + cc;
331 d[i] = (uint32_t)w & 0x3FFFFFFF;
332 cc = w >> 30;
333 }
334 d[17] = (uint32_t)cc;
335 }
336
337 /*
338 * Perform a "final reduction" in field F255 (field for Curve25519)
339 * The source value must be less than twice the modulus. If the value
340 * is not lower than the modulus, then the modulus is subtracted and
341 * this function returns 1; otherwise, it leaves it untouched and it
342 * returns 0.
343 */
344 static uint32_t
345 reduce_final_f255(uint32_t *d)
346 {
347 uint32_t t[9];
348 uint32_t cc;
349 int i;
350
351 memcpy(t, d, sizeof t);
352 cc = 19;
353 for (i = 0; i < 9; i ++) {
354 uint32_t w;
355
356 w = t[i] + cc;
357 cc = w >> 30;
358 t[i] = w & 0x3FFFFFFF;
359 }
360 cc = t[8] >> 15;
361 t[8] &= 0x7FFF;
362 CCOPY(cc, d, t, sizeof t);
363 return cc;
364 }
365
366 /*
367 * Perform a multiplication of two integers modulo 2^255-19.
368 * Operands are arrays of 9 words, each containing 30 bits of data, in
369 * little-endian order. Input value may be up to 2^256-1; on output, value
370 * fits on 256 bits and is lower than twice the modulus.
371 */
372 static void
373 f255_mul(uint32_t *d, const uint32_t *a, const uint32_t *b)
374 {
375 uint32_t t[18];
376 uint64_t cc, w;
377 int i;
378
379 /*
380 * Compute raw multiplication. All result words fit in 30 bits
381 * each; upper word (t[17]) must fit on 2 bits, since the product
382 * of two 256-bit integers must fit on 512 bits.
383 */
384 mul9(t, a, b);
385
386 /*
387 * Modular reduction: each high word is added where necessary.
388 * Since the modulus is 2^255-19 and word 9 corresponds to
389 * offset 9*30 = 270, word 9+k must be added to word k with
390 * a factor of 19*2^15 = 622592. The extra bits in word 8 are also
391 * added that way.
392 */
393 cc = MUL31(t[8] >> 15, 19);
394 t[8] &= 0x7FFF;
395 for (i = 0; i < 9; i ++) {
396 w = (uint64_t)t[i] + cc + MUL31(t[i + 9], 622592);
397 t[i] = (uint32_t)w & 0x3FFFFFFF;
398 cc = w >> 30;
399 }
400 cc = MUL31(w >> 15, 19);
401 t[8] &= 0x7FFF;
402 for (i = 0; i < 9; i ++) {
403 w = t[i] + cc;
404 d[i] = (uint32_t)w & 0x3FFFFFFF;
405 cc = w >> 30;
406 }
407 }
408
409 /*
410 * Perform a squaring of an integer modulo 2^255-19.
411 * Operands are arrays of 9 words, each containing 30 bits of data, in
412 * little-endian order. Input value may be up to 2^256-1; on output, value
413 * fits on 256 bits and is lower than twice the modulus.
414 */
415 static void
416 f255_square(uint32_t *d, const uint32_t *a)
417 {
418 uint32_t t[18];
419 uint64_t cc, w;
420 int i;
421
422 /*
423 * Compute raw squaring. All result words fit in 30 bits
424 * each; upper word (t[17]) must fit on 2 bits, since the square
425 * of a 256-bit integers must fit on 512 bits.
426 */
427 square9(t, a);
428
429 /*
430 * Modular reduction: each high word is added where necessary.
431 * Since the modulus is 2^255-19 and word 9 corresponds to
432 * offset 9*30 = 270, word 9+k must be added to word k with
433 * a factor of 19*2^15 = 622592. The extra bits in word 8 are also
434 * added that way.
435 */
436 cc = MUL31(t[8] >> 15, 19);
437 t[8] &= 0x7FFF;
438 for (i = 0; i < 9; i ++) {
439 w = (uint64_t)t[i] + cc + MUL31(t[i + 9], 622592);
440 t[i] = (uint32_t)w & 0x3FFFFFFF;
441 cc = w >> 30;
442 }
443 cc = MUL31(w >> 15, 19);
444 t[8] &= 0x7FFF;
445 for (i = 0; i < 9; i ++) {
446 w = t[i] + cc;
447 d[i] = (uint32_t)w & 0x3FFFFFFF;
448 cc = w >> 30;
449 }
450 }
451
452 /*
453 * Add two values in F255. Partial reduction is performed (down to less
454 * than twice the modulus).
455 */
456 static void
457 f255_add(uint32_t *d, const uint32_t *a, const uint32_t *b)
458 {
459 /*
460 * Since operand words fit on 30 bits, we can use 32-bit
461 * variables throughout.
462 */
463 int i;
464 uint32_t cc, w;
465
466 cc = 0;
467 for (i = 0; i < 9; i ++) {
468 w = a[i] + b[i] + cc;
469 d[i] = w & 0x3FFFFFFF;
470 cc = w >> 30;
471 }
472 cc = MUL15(w >> 15, 19);
473 d[8] &= 0x7FFF;
474 for (i = 0; i < 9; i ++) {
475 w = d[i] + cc;
476 d[i] = w & 0x3FFFFFFF;
477 cc = w >> 30;
478 }
479 }
480
481 /*
482 * Subtract one value from another in F255. Partial reduction is
483 * performed (down to less than twice the modulus).
484 */
485 static void
486 f255_sub(uint32_t *d, const uint32_t *a, const uint32_t *b)
487 {
488 /*
489 * We actually compute a - b + 2*p, so that the final value is
490 * necessarily positive.
491 */
492 int i;
493 uint32_t cc, w;
494
495 cc = (uint32_t)-38;
496 for (i = 0; i < 9; i ++) {
497 w = a[i] - b[i] + cc;
498 d[i] = w & 0x3FFFFFFF;
499 cc = ARSH(w, 30);
500 }
501 cc = MUL15((w + 0x10000) >> 15, 19);
502 d[8] &= 0x7FFF;
503 for (i = 0; i < 9; i ++) {
504 w = d[i] + cc;
505 d[i] = w & 0x3FFFFFFF;
506 cc = w >> 30;
507 }
508 }
509
510 /*
511 * Multiply an integer by the 'A24' constant (121665). Partial reduction
512 * is performed (down to less than twice the modulus).
513 */
514 static void
515 f255_mul_a24(uint32_t *d, const uint32_t *a)
516 {
517 int i;
518 uint64_t cc, w;
519
520 cc = 0;
521 for (i = 0; i < 9; i ++) {
522 w = MUL31(a[i], 121665) + cc;
523 d[i] = (uint32_t)w & 0x3FFFFFFF;
524 cc = w >> 30;
525 }
526 cc = MUL31((uint32_t)(w >> 15), 19);
527 d[8] &= 0x7FFF;
528 for (i = 0; i < 9; i ++) {
529 w = (uint64_t)d[i] + cc;
530 d[i] = w & 0x3FFFFFFF;
531 cc = w >> 30;
532 }
533 }
534
535 static const unsigned char GEN[] = {
536 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
537 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
538 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
539 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
540 };
541
542 static const unsigned char ORDER[] = {
543 0x7F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
544 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
545 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
546 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
547 };
548
549 static const unsigned char *
550 api_generator(int curve, size_t *len)
551 {
552 (void)curve;
553 *len = 32;
554 return GEN;
555 }
556
557 static const unsigned char *
558 api_order(int curve, size_t *len)
559 {
560 (void)curve;
561 *len = 32;
562 return ORDER;
563 }
564
565 static size_t
566 api_xoff(int curve, size_t *len)
567 {
568 (void)curve;
569 *len = 32;
570 return 0;
571 }
572
573 static void
574 cswap(uint32_t *a, uint32_t *b, uint32_t ctl)
575 {
576 int i;
577
578 ctl = -ctl;
579 for (i = 0; i < 9; i ++) {
580 uint32_t aw, bw, tw;
581
582 aw = a[i];
583 bw = b[i];
584 tw = ctl & (aw ^ bw);
585 a[i] = aw ^ tw;
586 b[i] = bw ^ tw;
587 }
588 }
589
590 static uint32_t
591 api_mul(unsigned char *G, size_t Glen,
592 const unsigned char *kb, size_t kblen, int curve)
593 {
594 uint32_t x1[9], x2[9], x3[9], z2[9], z3[9];
595 uint32_t a[9], aa[9], b[9], bb[9];
596 uint32_t c[9], d[9], e[9], da[9], cb[9];
597 unsigned char k[32];
598 uint32_t swap;
599 int i;
600
601 (void)curve;
602
603 /*
604 * Points are encoded over exactly 32 bytes. Multipliers must fit
605 * in 32 bytes as well.
606 * RFC 7748 mandates that the high bit of the last point byte must
607 * be ignored/cleared.
608 */
609 if (Glen != 32 || kblen > 32) {
610 return 0;
611 }
612 G[31] &= 0x7F;
613
614 /*
615 * Initialise variables x1, x2, z2, x3 and z3. We set all of them
616 * into Montgomery representation.
617 */
618 x1[8] = le8_to_le30(x1, G, 32);
619 memcpy(x3, x1, sizeof x1);
620 memset(z2, 0, sizeof z2);
621 memset(x2, 0, sizeof x2);
622 x2[0] = 1;
623 memset(z3, 0, sizeof z3);
624 z3[0] = 1;
625
626 memcpy(k, kb, kblen);
627 memset(k + kblen, 0, (sizeof k) - kblen);
628 k[0] &= 0xF8;
629 k[31] &= 0x7F;
630 k[31] |= 0x40;
631
632 /* obsolete
633 print_int("x1", x1);
634 */
635
636 swap = 0;
637 for (i = 254; i >= 0; i --) {
638 uint32_t kt;
639
640 kt = (k[i >> 3] >> (i & 7)) & 1;
641 swap ^= kt;
642 cswap(x2, x3, swap);
643 cswap(z2, z3, swap);
644 swap = kt;
645
646 /* obsolete
647 print_int("x2", x2);
648 print_int("z2", z2);
649 print_int("x3", x3);
650 print_int("z3", z3);
651 */
652
653 f255_add(a, x2, z2);
654 f255_square(aa, a);
655 f255_sub(b, x2, z2);
656 f255_square(bb, b);
657 f255_sub(e, aa, bb);
658 f255_add(c, x3, z3);
659 f255_sub(d, x3, z3);
660 f255_mul(da, d, a);
661 f255_mul(cb, c, b);
662
663 /* obsolete
664 print_int("a ", a);
665 print_int("aa", aa);
666 print_int("b ", b);
667 print_int("bb", bb);
668 print_int("e ", e);
669 print_int("c ", c);
670 print_int("d ", d);
671 print_int("da", da);
672 print_int("cb", cb);
673 */
674
675 f255_add(x3, da, cb);
676 f255_square(x3, x3);
677 f255_sub(z3, da, cb);
678 f255_square(z3, z3);
679 f255_mul(z3, z3, x1);
680 f255_mul(x2, aa, bb);
681 f255_mul_a24(z2, e);
682 f255_add(z2, z2, aa);
683 f255_mul(z2, e, z2);
684
685 /* obsolete
686 print_int("x2", x2);
687 print_int("z2", z2);
688 print_int("x3", x3);
689 print_int("z3", z3);
690 */
691 }
692 cswap(x2, x3, swap);
693 cswap(z2, z3, swap);
694
695 /*
696 * Inverse z2 with a modular exponentiation. This is a simple
697 * square-and-multiply algorithm; we mutualise most non-squarings
698 * since the exponent contains almost only ones.
699 */
700 memcpy(a, z2, sizeof z2);
701 for (i = 0; i < 15; i ++) {
702 f255_square(a, a);
703 f255_mul(a, a, z2);
704 }
705 memcpy(b, a, sizeof a);
706 for (i = 0; i < 14; i ++) {
707 int j;
708
709 for (j = 0; j < 16; j ++) {
710 f255_square(b, b);
711 }
712 f255_mul(b, b, a);
713 }
714 for (i = 14; i >= 0; i --) {
715 f255_square(b, b);
716 if ((0xFFEB >> i) & 1) {
717 f255_mul(b, z2, b);
718 }
719 }
720 f255_mul(x2, x2, b);
721 reduce_final_f255(x2);
722 le30_to_le8(G, 32, x2);
723 return 1;
724 }
725
726 static size_t
727 api_mulgen(unsigned char *R,
728 const unsigned char *x, size_t xlen, int curve)
729 {
730 const unsigned char *G;
731 size_t Glen;
732
733 G = api_generator(curve, &Glen);
734 memcpy(R, G, Glen);
735 api_mul(R, Glen, x, xlen, curve);
736 return Glen;
737 }
738
739 static uint32_t
740 api_muladd(unsigned char *A, const unsigned char *B, size_t len,
741 const unsigned char *x, size_t xlen,
742 const unsigned char *y, size_t ylen, int curve)
743 {
744 /*
745 * We don't implement this method, since it is used for ECDSA
746 * only, and there is no ECDSA over Curve25519 (which instead
747 * uses EdDSA).
748 */
749 (void)A;
750 (void)B;
751 (void)len;
752 (void)x;
753 (void)xlen;
754 (void)y;
755 (void)ylen;
756 (void)curve;
757 return 0;
758 }
759
760 /* see bearssl_ec.h */
761 const br_ec_impl br_ec_c25519_m31 = {
762 (uint32_t)0x20000000,
763 &api_generator,
764 &api_order,
765 &api_xoff,
766 &api_mul,
767 &api_mulgen,
768 &api_muladd
769 };