Fixed mishandling of tree structure in the cache for session parameters.
[BearSSL] / src / ec / ec_p256_m31.c
1 /*
2 * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining
5 * a copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sublicense, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be
13 * included in all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #include "inner.h"
26
27 /*
28 * If BR_NO_ARITH_SHIFT is undefined, or defined to 0, then we _assume_
29 * that right-shifting a signed negative integer copies the sign bit
30 * (arithmetic right-shift). This is "implementation-defined behaviour",
31 * i.e. it is not undefined, but it may differ between compilers. Each
32 * compiler is supposed to document its behaviour in that respect. GCC
33 * explicitly defines that an arithmetic right shift is used. We expect
34 * all other compilers to do the same, because underlying CPU offer an
35 * arithmetic right shift opcode that could not be used otherwise.
36 */
37 #if BR_NO_ARITH_SHIFT
38 #define ARSH(x, n) (((uint32_t)(x) >> (n)) \
39 | ((-((uint32_t)(x) >> 31)) << (32 - (n))))
40 #define ARSHW(x, n) (((uint64_t)(x) >> (n)) \
41 | ((-((uint64_t)(x) >> 63)) << (64 - (n))))
42 #else
43 #define ARSH(x, n) ((*(int32_t *)&(x)) >> (n))
44 #define ARSHW(x, n) ((*(int64_t *)&(x)) >> (n))
45 #endif
46
47 /*
48 * Convert an integer from unsigned big-endian encoding to a sequence of
49 * 30-bit words in little-endian order. The final "partial" word is
50 * returned.
51 */
52 static uint32_t
53 be8_to_le30(uint32_t *dst, const unsigned char *src, size_t len)
54 {
55 uint32_t acc;
56 int acc_len;
57
58 acc = 0;
59 acc_len = 0;
60 while (len -- > 0) {
61 uint32_t b;
62
63 b = src[len];
64 if (acc_len < 22) {
65 acc |= b << acc_len;
66 acc_len += 8;
67 } else {
68 *dst ++ = (acc | (b << acc_len)) & 0x3FFFFFFF;
69 acc = b >> (30 - acc_len);
70 acc_len -= 22;
71 }
72 }
73 return acc;
74 }
75
76 /*
77 * Convert an integer (30-bit words, little-endian) to unsigned
78 * big-endian encoding. The total encoding length is provided; all
79 * the destination bytes will be filled.
80 */
81 static void
82 le30_to_be8(unsigned char *dst, size_t len, const uint32_t *src)
83 {
84 uint32_t acc;
85 int acc_len;
86
87 acc = 0;
88 acc_len = 0;
89 while (len -- > 0) {
90 if (acc_len < 8) {
91 uint32_t w;
92
93 w = *src ++;
94 dst[len] = (unsigned char)(acc | (w << acc_len));
95 acc = w >> (8 - acc_len);
96 acc_len += 22;
97 } else {
98 dst[len] = (unsigned char)acc;
99 acc >>= 8;
100 acc_len -= 8;
101 }
102 }
103 }
104
105 /*
106 * Multiply two integers. Source integers are represented as arrays of
107 * nine 30-bit words, for values up to 2^270-1. Result is encoded over
108 * 18 words of 30 bits each.
109 */
110 static void
111 mul9(uint32_t *d, const uint32_t *a, const uint32_t *b)
112 {
113 /*
114 * Maximum intermediate result is no more than
115 * 10376293531797946367, which fits in 64 bits. Reason:
116 *
117 * 10376293531797946367 = 9 * (2^30-1)^2 + 9663676406
118 * 10376293531797946367 < 9663676407 * 2^30
119 *
120 * Thus, adding together 9 products of 30-bit integers, with
121 * a carry of at most 9663676406, yields an integer that fits
122 * on 64 bits and generates a carry of at most 9663676406.
123 */
124 uint64_t t[17];
125 uint64_t cc;
126 int i;
127
128 t[ 0] = MUL31(a[0], b[0]);
129 t[ 1] = MUL31(a[0], b[1])
130 + MUL31(a[1], b[0]);
131 t[ 2] = MUL31(a[0], b[2])
132 + MUL31(a[1], b[1])
133 + MUL31(a[2], b[0]);
134 t[ 3] = MUL31(a[0], b[3])
135 + MUL31(a[1], b[2])
136 + MUL31(a[2], b[1])
137 + MUL31(a[3], b[0]);
138 t[ 4] = MUL31(a[0], b[4])
139 + MUL31(a[1], b[3])
140 + MUL31(a[2], b[2])
141 + MUL31(a[3], b[1])
142 + MUL31(a[4], b[0]);
143 t[ 5] = MUL31(a[0], b[5])
144 + MUL31(a[1], b[4])
145 + MUL31(a[2], b[3])
146 + MUL31(a[3], b[2])
147 + MUL31(a[4], b[1])
148 + MUL31(a[5], b[0]);
149 t[ 6] = MUL31(a[0], b[6])
150 + MUL31(a[1], b[5])
151 + MUL31(a[2], b[4])
152 + MUL31(a[3], b[3])
153 + MUL31(a[4], b[2])
154 + MUL31(a[5], b[1])
155 + MUL31(a[6], b[0]);
156 t[ 7] = MUL31(a[0], b[7])
157 + MUL31(a[1], b[6])
158 + MUL31(a[2], b[5])
159 + MUL31(a[3], b[4])
160 + MUL31(a[4], b[3])
161 + MUL31(a[5], b[2])
162 + MUL31(a[6], b[1])
163 + MUL31(a[7], b[0]);
164 t[ 8] = MUL31(a[0], b[8])
165 + MUL31(a[1], b[7])
166 + MUL31(a[2], b[6])
167 + MUL31(a[3], b[5])
168 + MUL31(a[4], b[4])
169 + MUL31(a[5], b[3])
170 + MUL31(a[6], b[2])
171 + MUL31(a[7], b[1])
172 + MUL31(a[8], b[0]);
173 t[ 9] = MUL31(a[1], b[8])
174 + MUL31(a[2], b[7])
175 + MUL31(a[3], b[6])
176 + MUL31(a[4], b[5])
177 + MUL31(a[5], b[4])
178 + MUL31(a[6], b[3])
179 + MUL31(a[7], b[2])
180 + MUL31(a[8], b[1]);
181 t[10] = MUL31(a[2], b[8])
182 + MUL31(a[3], b[7])
183 + MUL31(a[4], b[6])
184 + MUL31(a[5], b[5])
185 + MUL31(a[6], b[4])
186 + MUL31(a[7], b[3])
187 + MUL31(a[8], b[2]);
188 t[11] = MUL31(a[3], b[8])
189 + MUL31(a[4], b[7])
190 + MUL31(a[5], b[6])
191 + MUL31(a[6], b[5])
192 + MUL31(a[7], b[4])
193 + MUL31(a[8], b[3]);
194 t[12] = MUL31(a[4], b[8])
195 + MUL31(a[5], b[7])
196 + MUL31(a[6], b[6])
197 + MUL31(a[7], b[5])
198 + MUL31(a[8], b[4]);
199 t[13] = MUL31(a[5], b[8])
200 + MUL31(a[6], b[7])
201 + MUL31(a[7], b[6])
202 + MUL31(a[8], b[5]);
203 t[14] = MUL31(a[6], b[8])
204 + MUL31(a[7], b[7])
205 + MUL31(a[8], b[6]);
206 t[15] = MUL31(a[7], b[8])
207 + MUL31(a[8], b[7]);
208 t[16] = MUL31(a[8], b[8]);
209
210 /*
211 * Propagate carries.
212 */
213 cc = 0;
214 for (i = 0; i < 17; i ++) {
215 uint64_t w;
216
217 w = t[i] + cc;
218 d[i] = (uint32_t)w & 0x3FFFFFFF;
219 cc = w >> 30;
220 }
221 d[17] = (uint32_t)cc;
222 }
223
224 /*
225 * Square a 270-bit integer, represented as an array of nine 30-bit words.
226 * Result uses 18 words of 30 bits each.
227 */
228 static void
229 square9(uint32_t *d, const uint32_t *a)
230 {
231 uint64_t t[17];
232 uint64_t cc;
233 int i;
234
235 t[ 0] = MUL31(a[0], a[0]);
236 t[ 1] = ((MUL31(a[0], a[1])) << 1);
237 t[ 2] = MUL31(a[1], a[1])
238 + ((MUL31(a[0], a[2])) << 1);
239 t[ 3] = ((MUL31(a[0], a[3])
240 + MUL31(a[1], a[2])) << 1);
241 t[ 4] = MUL31(a[2], a[2])
242 + ((MUL31(a[0], a[4])
243 + MUL31(a[1], a[3])) << 1);
244 t[ 5] = ((MUL31(a[0], a[5])
245 + MUL31(a[1], a[4])
246 + MUL31(a[2], a[3])) << 1);
247 t[ 6] = MUL31(a[3], a[3])
248 + ((MUL31(a[0], a[6])
249 + MUL31(a[1], a[5])
250 + MUL31(a[2], a[4])) << 1);
251 t[ 7] = ((MUL31(a[0], a[7])
252 + MUL31(a[1], a[6])
253 + MUL31(a[2], a[5])
254 + MUL31(a[3], a[4])) << 1);
255 t[ 8] = MUL31(a[4], a[4])
256 + ((MUL31(a[0], a[8])
257 + MUL31(a[1], a[7])
258 + MUL31(a[2], a[6])
259 + MUL31(a[3], a[5])) << 1);
260 t[ 9] = ((MUL31(a[1], a[8])
261 + MUL31(a[2], a[7])
262 + MUL31(a[3], a[6])
263 + MUL31(a[4], a[5])) << 1);
264 t[10] = MUL31(a[5], a[5])
265 + ((MUL31(a[2], a[8])
266 + MUL31(a[3], a[7])
267 + MUL31(a[4], a[6])) << 1);
268 t[11] = ((MUL31(a[3], a[8])
269 + MUL31(a[4], a[7])
270 + MUL31(a[5], a[6])) << 1);
271 t[12] = MUL31(a[6], a[6])
272 + ((MUL31(a[4], a[8])
273 + MUL31(a[5], a[7])) << 1);
274 t[13] = ((MUL31(a[5], a[8])
275 + MUL31(a[6], a[7])) << 1);
276 t[14] = MUL31(a[7], a[7])
277 + ((MUL31(a[6], a[8])) << 1);
278 t[15] = ((MUL31(a[7], a[8])) << 1);
279 t[16] = MUL31(a[8], a[8]);
280
281 /*
282 * Propagate carries.
283 */
284 cc = 0;
285 for (i = 0; i < 17; i ++) {
286 uint64_t w;
287
288 w = t[i] + cc;
289 d[i] = (uint32_t)w & 0x3FFFFFFF;
290 cc = w >> 30;
291 }
292 d[17] = (uint32_t)cc;
293 }
294
295 /*
296 * Base field modulus for P-256.
297 */
298 static const uint32_t F256[] = {
299
300 0x3FFFFFFF, 0x3FFFFFFF, 0x3FFFFFFF, 0x0000003F, 0x00000000,
301 0x00000000, 0x00001000, 0x3FFFC000, 0x0000FFFF
302 };
303
304 /*
305 * The 'b' curve equation coefficient for P-256.
306 */
307 static const uint32_t P256_B[] = {
308
309 0x27D2604B, 0x2F38F0F8, 0x053B0F63, 0x0741AC33, 0x1886BC65,
310 0x2EF555DA, 0x293E7B3E, 0x0D762A8E, 0x00005AC6
311 };
312
313 /*
314 * Addition in the field. Source operands shall fit on 257 bits; output
315 * will be lower than twice the modulus.
316 */
317 static void
318 add_f256(uint32_t *d, const uint32_t *a, const uint32_t *b)
319 {
320 uint32_t w, cc;
321 int i;
322
323 cc = 0;
324 for (i = 0; i < 9; i ++) {
325 w = a[i] + b[i] + cc;
326 d[i] = w & 0x3FFFFFFF;
327 cc = w >> 30;
328 }
329 w >>= 16;
330 d[8] &= 0xFFFF;
331 d[3] -= w << 6;
332 d[6] -= w << 12;
333 d[7] += w << 14;
334 cc = w;
335 for (i = 0; i < 9; i ++) {
336 w = d[i] + cc;
337 d[i] = w & 0x3FFFFFFF;
338 cc = ARSH(w, 30);
339 }
340 }
341
342 /*
343 * Subtraction in the field. Source operands shall be smaller than twice
344 * the modulus; the result will fulfil the same property.
345 */
346 static void
347 sub_f256(uint32_t *d, const uint32_t *a, const uint32_t *b)
348 {
349 uint32_t w, cc;
350 int i;
351
352 /*
353 * We really compute a - b + 2*p to make sure that the result is
354 * positive.
355 */
356 w = a[0] - b[0] - 0x00002;
357 d[0] = w & 0x3FFFFFFF;
358 w = a[1] - b[1] + ARSH(w, 30);
359 d[1] = w & 0x3FFFFFFF;
360 w = a[2] - b[2] + ARSH(w, 30);
361 d[2] = w & 0x3FFFFFFF;
362 w = a[3] - b[3] + ARSH(w, 30) + 0x00080;
363 d[3] = w & 0x3FFFFFFF;
364 w = a[4] - b[4] + ARSH(w, 30);
365 d[4] = w & 0x3FFFFFFF;
366 w = a[5] - b[5] + ARSH(w, 30);
367 d[5] = w & 0x3FFFFFFF;
368 w = a[6] - b[6] + ARSH(w, 30) + 0x02000;
369 d[6] = w & 0x3FFFFFFF;
370 w = a[7] - b[7] + ARSH(w, 30) - 0x08000;
371 d[7] = w & 0x3FFFFFFF;
372 w = a[8] - b[8] + ARSH(w, 30) + 0x20000;
373 d[8] = w & 0xFFFF;
374 w >>= 16;
375 d[8] &= 0xFFFF;
376 d[3] -= w << 6;
377 d[6] -= w << 12;
378 d[7] += w << 14;
379 cc = w;
380 for (i = 0; i < 9; i ++) {
381 w = d[i] + cc;
382 d[i] = w & 0x3FFFFFFF;
383 cc = ARSH(w, 30);
384 }
385 }
386
387 /*
388 * Compute a multiplication in F256. Source operands shall be less than
389 * twice the modulus.
390 */
391 static void
392 mul_f256(uint32_t *d, const uint32_t *a, const uint32_t *b)
393 {
394 uint32_t t[18];
395 uint64_t s[18];
396 uint64_t cc, x;
397 uint32_t z;
398 int i;
399
400 mul9(t, a, b);
401
402 /*
403 * Modular reduction: each high word in added/subtracted where
404 * necessary.
405 *
406 * The modulus is:
407 * p = 2^256 - 2^224 + 2^192 + 2^96 - 1
408 * Therefore:
409 * 2^256 = 2^224 - 2^192 - 2^96 + 1 mod p
410 *
411 * For a word x at bit offset n (n >= 256), we have:
412 * x*2^n = x*2^(n-32) - x*2^(n-64)
413 * - x*2^(n - 160) + x*2^(n-256) mod p
414 *
415 * Thus, we can nullify the high word if we reinject it at some
416 * proper emplacements.
417 *
418 * We use 64-bit intermediate words to allow for carries to
419 * accumulate easily, before performing the final propagation.
420 */
421 for (i = 0; i < 18; i ++) {
422 s[i] = t[i];
423 }
424
425 for (i = 17; i >= 9; i --) {
426 uint64_t x;
427
428 x = s[i];
429 s[i - 1] += ARSHW(x, 2);
430 s[i - 2] += (x << 28) & 0x3FFFFFFF;
431 s[i - 2] -= ARSHW(x, 4);
432 s[i - 3] -= (x << 26) & 0x3FFFFFFF;
433 s[i - 5] -= ARSHW(x, 10);
434 s[i - 6] -= (x << 20) & 0x3FFFFFFF;
435 s[i - 8] += ARSHW(x, 16);
436 s[i - 9] += (x << 14) & 0x3FFFFFFF;
437 }
438
439 /*
440 * Carry propagation must be signed. Moreover, we may have overdone
441 * it a bit, and obtain a negative result.
442 *
443 * The loop above ran 9 times; each time, each word was augmented
444 * by at most one extra word (in absolute value). Thus, the top
445 * word must in fine fit in 39 bits, so the carry below will fit
446 * on 9 bits.
447 */
448 cc = 0;
449 for (i = 0; i < 9; i ++) {
450 x = s[i] + cc;
451 d[i] = (uint32_t)x & 0x3FFFFFFF;
452 cc = ARSHW(x, 30);
453 }
454
455 /*
456 * All nine words fit on 30 bits, but there may be an extra
457 * carry for a few bits (at most 9), and that carry may be
458 * negative. Moreover, we want the result to fit on 257 bits.
459 * The two lines below ensure that the word in d[] has length
460 * 256 bits, and the (signed) carry (beyond 2^256) is in cc. The
461 * significant length of cc is less than 24 bits, so we will be
462 * able to switch to 32-bit operations.
463 */
464 cc = ARSHW(x, 16);
465 d[8] &= 0xFFFF;
466
467 /*
468 * Subtract cc*p.
469 */
470 z = (uint32_t)cc;
471 d[3] -= z << 6;
472 d[6] -= (z << 12) & 0x3FFFFFFF;
473 d[7] -= ARSH(z, 18);
474 d[7] += (z << 14) & 0x3FFFFFFF;
475 d[8] += ARSH(z, 16);
476 for (i = 0; i < 9; i ++) {
477 uint32_t w;
478
479 w = d[i] + z;
480 d[i] = w & 0x3FFFFFFF;
481 z = ARSH(w, 30);
482 }
483 }
484
485 /*
486 * Compute a square in F256. Source operand shall be less than
487 * twice the modulus.
488 */
489 static void
490 square_f256(uint32_t *d, const uint32_t *a)
491 {
492 uint32_t t[18];
493 uint64_t s[18];
494 uint64_t cc, x;
495 uint32_t z;
496 int i;
497
498 square9(t, a);
499
500 /*
501 * Modular reduction: each high word in added/subtracted where
502 * necessary.
503 *
504 * The modulus is:
505 * p = 2^256 - 2^224 + 2^192 + 2^96 - 1
506 * Therefore:
507 * 2^256 = 2^224 - 2^192 - 2^96 + 1 mod p
508 *
509 * For a word x at bit offset n (n >= 256), we have:
510 * x*2^n = x*2^(n-32) - x*2^(n-64)
511 * - x*2^(n - 160) + x*2^(n-256) mod p
512 *
513 * Thus, we can nullify the high word if we reinject it at some
514 * proper emplacements.
515 *
516 * We use 64-bit intermediate words to allow for carries to
517 * accumulate easily, before performing the final propagation.
518 */
519 for (i = 0; i < 18; i ++) {
520 s[i] = t[i];
521 }
522
523 for (i = 17; i >= 9; i --) {
524 uint64_t x;
525
526 x = s[i];
527 s[i - 1] += ARSHW(x, 2);
528 s[i - 2] += (x << 28) & 0x3FFFFFFF;
529 s[i - 2] -= ARSHW(x, 4);
530 s[i - 3] -= (x << 26) & 0x3FFFFFFF;
531 s[i - 5] -= ARSHW(x, 10);
532 s[i - 6] -= (x << 20) & 0x3FFFFFFF;
533 s[i - 8] += ARSHW(x, 16);
534 s[i - 9] += (x << 14) & 0x3FFFFFFF;
535 }
536
537 /*
538 * Carry propagation must be signed. Moreover, we may have overdone
539 * it a bit, and obtain a negative result.
540 *
541 * The loop above ran 9 times; each time, each word was augmented
542 * by at most one extra word (in absolute value). Thus, the top
543 * word must in fine fit in 39 bits, so the carry below will fit
544 * on 9 bits.
545 */
546 cc = 0;
547 for (i = 0; i < 9; i ++) {
548 x = s[i] + cc;
549 d[i] = (uint32_t)x & 0x3FFFFFFF;
550 cc = ARSHW(x, 30);
551 }
552
553 /*
554 * All nine words fit on 30 bits, but there may be an extra
555 * carry for a few bits (at most 9), and that carry may be
556 * negative. Moreover, we want the result to fit on 257 bits.
557 * The two lines below ensure that the word in d[] has length
558 * 256 bits, and the (signed) carry (beyond 2^256) is in cc. The
559 * significant length of cc is less than 24 bits, so we will be
560 * able to switch to 32-bit operations.
561 */
562 cc = ARSHW(x, 16);
563 d[8] &= 0xFFFF;
564
565 /*
566 * Subtract cc*p.
567 */
568 z = (uint32_t)cc;
569 d[3] -= z << 6;
570 d[6] -= (z << 12) & 0x3FFFFFFF;
571 d[7] -= ARSH(z, 18);
572 d[7] += (z << 14) & 0x3FFFFFFF;
573 d[8] += ARSH(z, 16);
574 for (i = 0; i < 9; i ++) {
575 uint32_t w;
576
577 w = d[i] + z;
578 d[i] = w & 0x3FFFFFFF;
579 z = ARSH(w, 30);
580 }
581 }
582
583 /*
584 * Perform a "final reduction" in field F256 (field for curve P-256).
585 * The source value must be less than twice the modulus. If the value
586 * is not lower than the modulus, then the modulus is subtracted and
587 * this function returns 1; otherwise, it leaves it untouched and it
588 * returns 0.
589 */
590 static uint32_t
591 reduce_final_f256(uint32_t *d)
592 {
593 uint32_t t[9];
594 uint32_t cc;
595 int i;
596
597 cc = 0;
598 for (i = 0; i < 9; i ++) {
599 uint32_t w;
600
601 w = d[i] - F256[i] - cc;
602 cc = w >> 31;
603 t[i] = w & 0x3FFFFFFF;
604 }
605 cc ^= 1;
606 CCOPY(cc, d, t, sizeof t);
607 return cc;
608 }
609
610 /*
611 * Jacobian coordinates for a point in P-256: affine coordinates (X,Y)
612 * are such that:
613 * X = x / z^2
614 * Y = y / z^3
615 * For the point at infinity, z = 0.
616 * Each point thus admits many possible representations.
617 *
618 * Coordinates are represented in arrays of 32-bit integers, each holding
619 * 30 bits of data. Values may also be slightly greater than the modulus,
620 * but they will always be lower than twice the modulus.
621 */
622 typedef struct {
623 uint32_t x[9];
624 uint32_t y[9];
625 uint32_t z[9];
626 } p256_jacobian;
627
628 /*
629 * Convert a point to affine coordinates:
630 * - If the point is the point at infinity, then all three coordinates
631 * are set to 0.
632 * - Otherwise, the 'z' coordinate is set to 1, and the 'x' and 'y'
633 * coordinates are the 'X' and 'Y' affine coordinates.
634 * The coordinates are guaranteed to be lower than the modulus.
635 */
636 static void
637 p256_to_affine(p256_jacobian *P)
638 {
639 uint32_t t1[9], t2[9];
640 int i;
641
642 /*
643 * Invert z with a modular exponentiation: the modulus is
644 * p = 2^256 - 2^224 + 2^192 + 2^96 - 1, and the exponent is
645 * p-2. Exponent bit pattern (from high to low) is:
646 * - 32 bits of value 1
647 * - 31 bits of value 0
648 * - 1 bit of value 1
649 * - 96 bits of value 0
650 * - 94 bits of value 1
651 * - 1 bit of value 0
652 * - 1 bit of value 1
653 * Thus, we precompute z^(2^31-1) to speed things up.
654 *
655 * If z = 0 (point at infinity) then the modular exponentiation
656 * will yield 0, which leads to the expected result (all three
657 * coordinates set to 0).
658 */
659
660 /*
661 * A simple square-and-multiply for z^(2^31-1). We could save about
662 * two dozen multiplications here with an addition chain, but
663 * this would require a bit more code, and extra stack buffers.
664 */
665 memcpy(t1, P->z, sizeof P->z);
666 for (i = 0; i < 30; i ++) {
667 square_f256(t1, t1);
668 mul_f256(t1, t1, P->z);
669 }
670
671 /*
672 * Square-and-multiply. Apart from the squarings, we have a few
673 * multiplications to set bits to 1; we multiply by the original z
674 * for setting 1 bit, and by t1 for setting 31 bits.
675 */
676 memcpy(t2, P->z, sizeof P->z);
677 for (i = 1; i < 256; i ++) {
678 square_f256(t2, t2);
679 switch (i) {
680 case 31:
681 case 190:
682 case 221:
683 case 252:
684 mul_f256(t2, t2, t1);
685 break;
686 case 63:
687 case 253:
688 case 255:
689 mul_f256(t2, t2, P->z);
690 break;
691 }
692 }
693
694 /*
695 * Now that we have 1/z, multiply x by 1/z^2 and y by 1/z^3.
696 */
697 mul_f256(t1, t2, t2);
698 mul_f256(P->x, t1, P->x);
699 mul_f256(t1, t1, t2);
700 mul_f256(P->y, t1, P->y);
701 reduce_final_f256(P->x);
702 reduce_final_f256(P->y);
703
704 /*
705 * Multiply z by 1/z. If z = 0, then this will yield 0, otherwise
706 * this will set z to 1.
707 */
708 mul_f256(P->z, P->z, t2);
709 reduce_final_f256(P->z);
710 }
711
712 /*
713 * Double a point in P-256. This function works for all valid points,
714 * including the point at infinity.
715 */
716 static void
717 p256_double(p256_jacobian *Q)
718 {
719 /*
720 * Doubling formulas are:
721 *
722 * s = 4*x*y^2
723 * m = 3*(x + z^2)*(x - z^2)
724 * x' = m^2 - 2*s
725 * y' = m*(s - x') - 8*y^4
726 * z' = 2*y*z
727 *
728 * These formulas work for all points, including points of order 2
729 * and points at infinity:
730 * - If y = 0 then z' = 0. But there is no such point in P-256
731 * anyway.
732 * - If z = 0 then z' = 0.
733 */
734 uint32_t t1[9], t2[9], t3[9], t4[9];
735
736 /*
737 * Compute z^2 in t1.
738 */
739 square_f256(t1, Q->z);
740
741 /*
742 * Compute x-z^2 in t2 and x+z^2 in t1.
743 */
744 add_f256(t2, Q->x, t1);
745 sub_f256(t1, Q->x, t1);
746
747 /*
748 * Compute 3*(x+z^2)*(x-z^2) in t1.
749 */
750 mul_f256(t3, t1, t2);
751 add_f256(t1, t3, t3);
752 add_f256(t1, t3, t1);
753
754 /*
755 * Compute 4*x*y^2 (in t2) and 2*y^2 (in t3).
756 */
757 square_f256(t3, Q->y);
758 add_f256(t3, t3, t3);
759 mul_f256(t2, Q->x, t3);
760 add_f256(t2, t2, t2);
761
762 /*
763 * Compute x' = m^2 - 2*s.
764 */
765 square_f256(Q->x, t1);
766 sub_f256(Q->x, Q->x, t2);
767 sub_f256(Q->x, Q->x, t2);
768
769 /*
770 * Compute z' = 2*y*z.
771 */
772 mul_f256(t4, Q->y, Q->z);
773 add_f256(Q->z, t4, t4);
774
775 /*
776 * Compute y' = m*(s - x') - 8*y^4. Note that we already have
777 * 2*y^2 in t3.
778 */
779 sub_f256(t2, t2, Q->x);
780 mul_f256(Q->y, t1, t2);
781 square_f256(t4, t3);
782 add_f256(t4, t4, t4);
783 sub_f256(Q->y, Q->y, t4);
784 }
785
786 /*
787 * Add point P2 to point P1.
788 *
789 * This function computes the wrong result in the following cases:
790 *
791 * - If P1 == 0 but P2 != 0
792 * - If P1 != 0 but P2 == 0
793 * - If P1 == P2
794 *
795 * In all three cases, P1 is set to the point at infinity.
796 *
797 * Returned value is 0 if one of the following occurs:
798 *
799 * - P1 and P2 have the same Y coordinate
800 * - P1 == 0 and P2 == 0
801 * - The Y coordinate of one of the points is 0 and the other point is
802 * the point at infinity.
803 *
804 * The third case cannot actually happen with valid points, since a point
805 * with Y == 0 is a point of order 2, and there is no point of order 2 on
806 * curve P-256.
807 *
808 * Therefore, assuming that P1 != 0 and P2 != 0 on input, then the caller
809 * can apply the following:
810 *
811 * - If the result is not the point at infinity, then it is correct.
812 * - Otherwise, if the returned value is 1, then this is a case of
813 * P1+P2 == 0, so the result is indeed the point at infinity.
814 * - Otherwise, P1 == P2, so a "double" operation should have been
815 * performed.
816 */
817 static uint32_t
818 p256_add(p256_jacobian *P1, const p256_jacobian *P2)
819 {
820 /*
821 * Addtions formulas are:
822 *
823 * u1 = x1 * z2^2
824 * u2 = x2 * z1^2
825 * s1 = y1 * z2^3
826 * s2 = y2 * z1^3
827 * h = u2 - u1
828 * r = s2 - s1
829 * x3 = r^2 - h^3 - 2 * u1 * h^2
830 * y3 = r * (u1 * h^2 - x3) - s1 * h^3
831 * z3 = h * z1 * z2
832 */
833 uint32_t t1[9], t2[9], t3[9], t4[9], t5[9], t6[9], t7[9];
834 uint32_t ret;
835 int i;
836
837 /*
838 * Compute u1 = x1*z2^2 (in t1) and s1 = y1*z2^3 (in t3).
839 */
840 square_f256(t3, P2->z);
841 mul_f256(t1, P1->x, t3);
842 mul_f256(t4, P2->z, t3);
843 mul_f256(t3, P1->y, t4);
844
845 /*
846 * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
847 */
848 square_f256(t4, P1->z);
849 mul_f256(t2, P2->x, t4);
850 mul_f256(t5, P1->z, t4);
851 mul_f256(t4, P2->y, t5);
852
853 /*
854 * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
855 * We need to test whether r is zero, so we will do some extra
856 * reduce.
857 */
858 sub_f256(t2, t2, t1);
859 sub_f256(t4, t4, t3);
860 reduce_final_f256(t4);
861 ret = 0;
862 for (i = 0; i < 9; i ++) {
863 ret |= t4[i];
864 }
865 ret = (ret | -ret) >> 31;
866
867 /*
868 * Compute u1*h^2 (in t6) and h^3 (in t5);
869 */
870 square_f256(t7, t2);
871 mul_f256(t6, t1, t7);
872 mul_f256(t5, t7, t2);
873
874 /*
875 * Compute x3 = r^2 - h^3 - 2*u1*h^2.
876 */
877 square_f256(P1->x, t4);
878 sub_f256(P1->x, P1->x, t5);
879 sub_f256(P1->x, P1->x, t6);
880 sub_f256(P1->x, P1->x, t6);
881
882 /*
883 * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
884 */
885 sub_f256(t6, t6, P1->x);
886 mul_f256(P1->y, t4, t6);
887 mul_f256(t1, t5, t3);
888 sub_f256(P1->y, P1->y, t1);
889
890 /*
891 * Compute z3 = h*z1*z2.
892 */
893 mul_f256(t1, P1->z, P2->z);
894 mul_f256(P1->z, t1, t2);
895
896 return ret;
897 }
898
899 /*
900 * Add point P2 to point P1. This is a specialised function for the
901 * case when P2 is a non-zero point in affine coordinate.
902 *
903 * This function computes the wrong result in the following cases:
904 *
905 * - If P1 == 0
906 * - If P1 == P2
907 *
908 * In both cases, P1 is set to the point at infinity.
909 *
910 * Returned value is 0 if one of the following occurs:
911 *
912 * - P1 and P2 have the same Y coordinate
913 * - The Y coordinate of P2 is 0 and P1 is the point at infinity.
914 *
915 * The second case cannot actually happen with valid points, since a point
916 * with Y == 0 is a point of order 2, and there is no point of order 2 on
917 * curve P-256.
918 *
919 * Therefore, assuming that P1 != 0 on input, then the caller
920 * can apply the following:
921 *
922 * - If the result is not the point at infinity, then it is correct.
923 * - Otherwise, if the returned value is 1, then this is a case of
924 * P1+P2 == 0, so the result is indeed the point at infinity.
925 * - Otherwise, P1 == P2, so a "double" operation should have been
926 * performed.
927 */
928 static uint32_t
929 p256_add_mixed(p256_jacobian *P1, const p256_jacobian *P2)
930 {
931 /*
932 * Addtions formulas are:
933 *
934 * u1 = x1
935 * u2 = x2 * z1^2
936 * s1 = y1
937 * s2 = y2 * z1^3
938 * h = u2 - u1
939 * r = s2 - s1
940 * x3 = r^2 - h^3 - 2 * u1 * h^2
941 * y3 = r * (u1 * h^2 - x3) - s1 * h^3
942 * z3 = h * z1
943 */
944 uint32_t t1[9], t2[9], t3[9], t4[9], t5[9], t6[9], t7[9];
945 uint32_t ret;
946 int i;
947
948 /*
949 * Compute u1 = x1 (in t1) and s1 = y1 (in t3).
950 */
951 memcpy(t1, P1->x, sizeof t1);
952 memcpy(t3, P1->y, sizeof t3);
953
954 /*
955 * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
956 */
957 square_f256(t4, P1->z);
958 mul_f256(t2, P2->x, t4);
959 mul_f256(t5, P1->z, t4);
960 mul_f256(t4, P2->y, t5);
961
962 /*
963 * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
964 * We need to test whether r is zero, so we will do some extra
965 * reduce.
966 */
967 sub_f256(t2, t2, t1);
968 sub_f256(t4, t4, t3);
969 reduce_final_f256(t4);
970 ret = 0;
971 for (i = 0; i < 9; i ++) {
972 ret |= t4[i];
973 }
974 ret = (ret | -ret) >> 31;
975
976 /*
977 * Compute u1*h^2 (in t6) and h^3 (in t5);
978 */
979 square_f256(t7, t2);
980 mul_f256(t6, t1, t7);
981 mul_f256(t5, t7, t2);
982
983 /*
984 * Compute x3 = r^2 - h^3 - 2*u1*h^2.
985 */
986 square_f256(P1->x, t4);
987 sub_f256(P1->x, P1->x, t5);
988 sub_f256(P1->x, P1->x, t6);
989 sub_f256(P1->x, P1->x, t6);
990
991 /*
992 * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
993 */
994 sub_f256(t6, t6, P1->x);
995 mul_f256(P1->y, t4, t6);
996 mul_f256(t1, t5, t3);
997 sub_f256(P1->y, P1->y, t1);
998
999 /*
1000 * Compute z3 = h*z1*z2.
1001 */
1002 mul_f256(P1->z, P1->z, t2);
1003
1004 return ret;
1005 }
1006
1007 /*
1008 * Decode a P-256 point. This function does not support the point at
1009 * infinity. Returned value is 0 if the point is invalid, 1 otherwise.
1010 */
1011 static uint32_t
1012 p256_decode(p256_jacobian *P, const void *src, size_t len)
1013 {
1014 const unsigned char *buf;
1015 uint32_t tx[9], ty[9], t1[9], t2[9];
1016 uint32_t bad;
1017 int i;
1018
1019 if (len != 65) {
1020 return 0;
1021 }
1022 buf = src;
1023
1024 /*
1025 * First byte must be 0x04 (uncompressed format). We could support
1026 * "hybrid format" (first byte is 0x06 or 0x07, and encodes the
1027 * least significant bit of the Y coordinate), but it is explicitly
1028 * forbidden by RFC 5480 (section 2.2).
1029 */
1030 bad = NEQ(buf[0], 0x04);
1031
1032 /*
1033 * Decode the coordinates, and check that they are both lower
1034 * than the modulus.
1035 */
1036 tx[8] = be8_to_le30(tx, buf + 1, 32);
1037 ty[8] = be8_to_le30(ty, buf + 33, 32);
1038 bad |= reduce_final_f256(tx);
1039 bad |= reduce_final_f256(ty);
1040
1041 /*
1042 * Check curve equation.
1043 */
1044 square_f256(t1, tx);
1045 mul_f256(t1, tx, t1);
1046 square_f256(t2, ty);
1047 sub_f256(t1, t1, tx);
1048 sub_f256(t1, t1, tx);
1049 sub_f256(t1, t1, tx);
1050 add_f256(t1, t1, P256_B);
1051 sub_f256(t1, t1, t2);
1052 reduce_final_f256(t1);
1053 for (i = 0; i < 9; i ++) {
1054 bad |= t1[i];
1055 }
1056
1057 /*
1058 * Copy coordinates to the point structure.
1059 */
1060 memcpy(P->x, tx, sizeof tx);
1061 memcpy(P->y, ty, sizeof ty);
1062 memset(P->z, 0, sizeof P->z);
1063 P->z[0] = 1;
1064 return NEQ(bad, 0) ^ 1;
1065 }
1066
1067 /*
1068 * Encode a point into a buffer. This function assumes that the point is
1069 * valid, in affine coordinates, and not the point at infinity.
1070 */
1071 static void
1072 p256_encode(void *dst, const p256_jacobian *P)
1073 {
1074 unsigned char *buf;
1075
1076 buf = dst;
1077 buf[0] = 0x04;
1078 le30_to_be8(buf + 1, 32, P->x);
1079 le30_to_be8(buf + 33, 32, P->y);
1080 }
1081
1082 /*
1083 * Multiply a curve point by an integer. The integer is assumed to be
1084 * lower than the curve order, and the base point must not be the point
1085 * at infinity.
1086 */
1087 static void
1088 p256_mul(p256_jacobian *P, const unsigned char *x, size_t xlen)
1089 {
1090 /*
1091 * qz is a flag that is initially 1, and remains equal to 1
1092 * as long as the point is the point at infinity.
1093 *
1094 * We use a 2-bit window to handle multiplier bits by pairs.
1095 * The precomputed window really is the points P2 and P3.
1096 */
1097 uint32_t qz;
1098 p256_jacobian P2, P3, Q, T, U;
1099
1100 /*
1101 * Compute window values.
1102 */
1103 P2 = *P;
1104 p256_double(&P2);
1105 P3 = *P;
1106 p256_add(&P3, &P2);
1107
1108 /*
1109 * We start with Q = 0. We process multiplier bits 2 by 2.
1110 */
1111 memset(&Q, 0, sizeof Q);
1112 qz = 1;
1113 while (xlen -- > 0) {
1114 int k;
1115
1116 for (k = 6; k >= 0; k -= 2) {
1117 uint32_t bits;
1118 uint32_t bnz;
1119
1120 p256_double(&Q);
1121 p256_double(&Q);
1122 T = *P;
1123 U = Q;
1124 bits = (*x >> k) & (uint32_t)3;
1125 bnz = NEQ(bits, 0);
1126 CCOPY(EQ(bits, 2), &T, &P2, sizeof T);
1127 CCOPY(EQ(bits, 3), &T, &P3, sizeof T);
1128 p256_add(&U, &T);
1129 CCOPY(bnz & qz, &Q, &T, sizeof Q);
1130 CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
1131 qz &= ~bnz;
1132 }
1133 x ++;
1134 }
1135 *P = Q;
1136 }
1137
1138 /*
1139 * Precomputed window: k*G points, where G is the curve generator, and k
1140 * is an integer from 1 to 15 (inclusive). The X and Y coordinates of
1141 * the point are encoded as 9 words of 30 bits each (little-endian
1142 * order).
1143 */
1144 static const uint32_t Gwin[15][18] = {
1145
1146 { 0x1898C296, 0x1284E517, 0x1EB33A0F, 0x00DF604B,
1147 0x2440F277, 0x339B958E, 0x04247F8B, 0x347CB84B,
1148 0x00006B17, 0x37BF51F5, 0x2ED901A0, 0x3315ECEC,
1149 0x338CD5DA, 0x0F9E162B, 0x1FAD29F0, 0x27F9B8EE,
1150 0x10B8BF86, 0x00004FE3 },
1151
1152 { 0x07669978, 0x182D23F1, 0x3F21B35A, 0x225A789D,
1153 0x351AC3C0, 0x08E00C12, 0x34F7E8A5, 0x1EC62340,
1154 0x00007CF2, 0x227873D1, 0x3812DE74, 0x0E982299,
1155 0x1F6B798F, 0x3430DBBA, 0x366B1A7D, 0x2D040293,
1156 0x154436E3, 0x00000777 },
1157
1158 { 0x06E7FD6C, 0x2D05986F, 0x3ADA985F, 0x31ADC87B,
1159 0x0BF165E6, 0x1FBE5475, 0x30A44C8F, 0x3934698C,
1160 0x00005ECB, 0x227D5032, 0x29E6C49E, 0x04FB83D9,
1161 0x0AAC0D8E, 0x24A2ECD8, 0x2C1B3869, 0x0FF7E374,
1162 0x19031266, 0x00008734 },
1163
1164 { 0x2B030852, 0x024C0911, 0x05596EF5, 0x07F8B6DE,
1165 0x262BD003, 0x3779967B, 0x08FBBA02, 0x128D4CB4,
1166 0x0000E253, 0x184ED8C6, 0x310B08FC, 0x30EE0055,
1167 0x3F25B0FC, 0x062D764E, 0x3FB97F6A, 0x33CC719D,
1168 0x15D69318, 0x0000E0F1 },
1169
1170 { 0x03D033ED, 0x05552837, 0x35BE5242, 0x2320BF47,
1171 0x268FDFEF, 0x13215821, 0x140D2D78, 0x02DE9454,
1172 0x00005159, 0x3DA16DA4, 0x0742ED13, 0x0D80888D,
1173 0x004BC035, 0x0A79260D, 0x06FCDAFE, 0x2727D8AE,
1174 0x1F6A2412, 0x0000E0C1 },
1175
1176 { 0x3C2291A9, 0x1AC2ABA4, 0x3B215B4C, 0x131D037A,
1177 0x17DDE302, 0x0C90B2E2, 0x0602C92D, 0x05CA9DA9,
1178 0x0000B01A, 0x0FC77FE2, 0x35F1214E, 0x07E16BDF,
1179 0x003DDC07, 0x2703791C, 0x3038B7EE, 0x3DAD56FE,
1180 0x041D0C8D, 0x0000E85C },
1181
1182 { 0x3187B2A3, 0x0018A1C0, 0x00FEF5B3, 0x3E7E2E2A,
1183 0x01FB607E, 0x2CC199F0, 0x37B4625B, 0x0EDBE82F,
1184 0x00008E53, 0x01F400B4, 0x15786A1B, 0x3041B21C,
1185 0x31CD8CF2, 0x35900053, 0x1A7E0E9B, 0x318366D0,
1186 0x076F780C, 0x000073EB },
1187
1188 { 0x1B6FB393, 0x13767707, 0x3CE97DBB, 0x348E2603,
1189 0x354CADC1, 0x09D0B4EA, 0x1B053404, 0x1DE76FBA,
1190 0x000062D9, 0x0F09957E, 0x295029A8, 0x3E76A78D,
1191 0x3B547DAE, 0x27CEE0A2, 0x0575DC45, 0x1D8244FF,
1192 0x332F647A, 0x0000AD5A },
1193
1194 { 0x10949EE0, 0x1E7A292E, 0x06DF8B3D, 0x02B2E30B,
1195 0x31F8729E, 0x24E35475, 0x30B71878, 0x35EDBFB7,
1196 0x0000EA68, 0x0DD048FA, 0x21688929, 0x0DE823FE,
1197 0x1C53FAA9, 0x0EA0C84D, 0x052A592A, 0x1FCE7870,
1198 0x11325CB2, 0x00002A27 },
1199
1200 { 0x04C5723F, 0x30D81A50, 0x048306E4, 0x329B11C7,
1201 0x223FB545, 0x085347A8, 0x2993E591, 0x1B5ACA8E,
1202 0x0000CEF6, 0x04AF0773, 0x28D2EEA9, 0x2751EEEC,
1203 0x037B4A7F, 0x3B4C1059, 0x08F37674, 0x2AE906E1,
1204 0x18A88A6A, 0x00008786 },
1205
1206 { 0x34BC21D1, 0x0CCE474D, 0x15048BF4, 0x1D0BB409,
1207 0x021CDA16, 0x20DE76C3, 0x34C59063, 0x04EDE20E,
1208 0x00003ED1, 0x282A3740, 0x0BE3BBF3, 0x29889DAE,
1209 0x03413697, 0x34C68A09, 0x210EBE93, 0x0C8A224C,
1210 0x0826B331, 0x00009099 },
1211
1212 { 0x0624E3C4, 0x140317BA, 0x2F82C99D, 0x260C0A2C,
1213 0x25D55179, 0x194DCC83, 0x3D95E462, 0x356F6A05,
1214 0x0000741D, 0x0D4481D3, 0x2657FC8B, 0x1BA5CA71,
1215 0x3AE44B0D, 0x07B1548E, 0x0E0D5522, 0x05FDC567,
1216 0x2D1AA70E, 0x00000770 },
1217
1218 { 0x06072C01, 0x23857675, 0x1EAD58A9, 0x0B8A12D9,
1219 0x1EE2FC79, 0x0177CB61, 0x0495A618, 0x20DEB82B,
1220 0x0000177C, 0x2FC7BFD8, 0x310EEF8B, 0x1FB4DF39,
1221 0x3B8530E8, 0x0F4E7226, 0x0246B6D0, 0x2A558A24,
1222 0x163353AF, 0x000063BB },
1223
1224 { 0x24D2920B, 0x1C249DCC, 0x2069C5E5, 0x09AB2F9E,
1225 0x36DF3CF1, 0x1991FD0C, 0x062B97A7, 0x1E80070E,
1226 0x000054E7, 0x20D0B375, 0x2E9F20BD, 0x35090081,
1227 0x1C7A9DDC, 0x22E7C371, 0x087E3016, 0x03175421,
1228 0x3C6ECA7D, 0x0000F599 },
1229
1230 { 0x259B9D5F, 0x0D9A318F, 0x23A0EF16, 0x00EBE4B7,
1231 0x088265AE, 0x2CDE2666, 0x2BAE7ADF, 0x1371A5C6,
1232 0x0000F045, 0x0D034F36, 0x1F967378, 0x1B5FA3F4,
1233 0x0EC8739D, 0x1643E62A, 0x1653947E, 0x22D1F4E6,
1234 0x0FB8D64B, 0x0000B5B9 }
1235 };
1236
1237 /*
1238 * Lookup one of the Gwin[] values, by index. This is constant-time.
1239 */
1240 static void
1241 lookup_Gwin(p256_jacobian *T, uint32_t idx)
1242 {
1243 uint32_t xy[18];
1244 uint32_t k;
1245 size_t u;
1246
1247 memset(xy, 0, sizeof xy);
1248 for (k = 0; k < 15; k ++) {
1249 uint32_t m;
1250
1251 m = -EQ(idx, k + 1);
1252 for (u = 0; u < 18; u ++) {
1253 xy[u] |= m & Gwin[k][u];
1254 }
1255 }
1256 memcpy(T->x, &xy[0], sizeof T->x);
1257 memcpy(T->y, &xy[9], sizeof T->y);
1258 memset(T->z, 0, sizeof T->z);
1259 T->z[0] = 1;
1260 }
1261
1262 /*
1263 * Multiply the generator by an integer. The integer is assumed non-zero
1264 * and lower than the curve order.
1265 */
1266 static void
1267 p256_mulgen(p256_jacobian *P, const unsigned char *x, size_t xlen)
1268 {
1269 /*
1270 * qz is a flag that is initially 1, and remains equal to 1
1271 * as long as the point is the point at infinity.
1272 *
1273 * We use a 4-bit window to handle multiplier bits by groups
1274 * of 4. The precomputed window is constant static data, with
1275 * points in affine coordinates; we use a constant-time lookup.
1276 */
1277 p256_jacobian Q;
1278 uint32_t qz;
1279
1280 memset(&Q, 0, sizeof Q);
1281 qz = 1;
1282 while (xlen -- > 0) {
1283 int k;
1284 unsigned bx;
1285
1286 bx = *x ++;
1287 for (k = 0; k < 2; k ++) {
1288 uint32_t bits;
1289 uint32_t bnz;
1290 p256_jacobian T, U;
1291
1292 p256_double(&Q);
1293 p256_double(&Q);
1294 p256_double(&Q);
1295 p256_double(&Q);
1296 bits = (bx >> 4) & 0x0F;
1297 bnz = NEQ(bits, 0);
1298 lookup_Gwin(&T, bits);
1299 U = Q;
1300 p256_add_mixed(&U, &T);
1301 CCOPY(bnz & qz, &Q, &T, sizeof Q);
1302 CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
1303 qz &= ~bnz;
1304 bx <<= 4;
1305 }
1306 }
1307 *P = Q;
1308 }
1309
1310 static const unsigned char P256_G[] = {
1311 0x04, 0x6B, 0x17, 0xD1, 0xF2, 0xE1, 0x2C, 0x42, 0x47, 0xF8,
1312 0xBC, 0xE6, 0xE5, 0x63, 0xA4, 0x40, 0xF2, 0x77, 0x03, 0x7D,
1313 0x81, 0x2D, 0xEB, 0x33, 0xA0, 0xF4, 0xA1, 0x39, 0x45, 0xD8,
1314 0x98, 0xC2, 0x96, 0x4F, 0xE3, 0x42, 0xE2, 0xFE, 0x1A, 0x7F,
1315 0x9B, 0x8E, 0xE7, 0xEB, 0x4A, 0x7C, 0x0F, 0x9E, 0x16, 0x2B,
1316 0xCE, 0x33, 0x57, 0x6B, 0x31, 0x5E, 0xCE, 0xCB, 0xB6, 0x40,
1317 0x68, 0x37, 0xBF, 0x51, 0xF5
1318 };
1319
1320 static const unsigned char P256_N[] = {
1321 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF,
1322 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xBC, 0xE6, 0xFA, 0xAD,
1323 0xA7, 0x17, 0x9E, 0x84, 0xF3, 0xB9, 0xCA, 0xC2, 0xFC, 0x63,
1324 0x25, 0x51
1325 };
1326
1327 static const unsigned char *
1328 api_generator(int curve, size_t *len)
1329 {
1330 (void)curve;
1331 *len = sizeof P256_G;
1332 return P256_G;
1333 }
1334
1335 static const unsigned char *
1336 api_order(int curve, size_t *len)
1337 {
1338 (void)curve;
1339 *len = sizeof P256_N;
1340 return P256_N;
1341 }
1342
1343 static size_t
1344 api_xoff(int curve, size_t *len)
1345 {
1346 (void)curve;
1347 *len = 32;
1348 return 1;
1349 }
1350
1351 static uint32_t
1352 api_mul(unsigned char *G, size_t Glen,
1353 const unsigned char *x, size_t xlen, int curve)
1354 {
1355 uint32_t r;
1356 p256_jacobian P;
1357
1358 (void)curve;
1359 r = p256_decode(&P, G, Glen);
1360 p256_mul(&P, x, xlen);
1361 if (Glen >= 65) {
1362 p256_to_affine(&P);
1363 p256_encode(G, &P);
1364 }
1365 return r;
1366 }
1367
1368 static size_t
1369 api_mulgen(unsigned char *R,
1370 const unsigned char *x, size_t xlen, int curve)
1371 {
1372 p256_jacobian P;
1373
1374 (void)curve;
1375 p256_mulgen(&P, x, xlen);
1376 p256_to_affine(&P);
1377 p256_encode(R, &P);
1378 return 65;
1379
1380 /*
1381 const unsigned char *G;
1382 size_t Glen;
1383
1384 G = api_generator(curve, &Glen);
1385 memcpy(R, G, Glen);
1386 api_mul(R, Glen, x, xlen, curve);
1387 return Glen;
1388 */
1389 }
1390
1391 static uint32_t
1392 api_muladd(unsigned char *A, const unsigned char *B, size_t len,
1393 const unsigned char *x, size_t xlen,
1394 const unsigned char *y, size_t ylen, int curve)
1395 {
1396 p256_jacobian P, Q;
1397 uint32_t r, t, z;
1398 int i;
1399
1400 (void)curve;
1401 r = p256_decode(&P, A, len);
1402 p256_mul(&P, x, xlen);
1403 if (B == NULL) {
1404 p256_mulgen(&Q, y, ylen);
1405 } else {
1406 r &= p256_decode(&Q, B, len);
1407 p256_mul(&Q, y, ylen);
1408 }
1409
1410 /*
1411 * The final addition may fail in case both points are equal.
1412 */
1413 t = p256_add(&P, &Q);
1414 reduce_final_f256(P.z);
1415 z = 0;
1416 for (i = 0; i < 9; i ++) {
1417 z |= P.z[i];
1418 }
1419 z = EQ(z, 0);
1420 p256_double(&Q);
1421
1422 /*
1423 * If z is 1 then either P+Q = 0 (t = 1) or P = Q (t = 0). So we
1424 * have the following:
1425 *
1426 * z = 0, t = 0 return P (normal addition)
1427 * z = 0, t = 1 return P (normal addition)
1428 * z = 1, t = 0 return Q (a 'double' case)
1429 * z = 1, t = 1 report an error (P+Q = 0)
1430 */
1431 CCOPY(z & ~t, &P, &Q, sizeof Q);
1432 p256_to_affine(&P);
1433 p256_encode(A, &P);
1434 r &= ~(z & t);
1435 return r;
1436 }
1437
1438 /* see bearssl_ec.h */
1439 const br_ec_impl br_ec_p256_m31 = {
1440 (uint32_t)0x00800000,
1441 &api_generator,
1442 &api_order,
1443 &api_xoff,
1444 &api_mul,
1445 &api_mulgen,
1446 &api_muladd
1447 };