Fixed handling of incoming application data after sending a close_notify (data shall...
[BearSSL] / src / ec / ec_p256_m15.c
1 /*
2 * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining
5 * a copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sublicense, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be
13 * included in all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #include "inner.h"
26
27 /*
28 * If BR_NO_ARITH_SHIFT is undefined, or defined to 0, then we _assume_
29 * that right-shifting a signed negative integer copies the sign bit
30 * (arithmetic right-shift). This is "implementation-defined behaviour",
31 * i.e. it is not undefined, but it may differ between compilers. Each
32 * compiler is supposed to document its behaviour in that respect. GCC
33 * explicitly defines that an arithmetic right shift is used. We expect
34 * all other compilers to do the same, because underlying CPU offer an
35 * arithmetic right shift opcode that could not be used otherwise.
36 */
37 #if BR_NO_ARITH_SHIFT
38 #define ARSH(x, n) (((uint32_t)(x) >> (n)) \
39 | ((-((uint32_t)(x) >> 31)) << (32 - (n))))
40 #else
41 #define ARSH(x, n) ((*(int32_t *)&(x)) >> (n))
42 #endif
43
44 /*
45 * Convert an integer from unsigned big-endian encoding to a sequence of
46 * 13-bit words in little-endian order. The final "partial" word is
47 * returned.
48 */
49 static uint32_t
50 be8_to_le13(uint32_t *dst, const unsigned char *src, size_t len)
51 {
52 uint32_t acc;
53 int acc_len;
54
55 acc = 0;
56 acc_len = 0;
57 while (len -- > 0) {
58 acc |= (uint32_t)src[len] << acc_len;
59 acc_len += 8;
60 if (acc_len >= 13) {
61 *dst ++ = acc & 0x1FFF;
62 acc >>= 13;
63 acc_len -= 13;
64 }
65 }
66 return acc;
67 }
68
69 /*
70 * Convert an integer (13-bit words, little-endian) to unsigned
71 * big-endian encoding. The total encoding length is provided; all
72 * the destination bytes will be filled.
73 */
74 static void
75 le13_to_be8(unsigned char *dst, size_t len, const uint32_t *src)
76 {
77 uint32_t acc;
78 int acc_len;
79
80 acc = 0;
81 acc_len = 0;
82 while (len -- > 0) {
83 if (acc_len < 8) {
84 acc |= (*src ++) << acc_len;
85 acc_len += 13;
86 }
87 dst[len] = (unsigned char)acc;
88 acc >>= 8;
89 acc_len -= 8;
90 }
91 }
92
93 /*
94 * Normalise an array of words to a strict 13 bits per word. Returned
95 * value is the resulting carry. The source (w) and destination (d)
96 * arrays may be identical, but shall not overlap partially.
97 */
98 static inline uint32_t
99 norm13(uint32_t *d, const uint32_t *w, size_t len)
100 {
101 size_t u;
102 uint32_t cc;
103
104 cc = 0;
105 for (u = 0; u < len; u ++) {
106 int32_t z;
107
108 z = w[u] + cc;
109 d[u] = z & 0x1FFF;
110 cc = ARSH(z, 13);
111 }
112 return cc;
113 }
114
115 /*
116 * mul20() multiplies two 260-bit integers together. Each word must fit
117 * on 13 bits; source operands use 20 words, destination operand
118 * receives 40 words. All overlaps allowed.
119 *
120 * square20() computes the square of a 260-bit integer. Each word must
121 * fit on 13 bits; source operand uses 20 words, destination operand
122 * receives 40 words. All overlaps allowed.
123 */
124
125 #if BR_SLOW_MUL15
126
127 static void
128 mul20(uint32_t *d, const uint32_t *a, const uint32_t *b)
129 {
130 /*
131 * Two-level Karatsuba: turns a 20x20 multiplication into
132 * nine 5x5 multiplications. We use 13-bit words but do not
133 * propagate carries immediately, so words may expand:
134 *
135 * - First Karatsuba decomposition turns the 20x20 mul on
136 * 13-bit words into three 10x10 muls, two on 13-bit words
137 * and one on 14-bit words.
138 *
139 * - Second Karatsuba decomposition further splits these into:
140 *
141 * * four 5x5 muls on 13-bit words
142 * * four 5x5 muls on 14-bit words
143 * * one 5x5 mul on 15-bit words
144 *
145 * Highest word value is 8191, 16382 or 32764, for 13-bit, 14-bit
146 * or 15-bit words, respectively.
147 */
148 uint32_t u[45], v[45], w[90];
149 uint32_t cc;
150 int i;
151
152 #define ZADD(dw, d_off, s1w, s1_off, s2w, s2_off) do { \
153 (dw)[5 * (d_off) + 0] = (s1w)[5 * (s1_off) + 0] \
154 + (s2w)[5 * (s2_off) + 0]; \
155 (dw)[5 * (d_off) + 1] = (s1w)[5 * (s1_off) + 1] \
156 + (s2w)[5 * (s2_off) + 1]; \
157 (dw)[5 * (d_off) + 2] = (s1w)[5 * (s1_off) + 2] \
158 + (s2w)[5 * (s2_off) + 2]; \
159 (dw)[5 * (d_off) + 3] = (s1w)[5 * (s1_off) + 3] \
160 + (s2w)[5 * (s2_off) + 3]; \
161 (dw)[5 * (d_off) + 4] = (s1w)[5 * (s1_off) + 4] \
162 + (s2w)[5 * (s2_off) + 4]; \
163 } while (0)
164
165 #define ZADDT(dw, d_off, sw, s_off) do { \
166 (dw)[5 * (d_off) + 0] += (sw)[5 * (s_off) + 0]; \
167 (dw)[5 * (d_off) + 1] += (sw)[5 * (s_off) + 1]; \
168 (dw)[5 * (d_off) + 2] += (sw)[5 * (s_off) + 2]; \
169 (dw)[5 * (d_off) + 3] += (sw)[5 * (s_off) + 3]; \
170 (dw)[5 * (d_off) + 4] += (sw)[5 * (s_off) + 4]; \
171 } while (0)
172
173 #define ZSUB2F(dw, d_off, s1w, s1_off, s2w, s2_off) do { \
174 (dw)[5 * (d_off) + 0] -= (s1w)[5 * (s1_off) + 0] \
175 + (s2w)[5 * (s2_off) + 0]; \
176 (dw)[5 * (d_off) + 1] -= (s1w)[5 * (s1_off) + 1] \
177 + (s2w)[5 * (s2_off) + 1]; \
178 (dw)[5 * (d_off) + 2] -= (s1w)[5 * (s1_off) + 2] \
179 + (s2w)[5 * (s2_off) + 2]; \
180 (dw)[5 * (d_off) + 3] -= (s1w)[5 * (s1_off) + 3] \
181 + (s2w)[5 * (s2_off) + 3]; \
182 (dw)[5 * (d_off) + 4] -= (s1w)[5 * (s1_off) + 4] \
183 + (s2w)[5 * (s2_off) + 4]; \
184 } while (0)
185
186 #define CPR1(w, cprcc) do { \
187 uint32_t cprz = (w) + cprcc; \
188 (w) = cprz & 0x1FFF; \
189 cprcc = cprz >> 13; \
190 } while (0)
191
192 #define CPR(dw, d_off) do { \
193 uint32_t cprcc; \
194 cprcc = 0; \
195 CPR1((dw)[(d_off) + 0], cprcc); \
196 CPR1((dw)[(d_off) + 1], cprcc); \
197 CPR1((dw)[(d_off) + 2], cprcc); \
198 CPR1((dw)[(d_off) + 3], cprcc); \
199 CPR1((dw)[(d_off) + 4], cprcc); \
200 CPR1((dw)[(d_off) + 5], cprcc); \
201 CPR1((dw)[(d_off) + 6], cprcc); \
202 CPR1((dw)[(d_off) + 7], cprcc); \
203 CPR1((dw)[(d_off) + 8], cprcc); \
204 (dw)[(d_off) + 9] = cprcc; \
205 } while (0)
206
207 memcpy(u, a, 20 * sizeof *a);
208 ZADD(u, 4, a, 0, a, 1);
209 ZADD(u, 5, a, 2, a, 3);
210 ZADD(u, 6, a, 0, a, 2);
211 ZADD(u, 7, a, 1, a, 3);
212 ZADD(u, 8, u, 6, u, 7);
213
214 memcpy(v, b, 20 * sizeof *b);
215 ZADD(v, 4, b, 0, b, 1);
216 ZADD(v, 5, b, 2, b, 3);
217 ZADD(v, 6, b, 0, b, 2);
218 ZADD(v, 7, b, 1, b, 3);
219 ZADD(v, 8, v, 6, v, 7);
220
221 /*
222 * Do the eight first 8x8 muls. Source words are at most 16382
223 * each, so we can add product results together "as is" in 32-bit
224 * words.
225 */
226 for (i = 0; i < 40; i += 5) {
227 w[(i << 1) + 0] = MUL15(u[i + 0], v[i + 0]);
228 w[(i << 1) + 1] = MUL15(u[i + 0], v[i + 1])
229 + MUL15(u[i + 1], v[i + 0]);
230 w[(i << 1) + 2] = MUL15(u[i + 0], v[i + 2])
231 + MUL15(u[i + 1], v[i + 1])
232 + MUL15(u[i + 2], v[i + 0]);
233 w[(i << 1) + 3] = MUL15(u[i + 0], v[i + 3])
234 + MUL15(u[i + 1], v[i + 2])
235 + MUL15(u[i + 2], v[i + 1])
236 + MUL15(u[i + 3], v[i + 0]);
237 w[(i << 1) + 4] = MUL15(u[i + 0], v[i + 4])
238 + MUL15(u[i + 1], v[i + 3])
239 + MUL15(u[i + 2], v[i + 2])
240 + MUL15(u[i + 3], v[i + 1])
241 + MUL15(u[i + 4], v[i + 0]);
242 w[(i << 1) + 5] = MUL15(u[i + 1], v[i + 4])
243 + MUL15(u[i + 2], v[i + 3])
244 + MUL15(u[i + 3], v[i + 2])
245 + MUL15(u[i + 4], v[i + 1]);
246 w[(i << 1) + 6] = MUL15(u[i + 2], v[i + 4])
247 + MUL15(u[i + 3], v[i + 3])
248 + MUL15(u[i + 4], v[i + 2]);
249 w[(i << 1) + 7] = MUL15(u[i + 3], v[i + 4])
250 + MUL15(u[i + 4], v[i + 3]);
251 w[(i << 1) + 8] = MUL15(u[i + 4], v[i + 4]);
252 w[(i << 1) + 9] = 0;
253 }
254
255 /*
256 * For the 9th multiplication, source words are up to 32764,
257 * so we must do some carry propagation. If we add up to
258 * 4 products and the carry is no more than 524224, then the
259 * result fits in 32 bits, and the next carry will be no more
260 * than 524224 (because 4*(32764^2)+524224 < 8192*524225).
261 *
262 * We thus just skip one of the products in the middle word,
263 * then do a carry propagation (this reduces words to 13 bits
264 * each, except possibly the last, which may use up to 17 bits
265 * or so), then add the missing product.
266 */
267 w[80 + 0] = MUL15(u[40 + 0], v[40 + 0]);
268 w[80 + 1] = MUL15(u[40 + 0], v[40 + 1])
269 + MUL15(u[40 + 1], v[40 + 0]);
270 w[80 + 2] = MUL15(u[40 + 0], v[40 + 2])
271 + MUL15(u[40 + 1], v[40 + 1])
272 + MUL15(u[40 + 2], v[40 + 0]);
273 w[80 + 3] = MUL15(u[40 + 0], v[40 + 3])
274 + MUL15(u[40 + 1], v[40 + 2])
275 + MUL15(u[40 + 2], v[40 + 1])
276 + MUL15(u[40 + 3], v[40 + 0]);
277 w[80 + 4] = MUL15(u[40 + 0], v[40 + 4])
278 + MUL15(u[40 + 1], v[40 + 3])
279 + MUL15(u[40 + 2], v[40 + 2])
280 + MUL15(u[40 + 3], v[40 + 1]);
281 /* + MUL15(u[40 + 4], v[40 + 0]) */
282 w[80 + 5] = MUL15(u[40 + 1], v[40 + 4])
283 + MUL15(u[40 + 2], v[40 + 3])
284 + MUL15(u[40 + 3], v[40 + 2])
285 + MUL15(u[40 + 4], v[40 + 1]);
286 w[80 + 6] = MUL15(u[40 + 2], v[40 + 4])
287 + MUL15(u[40 + 3], v[40 + 3])
288 + MUL15(u[40 + 4], v[40 + 2]);
289 w[80 + 7] = MUL15(u[40 + 3], v[40 + 4])
290 + MUL15(u[40 + 4], v[40 + 3]);
291 w[80 + 8] = MUL15(u[40 + 4], v[40 + 4]);
292
293 CPR(w, 80);
294
295 w[80 + 4] += MUL15(u[40 + 4], v[40 + 0]);
296
297 /*
298 * The products on 14-bit words in slots 6 and 7 yield values
299 * up to 5*(16382^2) each, and we need to subtract two such
300 * values from the higher word. We need the subtraction to fit
301 * in a _signed_ 32-bit integer, i.e. 31 bits + a sign bit.
302 * However, 10*(16382^2) does not fit. So we must perform a
303 * bit of reduction here.
304 */
305 CPR(w, 60);
306 CPR(w, 70);
307
308 /*
309 * Recompose results.
310 */
311
312 /* 0..1*0..1 into 0..3 */
313 ZSUB2F(w, 8, w, 0, w, 2);
314 ZSUB2F(w, 9, w, 1, w, 3);
315 ZADDT(w, 1, w, 8);
316 ZADDT(w, 2, w, 9);
317
318 /* 2..3*2..3 into 4..7 */
319 ZSUB2F(w, 10, w, 4, w, 6);
320 ZSUB2F(w, 11, w, 5, w, 7);
321 ZADDT(w, 5, w, 10);
322 ZADDT(w, 6, w, 11);
323
324 /* (0..1+2..3)*(0..1+2..3) into 12..15 */
325 ZSUB2F(w, 16, w, 12, w, 14);
326 ZSUB2F(w, 17, w, 13, w, 15);
327 ZADDT(w, 13, w, 16);
328 ZADDT(w, 14, w, 17);
329
330 /* first-level recomposition */
331 ZSUB2F(w, 12, w, 0, w, 4);
332 ZSUB2F(w, 13, w, 1, w, 5);
333 ZSUB2F(w, 14, w, 2, w, 6);
334 ZSUB2F(w, 15, w, 3, w, 7);
335 ZADDT(w, 2, w, 12);
336 ZADDT(w, 3, w, 13);
337 ZADDT(w, 4, w, 14);
338 ZADDT(w, 5, w, 15);
339
340 /*
341 * Perform carry propagation to bring all words down to 13 bits.
342 */
343 cc = norm13(d, w, 40);
344 d[39] += (cc << 13);
345
346 #undef ZADD
347 #undef ZADDT
348 #undef ZSUB2F
349 #undef CPR1
350 #undef CPR
351 }
352
353 static inline void
354 square20(uint32_t *d, const uint32_t *a)
355 {
356 mul20(d, a, a);
357 }
358
359 #else
360
361 static void
362 mul20(uint32_t *d, const uint32_t *a, const uint32_t *b)
363 {
364 uint32_t t[39];
365
366 t[ 0] = MUL15(a[ 0], b[ 0]);
367 t[ 1] = MUL15(a[ 0], b[ 1])
368 + MUL15(a[ 1], b[ 0]);
369 t[ 2] = MUL15(a[ 0], b[ 2])
370 + MUL15(a[ 1], b[ 1])
371 + MUL15(a[ 2], b[ 0]);
372 t[ 3] = MUL15(a[ 0], b[ 3])
373 + MUL15(a[ 1], b[ 2])
374 + MUL15(a[ 2], b[ 1])
375 + MUL15(a[ 3], b[ 0]);
376 t[ 4] = MUL15(a[ 0], b[ 4])
377 + MUL15(a[ 1], b[ 3])
378 + MUL15(a[ 2], b[ 2])
379 + MUL15(a[ 3], b[ 1])
380 + MUL15(a[ 4], b[ 0]);
381 t[ 5] = MUL15(a[ 0], b[ 5])
382 + MUL15(a[ 1], b[ 4])
383 + MUL15(a[ 2], b[ 3])
384 + MUL15(a[ 3], b[ 2])
385 + MUL15(a[ 4], b[ 1])
386 + MUL15(a[ 5], b[ 0]);
387 t[ 6] = MUL15(a[ 0], b[ 6])
388 + MUL15(a[ 1], b[ 5])
389 + MUL15(a[ 2], b[ 4])
390 + MUL15(a[ 3], b[ 3])
391 + MUL15(a[ 4], b[ 2])
392 + MUL15(a[ 5], b[ 1])
393 + MUL15(a[ 6], b[ 0]);
394 t[ 7] = MUL15(a[ 0], b[ 7])
395 + MUL15(a[ 1], b[ 6])
396 + MUL15(a[ 2], b[ 5])
397 + MUL15(a[ 3], b[ 4])
398 + MUL15(a[ 4], b[ 3])
399 + MUL15(a[ 5], b[ 2])
400 + MUL15(a[ 6], b[ 1])
401 + MUL15(a[ 7], b[ 0]);
402 t[ 8] = MUL15(a[ 0], b[ 8])
403 + MUL15(a[ 1], b[ 7])
404 + MUL15(a[ 2], b[ 6])
405 + MUL15(a[ 3], b[ 5])
406 + MUL15(a[ 4], b[ 4])
407 + MUL15(a[ 5], b[ 3])
408 + MUL15(a[ 6], b[ 2])
409 + MUL15(a[ 7], b[ 1])
410 + MUL15(a[ 8], b[ 0]);
411 t[ 9] = MUL15(a[ 0], b[ 9])
412 + MUL15(a[ 1], b[ 8])
413 + MUL15(a[ 2], b[ 7])
414 + MUL15(a[ 3], b[ 6])
415 + MUL15(a[ 4], b[ 5])
416 + MUL15(a[ 5], b[ 4])
417 + MUL15(a[ 6], b[ 3])
418 + MUL15(a[ 7], b[ 2])
419 + MUL15(a[ 8], b[ 1])
420 + MUL15(a[ 9], b[ 0]);
421 t[10] = MUL15(a[ 0], b[10])
422 + MUL15(a[ 1], b[ 9])
423 + MUL15(a[ 2], b[ 8])
424 + MUL15(a[ 3], b[ 7])
425 + MUL15(a[ 4], b[ 6])
426 + MUL15(a[ 5], b[ 5])
427 + MUL15(a[ 6], b[ 4])
428 + MUL15(a[ 7], b[ 3])
429 + MUL15(a[ 8], b[ 2])
430 + MUL15(a[ 9], b[ 1])
431 + MUL15(a[10], b[ 0]);
432 t[11] = MUL15(a[ 0], b[11])
433 + MUL15(a[ 1], b[10])
434 + MUL15(a[ 2], b[ 9])
435 + MUL15(a[ 3], b[ 8])
436 + MUL15(a[ 4], b[ 7])
437 + MUL15(a[ 5], b[ 6])
438 + MUL15(a[ 6], b[ 5])
439 + MUL15(a[ 7], b[ 4])
440 + MUL15(a[ 8], b[ 3])
441 + MUL15(a[ 9], b[ 2])
442 + MUL15(a[10], b[ 1])
443 + MUL15(a[11], b[ 0]);
444 t[12] = MUL15(a[ 0], b[12])
445 + MUL15(a[ 1], b[11])
446 + MUL15(a[ 2], b[10])
447 + MUL15(a[ 3], b[ 9])
448 + MUL15(a[ 4], b[ 8])
449 + MUL15(a[ 5], b[ 7])
450 + MUL15(a[ 6], b[ 6])
451 + MUL15(a[ 7], b[ 5])
452 + MUL15(a[ 8], b[ 4])
453 + MUL15(a[ 9], b[ 3])
454 + MUL15(a[10], b[ 2])
455 + MUL15(a[11], b[ 1])
456 + MUL15(a[12], b[ 0]);
457 t[13] = MUL15(a[ 0], b[13])
458 + MUL15(a[ 1], b[12])
459 + MUL15(a[ 2], b[11])
460 + MUL15(a[ 3], b[10])
461 + MUL15(a[ 4], b[ 9])
462 + MUL15(a[ 5], b[ 8])
463 + MUL15(a[ 6], b[ 7])
464 + MUL15(a[ 7], b[ 6])
465 + MUL15(a[ 8], b[ 5])
466 + MUL15(a[ 9], b[ 4])
467 + MUL15(a[10], b[ 3])
468 + MUL15(a[11], b[ 2])
469 + MUL15(a[12], b[ 1])
470 + MUL15(a[13], b[ 0]);
471 t[14] = MUL15(a[ 0], b[14])
472 + MUL15(a[ 1], b[13])
473 + MUL15(a[ 2], b[12])
474 + MUL15(a[ 3], b[11])
475 + MUL15(a[ 4], b[10])
476 + MUL15(a[ 5], b[ 9])
477 + MUL15(a[ 6], b[ 8])
478 + MUL15(a[ 7], b[ 7])
479 + MUL15(a[ 8], b[ 6])
480 + MUL15(a[ 9], b[ 5])
481 + MUL15(a[10], b[ 4])
482 + MUL15(a[11], b[ 3])
483 + MUL15(a[12], b[ 2])
484 + MUL15(a[13], b[ 1])
485 + MUL15(a[14], b[ 0]);
486 t[15] = MUL15(a[ 0], b[15])
487 + MUL15(a[ 1], b[14])
488 + MUL15(a[ 2], b[13])
489 + MUL15(a[ 3], b[12])
490 + MUL15(a[ 4], b[11])
491 + MUL15(a[ 5], b[10])
492 + MUL15(a[ 6], b[ 9])
493 + MUL15(a[ 7], b[ 8])
494 + MUL15(a[ 8], b[ 7])
495 + MUL15(a[ 9], b[ 6])
496 + MUL15(a[10], b[ 5])
497 + MUL15(a[11], b[ 4])
498 + MUL15(a[12], b[ 3])
499 + MUL15(a[13], b[ 2])
500 + MUL15(a[14], b[ 1])
501 + MUL15(a[15], b[ 0]);
502 t[16] = MUL15(a[ 0], b[16])
503 + MUL15(a[ 1], b[15])
504 + MUL15(a[ 2], b[14])
505 + MUL15(a[ 3], b[13])
506 + MUL15(a[ 4], b[12])
507 + MUL15(a[ 5], b[11])
508 + MUL15(a[ 6], b[10])
509 + MUL15(a[ 7], b[ 9])
510 + MUL15(a[ 8], b[ 8])
511 + MUL15(a[ 9], b[ 7])
512 + MUL15(a[10], b[ 6])
513 + MUL15(a[11], b[ 5])
514 + MUL15(a[12], b[ 4])
515 + MUL15(a[13], b[ 3])
516 + MUL15(a[14], b[ 2])
517 + MUL15(a[15], b[ 1])
518 + MUL15(a[16], b[ 0]);
519 t[17] = MUL15(a[ 0], b[17])
520 + MUL15(a[ 1], b[16])
521 + MUL15(a[ 2], b[15])
522 + MUL15(a[ 3], b[14])
523 + MUL15(a[ 4], b[13])
524 + MUL15(a[ 5], b[12])
525 + MUL15(a[ 6], b[11])
526 + MUL15(a[ 7], b[10])
527 + MUL15(a[ 8], b[ 9])
528 + MUL15(a[ 9], b[ 8])
529 + MUL15(a[10], b[ 7])
530 + MUL15(a[11], b[ 6])
531 + MUL15(a[12], b[ 5])
532 + MUL15(a[13], b[ 4])
533 + MUL15(a[14], b[ 3])
534 + MUL15(a[15], b[ 2])
535 + MUL15(a[16], b[ 1])
536 + MUL15(a[17], b[ 0]);
537 t[18] = MUL15(a[ 0], b[18])
538 + MUL15(a[ 1], b[17])
539 + MUL15(a[ 2], b[16])
540 + MUL15(a[ 3], b[15])
541 + MUL15(a[ 4], b[14])
542 + MUL15(a[ 5], b[13])
543 + MUL15(a[ 6], b[12])
544 + MUL15(a[ 7], b[11])
545 + MUL15(a[ 8], b[10])
546 + MUL15(a[ 9], b[ 9])
547 + MUL15(a[10], b[ 8])
548 + MUL15(a[11], b[ 7])
549 + MUL15(a[12], b[ 6])
550 + MUL15(a[13], b[ 5])
551 + MUL15(a[14], b[ 4])
552 + MUL15(a[15], b[ 3])
553 + MUL15(a[16], b[ 2])
554 + MUL15(a[17], b[ 1])
555 + MUL15(a[18], b[ 0]);
556 t[19] = MUL15(a[ 0], b[19])
557 + MUL15(a[ 1], b[18])
558 + MUL15(a[ 2], b[17])
559 + MUL15(a[ 3], b[16])
560 + MUL15(a[ 4], b[15])
561 + MUL15(a[ 5], b[14])
562 + MUL15(a[ 6], b[13])
563 + MUL15(a[ 7], b[12])
564 + MUL15(a[ 8], b[11])
565 + MUL15(a[ 9], b[10])
566 + MUL15(a[10], b[ 9])
567 + MUL15(a[11], b[ 8])
568 + MUL15(a[12], b[ 7])
569 + MUL15(a[13], b[ 6])
570 + MUL15(a[14], b[ 5])
571 + MUL15(a[15], b[ 4])
572 + MUL15(a[16], b[ 3])
573 + MUL15(a[17], b[ 2])
574 + MUL15(a[18], b[ 1])
575 + MUL15(a[19], b[ 0]);
576 t[20] = MUL15(a[ 1], b[19])
577 + MUL15(a[ 2], b[18])
578 + MUL15(a[ 3], b[17])
579 + MUL15(a[ 4], b[16])
580 + MUL15(a[ 5], b[15])
581 + MUL15(a[ 6], b[14])
582 + MUL15(a[ 7], b[13])
583 + MUL15(a[ 8], b[12])
584 + MUL15(a[ 9], b[11])
585 + MUL15(a[10], b[10])
586 + MUL15(a[11], b[ 9])
587 + MUL15(a[12], b[ 8])
588 + MUL15(a[13], b[ 7])
589 + MUL15(a[14], b[ 6])
590 + MUL15(a[15], b[ 5])
591 + MUL15(a[16], b[ 4])
592 + MUL15(a[17], b[ 3])
593 + MUL15(a[18], b[ 2])
594 + MUL15(a[19], b[ 1]);
595 t[21] = MUL15(a[ 2], b[19])
596 + MUL15(a[ 3], b[18])
597 + MUL15(a[ 4], b[17])
598 + MUL15(a[ 5], b[16])
599 + MUL15(a[ 6], b[15])
600 + MUL15(a[ 7], b[14])
601 + MUL15(a[ 8], b[13])
602 + MUL15(a[ 9], b[12])
603 + MUL15(a[10], b[11])
604 + MUL15(a[11], b[10])
605 + MUL15(a[12], b[ 9])
606 + MUL15(a[13], b[ 8])
607 + MUL15(a[14], b[ 7])
608 + MUL15(a[15], b[ 6])
609 + MUL15(a[16], b[ 5])
610 + MUL15(a[17], b[ 4])
611 + MUL15(a[18], b[ 3])
612 + MUL15(a[19], b[ 2]);
613 t[22] = MUL15(a[ 3], b[19])
614 + MUL15(a[ 4], b[18])
615 + MUL15(a[ 5], b[17])
616 + MUL15(a[ 6], b[16])
617 + MUL15(a[ 7], b[15])
618 + MUL15(a[ 8], b[14])
619 + MUL15(a[ 9], b[13])
620 + MUL15(a[10], b[12])
621 + MUL15(a[11], b[11])
622 + MUL15(a[12], b[10])
623 + MUL15(a[13], b[ 9])
624 + MUL15(a[14], b[ 8])
625 + MUL15(a[15], b[ 7])
626 + MUL15(a[16], b[ 6])
627 + MUL15(a[17], b[ 5])
628 + MUL15(a[18], b[ 4])
629 + MUL15(a[19], b[ 3]);
630 t[23] = MUL15(a[ 4], b[19])
631 + MUL15(a[ 5], b[18])
632 + MUL15(a[ 6], b[17])
633 + MUL15(a[ 7], b[16])
634 + MUL15(a[ 8], b[15])
635 + MUL15(a[ 9], b[14])
636 + MUL15(a[10], b[13])
637 + MUL15(a[11], b[12])
638 + MUL15(a[12], b[11])
639 + MUL15(a[13], b[10])
640 + MUL15(a[14], b[ 9])
641 + MUL15(a[15], b[ 8])
642 + MUL15(a[16], b[ 7])
643 + MUL15(a[17], b[ 6])
644 + MUL15(a[18], b[ 5])
645 + MUL15(a[19], b[ 4]);
646 t[24] = MUL15(a[ 5], b[19])
647 + MUL15(a[ 6], b[18])
648 + MUL15(a[ 7], b[17])
649 + MUL15(a[ 8], b[16])
650 + MUL15(a[ 9], b[15])
651 + MUL15(a[10], b[14])
652 + MUL15(a[11], b[13])
653 + MUL15(a[12], b[12])
654 + MUL15(a[13], b[11])
655 + MUL15(a[14], b[10])
656 + MUL15(a[15], b[ 9])
657 + MUL15(a[16], b[ 8])
658 + MUL15(a[17], b[ 7])
659 + MUL15(a[18], b[ 6])
660 + MUL15(a[19], b[ 5]);
661 t[25] = MUL15(a[ 6], b[19])
662 + MUL15(a[ 7], b[18])
663 + MUL15(a[ 8], b[17])
664 + MUL15(a[ 9], b[16])
665 + MUL15(a[10], b[15])
666 + MUL15(a[11], b[14])
667 + MUL15(a[12], b[13])
668 + MUL15(a[13], b[12])
669 + MUL15(a[14], b[11])
670 + MUL15(a[15], b[10])
671 + MUL15(a[16], b[ 9])
672 + MUL15(a[17], b[ 8])
673 + MUL15(a[18], b[ 7])
674 + MUL15(a[19], b[ 6]);
675 t[26] = MUL15(a[ 7], b[19])
676 + MUL15(a[ 8], b[18])
677 + MUL15(a[ 9], b[17])
678 + MUL15(a[10], b[16])
679 + MUL15(a[11], b[15])
680 + MUL15(a[12], b[14])
681 + MUL15(a[13], b[13])
682 + MUL15(a[14], b[12])
683 + MUL15(a[15], b[11])
684 + MUL15(a[16], b[10])
685 + MUL15(a[17], b[ 9])
686 + MUL15(a[18], b[ 8])
687 + MUL15(a[19], b[ 7]);
688 t[27] = MUL15(a[ 8], b[19])
689 + MUL15(a[ 9], b[18])
690 + MUL15(a[10], b[17])
691 + MUL15(a[11], b[16])
692 + MUL15(a[12], b[15])
693 + MUL15(a[13], b[14])
694 + MUL15(a[14], b[13])
695 + MUL15(a[15], b[12])
696 + MUL15(a[16], b[11])
697 + MUL15(a[17], b[10])
698 + MUL15(a[18], b[ 9])
699 + MUL15(a[19], b[ 8]);
700 t[28] = MUL15(a[ 9], b[19])
701 + MUL15(a[10], b[18])
702 + MUL15(a[11], b[17])
703 + MUL15(a[12], b[16])
704 + MUL15(a[13], b[15])
705 + MUL15(a[14], b[14])
706 + MUL15(a[15], b[13])
707 + MUL15(a[16], b[12])
708 + MUL15(a[17], b[11])
709 + MUL15(a[18], b[10])
710 + MUL15(a[19], b[ 9]);
711 t[29] = MUL15(a[10], b[19])
712 + MUL15(a[11], b[18])
713 + MUL15(a[12], b[17])
714 + MUL15(a[13], b[16])
715 + MUL15(a[14], b[15])
716 + MUL15(a[15], b[14])
717 + MUL15(a[16], b[13])
718 + MUL15(a[17], b[12])
719 + MUL15(a[18], b[11])
720 + MUL15(a[19], b[10]);
721 t[30] = MUL15(a[11], b[19])
722 + MUL15(a[12], b[18])
723 + MUL15(a[13], b[17])
724 + MUL15(a[14], b[16])
725 + MUL15(a[15], b[15])
726 + MUL15(a[16], b[14])
727 + MUL15(a[17], b[13])
728 + MUL15(a[18], b[12])
729 + MUL15(a[19], b[11]);
730 t[31] = MUL15(a[12], b[19])
731 + MUL15(a[13], b[18])
732 + MUL15(a[14], b[17])
733 + MUL15(a[15], b[16])
734 + MUL15(a[16], b[15])
735 + MUL15(a[17], b[14])
736 + MUL15(a[18], b[13])
737 + MUL15(a[19], b[12]);
738 t[32] = MUL15(a[13], b[19])
739 + MUL15(a[14], b[18])
740 + MUL15(a[15], b[17])
741 + MUL15(a[16], b[16])
742 + MUL15(a[17], b[15])
743 + MUL15(a[18], b[14])
744 + MUL15(a[19], b[13]);
745 t[33] = MUL15(a[14], b[19])
746 + MUL15(a[15], b[18])
747 + MUL15(a[16], b[17])
748 + MUL15(a[17], b[16])
749 + MUL15(a[18], b[15])
750 + MUL15(a[19], b[14]);
751 t[34] = MUL15(a[15], b[19])
752 + MUL15(a[16], b[18])
753 + MUL15(a[17], b[17])
754 + MUL15(a[18], b[16])
755 + MUL15(a[19], b[15]);
756 t[35] = MUL15(a[16], b[19])
757 + MUL15(a[17], b[18])
758 + MUL15(a[18], b[17])
759 + MUL15(a[19], b[16]);
760 t[36] = MUL15(a[17], b[19])
761 + MUL15(a[18], b[18])
762 + MUL15(a[19], b[17]);
763 t[37] = MUL15(a[18], b[19])
764 + MUL15(a[19], b[18]);
765 t[38] = MUL15(a[19], b[19]);
766 d[39] = norm13(d, t, 39);
767 }
768
769 static void
770 square20(uint32_t *d, const uint32_t *a)
771 {
772 uint32_t t[39];
773
774 t[ 0] = MUL15(a[ 0], a[ 0]);
775 t[ 1] = ((MUL15(a[ 0], a[ 1])) << 1);
776 t[ 2] = MUL15(a[ 1], a[ 1])
777 + ((MUL15(a[ 0], a[ 2])) << 1);
778 t[ 3] = ((MUL15(a[ 0], a[ 3])
779 + MUL15(a[ 1], a[ 2])) << 1);
780 t[ 4] = MUL15(a[ 2], a[ 2])
781 + ((MUL15(a[ 0], a[ 4])
782 + MUL15(a[ 1], a[ 3])) << 1);
783 t[ 5] = ((MUL15(a[ 0], a[ 5])
784 + MUL15(a[ 1], a[ 4])
785 + MUL15(a[ 2], a[ 3])) << 1);
786 t[ 6] = MUL15(a[ 3], a[ 3])
787 + ((MUL15(a[ 0], a[ 6])
788 + MUL15(a[ 1], a[ 5])
789 + MUL15(a[ 2], a[ 4])) << 1);
790 t[ 7] = ((MUL15(a[ 0], a[ 7])
791 + MUL15(a[ 1], a[ 6])
792 + MUL15(a[ 2], a[ 5])
793 + MUL15(a[ 3], a[ 4])) << 1);
794 t[ 8] = MUL15(a[ 4], a[ 4])
795 + ((MUL15(a[ 0], a[ 8])
796 + MUL15(a[ 1], a[ 7])
797 + MUL15(a[ 2], a[ 6])
798 + MUL15(a[ 3], a[ 5])) << 1);
799 t[ 9] = ((MUL15(a[ 0], a[ 9])
800 + MUL15(a[ 1], a[ 8])
801 + MUL15(a[ 2], a[ 7])
802 + MUL15(a[ 3], a[ 6])
803 + MUL15(a[ 4], a[ 5])) << 1);
804 t[10] = MUL15(a[ 5], a[ 5])
805 + ((MUL15(a[ 0], a[10])
806 + MUL15(a[ 1], a[ 9])
807 + MUL15(a[ 2], a[ 8])
808 + MUL15(a[ 3], a[ 7])
809 + MUL15(a[ 4], a[ 6])) << 1);
810 t[11] = ((MUL15(a[ 0], a[11])
811 + MUL15(a[ 1], a[10])
812 + MUL15(a[ 2], a[ 9])
813 + MUL15(a[ 3], a[ 8])
814 + MUL15(a[ 4], a[ 7])
815 + MUL15(a[ 5], a[ 6])) << 1);
816 t[12] = MUL15(a[ 6], a[ 6])
817 + ((MUL15(a[ 0], a[12])
818 + MUL15(a[ 1], a[11])
819 + MUL15(a[ 2], a[10])
820 + MUL15(a[ 3], a[ 9])
821 + MUL15(a[ 4], a[ 8])
822 + MUL15(a[ 5], a[ 7])) << 1);
823 t[13] = ((MUL15(a[ 0], a[13])
824 + MUL15(a[ 1], a[12])
825 + MUL15(a[ 2], a[11])
826 + MUL15(a[ 3], a[10])
827 + MUL15(a[ 4], a[ 9])
828 + MUL15(a[ 5], a[ 8])
829 + MUL15(a[ 6], a[ 7])) << 1);
830 t[14] = MUL15(a[ 7], a[ 7])
831 + ((MUL15(a[ 0], a[14])
832 + MUL15(a[ 1], a[13])
833 + MUL15(a[ 2], a[12])
834 + MUL15(a[ 3], a[11])
835 + MUL15(a[ 4], a[10])
836 + MUL15(a[ 5], a[ 9])
837 + MUL15(a[ 6], a[ 8])) << 1);
838 t[15] = ((MUL15(a[ 0], a[15])
839 + MUL15(a[ 1], a[14])
840 + MUL15(a[ 2], a[13])
841 + MUL15(a[ 3], a[12])
842 + MUL15(a[ 4], a[11])
843 + MUL15(a[ 5], a[10])
844 + MUL15(a[ 6], a[ 9])
845 + MUL15(a[ 7], a[ 8])) << 1);
846 t[16] = MUL15(a[ 8], a[ 8])
847 + ((MUL15(a[ 0], a[16])
848 + MUL15(a[ 1], a[15])
849 + MUL15(a[ 2], a[14])
850 + MUL15(a[ 3], a[13])
851 + MUL15(a[ 4], a[12])
852 + MUL15(a[ 5], a[11])
853 + MUL15(a[ 6], a[10])
854 + MUL15(a[ 7], a[ 9])) << 1);
855 t[17] = ((MUL15(a[ 0], a[17])
856 + MUL15(a[ 1], a[16])
857 + MUL15(a[ 2], a[15])
858 + MUL15(a[ 3], a[14])
859 + MUL15(a[ 4], a[13])
860 + MUL15(a[ 5], a[12])
861 + MUL15(a[ 6], a[11])
862 + MUL15(a[ 7], a[10])
863 + MUL15(a[ 8], a[ 9])) << 1);
864 t[18] = MUL15(a[ 9], a[ 9])
865 + ((MUL15(a[ 0], a[18])
866 + MUL15(a[ 1], a[17])
867 + MUL15(a[ 2], a[16])
868 + MUL15(a[ 3], a[15])
869 + MUL15(a[ 4], a[14])
870 + MUL15(a[ 5], a[13])
871 + MUL15(a[ 6], a[12])
872 + MUL15(a[ 7], a[11])
873 + MUL15(a[ 8], a[10])) << 1);
874 t[19] = ((MUL15(a[ 0], a[19])
875 + MUL15(a[ 1], a[18])
876 + MUL15(a[ 2], a[17])
877 + MUL15(a[ 3], a[16])
878 + MUL15(a[ 4], a[15])
879 + MUL15(a[ 5], a[14])
880 + MUL15(a[ 6], a[13])
881 + MUL15(a[ 7], a[12])
882 + MUL15(a[ 8], a[11])
883 + MUL15(a[ 9], a[10])) << 1);
884 t[20] = MUL15(a[10], a[10])
885 + ((MUL15(a[ 1], a[19])
886 + MUL15(a[ 2], a[18])
887 + MUL15(a[ 3], a[17])
888 + MUL15(a[ 4], a[16])
889 + MUL15(a[ 5], a[15])
890 + MUL15(a[ 6], a[14])
891 + MUL15(a[ 7], a[13])
892 + MUL15(a[ 8], a[12])
893 + MUL15(a[ 9], a[11])) << 1);
894 t[21] = ((MUL15(a[ 2], a[19])
895 + MUL15(a[ 3], a[18])
896 + MUL15(a[ 4], a[17])
897 + MUL15(a[ 5], a[16])
898 + MUL15(a[ 6], a[15])
899 + MUL15(a[ 7], a[14])
900 + MUL15(a[ 8], a[13])
901 + MUL15(a[ 9], a[12])
902 + MUL15(a[10], a[11])) << 1);
903 t[22] = MUL15(a[11], a[11])
904 + ((MUL15(a[ 3], a[19])
905 + MUL15(a[ 4], a[18])
906 + MUL15(a[ 5], a[17])
907 + MUL15(a[ 6], a[16])
908 + MUL15(a[ 7], a[15])
909 + MUL15(a[ 8], a[14])
910 + MUL15(a[ 9], a[13])
911 + MUL15(a[10], a[12])) << 1);
912 t[23] = ((MUL15(a[ 4], a[19])
913 + MUL15(a[ 5], a[18])
914 + MUL15(a[ 6], a[17])
915 + MUL15(a[ 7], a[16])
916 + MUL15(a[ 8], a[15])
917 + MUL15(a[ 9], a[14])
918 + MUL15(a[10], a[13])
919 + MUL15(a[11], a[12])) << 1);
920 t[24] = MUL15(a[12], a[12])
921 + ((MUL15(a[ 5], a[19])
922 + MUL15(a[ 6], a[18])
923 + MUL15(a[ 7], a[17])
924 + MUL15(a[ 8], a[16])
925 + MUL15(a[ 9], a[15])
926 + MUL15(a[10], a[14])
927 + MUL15(a[11], a[13])) << 1);
928 t[25] = ((MUL15(a[ 6], a[19])
929 + MUL15(a[ 7], a[18])
930 + MUL15(a[ 8], a[17])
931 + MUL15(a[ 9], a[16])
932 + MUL15(a[10], a[15])
933 + MUL15(a[11], a[14])
934 + MUL15(a[12], a[13])) << 1);
935 t[26] = MUL15(a[13], a[13])
936 + ((MUL15(a[ 7], a[19])
937 + MUL15(a[ 8], a[18])
938 + MUL15(a[ 9], a[17])
939 + MUL15(a[10], a[16])
940 + MUL15(a[11], a[15])
941 + MUL15(a[12], a[14])) << 1);
942 t[27] = ((MUL15(a[ 8], a[19])
943 + MUL15(a[ 9], a[18])
944 + MUL15(a[10], a[17])
945 + MUL15(a[11], a[16])
946 + MUL15(a[12], a[15])
947 + MUL15(a[13], a[14])) << 1);
948 t[28] = MUL15(a[14], a[14])
949 + ((MUL15(a[ 9], a[19])
950 + MUL15(a[10], a[18])
951 + MUL15(a[11], a[17])
952 + MUL15(a[12], a[16])
953 + MUL15(a[13], a[15])) << 1);
954 t[29] = ((MUL15(a[10], a[19])
955 + MUL15(a[11], a[18])
956 + MUL15(a[12], a[17])
957 + MUL15(a[13], a[16])
958 + MUL15(a[14], a[15])) << 1);
959 t[30] = MUL15(a[15], a[15])
960 + ((MUL15(a[11], a[19])
961 + MUL15(a[12], a[18])
962 + MUL15(a[13], a[17])
963 + MUL15(a[14], a[16])) << 1);
964 t[31] = ((MUL15(a[12], a[19])
965 + MUL15(a[13], a[18])
966 + MUL15(a[14], a[17])
967 + MUL15(a[15], a[16])) << 1);
968 t[32] = MUL15(a[16], a[16])
969 + ((MUL15(a[13], a[19])
970 + MUL15(a[14], a[18])
971 + MUL15(a[15], a[17])) << 1);
972 t[33] = ((MUL15(a[14], a[19])
973 + MUL15(a[15], a[18])
974 + MUL15(a[16], a[17])) << 1);
975 t[34] = MUL15(a[17], a[17])
976 + ((MUL15(a[15], a[19])
977 + MUL15(a[16], a[18])) << 1);
978 t[35] = ((MUL15(a[16], a[19])
979 + MUL15(a[17], a[18])) << 1);
980 t[36] = MUL15(a[18], a[18])
981 + ((MUL15(a[17], a[19])) << 1);
982 t[37] = ((MUL15(a[18], a[19])) << 1);
983 t[38] = MUL15(a[19], a[19]);
984 d[39] = norm13(d, t, 39);
985 }
986
987 #endif
988
989 /*
990 * Modulus for field F256 (field for point coordinates in curve P-256).
991 */
992 static const uint32_t F256[] = {
993 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x001F,
994 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0400, 0x0000,
995 0x0000, 0x1FF8, 0x1FFF, 0x01FF
996 };
997
998 /*
999 * The 'b' curve equation coefficient for P-256.
1000 */
1001 static const uint32_t P256_B[] = {
1002 0x004B, 0x1E93, 0x0F89, 0x1C78, 0x03BC, 0x187B, 0x114E, 0x1619,
1003 0x1D06, 0x0328, 0x01AF, 0x0D31, 0x1557, 0x15DE, 0x1ECF, 0x127C,
1004 0x0A3A, 0x0EC5, 0x118D, 0x00B5
1005 };
1006
1007 /*
1008 * Perform a "short reduction" in field F256 (field for curve P-256).
1009 * The source value should be less than 262 bits; on output, it will
1010 * be at most 257 bits, and less than twice the modulus.
1011 */
1012 static void
1013 reduce_f256(uint32_t *d)
1014 {
1015 uint32_t x;
1016
1017 x = d[19] >> 9;
1018 d[19] &= 0x01FF;
1019 d[17] += x << 3;
1020 d[14] -= x << 10;
1021 d[7] -= x << 5;
1022 d[0] += x;
1023 norm13(d, d, 20);
1024 }
1025
1026 /*
1027 * Perform a "final reduction" in field F256 (field for curve P-256).
1028 * The source value must be less than twice the modulus. If the value
1029 * is not lower than the modulus, then the modulus is subtracted and
1030 * this function returns 1; otherwise, it leaves it untouched and it
1031 * returns 0.
1032 */
1033 static uint32_t
1034 reduce_final_f256(uint32_t *d)
1035 {
1036 uint32_t t[20];
1037 uint32_t cc;
1038 int i;
1039
1040 memcpy(t, d, sizeof t);
1041 cc = 0;
1042 for (i = 0; i < 20; i ++) {
1043 uint32_t w;
1044
1045 w = t[i] - F256[i] - cc;
1046 cc = w >> 31;
1047 t[i] = w & 0x1FFF;
1048 }
1049 cc ^= 1;
1050 CCOPY(cc, d, t, sizeof t);
1051 return cc;
1052 }
1053
1054 /*
1055 * Perform a multiplication of two integers modulo
1056 * 2^256-2^224+2^192+2^96-1 (for NIST curve P-256). Operands are arrays
1057 * of 20 words, each containing 13 bits of data, in little-endian order.
1058 * On input, upper word may be up to 13 bits (hence value up to 2^260-1);
1059 * on output, value fits on 257 bits and is lower than twice the modulus.
1060 */
1061 static void
1062 mul_f256(uint32_t *d, const uint32_t *a, const uint32_t *b)
1063 {
1064 uint32_t t[40], cc;
1065 int i;
1066
1067 /*
1068 * Compute raw multiplication. All result words fit in 13 bits
1069 * each.
1070 */
1071 mul20(t, a, b);
1072
1073 /*
1074 * Modular reduction: each high word in added/subtracted where
1075 * necessary.
1076 *
1077 * The modulus is:
1078 * p = 2^256 - 2^224 + 2^192 + 2^96 - 1
1079 * Therefore:
1080 * 2^256 = 2^224 - 2^192 - 2^96 + 1 mod p
1081 *
1082 * For a word x at bit offset n (n >= 256), we have:
1083 * x*2^n = x*2^(n-32) - x*2^(n-64)
1084 * - x*2^(n - 160) + x*2^(n-256) mod p
1085 *
1086 * Thus, we can nullify the high word if we reinject it at some
1087 * proper emplacements.
1088 */
1089 for (i = 39; i >= 20; i --) {
1090 uint32_t x;
1091
1092 x = t[i];
1093 t[i - 2] += ARSH(x, 6);
1094 t[i - 3] += (x << 7) & 0x1FFF;
1095 t[i - 4] -= ARSH(x, 12);
1096 t[i - 5] -= (x << 1) & 0x1FFF;
1097 t[i - 12] -= ARSH(x, 4);
1098 t[i - 13] -= (x << 9) & 0x1FFF;
1099 t[i - 19] += ARSH(x, 9);
1100 t[i - 20] += (x << 4) & 0x1FFF;
1101 }
1102
1103 /*
1104 * Propagate carries. This is a signed propagation, and the
1105 * result may be negative. The loop above may enlarge values,
1106 * but not two much: worst case is the chain involving t[i - 3],
1107 * in which a value may be added to itself up to 7 times. Since
1108 * starting values are 13-bit each, all words fit on 20 bits
1109 * (21 to account for the sign bit).
1110 */
1111 cc = norm13(t, t, 20);
1112
1113 /*
1114 * Perform modular reduction again for the bits beyond 256 (the carry
1115 * and the bits 256..259). Since the largest shift below is by 10
1116 * bits, and the values fit on 21 bits, values fit in 32-bit words,
1117 * thereby allowing injecting full word values.
1118 */
1119 cc = (cc << 4) | (t[19] >> 9);
1120 t[19] &= 0x01FF;
1121 t[17] += cc << 3;
1122 t[14] -= cc << 10;
1123 t[7] -= cc << 5;
1124 t[0] += cc;
1125 norm13(d, t, 20);
1126 }
1127
1128 /*
1129 * Square an integer modulo 2^256-2^224+2^192+2^96-1 (for NIST curve
1130 * P-256). Operand is an array of 20 words, each containing 13 bits of
1131 * data, in little-endian order. On input, upper word may be up to 13
1132 * bits (hence value up to 2^260-1); on output, value fits on 257 bits
1133 * and is lower than twice the modulus.
1134 */
1135 static void
1136 square_f256(uint32_t *d, const uint32_t *a)
1137 {
1138 uint32_t t[40], cc;
1139 int i;
1140
1141 /*
1142 * Compute raw square. All result words fit in 13 bits each.
1143 */
1144 square20(t, a);
1145
1146 /*
1147 * Modular reduction: each high word in added/subtracted where
1148 * necessary.
1149 *
1150 * The modulus is:
1151 * p = 2^256 - 2^224 + 2^192 + 2^96 - 1
1152 * Therefore:
1153 * 2^256 = 2^224 - 2^192 - 2^96 + 1 mod p
1154 *
1155 * For a word x at bit offset n (n >= 256), we have:
1156 * x*2^n = x*2^(n-32) - x*2^(n-64)
1157 * - x*2^(n - 160) + x*2^(n-256) mod p
1158 *
1159 * Thus, we can nullify the high word if we reinject it at some
1160 * proper emplacements.
1161 */
1162 for (i = 39; i >= 20; i --) {
1163 uint32_t x;
1164
1165 x = t[i];
1166 t[i - 2] += ARSH(x, 6);
1167 t[i - 3] += (x << 7) & 0x1FFF;
1168 t[i - 4] -= ARSH(x, 12);
1169 t[i - 5] -= (x << 1) & 0x1FFF;
1170 t[i - 12] -= ARSH(x, 4);
1171 t[i - 13] -= (x << 9) & 0x1FFF;
1172 t[i - 19] += ARSH(x, 9);
1173 t[i - 20] += (x << 4) & 0x1FFF;
1174 }
1175
1176 /*
1177 * Propagate carries. This is a signed propagation, and the
1178 * result may be negative. The loop above may enlarge values,
1179 * but not two much: worst case is the chain involving t[i - 3],
1180 * in which a value may be added to itself up to 7 times. Since
1181 * starting values are 13-bit each, all words fit on 20 bits
1182 * (21 to account for the sign bit).
1183 */
1184 cc = norm13(t, t, 20);
1185
1186 /*
1187 * Perform modular reduction again for the bits beyond 256 (the carry
1188 * and the bits 256..259). Since the largest shift below is by 10
1189 * bits, and the values fit on 21 bits, values fit in 32-bit words,
1190 * thereby allowing injecting full word values.
1191 */
1192 cc = (cc << 4) | (t[19] >> 9);
1193 t[19] &= 0x01FF;
1194 t[17] += cc << 3;
1195 t[14] -= cc << 10;
1196 t[7] -= cc << 5;
1197 t[0] += cc;
1198 norm13(d, t, 20);
1199 }
1200
1201 /*
1202 * Jacobian coordinates for a point in P-256: affine coordinates (X,Y)
1203 * are such that:
1204 * X = x / z^2
1205 * Y = y / z^3
1206 * For the point at infinity, z = 0.
1207 * Each point thus admits many possible representations.
1208 *
1209 * Coordinates are represented in arrays of 32-bit integers, each holding
1210 * 13 bits of data. Values may also be slightly greater than the modulus,
1211 * but they will always be lower than twice the modulus.
1212 */
1213 typedef struct {
1214 uint32_t x[20];
1215 uint32_t y[20];
1216 uint32_t z[20];
1217 } p256_jacobian;
1218
1219 /*
1220 * Convert a point to affine coordinates:
1221 * - If the point is the point at infinity, then all three coordinates
1222 * are set to 0.
1223 * - Otherwise, the 'z' coordinate is set to 1, and the 'x' and 'y'
1224 * coordinates are the 'X' and 'Y' affine coordinates.
1225 * The coordinates are guaranteed to be lower than the modulus.
1226 */
1227 static void
1228 p256_to_affine(p256_jacobian *P)
1229 {
1230 uint32_t t1[20], t2[20];
1231 int i;
1232
1233 /*
1234 * Invert z with a modular exponentiation: the modulus is
1235 * p = 2^256 - 2^224 + 2^192 + 2^96 - 1, and the exponent is
1236 * p-2. Exponent bit pattern (from high to low) is:
1237 * - 32 bits of value 1
1238 * - 31 bits of value 0
1239 * - 1 bit of value 1
1240 * - 96 bits of value 0
1241 * - 94 bits of value 1
1242 * - 1 bit of value 0
1243 * - 1 bit of value 1
1244 * Thus, we precompute z^(2^31-1) to speed things up.
1245 *
1246 * If z = 0 (point at infinity) then the modular exponentiation
1247 * will yield 0, which leads to the expected result (all three
1248 * coordinates set to 0).
1249 */
1250
1251 /*
1252 * A simple square-and-multiply for z^(2^31-1). We could save about
1253 * two dozen multiplications here with an addition chain, but
1254 * this would require a bit more code, and extra stack buffers.
1255 */
1256 memcpy(t1, P->z, sizeof P->z);
1257 for (i = 0; i < 30; i ++) {
1258 square_f256(t1, t1);
1259 mul_f256(t1, t1, P->z);
1260 }
1261
1262 /*
1263 * Square-and-multiply. Apart from the squarings, we have a few
1264 * multiplications to set bits to 1; we multiply by the original z
1265 * for setting 1 bit, and by t1 for setting 31 bits.
1266 */
1267 memcpy(t2, P->z, sizeof P->z);
1268 for (i = 1; i < 256; i ++) {
1269 square_f256(t2, t2);
1270 switch (i) {
1271 case 31:
1272 case 190:
1273 case 221:
1274 case 252:
1275 mul_f256(t2, t2, t1);
1276 break;
1277 case 63:
1278 case 253:
1279 case 255:
1280 mul_f256(t2, t2, P->z);
1281 break;
1282 }
1283 }
1284
1285 /*
1286 * Now that we have 1/z, multiply x by 1/z^2 and y by 1/z^3.
1287 */
1288 mul_f256(t1, t2, t2);
1289 mul_f256(P->x, t1, P->x);
1290 mul_f256(t1, t1, t2);
1291 mul_f256(P->y, t1, P->y);
1292 reduce_final_f256(P->x);
1293 reduce_final_f256(P->y);
1294
1295 /*
1296 * Multiply z by 1/z. If z = 0, then this will yield 0, otherwise
1297 * this will set z to 1.
1298 */
1299 mul_f256(P->z, P->z, t2);
1300 reduce_final_f256(P->z);
1301 }
1302
1303 /*
1304 * Double a point in P-256. This function works for all valid points,
1305 * including the point at infinity.
1306 */
1307 static void
1308 p256_double(p256_jacobian *Q)
1309 {
1310 /*
1311 * Doubling formulas are:
1312 *
1313 * s = 4*x*y^2
1314 * m = 3*(x + z^2)*(x - z^2)
1315 * x' = m^2 - 2*s
1316 * y' = m*(s - x') - 8*y^4
1317 * z' = 2*y*z
1318 *
1319 * These formulas work for all points, including points of order 2
1320 * and points at infinity:
1321 * - If y = 0 then z' = 0. But there is no such point in P-256
1322 * anyway.
1323 * - If z = 0 then z' = 0.
1324 */
1325 uint32_t t1[20], t2[20], t3[20], t4[20];
1326 int i;
1327
1328 /*
1329 * Compute z^2 in t1.
1330 */
1331 square_f256(t1, Q->z);
1332
1333 /*
1334 * Compute x-z^2 in t2 and x+z^2 in t1.
1335 */
1336 for (i = 0; i < 20; i ++) {
1337 t2[i] = (F256[i] << 1) + Q->x[i] - t1[i];
1338 t1[i] += Q->x[i];
1339 }
1340 norm13(t1, t1, 20);
1341 norm13(t2, t2, 20);
1342
1343 /*
1344 * Compute 3*(x+z^2)*(x-z^2) in t1.
1345 */
1346 mul_f256(t3, t1, t2);
1347 for (i = 0; i < 20; i ++) {
1348 t1[i] = MUL15(3, t3[i]);
1349 }
1350 norm13(t1, t1, 20);
1351
1352 /*
1353 * Compute 4*x*y^2 (in t2) and 2*y^2 (in t3).
1354 */
1355 square_f256(t3, Q->y);
1356 for (i = 0; i < 20; i ++) {
1357 t3[i] <<= 1;
1358 }
1359 norm13(t3, t3, 20);
1360 mul_f256(t2, Q->x, t3);
1361 for (i = 0; i < 20; i ++) {
1362 t2[i] <<= 1;
1363 }
1364 norm13(t2, t2, 20);
1365 reduce_f256(t2);
1366
1367 /*
1368 * Compute x' = m^2 - 2*s.
1369 */
1370 square_f256(Q->x, t1);
1371 for (i = 0; i < 20; i ++) {
1372 Q->x[i] += (F256[i] << 2) - (t2[i] << 1);
1373 }
1374 norm13(Q->x, Q->x, 20);
1375 reduce_f256(Q->x);
1376
1377 /*
1378 * Compute z' = 2*y*z.
1379 */
1380 mul_f256(t4, Q->y, Q->z);
1381 for (i = 0; i < 20; i ++) {
1382 Q->z[i] = t4[i] << 1;
1383 }
1384 norm13(Q->z, Q->z, 20);
1385 reduce_f256(Q->z);
1386
1387 /*
1388 * Compute y' = m*(s - x') - 8*y^4. Note that we already have
1389 * 2*y^2 in t3.
1390 */
1391 for (i = 0; i < 20; i ++) {
1392 t2[i] += (F256[i] << 1) - Q->x[i];
1393 }
1394 norm13(t2, t2, 20);
1395 mul_f256(Q->y, t1, t2);
1396 square_f256(t4, t3);
1397 for (i = 0; i < 20; i ++) {
1398 Q->y[i] += (F256[i] << 2) - (t4[i] << 1);
1399 }
1400 norm13(Q->y, Q->y, 20);
1401 reduce_f256(Q->y);
1402 }
1403
1404 /*
1405 * Add point P2 to point P1.
1406 *
1407 * This function computes the wrong result in the following cases:
1408 *
1409 * - If P1 == 0 but P2 != 0
1410 * - If P1 != 0 but P2 == 0
1411 * - If P1 == P2
1412 *
1413 * In all three cases, P1 is set to the point at infinity.
1414 *
1415 * Returned value is 0 if one of the following occurs:
1416 *
1417 * - P1 and P2 have the same Y coordinate
1418 * - P1 == 0 and P2 == 0
1419 * - The Y coordinate of one of the points is 0 and the other point is
1420 * the point at infinity.
1421 *
1422 * The third case cannot actually happen with valid points, since a point
1423 * with Y == 0 is a point of order 2, and there is no point of order 2 on
1424 * curve P-256.
1425 *
1426 * Therefore, assuming that P1 != 0 and P2 != 0 on input, then the caller
1427 * can apply the following:
1428 *
1429 * - If the result is not the point at infinity, then it is correct.
1430 * - Otherwise, if the returned value is 1, then this is a case of
1431 * P1+P2 == 0, so the result is indeed the point at infinity.
1432 * - Otherwise, P1 == P2, so a "double" operation should have been
1433 * performed.
1434 */
1435 static uint32_t
1436 p256_add(p256_jacobian *P1, const p256_jacobian *P2)
1437 {
1438 /*
1439 * Addtions formulas are:
1440 *
1441 * u1 = x1 * z2^2
1442 * u2 = x2 * z1^2
1443 * s1 = y1 * z2^3
1444 * s2 = y2 * z1^3
1445 * h = u2 - u1
1446 * r = s2 - s1
1447 * x3 = r^2 - h^3 - 2 * u1 * h^2
1448 * y3 = r * (u1 * h^2 - x3) - s1 * h^3
1449 * z3 = h * z1 * z2
1450 */
1451 uint32_t t1[20], t2[20], t3[20], t4[20], t5[20], t6[20], t7[20];
1452 uint32_t ret;
1453 int i;
1454
1455 /*
1456 * Compute u1 = x1*z2^2 (in t1) and s1 = y1*z2^3 (in t3).
1457 */
1458 square_f256(t3, P2->z);
1459 mul_f256(t1, P1->x, t3);
1460 mul_f256(t4, P2->z, t3);
1461 mul_f256(t3, P1->y, t4);
1462
1463 /*
1464 * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
1465 */
1466 square_f256(t4, P1->z);
1467 mul_f256(t2, P2->x, t4);
1468 mul_f256(t5, P1->z, t4);
1469 mul_f256(t4, P2->y, t5);
1470
1471 /*
1472 * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
1473 * We need to test whether r is zero, so we will do some extra
1474 * reduce.
1475 */
1476 for (i = 0; i < 20; i ++) {
1477 t2[i] += (F256[i] << 1) - t1[i];
1478 t4[i] += (F256[i] << 1) - t3[i];
1479 }
1480 norm13(t2, t2, 20);
1481 norm13(t4, t4, 20);
1482 reduce_f256(t4);
1483 reduce_final_f256(t4);
1484 ret = 0;
1485 for (i = 0; i < 20; i ++) {
1486 ret |= t4[i];
1487 }
1488 ret = (ret | -ret) >> 31;
1489
1490 /*
1491 * Compute u1*h^2 (in t6) and h^3 (in t5);
1492 */
1493 square_f256(t7, t2);
1494 mul_f256(t6, t1, t7);
1495 mul_f256(t5, t7, t2);
1496
1497 /*
1498 * Compute x3 = r^2 - h^3 - 2*u1*h^2.
1499 */
1500 square_f256(P1->x, t4);
1501 for (i = 0; i < 20; i ++) {
1502 P1->x[i] += (F256[i] << 3) - t5[i] - (t6[i] << 1);
1503 }
1504 norm13(P1->x, P1->x, 20);
1505 reduce_f256(P1->x);
1506
1507 /*
1508 * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
1509 */
1510 for (i = 0; i < 20; i ++) {
1511 t6[i] += (F256[i] << 1) - P1->x[i];
1512 }
1513 norm13(t6, t6, 20);
1514 mul_f256(P1->y, t4, t6);
1515 mul_f256(t1, t5, t3);
1516 for (i = 0; i < 20; i ++) {
1517 P1->y[i] += (F256[i] << 1) - t1[i];
1518 }
1519 norm13(P1->y, P1->y, 20);
1520 reduce_f256(P1->y);
1521
1522 /*
1523 * Compute z3 = h*z1*z2.
1524 */
1525 mul_f256(t1, P1->z, P2->z);
1526 mul_f256(P1->z, t1, t2);
1527
1528 return ret;
1529 }
1530
1531 /*
1532 * Add point P2 to point P1. This is a specialised function for the
1533 * case when P2 is a non-zero point in affine coordinate.
1534 *
1535 * This function computes the wrong result in the following cases:
1536 *
1537 * - If P1 == 0
1538 * - If P1 == P2
1539 *
1540 * In both cases, P1 is set to the point at infinity.
1541 *
1542 * Returned value is 0 if one of the following occurs:
1543 *
1544 * - P1 and P2 have the same Y coordinate
1545 * - The Y coordinate of P2 is 0 and P1 is the point at infinity.
1546 *
1547 * The second case cannot actually happen with valid points, since a point
1548 * with Y == 0 is a point of order 2, and there is no point of order 2 on
1549 * curve P-256.
1550 *
1551 * Therefore, assuming that P1 != 0 on input, then the caller
1552 * can apply the following:
1553 *
1554 * - If the result is not the point at infinity, then it is correct.
1555 * - Otherwise, if the returned value is 1, then this is a case of
1556 * P1+P2 == 0, so the result is indeed the point at infinity.
1557 * - Otherwise, P1 == P2, so a "double" operation should have been
1558 * performed.
1559 */
1560 static uint32_t
1561 p256_add_mixed(p256_jacobian *P1, const p256_jacobian *P2)
1562 {
1563 /*
1564 * Addtions formulas are:
1565 *
1566 * u1 = x1
1567 * u2 = x2 * z1^2
1568 * s1 = y1
1569 * s2 = y2 * z1^3
1570 * h = u2 - u1
1571 * r = s2 - s1
1572 * x3 = r^2 - h^3 - 2 * u1 * h^2
1573 * y3 = r * (u1 * h^2 - x3) - s1 * h^3
1574 * z3 = h * z1
1575 */
1576 uint32_t t1[20], t2[20], t3[20], t4[20], t5[20], t6[20], t7[20];
1577 uint32_t ret;
1578 int i;
1579
1580 /*
1581 * Compute u1 = x1 (in t1) and s1 = y1 (in t3).
1582 */
1583 memcpy(t1, P1->x, sizeof t1);
1584 memcpy(t3, P1->y, sizeof t3);
1585
1586 /*
1587 * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
1588 */
1589 square_f256(t4, P1->z);
1590 mul_f256(t2, P2->x, t4);
1591 mul_f256(t5, P1->z, t4);
1592 mul_f256(t4, P2->y, t5);
1593
1594 /*
1595 * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
1596 * We need to test whether r is zero, so we will do some extra
1597 * reduce.
1598 */
1599 for (i = 0; i < 20; i ++) {
1600 t2[i] += (F256[i] << 1) - t1[i];
1601 t4[i] += (F256[i] << 1) - t3[i];
1602 }
1603 norm13(t2, t2, 20);
1604 norm13(t4, t4, 20);
1605 reduce_f256(t4);
1606 reduce_final_f256(t4);
1607 ret = 0;
1608 for (i = 0; i < 20; i ++) {
1609 ret |= t4[i];
1610 }
1611 ret = (ret | -ret) >> 31;
1612
1613 /*
1614 * Compute u1*h^2 (in t6) and h^3 (in t5);
1615 */
1616 square_f256(t7, t2);
1617 mul_f256(t6, t1, t7);
1618 mul_f256(t5, t7, t2);
1619
1620 /*
1621 * Compute x3 = r^2 - h^3 - 2*u1*h^2.
1622 */
1623 square_f256(P1->x, t4);
1624 for (i = 0; i < 20; i ++) {
1625 P1->x[i] += (F256[i] << 3) - t5[i] - (t6[i] << 1);
1626 }
1627 norm13(P1->x, P1->x, 20);
1628 reduce_f256(P1->x);
1629
1630 /*
1631 * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
1632 */
1633 for (i = 0; i < 20; i ++) {
1634 t6[i] += (F256[i] << 1) - P1->x[i];
1635 }
1636 norm13(t6, t6, 20);
1637 mul_f256(P1->y, t4, t6);
1638 mul_f256(t1, t5, t3);
1639 for (i = 0; i < 20; i ++) {
1640 P1->y[i] += (F256[i] << 1) - t1[i];
1641 }
1642 norm13(P1->y, P1->y, 20);
1643 reduce_f256(P1->y);
1644
1645 /*
1646 * Compute z3 = h*z1*z2.
1647 */
1648 mul_f256(P1->z, P1->z, t2);
1649
1650 return ret;
1651 }
1652
1653 /*
1654 * Decode a P-256 point. This function does not support the point at
1655 * infinity. Returned value is 0 if the point is invalid, 1 otherwise.
1656 */
1657 static uint32_t
1658 p256_decode(p256_jacobian *P, const void *src, size_t len)
1659 {
1660 const unsigned char *buf;
1661 uint32_t tx[20], ty[20], t1[20], t2[20];
1662 uint32_t bad;
1663 int i;
1664
1665 if (len != 65) {
1666 return 0;
1667 }
1668 buf = src;
1669
1670 /*
1671 * First byte must be 0x04 (uncompressed format). We could support
1672 * "hybrid format" (first byte is 0x06 or 0x07, and encodes the
1673 * least significant bit of the Y coordinate), but it is explicitly
1674 * forbidden by RFC 5480 (section 2.2).
1675 */
1676 bad = NEQ(buf[0], 0x04);
1677
1678 /*
1679 * Decode the coordinates, and check that they are both lower
1680 * than the modulus.
1681 */
1682 tx[19] = be8_to_le13(tx, buf + 1, 32);
1683 ty[19] = be8_to_le13(ty, buf + 33, 32);
1684 bad |= reduce_final_f256(tx);
1685 bad |= reduce_final_f256(ty);
1686
1687 /*
1688 * Check curve equation.
1689 */
1690 square_f256(t1, tx);
1691 mul_f256(t1, tx, t1);
1692 square_f256(t2, ty);
1693 for (i = 0; i < 20; i ++) {
1694 t1[i] += (F256[i] << 3) - MUL15(3, tx[i]) + P256_B[i] - t2[i];
1695 }
1696 norm13(t1, t1, 20);
1697 reduce_f256(t1);
1698 reduce_final_f256(t1);
1699 for (i = 0; i < 20; i ++) {
1700 bad |= t1[i];
1701 }
1702
1703 /*
1704 * Copy coordinates to the point structure.
1705 */
1706 memcpy(P->x, tx, sizeof tx);
1707 memcpy(P->y, ty, sizeof ty);
1708 memset(P->z, 0, sizeof P->z);
1709 P->z[0] = 1;
1710 return NEQ(bad, 0) ^ 1;
1711 }
1712
1713 /*
1714 * Encode a point into a buffer. This function assumes that the point is
1715 * valid, in affine coordinates, and not the point at infinity.
1716 */
1717 static void
1718 p256_encode(void *dst, const p256_jacobian *P)
1719 {
1720 unsigned char *buf;
1721
1722 buf = dst;
1723 buf[0] = 0x04;
1724 le13_to_be8(buf + 1, 32, P->x);
1725 le13_to_be8(buf + 33, 32, P->y);
1726 }
1727
1728 /*
1729 * Multiply a curve point by an integer. The integer is assumed to be
1730 * lower than the curve order, and the base point must not be the point
1731 * at infinity.
1732 */
1733 static void
1734 p256_mul(p256_jacobian *P, const unsigned char *x, size_t xlen)
1735 {
1736 /*
1737 * qz is a flag that is initially 1, and remains equal to 1
1738 * as long as the point is the point at infinity.
1739 *
1740 * We use a 2-bit window to handle multiplier bits by pairs.
1741 * The precomputed window really is the points P2 and P3.
1742 */
1743 uint32_t qz;
1744 p256_jacobian P2, P3, Q, T, U;
1745
1746 /*
1747 * Compute window values.
1748 */
1749 P2 = *P;
1750 p256_double(&P2);
1751 P3 = *P;
1752 p256_add(&P3, &P2);
1753
1754 /*
1755 * We start with Q = 0. We process multiplier bits 2 by 2.
1756 */
1757 memset(&Q, 0, sizeof Q);
1758 qz = 1;
1759 while (xlen -- > 0) {
1760 int k;
1761
1762 for (k = 6; k >= 0; k -= 2) {
1763 uint32_t bits;
1764 uint32_t bnz;
1765
1766 p256_double(&Q);
1767 p256_double(&Q);
1768 T = *P;
1769 U = Q;
1770 bits = (*x >> k) & (uint32_t)3;
1771 bnz = NEQ(bits, 0);
1772 CCOPY(EQ(bits, 2), &T, &P2, sizeof T);
1773 CCOPY(EQ(bits, 3), &T, &P3, sizeof T);
1774 p256_add(&U, &T);
1775 CCOPY(bnz & qz, &Q, &T, sizeof Q);
1776 CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
1777 qz &= ~bnz;
1778 }
1779 x ++;
1780 }
1781 *P = Q;
1782 }
1783
1784 /*
1785 * Precomputed window: k*G points, where G is the curve generator, and k
1786 * is an integer from 1 to 15 (inclusive). The X and Y coordinates of
1787 * the point are encoded as 20 words of 13 bits each (little-endian
1788 * order); 13-bit words are then grouped 2-by-2 into 32-bit words
1789 * (little-endian order within each word).
1790 */
1791 static const uint32_t Gwin[15][20] = {
1792
1793 { 0x04C60296, 0x02721176, 0x19D00F4A, 0x102517AC,
1794 0x13B8037D, 0x0748103C, 0x1E730E56, 0x08481FE2,
1795 0x0F97012C, 0x00D605F4, 0x1DFA11F5, 0x0C801A0D,
1796 0x0F670CBB, 0x0AED0CC5, 0x115E0E33, 0x181F0785,
1797 0x13F514A7, 0x0FF30E3B, 0x17171E1A, 0x009F18D0 },
1798
1799 { 0x1B341978, 0x16911F11, 0x0D9A1A60, 0x1C4E1FC8,
1800 0x1E040969, 0x096A06B0, 0x091C0030, 0x09EF1A29,
1801 0x18C40D03, 0x00F91C9E, 0x13C313D1, 0x096F0748,
1802 0x011419E0, 0x1CC713A6, 0x1DD31DAD, 0x1EE80C36,
1803 0x1ECD0C69, 0x1A0800A4, 0x08861B8E, 0x000E1DD5 },
1804
1805 { 0x173F1D6C, 0x02CC06F1, 0x14C21FB4, 0x043D1EB6,
1806 0x0F3606B7, 0x1A971C59, 0x1BF71951, 0x01481323,
1807 0x068D0633, 0x00BD12F9, 0x13EA1032, 0x136209E8,
1808 0x1C1E19A7, 0x06C7013E, 0x06C10AB0, 0x14C908BB,
1809 0x05830CE1, 0x1FEF18DD, 0x00620998, 0x010E0D19 },
1810
1811 { 0x18180852, 0x0604111A, 0x0B771509, 0x1B6F0156,
1812 0x00181FE2, 0x1DCC0AF4, 0x16EF0659, 0x11F70E80,
1813 0x11A912D0, 0x01C414D2, 0x027618C6, 0x05840FC6,
1814 0x100215C4, 0x187E0C3B, 0x12771C96, 0x150C0B5D,
1815 0x0FF705FD, 0x07981C67, 0x1AD20C63, 0x01C11C55 },
1816
1817 { 0x1E8113ED, 0x0A940370, 0x12920215, 0x1FA31D6F,
1818 0x1F7C0C82, 0x10CD03F7, 0x02640560, 0x081A0B5E,
1819 0x1BD21151, 0x00A21642, 0x0D0B0DA4, 0x0176113F,
1820 0x04440D1D, 0x001A1360, 0x1068012F, 0x1F141E49,
1821 0x10DF136B, 0x0E4F162B, 0x0D44104A, 0x01C1105F },
1822
1823 { 0x011411A9, 0x01551A4F, 0x0ADA0C6B, 0x01BD0EC8,
1824 0x18120C74, 0x112F1778, 0x099202CB, 0x0C05124B,
1825 0x195316A4, 0x01600685, 0x1E3B1FE2, 0x189014E3,
1826 0x0B5E1FD7, 0x0E0311F8, 0x08E000F7, 0x174E00DE,
1827 0x160702DF, 0x1B5A15BF, 0x03A11237, 0x01D01704 },
1828
1829 { 0x0C3D12A3, 0x0C501C0C, 0x17AD1300, 0x1715003F,
1830 0x03F719F8, 0x18031ED8, 0x1D980667, 0x0F681896,
1831 0x1B7D00BF, 0x011C14CE, 0x0FA000B4, 0x1C3501B0,
1832 0x0D901C55, 0x06790C10, 0x029E0736, 0x0DEB0400,
1833 0x034F183A, 0x030619B4, 0x0DEF0033, 0x00E71AC7 },
1834
1835 { 0x1B7D1393, 0x1B3B1076, 0x0BED1B4D, 0x13011F3A,
1836 0x0E0E1238, 0x156A132B, 0x013A02D3, 0x160A0D01,
1837 0x1CED1EE9, 0x00C5165D, 0x184C157E, 0x08141A83,
1838 0x153C0DA5, 0x1ED70F9D, 0x05170D51, 0x02CF13B8,
1839 0x18AE1771, 0x1B04113F, 0x05EC11E9, 0x015A16B3 },
1840
1841 { 0x04A41EE0, 0x1D1412E4, 0x1C591D79, 0x118511B7,
1842 0x14F00ACB, 0x1AE31E1C, 0x049C0D51, 0x016E061E,
1843 0x1DB71EDF, 0x01D41A35, 0x0E8208FA, 0x14441293,
1844 0x011F1E85, 0x1D54137A, 0x026B114F, 0x151D0832,
1845 0x00A50964, 0x1F9C1E1C, 0x064B12C9, 0x005409D1 },
1846
1847 { 0x062B123F, 0x0C0D0501, 0x183704C3, 0x08E31120,
1848 0x0A2E0A6C, 0x14440FED, 0x090A0D1E, 0x13271964,
1849 0x0B590A3A, 0x019D1D9B, 0x05780773, 0x09770A91,
1850 0x0F770CA3, 0x053F19D4, 0x02C80DED, 0x1A761304,
1851 0x091E0DD9, 0x15D201B8, 0x151109AA, 0x010F0198 },
1852
1853 { 0x05E101D1, 0x072314DD, 0x045F1433, 0x1A041541,
1854 0x10B3142E, 0x01840736, 0x1C1B19DB, 0x098B0418,
1855 0x1DBC083B, 0x007D1444, 0x01511740, 0x11DD1F3A,
1856 0x04ED0E2F, 0x1B4B1A62, 0x10480D04, 0x09E911A2,
1857 0x04211AFA, 0x19140893, 0x04D60CC4, 0x01210648 },
1858
1859 { 0x112703C4, 0x018B1BA1, 0x164C1D50, 0x05160BE0,
1860 0x0BCC1830, 0x01CB1554, 0x13291732, 0x1B2B1918,
1861 0x0DED0817, 0x00E80775, 0x0A2401D3, 0x0BFE08B3,
1862 0x0E531199, 0x058616E9, 0x04770B91, 0x110F0C55,
1863 0x19C11554, 0x0BFB1159, 0x03541C38, 0x000E1C2D },
1864
1865 { 0x10390C01, 0x02BB0751, 0x0AC5098E, 0x096C17AB,
1866 0x03C90E28, 0x10BD18BF, 0x002E1F2D, 0x092B0986,
1867 0x1BD700AC, 0x002E1F20, 0x1E3D1FD8, 0x077718BB,
1868 0x06F919C4, 0x187407ED, 0x11370E14, 0x081E139C,
1869 0x00481ADB, 0x14AB0289, 0x066A0EBE, 0x00C70ED6 },
1870
1871 { 0x0694120B, 0x124E1CC9, 0x0E2F0570, 0x17CF081A,
1872 0x078906AC, 0x066D17CF, 0x1B3207F4, 0x0C5705E9,
1873 0x10001C38, 0x00A919DE, 0x06851375, 0x0F900BD8,
1874 0x080401BA, 0x0EEE0D42, 0x1B8B11EA, 0x0B4519F0,
1875 0x090F18C0, 0x062E1508, 0x0DD909F4, 0x01EB067C },
1876
1877 { 0x0CDC1D5F, 0x0D1818F9, 0x07781636, 0x125B18E8,
1878 0x0D7003AF, 0x13110099, 0x1D9B1899, 0x175C1EB7,
1879 0x0E34171A, 0x01E01153, 0x081A0F36, 0x0B391783,
1880 0x1D1F147E, 0x19CE16D7, 0x11511B21, 0x1F2C10F9,
1881 0x12CA0E51, 0x05A31D39, 0x171A192E, 0x016B0E4F }
1882 };
1883
1884 /*
1885 * Lookup one of the Gwin[] values, by index. This is constant-time.
1886 */
1887 static void
1888 lookup_Gwin(p256_jacobian *T, uint32_t idx)
1889 {
1890 uint32_t xy[20];
1891 uint32_t k;
1892 size_t u;
1893
1894 memset(xy, 0, sizeof xy);
1895 for (k = 0; k < 15; k ++) {
1896 uint32_t m;
1897
1898 m = -EQ(idx, k + 1);
1899 for (u = 0; u < 20; u ++) {
1900 xy[u] |= m & Gwin[k][u];
1901 }
1902 }
1903 for (u = 0; u < 10; u ++) {
1904 T->x[(u << 1) + 0] = xy[u] & 0xFFFF;
1905 T->x[(u << 1) + 1] = xy[u] >> 16;
1906 T->y[(u << 1) + 0] = xy[u + 10] & 0xFFFF;
1907 T->y[(u << 1) + 1] = xy[u + 10] >> 16;
1908 }
1909 memset(T->z, 0, sizeof T->z);
1910 T->z[0] = 1;
1911 }
1912
1913 /*
1914 * Multiply the generator by an integer. The integer is assumed non-zero
1915 * and lower than the curve order.
1916 */
1917 static void
1918 p256_mulgen(p256_jacobian *P, const unsigned char *x, size_t xlen)
1919 {
1920 /*
1921 * qz is a flag that is initially 1, and remains equal to 1
1922 * as long as the point is the point at infinity.
1923 *
1924 * We use a 4-bit window to handle multiplier bits by groups
1925 * of 4. The precomputed window is constant static data, with
1926 * points in affine coordinates; we use a constant-time lookup.
1927 */
1928 p256_jacobian Q;
1929 uint32_t qz;
1930
1931 memset(&Q, 0, sizeof Q);
1932 qz = 1;
1933 while (xlen -- > 0) {
1934 int k;
1935 unsigned bx;
1936
1937 bx = *x ++;
1938 for (k = 0; k < 2; k ++) {
1939 uint32_t bits;
1940 uint32_t bnz;
1941 p256_jacobian T, U;
1942
1943 p256_double(&Q);
1944 p256_double(&Q);
1945 p256_double(&Q);
1946 p256_double(&Q);
1947 bits = (bx >> 4) & 0x0F;
1948 bnz = NEQ(bits, 0);
1949 lookup_Gwin(&T, bits);
1950 U = Q;
1951 p256_add_mixed(&U, &T);
1952 CCOPY(bnz & qz, &Q, &T, sizeof Q);
1953 CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
1954 qz &= ~bnz;
1955 bx <<= 4;
1956 }
1957 }
1958 *P = Q;
1959 }
1960
1961 static const unsigned char P256_G[] = {
1962 0x04, 0x6B, 0x17, 0xD1, 0xF2, 0xE1, 0x2C, 0x42, 0x47, 0xF8,
1963 0xBC, 0xE6, 0xE5, 0x63, 0xA4, 0x40, 0xF2, 0x77, 0x03, 0x7D,
1964 0x81, 0x2D, 0xEB, 0x33, 0xA0, 0xF4, 0xA1, 0x39, 0x45, 0xD8,
1965 0x98, 0xC2, 0x96, 0x4F, 0xE3, 0x42, 0xE2, 0xFE, 0x1A, 0x7F,
1966 0x9B, 0x8E, 0xE7, 0xEB, 0x4A, 0x7C, 0x0F, 0x9E, 0x16, 0x2B,
1967 0xCE, 0x33, 0x57, 0x6B, 0x31, 0x5E, 0xCE, 0xCB, 0xB6, 0x40,
1968 0x68, 0x37, 0xBF, 0x51, 0xF5
1969 };
1970
1971 static const unsigned char P256_N[] = {
1972 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF,
1973 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xBC, 0xE6, 0xFA, 0xAD,
1974 0xA7, 0x17, 0x9E, 0x84, 0xF3, 0xB9, 0xCA, 0xC2, 0xFC, 0x63,
1975 0x25, 0x51
1976 };
1977
1978 static const unsigned char *
1979 api_generator(int curve, size_t *len)
1980 {
1981 (void)curve;
1982 *len = sizeof P256_G;
1983 return P256_G;
1984 }
1985
1986 static const unsigned char *
1987 api_order(int curve, size_t *len)
1988 {
1989 (void)curve;
1990 *len = sizeof P256_N;
1991 return P256_N;
1992 }
1993
1994 static size_t
1995 api_xoff(int curve, size_t *len)
1996 {
1997 (void)curve;
1998 *len = 32;
1999 return 1;
2000 }
2001
2002 static uint32_t
2003 api_mul(unsigned char *G, size_t Glen,
2004 const unsigned char *x, size_t xlen, int curve)
2005 {
2006 uint32_t r;
2007 p256_jacobian P;
2008
2009 (void)curve;
2010 r = p256_decode(&P, G, Glen);
2011 p256_mul(&P, x, xlen);
2012 if (Glen >= 65) {
2013 p256_to_affine(&P);
2014 p256_encode(G, &P);
2015 }
2016 return r;
2017 }
2018
2019 static size_t
2020 api_mulgen(unsigned char *R,
2021 const unsigned char *x, size_t xlen, int curve)
2022 {
2023 p256_jacobian P;
2024
2025 (void)curve;
2026 p256_mulgen(&P, x, xlen);
2027 p256_to_affine(&P);
2028 p256_encode(R, &P);
2029 return 65;
2030
2031 /*
2032 const unsigned char *G;
2033 size_t Glen;
2034
2035 G = api_generator(curve, &Glen);
2036 memcpy(R, G, Glen);
2037 api_mul(R, Glen, x, xlen, curve);
2038 return Glen;
2039 */
2040 }
2041
2042 static uint32_t
2043 api_muladd(unsigned char *A, const unsigned char *B, size_t len,
2044 const unsigned char *x, size_t xlen,
2045 const unsigned char *y, size_t ylen, int curve)
2046 {
2047 p256_jacobian P, Q;
2048 uint32_t r, t, z;
2049 int i;
2050
2051 (void)curve;
2052 r = p256_decode(&P, A, len);
2053 p256_mul(&P, x, xlen);
2054 if (B == NULL) {
2055 p256_mulgen(&Q, y, ylen);
2056 } else {
2057 r &= p256_decode(&Q, B, len);
2058 p256_mul(&Q, y, ylen);
2059 }
2060
2061 /*
2062 * The final addition may fail in case both points are equal.
2063 */
2064 t = p256_add(&P, &Q);
2065 reduce_final_f256(P.z);
2066 z = 0;
2067 for (i = 0; i < 20; i ++) {
2068 z |= P.z[i];
2069 }
2070 z = EQ(z, 0);
2071 p256_double(&Q);
2072
2073 /*
2074 * If z is 1 then either P+Q = 0 (t = 1) or P = Q (t = 0). So we
2075 * have the following:
2076 *
2077 * z = 0, t = 0 return P (normal addition)
2078 * z = 0, t = 1 return P (normal addition)
2079 * z = 1, t = 0 return Q (a 'double' case)
2080 * z = 1, t = 1 report an error (P+Q = 0)
2081 */
2082 CCOPY(z & ~t, &P, &Q, sizeof Q);
2083 p256_to_affine(&P);
2084 p256_encode(A, &P);
2085 r &= ~(z & t);
2086 return r;
2087 }
2088
2089 /* see bearssl_ec.h */
2090 const br_ec_impl br_ec_p256_m15 = {
2091 (uint32_t)0x00800000,
2092 &api_generator,
2093 &api_order,
2094 &api_xoff,
2095 &api_mul,
2096 &api_mulgen,
2097 &api_muladd
2098 };