2 * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
4 * Permission is hereby granted, free of charge, to any person obtaining
5 * a copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sublicense, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
12 * The above copyright notice and this permission notice shall be
13 * included in all copies or substantial portions of the Software.
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28 * If BR_NO_ARITH_SHIFT is undefined, or defined to 0, then we _assume_
29 * that right-shifting a signed negative integer copies the sign bit
30 * (arithmetic right-shift). This is "implementation-defined behaviour",
31 * i.e. it is not undefined, but it may differ between compilers. Each
32 * compiler is supposed to document its behaviour in that respect. GCC
33 * explicitly defines that an arithmetic right shift is used. We expect
34 * all other compilers to do the same, because underlying CPU offer an
35 * arithmetic right shift opcode that could not be used otherwise.
38 #define ARSH(x, n) (((uint32_t)(x) >> (n)) \
39 | ((-((uint32_t)(x) >> 31)) << (32 - (n))))
41 #define ARSH(x, n) ((*(int32_t *)&(x)) >> (n))
45 * Convert an integer from unsigned big-endian encoding to a sequence of
46 * 13-bit words in little-endian order. The final "partial" word is
50 be8_to_le13(uint32_t *dst
, const unsigned char *src
, size_t len
)
58 acc
|= (uint32_t)src
[len
] << acc_len
;
61 *dst
++ = acc
& 0x1FFF;
70 * Convert an integer (13-bit words, little-endian) to unsigned
71 * big-endian encoding. The total encoding length is provided; all
72 * the destination bytes will be filled.
75 le13_to_be8(unsigned char *dst
, size_t len
, const uint32_t *src
)
84 acc
|= (*src
++) << acc_len
;
87 dst
[len
] = (unsigned char)acc
;
94 * Normalise an array of words to a strict 13 bits per word. Returned
95 * value is the resulting carry. The source (w) and destination (d)
96 * arrays may be identical, but shall not overlap partially.
98 static inline uint32_t
99 norm13(uint32_t *d
, const uint32_t *w
, size_t len
)
105 for (u
= 0; u
< len
; u
++) {
116 * mul20() multiplies two 260-bit integers together. Each word must fit
117 * on 13 bits; source operands use 20 words, destination operand
118 * receives 40 words. All overlaps allowed.
126 mul20(uint32_t *d
, const uint32_t *a
, const uint32_t *b
)
129 * Two-level Karatsuba: turns a 20x20 multiplication into
130 * nine 5x5 multiplications. We use 13-bit words but do not
131 * propagate carries immediately, so words may expand:
133 * - First Karatsuba decomposition turns the 20x20 mul on
134 * 13-bit words into three 10x10 muls, two on 13-bit words
135 * and one on 14-bit words.
137 * - Second Karatsuba decomposition further splits these into:
139 * * four 5x5 muls on 13-bit words
140 * * four 5x5 muls on 14-bit words
141 * * one 5x5 mul on 15-bit words
143 * Highest word value is 8191, 16382 or 32764, for 13-bit, 14-bit
144 * or 15-bit words, respectively.
146 uint32_t u
[45], v
[45], w
[90];
150 #define ZADD(dw, d_off, s1w, s1_off, s2w, s2_off) do { \
151 (dw)[5 * (d_off) + 0] = (s1w)[5 * (s1_off) + 0] \
152 + (s2w)[5 * (s2_off) + 0]; \
153 (dw)[5 * (d_off) + 1] = (s1w)[5 * (s1_off) + 1] \
154 + (s2w)[5 * (s2_off) + 1]; \
155 (dw)[5 * (d_off) + 2] = (s1w)[5 * (s1_off) + 2] \
156 + (s2w)[5 * (s2_off) + 2]; \
157 (dw)[5 * (d_off) + 3] = (s1w)[5 * (s1_off) + 3] \
158 + (s2w)[5 * (s2_off) + 3]; \
159 (dw)[5 * (d_off) + 4] = (s1w)[5 * (s1_off) + 4] \
160 + (s2w)[5 * (s2_off) + 4]; \
163 #define ZADDT(dw, d_off, sw, s_off) do { \
164 (dw)[5 * (d_off) + 0] += (sw)[5 * (s_off) + 0]; \
165 (dw)[5 * (d_off) + 1] += (sw)[5 * (s_off) + 1]; \
166 (dw)[5 * (d_off) + 2] += (sw)[5 * (s_off) + 2]; \
167 (dw)[5 * (d_off) + 3] += (sw)[5 * (s_off) + 3]; \
168 (dw)[5 * (d_off) + 4] += (sw)[5 * (s_off) + 4]; \
171 #define ZSUB2F(dw, d_off, s1w, s1_off, s2w, s2_off) do { \
172 (dw)[5 * (d_off) + 0] -= (s1w)[5 * (s1_off) + 0] \
173 + (s2w)[5 * (s2_off) + 0]; \
174 (dw)[5 * (d_off) + 1] -= (s1w)[5 * (s1_off) + 1] \
175 + (s2w)[5 * (s2_off) + 1]; \
176 (dw)[5 * (d_off) + 2] -= (s1w)[5 * (s1_off) + 2] \
177 + (s2w)[5 * (s2_off) + 2]; \
178 (dw)[5 * (d_off) + 3] -= (s1w)[5 * (s1_off) + 3] \
179 + (s2w)[5 * (s2_off) + 3]; \
180 (dw)[5 * (d_off) + 4] -= (s1w)[5 * (s1_off) + 4] \
181 + (s2w)[5 * (s2_off) + 4]; \
184 #define CPR1(w, cprcc) do { \
185 uint32_t cprz = (w) + cprcc; \
186 (w) = cprz & 0x1FFF; \
187 cprcc = cprz >> 13; \
190 #define CPR(dw, d_off) do { \
193 CPR1((dw)[(d_off) + 0], cprcc); \
194 CPR1((dw)[(d_off) + 1], cprcc); \
195 CPR1((dw)[(d_off) + 2], cprcc); \
196 CPR1((dw)[(d_off) + 3], cprcc); \
197 CPR1((dw)[(d_off) + 4], cprcc); \
198 CPR1((dw)[(d_off) + 5], cprcc); \
199 CPR1((dw)[(d_off) + 6], cprcc); \
200 CPR1((dw)[(d_off) + 7], cprcc); \
201 CPR1((dw)[(d_off) + 8], cprcc); \
202 (dw)[(d_off) + 9] = cprcc; \
205 memcpy(u
, a
, 20 * sizeof *a
);
206 ZADD(u
, 4, a
, 0, a
, 1);
207 ZADD(u
, 5, a
, 2, a
, 3);
208 ZADD(u
, 6, a
, 0, a
, 2);
209 ZADD(u
, 7, a
, 1, a
, 3);
210 ZADD(u
, 8, u
, 6, u
, 7);
212 memcpy(v
, b
, 20 * sizeof *b
);
213 ZADD(v
, 4, b
, 0, b
, 1);
214 ZADD(v
, 5, b
, 2, b
, 3);
215 ZADD(v
, 6, b
, 0, b
, 2);
216 ZADD(v
, 7, b
, 1, b
, 3);
217 ZADD(v
, 8, v
, 6, v
, 7);
220 * Do the eight first 8x8 muls. Source words are at most 16382
221 * each, so we can add product results together "as is" in 32-bit
224 for (i
= 0; i
< 40; i
+= 5) {
225 w
[(i
<< 1) + 0] = MUL15(u
[i
+ 0], v
[i
+ 0]);
226 w
[(i
<< 1) + 1] = MUL15(u
[i
+ 0], v
[i
+ 1])
227 + MUL15(u
[i
+ 1], v
[i
+ 0]);
228 w
[(i
<< 1) + 2] = MUL15(u
[i
+ 0], v
[i
+ 2])
229 + MUL15(u
[i
+ 1], v
[i
+ 1])
230 + MUL15(u
[i
+ 2], v
[i
+ 0]);
231 w
[(i
<< 1) + 3] = MUL15(u
[i
+ 0], v
[i
+ 3])
232 + MUL15(u
[i
+ 1], v
[i
+ 2])
233 + MUL15(u
[i
+ 2], v
[i
+ 1])
234 + MUL15(u
[i
+ 3], v
[i
+ 0]);
235 w
[(i
<< 1) + 4] = MUL15(u
[i
+ 0], v
[i
+ 4])
236 + MUL15(u
[i
+ 1], v
[i
+ 3])
237 + MUL15(u
[i
+ 2], v
[i
+ 2])
238 + MUL15(u
[i
+ 3], v
[i
+ 1])
239 + MUL15(u
[i
+ 4], v
[i
+ 0]);
240 w
[(i
<< 1) + 5] = MUL15(u
[i
+ 1], v
[i
+ 4])
241 + MUL15(u
[i
+ 2], v
[i
+ 3])
242 + MUL15(u
[i
+ 3], v
[i
+ 2])
243 + MUL15(u
[i
+ 4], v
[i
+ 1]);
244 w
[(i
<< 1) + 6] = MUL15(u
[i
+ 2], v
[i
+ 4])
245 + MUL15(u
[i
+ 3], v
[i
+ 3])
246 + MUL15(u
[i
+ 4], v
[i
+ 2]);
247 w
[(i
<< 1) + 7] = MUL15(u
[i
+ 3], v
[i
+ 4])
248 + MUL15(u
[i
+ 4], v
[i
+ 3]);
249 w
[(i
<< 1) + 8] = MUL15(u
[i
+ 4], v
[i
+ 4]);
254 * For the 9th multiplication, source words are up to 32764,
255 * so we must do some carry propagation. If we add up to
256 * 4 products and the carry is no more than 524224, then the
257 * result fits in 32 bits, and the next carry will be no more
258 * than 524224 (because 4*(32764^2)+524224 < 8192*524225).
260 * We thus just skip one of the products in the middle word,
261 * then do a carry propagation (this reduces words to 13 bits
262 * each, except possibly the last, which may use up to 17 bits
263 * or so), then add the missing product.
265 w
[80 + 0] = MUL15(u
[40 + 0], v
[40 + 0]);
266 w
[80 + 1] = MUL15(u
[40 + 0], v
[40 + 1])
267 + MUL15(u
[40 + 1], v
[40 + 0]);
268 w
[80 + 2] = MUL15(u
[40 + 0], v
[40 + 2])
269 + MUL15(u
[40 + 1], v
[40 + 1])
270 + MUL15(u
[40 + 2], v
[40 + 0]);
271 w
[80 + 3] = MUL15(u
[40 + 0], v
[40 + 3])
272 + MUL15(u
[40 + 1], v
[40 + 2])
273 + MUL15(u
[40 + 2], v
[40 + 1])
274 + MUL15(u
[40 + 3], v
[40 + 0]);
275 w
[80 + 4] = MUL15(u
[40 + 0], v
[40 + 4])
276 + MUL15(u
[40 + 1], v
[40 + 3])
277 + MUL15(u
[40 + 2], v
[40 + 2])
278 + MUL15(u
[40 + 3], v
[40 + 1]);
279 /* + MUL15(u[40 + 4], v[40 + 0]) */
280 w
[80 + 5] = MUL15(u
[40 + 1], v
[40 + 4])
281 + MUL15(u
[40 + 2], v
[40 + 3])
282 + MUL15(u
[40 + 3], v
[40 + 2])
283 + MUL15(u
[40 + 4], v
[40 + 1]);
284 w
[80 + 6] = MUL15(u
[40 + 2], v
[40 + 4])
285 + MUL15(u
[40 + 3], v
[40 + 3])
286 + MUL15(u
[40 + 4], v
[40 + 2]);
287 w
[80 + 7] = MUL15(u
[40 + 3], v
[40 + 4])
288 + MUL15(u
[40 + 4], v
[40 + 3]);
289 w
[80 + 8] = MUL15(u
[40 + 4], v
[40 + 4]);
293 w
[80 + 4] += MUL15(u
[40 + 4], v
[40 + 0]);
296 * The products on 14-bit words in slots 6 and 7 yield values
297 * up to 5*(16382^2) each, and we need to subtract two such
298 * values from the higher word. We need the subtraction to fit
299 * in a _signed_ 32-bit integer, i.e. 31 bits + a sign bit.
300 * However, 10*(16382^2) does not fit. So we must perform a
301 * bit of reduction here.
310 /* 0..1*0..1 into 0..3 */
311 ZSUB2F(w
, 8, w
, 0, w
, 2);
312 ZSUB2F(w
, 9, w
, 1, w
, 3);
316 /* 2..3*2..3 into 4..7 */
317 ZSUB2F(w
, 10, w
, 4, w
, 6);
318 ZSUB2F(w
, 11, w
, 5, w
, 7);
322 /* (0..1+2..3)*(0..1+2..3) into 12..15 */
323 ZSUB2F(w
, 16, w
, 12, w
, 14);
324 ZSUB2F(w
, 17, w
, 13, w
, 15);
328 /* first-level recomposition */
329 ZSUB2F(w
, 12, w
, 0, w
, 4);
330 ZSUB2F(w
, 13, w
, 1, w
, 5);
331 ZSUB2F(w
, 14, w
, 2, w
, 6);
332 ZSUB2F(w
, 15, w
, 3, w
, 7);
339 * Perform carry propagation to bring all words down to 13 bits.
341 cc
= norm13(d
, w
, 40);
354 mul20(uint32_t *d
, const uint32_t *a
, const uint32_t *b
)
358 t
[ 0] = MUL15(a
[ 0], b
[ 0]);
359 t
[ 1] = MUL15(a
[ 0], b
[ 1])
360 + MUL15(a
[ 1], b
[ 0]);
361 t
[ 2] = MUL15(a
[ 0], b
[ 2])
362 + MUL15(a
[ 1], b
[ 1])
363 + MUL15(a
[ 2], b
[ 0]);
364 t
[ 3] = MUL15(a
[ 0], b
[ 3])
365 + MUL15(a
[ 1], b
[ 2])
366 + MUL15(a
[ 2], b
[ 1])
367 + MUL15(a
[ 3], b
[ 0]);
368 t
[ 4] = MUL15(a
[ 0], b
[ 4])
369 + MUL15(a
[ 1], b
[ 3])
370 + MUL15(a
[ 2], b
[ 2])
371 + MUL15(a
[ 3], b
[ 1])
372 + MUL15(a
[ 4], b
[ 0]);
373 t
[ 5] = MUL15(a
[ 0], b
[ 5])
374 + MUL15(a
[ 1], b
[ 4])
375 + MUL15(a
[ 2], b
[ 3])
376 + MUL15(a
[ 3], b
[ 2])
377 + MUL15(a
[ 4], b
[ 1])
378 + MUL15(a
[ 5], b
[ 0]);
379 t
[ 6] = MUL15(a
[ 0], b
[ 6])
380 + MUL15(a
[ 1], b
[ 5])
381 + MUL15(a
[ 2], b
[ 4])
382 + MUL15(a
[ 3], b
[ 3])
383 + MUL15(a
[ 4], b
[ 2])
384 + MUL15(a
[ 5], b
[ 1])
385 + MUL15(a
[ 6], b
[ 0]);
386 t
[ 7] = MUL15(a
[ 0], b
[ 7])
387 + MUL15(a
[ 1], b
[ 6])
388 + MUL15(a
[ 2], b
[ 5])
389 + MUL15(a
[ 3], b
[ 4])
390 + MUL15(a
[ 4], b
[ 3])
391 + MUL15(a
[ 5], b
[ 2])
392 + MUL15(a
[ 6], b
[ 1])
393 + MUL15(a
[ 7], b
[ 0]);
394 t
[ 8] = MUL15(a
[ 0], b
[ 8])
395 + MUL15(a
[ 1], b
[ 7])
396 + MUL15(a
[ 2], b
[ 6])
397 + MUL15(a
[ 3], b
[ 5])
398 + MUL15(a
[ 4], b
[ 4])
399 + MUL15(a
[ 5], b
[ 3])
400 + MUL15(a
[ 6], b
[ 2])
401 + MUL15(a
[ 7], b
[ 1])
402 + MUL15(a
[ 8], b
[ 0]);
403 t
[ 9] = MUL15(a
[ 0], b
[ 9])
404 + MUL15(a
[ 1], b
[ 8])
405 + MUL15(a
[ 2], b
[ 7])
406 + MUL15(a
[ 3], b
[ 6])
407 + MUL15(a
[ 4], b
[ 5])
408 + MUL15(a
[ 5], b
[ 4])
409 + MUL15(a
[ 6], b
[ 3])
410 + MUL15(a
[ 7], b
[ 2])
411 + MUL15(a
[ 8], b
[ 1])
412 + MUL15(a
[ 9], b
[ 0]);
413 t
[10] = MUL15(a
[ 0], b
[10])
414 + MUL15(a
[ 1], b
[ 9])
415 + MUL15(a
[ 2], b
[ 8])
416 + MUL15(a
[ 3], b
[ 7])
417 + MUL15(a
[ 4], b
[ 6])
418 + MUL15(a
[ 5], b
[ 5])
419 + MUL15(a
[ 6], b
[ 4])
420 + MUL15(a
[ 7], b
[ 3])
421 + MUL15(a
[ 8], b
[ 2])
422 + MUL15(a
[ 9], b
[ 1])
423 + MUL15(a
[10], b
[ 0]);
424 t
[11] = MUL15(a
[ 0], b
[11])
425 + MUL15(a
[ 1], b
[10])
426 + MUL15(a
[ 2], b
[ 9])
427 + MUL15(a
[ 3], b
[ 8])
428 + MUL15(a
[ 4], b
[ 7])
429 + MUL15(a
[ 5], b
[ 6])
430 + MUL15(a
[ 6], b
[ 5])
431 + MUL15(a
[ 7], b
[ 4])
432 + MUL15(a
[ 8], b
[ 3])
433 + MUL15(a
[ 9], b
[ 2])
434 + MUL15(a
[10], b
[ 1])
435 + MUL15(a
[11], b
[ 0]);
436 t
[12] = MUL15(a
[ 0], b
[12])
437 + MUL15(a
[ 1], b
[11])
438 + MUL15(a
[ 2], b
[10])
439 + MUL15(a
[ 3], b
[ 9])
440 + MUL15(a
[ 4], b
[ 8])
441 + MUL15(a
[ 5], b
[ 7])
442 + MUL15(a
[ 6], b
[ 6])
443 + MUL15(a
[ 7], b
[ 5])
444 + MUL15(a
[ 8], b
[ 4])
445 + MUL15(a
[ 9], b
[ 3])
446 + MUL15(a
[10], b
[ 2])
447 + MUL15(a
[11], b
[ 1])
448 + MUL15(a
[12], b
[ 0]);
449 t
[13] = MUL15(a
[ 0], b
[13])
450 + MUL15(a
[ 1], b
[12])
451 + MUL15(a
[ 2], b
[11])
452 + MUL15(a
[ 3], b
[10])
453 + MUL15(a
[ 4], b
[ 9])
454 + MUL15(a
[ 5], b
[ 8])
455 + MUL15(a
[ 6], b
[ 7])
456 + MUL15(a
[ 7], b
[ 6])
457 + MUL15(a
[ 8], b
[ 5])
458 + MUL15(a
[ 9], b
[ 4])
459 + MUL15(a
[10], b
[ 3])
460 + MUL15(a
[11], b
[ 2])
461 + MUL15(a
[12], b
[ 1])
462 + MUL15(a
[13], b
[ 0]);
463 t
[14] = MUL15(a
[ 0], b
[14])
464 + MUL15(a
[ 1], b
[13])
465 + MUL15(a
[ 2], b
[12])
466 + MUL15(a
[ 3], b
[11])
467 + MUL15(a
[ 4], b
[10])
468 + MUL15(a
[ 5], b
[ 9])
469 + MUL15(a
[ 6], b
[ 8])
470 + MUL15(a
[ 7], b
[ 7])
471 + MUL15(a
[ 8], b
[ 6])
472 + MUL15(a
[ 9], b
[ 5])
473 + MUL15(a
[10], b
[ 4])
474 + MUL15(a
[11], b
[ 3])
475 + MUL15(a
[12], b
[ 2])
476 + MUL15(a
[13], b
[ 1])
477 + MUL15(a
[14], b
[ 0]);
478 t
[15] = MUL15(a
[ 0], b
[15])
479 + MUL15(a
[ 1], b
[14])
480 + MUL15(a
[ 2], b
[13])
481 + MUL15(a
[ 3], b
[12])
482 + MUL15(a
[ 4], b
[11])
483 + MUL15(a
[ 5], b
[10])
484 + MUL15(a
[ 6], b
[ 9])
485 + MUL15(a
[ 7], b
[ 8])
486 + MUL15(a
[ 8], b
[ 7])
487 + MUL15(a
[ 9], b
[ 6])
488 + MUL15(a
[10], b
[ 5])
489 + MUL15(a
[11], b
[ 4])
490 + MUL15(a
[12], b
[ 3])
491 + MUL15(a
[13], b
[ 2])
492 + MUL15(a
[14], b
[ 1])
493 + MUL15(a
[15], b
[ 0]);
494 t
[16] = MUL15(a
[ 0], b
[16])
495 + MUL15(a
[ 1], b
[15])
496 + MUL15(a
[ 2], b
[14])
497 + MUL15(a
[ 3], b
[13])
498 + MUL15(a
[ 4], b
[12])
499 + MUL15(a
[ 5], b
[11])
500 + MUL15(a
[ 6], b
[10])
501 + MUL15(a
[ 7], b
[ 9])
502 + MUL15(a
[ 8], b
[ 8])
503 + MUL15(a
[ 9], b
[ 7])
504 + MUL15(a
[10], b
[ 6])
505 + MUL15(a
[11], b
[ 5])
506 + MUL15(a
[12], b
[ 4])
507 + MUL15(a
[13], b
[ 3])
508 + MUL15(a
[14], b
[ 2])
509 + MUL15(a
[15], b
[ 1])
510 + MUL15(a
[16], b
[ 0]);
511 t
[17] = MUL15(a
[ 0], b
[17])
512 + MUL15(a
[ 1], b
[16])
513 + MUL15(a
[ 2], b
[15])
514 + MUL15(a
[ 3], b
[14])
515 + MUL15(a
[ 4], b
[13])
516 + MUL15(a
[ 5], b
[12])
517 + MUL15(a
[ 6], b
[11])
518 + MUL15(a
[ 7], b
[10])
519 + MUL15(a
[ 8], b
[ 9])
520 + MUL15(a
[ 9], b
[ 8])
521 + MUL15(a
[10], b
[ 7])
522 + MUL15(a
[11], b
[ 6])
523 + MUL15(a
[12], b
[ 5])
524 + MUL15(a
[13], b
[ 4])
525 + MUL15(a
[14], b
[ 3])
526 + MUL15(a
[15], b
[ 2])
527 + MUL15(a
[16], b
[ 1])
528 + MUL15(a
[17], b
[ 0]);
529 t
[18] = MUL15(a
[ 0], b
[18])
530 + MUL15(a
[ 1], b
[17])
531 + MUL15(a
[ 2], b
[16])
532 + MUL15(a
[ 3], b
[15])
533 + MUL15(a
[ 4], b
[14])
534 + MUL15(a
[ 5], b
[13])
535 + MUL15(a
[ 6], b
[12])
536 + MUL15(a
[ 7], b
[11])
537 + MUL15(a
[ 8], b
[10])
538 + MUL15(a
[ 9], b
[ 9])
539 + MUL15(a
[10], b
[ 8])
540 + MUL15(a
[11], b
[ 7])
541 + MUL15(a
[12], b
[ 6])
542 + MUL15(a
[13], b
[ 5])
543 + MUL15(a
[14], b
[ 4])
544 + MUL15(a
[15], b
[ 3])
545 + MUL15(a
[16], b
[ 2])
546 + MUL15(a
[17], b
[ 1])
547 + MUL15(a
[18], b
[ 0]);
548 t
[19] = MUL15(a
[ 0], b
[19])
549 + MUL15(a
[ 1], b
[18])
550 + MUL15(a
[ 2], b
[17])
551 + MUL15(a
[ 3], b
[16])
552 + MUL15(a
[ 4], b
[15])
553 + MUL15(a
[ 5], b
[14])
554 + MUL15(a
[ 6], b
[13])
555 + MUL15(a
[ 7], b
[12])
556 + MUL15(a
[ 8], b
[11])
557 + MUL15(a
[ 9], b
[10])
558 + MUL15(a
[10], b
[ 9])
559 + MUL15(a
[11], b
[ 8])
560 + MUL15(a
[12], b
[ 7])
561 + MUL15(a
[13], b
[ 6])
562 + MUL15(a
[14], b
[ 5])
563 + MUL15(a
[15], b
[ 4])
564 + MUL15(a
[16], b
[ 3])
565 + MUL15(a
[17], b
[ 2])
566 + MUL15(a
[18], b
[ 1])
567 + MUL15(a
[19], b
[ 0]);
568 t
[20] = MUL15(a
[ 1], b
[19])
569 + MUL15(a
[ 2], b
[18])
570 + MUL15(a
[ 3], b
[17])
571 + MUL15(a
[ 4], b
[16])
572 + MUL15(a
[ 5], b
[15])
573 + MUL15(a
[ 6], b
[14])
574 + MUL15(a
[ 7], b
[13])
575 + MUL15(a
[ 8], b
[12])
576 + MUL15(a
[ 9], b
[11])
577 + MUL15(a
[10], b
[10])
578 + MUL15(a
[11], b
[ 9])
579 + MUL15(a
[12], b
[ 8])
580 + MUL15(a
[13], b
[ 7])
581 + MUL15(a
[14], b
[ 6])
582 + MUL15(a
[15], b
[ 5])
583 + MUL15(a
[16], b
[ 4])
584 + MUL15(a
[17], b
[ 3])
585 + MUL15(a
[18], b
[ 2])
586 + MUL15(a
[19], b
[ 1]);
587 t
[21] = MUL15(a
[ 2], b
[19])
588 + MUL15(a
[ 3], b
[18])
589 + MUL15(a
[ 4], b
[17])
590 + MUL15(a
[ 5], b
[16])
591 + MUL15(a
[ 6], b
[15])
592 + MUL15(a
[ 7], b
[14])
593 + MUL15(a
[ 8], b
[13])
594 + MUL15(a
[ 9], b
[12])
595 + MUL15(a
[10], b
[11])
596 + MUL15(a
[11], b
[10])
597 + MUL15(a
[12], b
[ 9])
598 + MUL15(a
[13], b
[ 8])
599 + MUL15(a
[14], b
[ 7])
600 + MUL15(a
[15], b
[ 6])
601 + MUL15(a
[16], b
[ 5])
602 + MUL15(a
[17], b
[ 4])
603 + MUL15(a
[18], b
[ 3])
604 + MUL15(a
[19], b
[ 2]);
605 t
[22] = MUL15(a
[ 3], b
[19])
606 + MUL15(a
[ 4], b
[18])
607 + MUL15(a
[ 5], b
[17])
608 + MUL15(a
[ 6], b
[16])
609 + MUL15(a
[ 7], b
[15])
610 + MUL15(a
[ 8], b
[14])
611 + MUL15(a
[ 9], b
[13])
612 + MUL15(a
[10], b
[12])
613 + MUL15(a
[11], b
[11])
614 + MUL15(a
[12], b
[10])
615 + MUL15(a
[13], b
[ 9])
616 + MUL15(a
[14], b
[ 8])
617 + MUL15(a
[15], b
[ 7])
618 + MUL15(a
[16], b
[ 6])
619 + MUL15(a
[17], b
[ 5])
620 + MUL15(a
[18], b
[ 4])
621 + MUL15(a
[19], b
[ 3]);
622 t
[23] = MUL15(a
[ 4], b
[19])
623 + MUL15(a
[ 5], b
[18])
624 + MUL15(a
[ 6], b
[17])
625 + MUL15(a
[ 7], b
[16])
626 + MUL15(a
[ 8], b
[15])
627 + MUL15(a
[ 9], b
[14])
628 + MUL15(a
[10], b
[13])
629 + MUL15(a
[11], b
[12])
630 + MUL15(a
[12], b
[11])
631 + MUL15(a
[13], b
[10])
632 + MUL15(a
[14], b
[ 9])
633 + MUL15(a
[15], b
[ 8])
634 + MUL15(a
[16], b
[ 7])
635 + MUL15(a
[17], b
[ 6])
636 + MUL15(a
[18], b
[ 5])
637 + MUL15(a
[19], b
[ 4]);
638 t
[24] = MUL15(a
[ 5], b
[19])
639 + MUL15(a
[ 6], b
[18])
640 + MUL15(a
[ 7], b
[17])
641 + MUL15(a
[ 8], b
[16])
642 + MUL15(a
[ 9], b
[15])
643 + MUL15(a
[10], b
[14])
644 + MUL15(a
[11], b
[13])
645 + MUL15(a
[12], b
[12])
646 + MUL15(a
[13], b
[11])
647 + MUL15(a
[14], b
[10])
648 + MUL15(a
[15], b
[ 9])
649 + MUL15(a
[16], b
[ 8])
650 + MUL15(a
[17], b
[ 7])
651 + MUL15(a
[18], b
[ 6])
652 + MUL15(a
[19], b
[ 5]);
653 t
[25] = MUL15(a
[ 6], b
[19])
654 + MUL15(a
[ 7], b
[18])
655 + MUL15(a
[ 8], b
[17])
656 + MUL15(a
[ 9], b
[16])
657 + MUL15(a
[10], b
[15])
658 + MUL15(a
[11], b
[14])
659 + MUL15(a
[12], b
[13])
660 + MUL15(a
[13], b
[12])
661 + MUL15(a
[14], b
[11])
662 + MUL15(a
[15], b
[10])
663 + MUL15(a
[16], b
[ 9])
664 + MUL15(a
[17], b
[ 8])
665 + MUL15(a
[18], b
[ 7])
666 + MUL15(a
[19], b
[ 6]);
667 t
[26] = MUL15(a
[ 7], b
[19])
668 + MUL15(a
[ 8], b
[18])
669 + MUL15(a
[ 9], b
[17])
670 + MUL15(a
[10], b
[16])
671 + MUL15(a
[11], b
[15])
672 + MUL15(a
[12], b
[14])
673 + MUL15(a
[13], b
[13])
674 + MUL15(a
[14], b
[12])
675 + MUL15(a
[15], b
[11])
676 + MUL15(a
[16], b
[10])
677 + MUL15(a
[17], b
[ 9])
678 + MUL15(a
[18], b
[ 8])
679 + MUL15(a
[19], b
[ 7]);
680 t
[27] = MUL15(a
[ 8], b
[19])
681 + MUL15(a
[ 9], b
[18])
682 + MUL15(a
[10], b
[17])
683 + MUL15(a
[11], b
[16])
684 + MUL15(a
[12], b
[15])
685 + MUL15(a
[13], b
[14])
686 + MUL15(a
[14], b
[13])
687 + MUL15(a
[15], b
[12])
688 + MUL15(a
[16], b
[11])
689 + MUL15(a
[17], b
[10])
690 + MUL15(a
[18], b
[ 9])
691 + MUL15(a
[19], b
[ 8]);
692 t
[28] = MUL15(a
[ 9], b
[19])
693 + MUL15(a
[10], b
[18])
694 + MUL15(a
[11], b
[17])
695 + MUL15(a
[12], b
[16])
696 + MUL15(a
[13], b
[15])
697 + MUL15(a
[14], b
[14])
698 + MUL15(a
[15], b
[13])
699 + MUL15(a
[16], b
[12])
700 + MUL15(a
[17], b
[11])
701 + MUL15(a
[18], b
[10])
702 + MUL15(a
[19], b
[ 9]);
703 t
[29] = MUL15(a
[10], b
[19])
704 + MUL15(a
[11], b
[18])
705 + MUL15(a
[12], b
[17])
706 + MUL15(a
[13], b
[16])
707 + MUL15(a
[14], b
[15])
708 + MUL15(a
[15], b
[14])
709 + MUL15(a
[16], b
[13])
710 + MUL15(a
[17], b
[12])
711 + MUL15(a
[18], b
[11])
712 + MUL15(a
[19], b
[10]);
713 t
[30] = MUL15(a
[11], b
[19])
714 + MUL15(a
[12], b
[18])
715 + MUL15(a
[13], b
[17])
716 + MUL15(a
[14], b
[16])
717 + MUL15(a
[15], b
[15])
718 + MUL15(a
[16], b
[14])
719 + MUL15(a
[17], b
[13])
720 + MUL15(a
[18], b
[12])
721 + MUL15(a
[19], b
[11]);
722 t
[31] = MUL15(a
[12], b
[19])
723 + MUL15(a
[13], b
[18])
724 + MUL15(a
[14], b
[17])
725 + MUL15(a
[15], b
[16])
726 + MUL15(a
[16], b
[15])
727 + MUL15(a
[17], b
[14])
728 + MUL15(a
[18], b
[13])
729 + MUL15(a
[19], b
[12]);
730 t
[32] = MUL15(a
[13], b
[19])
731 + MUL15(a
[14], b
[18])
732 + MUL15(a
[15], b
[17])
733 + MUL15(a
[16], b
[16])
734 + MUL15(a
[17], b
[15])
735 + MUL15(a
[18], b
[14])
736 + MUL15(a
[19], b
[13]);
737 t
[33] = MUL15(a
[14], b
[19])
738 + MUL15(a
[15], b
[18])
739 + MUL15(a
[16], b
[17])
740 + MUL15(a
[17], b
[16])
741 + MUL15(a
[18], b
[15])
742 + MUL15(a
[19], b
[14]);
743 t
[34] = MUL15(a
[15], b
[19])
744 + MUL15(a
[16], b
[18])
745 + MUL15(a
[17], b
[17])
746 + MUL15(a
[18], b
[16])
747 + MUL15(a
[19], b
[15]);
748 t
[35] = MUL15(a
[16], b
[19])
749 + MUL15(a
[17], b
[18])
750 + MUL15(a
[18], b
[17])
751 + MUL15(a
[19], b
[16]);
752 t
[36] = MUL15(a
[17], b
[19])
753 + MUL15(a
[18], b
[18])
754 + MUL15(a
[19], b
[17]);
755 t
[37] = MUL15(a
[18], b
[19])
756 + MUL15(a
[19], b
[18]);
757 t
[38] = MUL15(a
[19], b
[19]);
758 d
[39] = norm13(d
, t
, 39);
764 * Modulus for field F256 (field for point coordinates in curve P-256).
766 static const uint32_t F256
[] = {
767 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x001F,
768 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0400, 0x0000,
769 0x0000, 0x1FF8, 0x1FFF, 0x01FF
773 * The 'b' curve equation coefficient for P-256.
775 static const uint32_t P256_B
[] = {
776 0x004B, 0x1E93, 0x0F89, 0x1C78, 0x03BC, 0x187B, 0x114E, 0x1619,
777 0x1D06, 0x0328, 0x01AF, 0x0D31, 0x1557, 0x15DE, 0x1ECF, 0x127C,
778 0x0A3A, 0x0EC5, 0x118D, 0x00B5
782 * Perform a "short reduction" in field F256 (field for curve P-256).
783 * The source value should be less than 262 bits; on output, it will
784 * be at most 257 bits, and less than twice the modulus.
787 reduce_f256(uint32_t *d
)
801 * Perform a "final reduction" in field F256 (field for curve P-256).
802 * The source value must be less than twice the modulus. If the value
803 * is not lower than the modulus, then the modulus is subtracted and
804 * this function returns 1; otherwise, it leaves it untouched and it
808 reduce_final_f256(uint32_t *d
)
814 memcpy(t
, d
, sizeof t
);
816 for (i
= 0; i
< 20; i
++) {
819 w
= t
[i
] - F256
[i
] - cc
;
824 CCOPY(cc
, d
, t
, sizeof t
);
829 * Perform a multiplication of two integers modulo
830 * 2^256-2^224+2^192+2^96-1 (for NIST curve P-256). Operands are arrays
831 * of 20 words, each containing 13 bits of data, in little-endian order.
832 * On input, upper word may be up to 15 bits (hence value up to 2^262-1);
833 * on output, value fits on 257 bits and is lower than twice the modulus.
836 mul_f256(uint32_t *d
, const uint32_t *a
, const uint32_t *b
)
842 * Compute raw multiplication. All result words fit in 13 bits
848 * Modular reduction: each high word in added/subtracted where
852 * p = 2^256 - 2^224 + 2^192 + 2^96 - 1
854 * 2^256 = 2^224 - 2^192 - 2^96 + 1 mod p
856 * For a word x at bit offset n (n >= 256), we have:
857 * x*2^n = x*2^(n-32) - x*2^(n-64)
858 * - x*2^(n - 160) + x*2^(n-256) mod p
860 * Thus, we can nullify the high word if we reinject it at some
861 * proper emplacements.
863 for (i
= 39; i
>= 20; i
--) {
867 t
[i
- 2] += ARSH(x
, 6);
868 t
[i
- 3] += (x
<< 7) & 0x1FFF;
869 t
[i
- 4] -= ARSH(x
, 12);
870 t
[i
- 5] -= (x
<< 1) & 0x1FFF;
871 t
[i
- 12] -= ARSH(x
, 4);
872 t
[i
- 13] -= (x
<< 9) & 0x1FFF;
873 t
[i
- 19] += ARSH(x
, 9);
874 t
[i
- 20] += (x
<< 4) & 0x1FFF;
878 * Propagate carries. Since the operation above really is a
879 * truncature, followed by the addition of nonnegative values,
880 * the result will be positive. Moreover, the carry cannot
881 * exceed 5 bits (we performed 20 additions with values smaller
884 cc
= norm13(t
, t
, 20);
887 * Perform modular reduction again for the bits beyond 256 (the carry
888 * and the bits 256..259). This time, we can simply inject full
891 cc
= (cc
<< 4) | (t
[19] >> 9);
901 * Jacobian coordinates for a point in P-256: affine coordinates (X,Y)
905 * For the point at infinity, z = 0.
906 * Each point thus admits many possible representations.
908 * Coordinates are represented in arrays of 32-bit integers, each holding
909 * 13 bits of data. Values may also be slightly greater than the modulus,
910 * but they will always be lower than twice the modulus.
919 * Convert a point to affine coordinates:
920 * - If the point is the point at infinity, then all three coordinates
922 * - Otherwise, the 'z' coordinate is set to 1, and the 'x' and 'y'
923 * coordinates are the 'X' and 'Y' affine coordinates.
924 * The coordinates are guaranteed to be lower than the modulus.
927 p256_to_affine(p256_jacobian
*P
)
929 uint32_t t1
[20], t2
[20];
933 * Invert z with a modular exponentiation: the modulus is
934 * p = 2^256 - 2^224 + 2^192 + 2^96 - 1, and the exponent is
935 * p-2. Exponent bit pattern (from high to low) is:
936 * - 32 bits of value 1
937 * - 31 bits of value 0
939 * - 96 bits of value 0
940 * - 94 bits of value 1
943 * Thus, we precompute z^(2^31-1) to speed things up.
945 * If z = 0 (point at infinity) then the modular exponentiation
946 * will yield 0, which leads to the expected result (all three
947 * coordinates set to 0).
951 * A simple square-and-multiply for z^(2^31-1). We could save about
952 * two dozen multiplications here with an addition chain, but
953 * this would require a bit more code, and extra stack buffers.
955 memcpy(t1
, P
->z
, sizeof P
->z
);
956 for (i
= 0; i
< 30; i
++) {
957 mul_f256(t1
, t1
, t1
);
958 mul_f256(t1
, t1
, P
->z
);
962 * Square-and-multiply. Apart from the squarings, we have a few
963 * multiplications to set bits to 1; we multiply by the original z
964 * for setting 1 bit, and by t1 for setting 31 bits.
966 memcpy(t2
, P
->z
, sizeof P
->z
);
967 for (i
= 1; i
< 256; i
++) {
968 mul_f256(t2
, t2
, t2
);
974 mul_f256(t2
, t2
, t1
);
979 mul_f256(t2
, t2
, P
->z
);
985 * Now that we have 1/z, multiply x by 1/z^2 and y by 1/z^3.
987 mul_f256(t1
, t2
, t2
);
988 mul_f256(P
->x
, t1
, P
->x
);
989 mul_f256(t1
, t1
, t2
);
990 mul_f256(P
->y
, t1
, P
->y
);
991 reduce_final_f256(P
->x
);
992 reduce_final_f256(P
->y
);
995 * Multiply z by 1/z. If z = 0, then this will yield 0, otherwise
996 * this will set z to 1.
998 mul_f256(P
->z
, P
->z
, t2
);
999 reduce_final_f256(P
->z
);
1003 * Double a point in P-256. This function works for all valid points,
1004 * including the point at infinity.
1007 p256_double(p256_jacobian
*Q
)
1010 * Doubling formulas are:
1013 * m = 3*(x + z^2)*(x - z^2)
1015 * y' = m*(s - x') - 8*y^4
1018 * These formulas work for all points, including points of order 2
1019 * and points at infinity:
1020 * - If y = 0 then z' = 0. But there is no such point in P-256
1022 * - If z = 0 then z' = 0.
1024 uint32_t t1
[20], t2
[20], t3
[20], t4
[20];
1028 * Compute z^2 in t1.
1030 mul_f256(t1
, Q
->z
, Q
->z
);
1033 * Compute x-z^2 in t2 and x+z^2 in t1.
1035 for (i
= 0; i
< 20; i
++) {
1036 t2
[i
] = (F256
[i
] << 1) + Q
->x
[i
] - t1
[i
];
1043 * Compute 3*(x+z^2)*(x-z^2) in t1.
1045 mul_f256(t3
, t1
, t2
);
1046 for (i
= 0; i
< 20; i
++) {
1047 t1
[i
] = MUL15(3, t3
[i
]);
1052 * Compute 4*x*y^2 (in t2) and 2*y^2 (in t3).
1054 mul_f256(t3
, Q
->y
, Q
->y
);
1055 for (i
= 0; i
< 20; i
++) {
1059 mul_f256(t2
, Q
->x
, t3
);
1060 for (i
= 0; i
< 20; i
++) {
1067 * Compute x' = m^2 - 2*s.
1069 mul_f256(Q
->x
, t1
, t1
);
1070 for (i
= 0; i
< 20; i
++) {
1071 Q
->x
[i
] += (F256
[i
] << 2) - (t2
[i
] << 1);
1073 norm13(Q
->x
, Q
->x
, 20);
1077 * Compute z' = 2*y*z.
1079 mul_f256(t4
, Q
->y
, Q
->z
);
1080 for (i
= 0; i
< 20; i
++) {
1081 Q
->z
[i
] = t4
[i
] << 1;
1083 norm13(Q
->z
, Q
->z
, 20);
1087 * Compute y' = m*(s - x') - 8*y^4. Note that we already have
1090 for (i
= 0; i
< 20; i
++) {
1091 t2
[i
] += (F256
[i
] << 1) - Q
->x
[i
];
1094 mul_f256(Q
->y
, t1
, t2
);
1095 mul_f256(t4
, t3
, t3
);
1096 for (i
= 0; i
< 20; i
++) {
1097 Q
->y
[i
] += (F256
[i
] << 2) - (t4
[i
] << 1);
1099 norm13(Q
->y
, Q
->y
, 20);
1104 * Add point P2 to point P1.
1106 * This function computes the wrong result in the following cases:
1108 * - If P1 == 0 but P2 != 0
1109 * - If P1 != 0 but P2 == 0
1112 * In all three cases, P1 is set to the point at infinity.
1114 * Returned value is 0 if one of the following occurs:
1116 * - P1 and P2 have the same Y coordinate
1117 * - P1 == 0 and P2 == 0
1118 * - The Y coordinate of one of the points is 0 and the other point is
1119 * the point at infinity.
1121 * The third case cannot actually happen with valid points, since a point
1122 * with Y == 0 is a point of order 2, and there is no point of order 2 on
1125 * Therefore, assuming that P1 != 0 and P2 != 0 on input, then the caller
1126 * can apply the following:
1128 * - If the result is not the point at infinity, then it is correct.
1129 * - Otherwise, if the returned value is 1, then this is a case of
1130 * P1+P2 == 0, so the result is indeed the point at infinity.
1131 * - Otherwise, P1 == P2, so a "double" operation should have been
1135 p256_add(p256_jacobian
*P1
, const p256_jacobian
*P2
)
1138 * Addtions formulas are:
1146 * x3 = r^2 - h^3 - 2 * u1 * h^2
1147 * y3 = r * (u1 * h^2 - x3) - s1 * h^3
1150 uint32_t t1
[20], t2
[20], t3
[20], t4
[20], t5
[20], t6
[20], t7
[20];
1155 * Compute u1 = x1*z2^2 (in t1) and s1 = y1*z2^3 (in t3).
1157 mul_f256(t3
, P2
->z
, P2
->z
);
1158 mul_f256(t1
, P1
->x
, t3
);
1159 mul_f256(t4
, P2
->z
, t3
);
1160 mul_f256(t3
, P1
->y
, t4
);
1163 * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
1165 mul_f256(t4
, P1
->z
, P1
->z
);
1166 mul_f256(t2
, P2
->x
, t4
);
1167 mul_f256(t5
, P1
->z
, t4
);
1168 mul_f256(t4
, P2
->y
, t5
);
1171 * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
1172 * We need to test whether r is zero, so we will do some extra
1175 for (i
= 0; i
< 20; i
++) {
1176 t2
[i
] += (F256
[i
] << 1) - t1
[i
];
1177 t4
[i
] += (F256
[i
] << 1) - t3
[i
];
1182 reduce_final_f256(t4
);
1184 for (i
= 0; i
< 20; i
++) {
1187 ret
= (ret
| -ret
) >> 31;
1190 * Compute u1*h^2 (in t6) and h^3 (in t5);
1192 mul_f256(t7
, t2
, t2
);
1193 mul_f256(t6
, t1
, t7
);
1194 mul_f256(t5
, t7
, t2
);
1197 * Compute x3 = r^2 - h^3 - 2*u1*h^2.
1199 mul_f256(P1
->x
, t4
, t4
);
1200 for (i
= 0; i
< 20; i
++) {
1201 P1
->x
[i
] += (F256
[i
] << 3) - t5
[i
] - (t6
[i
] << 1);
1203 norm13(P1
->x
, P1
->x
, 20);
1207 * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
1209 for (i
= 0; i
< 20; i
++) {
1210 t6
[i
] += (F256
[i
] << 1) - P1
->x
[i
];
1213 mul_f256(P1
->y
, t4
, t6
);
1214 mul_f256(t1
, t5
, t3
);
1215 for (i
= 0; i
< 20; i
++) {
1216 P1
->y
[i
] += (F256
[i
] << 1) - t1
[i
];
1218 norm13(P1
->y
, P1
->y
, 20);
1222 * Compute z3 = h*z1*z2.
1224 mul_f256(t1
, P1
->z
, P2
->z
);
1225 mul_f256(P1
->z
, t1
, t2
);
1231 * Decode a P-256 point. This function does not support the point at
1232 * infinity. Returned value is 0 if the point is invalid, 1 otherwise.
1235 p256_decode(p256_jacobian
*P
, const void *src
, size_t len
)
1237 const unsigned char *buf
;
1238 uint32_t tx
[20], ty
[20], t1
[20], t2
[20];
1248 * First byte must be 0x04 (uncompressed format). We could support
1249 * "hybrid format" (first byte is 0x06 or 0x07, and encodes the
1250 * least significant bit of the Y coordinate), but it is explicitly
1251 * forbidden by RFC 5480 (section 2.2).
1253 bad
= NEQ(buf
[0], 0x04);
1256 * Decode the coordinates, and check that they are both lower
1259 tx
[19] = be8_to_le13(tx
, buf
+ 1, 32);
1260 ty
[19] = be8_to_le13(ty
, buf
+ 33, 32);
1261 bad
|= reduce_final_f256(tx
);
1262 bad
|= reduce_final_f256(ty
);
1265 * Check curve equation.
1267 mul_f256(t1
, tx
, tx
);
1268 mul_f256(t1
, tx
, t1
);
1269 mul_f256(t2
, ty
, ty
);
1270 for (i
= 0; i
< 20; i
++) {
1271 t1
[i
] += (F256
[i
] << 3) - MUL15(3, tx
[i
]) + P256_B
[i
] - t2
[i
];
1275 reduce_final_f256(t1
);
1276 for (i
= 0; i
< 20; i
++) {
1281 * Copy coordinates to the point structure.
1283 memcpy(P
->x
, tx
, sizeof tx
);
1284 memcpy(P
->y
, ty
, sizeof ty
);
1285 memset(P
->z
, 0, sizeof P
->z
);
1287 return NEQ(bad
, 0) ^ 1;
1291 * Encode a point into a buffer. This function assumes that the point is
1292 * valid, in affine coordinates, and not the point at infinity.
1295 p256_encode(void *dst
, const p256_jacobian
*P
)
1301 le13_to_be8(buf
+ 1, 32, P
->x
);
1302 le13_to_be8(buf
+ 33, 32, P
->y
);
1306 * Multiply a curve point by an integer. The integer is assumed to be
1307 * lower than the curve order, and the base point must not be the point
1311 p256_mul(p256_jacobian
*P
, const unsigned char *x
, size_t xlen
)
1314 * qz is a flag that is initially 1, and remains equal to 1
1315 * as long as the point is the point at infinity.
1317 * We use a 2-bit window to handle multiplier bits by pairs.
1318 * The precomputed window really is the points P2 and P3.
1321 p256_jacobian P2
, P3
, Q
, T
, U
;
1324 * Compute window values.
1332 * We start with Q = 0. We process multiplier bits 2 by 2.
1334 memset(&Q
, 0, sizeof Q
);
1336 while (xlen
-- > 0) {
1339 for (k
= 6; k
>= 0; k
-= 2) {
1347 bits
= (*x
>> k
) & (uint32_t)3;
1349 CCOPY(EQ(bits
, 2), &T
, &P2
, sizeof T
);
1350 CCOPY(EQ(bits
, 3), &T
, &P3
, sizeof T
);
1352 CCOPY(bnz
& qz
, &Q
, &T
, sizeof Q
);
1353 CCOPY(bnz
& ~qz
, &Q
, &U
, sizeof Q
);
1361 static const unsigned char P256_G
[] = {
1362 0x04, 0x6B, 0x17, 0xD1, 0xF2, 0xE1, 0x2C, 0x42, 0x47, 0xF8,
1363 0xBC, 0xE6, 0xE5, 0x63, 0xA4, 0x40, 0xF2, 0x77, 0x03, 0x7D,
1364 0x81, 0x2D, 0xEB, 0x33, 0xA0, 0xF4, 0xA1, 0x39, 0x45, 0xD8,
1365 0x98, 0xC2, 0x96, 0x4F, 0xE3, 0x42, 0xE2, 0xFE, 0x1A, 0x7F,
1366 0x9B, 0x8E, 0xE7, 0xEB, 0x4A, 0x7C, 0x0F, 0x9E, 0x16, 0x2B,
1367 0xCE, 0x33, 0x57, 0x6B, 0x31, 0x5E, 0xCE, 0xCB, 0xB6, 0x40,
1368 0x68, 0x37, 0xBF, 0x51, 0xF5
1371 static const unsigned char P256_N
[] = {
1372 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF,
1373 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xBC, 0xE6, 0xFA, 0xAD,
1374 0xA7, 0x17, 0x9E, 0x84, 0xF3, 0xB9, 0xCA, 0xC2, 0xFC, 0x63,
1378 static const unsigned char *
1379 api_generator(int curve
, size_t *len
)
1382 *len
= sizeof P256_G
;
1386 static const unsigned char *
1387 api_order(int curve
, size_t *len
)
1390 *len
= sizeof P256_N
;
1395 api_mul(unsigned char *G
, size_t Glen
,
1396 const unsigned char *x
, size_t xlen
, int curve
)
1402 r
= p256_decode(&P
, G
, Glen
);
1403 p256_mul(&P
, x
, xlen
);
1412 api_muladd(unsigned char *A
, const unsigned char *B
, size_t len
,
1413 const unsigned char *x
, size_t xlen
,
1414 const unsigned char *y
, size_t ylen
, int curve
)
1421 r
= p256_decode(&P
, A
, len
);
1422 r
&= p256_decode(&Q
, B
, len
);
1423 p256_mul(&P
, x
, xlen
);
1424 p256_mul(&Q
, y
, ylen
);
1427 * The final addition may fail in case both points are equal.
1429 t
= p256_add(&P
, &Q
);
1430 reduce_final_f256(P
.z
);
1432 for (i
= 0; i
< 20; i
++) {
1439 * If z is 1 then either P+Q = 0 (t = 1) or P = Q (t = 0). So we
1440 * have the following:
1442 * z = 0, t = 0 return P (normal addition)
1443 * z = 0, t = 1 return P (normal addition)
1444 * z = 1, t = 0 return Q (a 'double' case)
1445 * z = 1, t = 1 report an error (P+Q = 0)
1447 CCOPY(z
& ~t
, &P
, &Q
, sizeof Q
);
1454 /* see bearssl_ec.h */
1455 const br_ec_impl br_ec_p256_i15
= {
1456 (uint32_t)0x00800000,