2 * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
4 * Permission is hereby granted, free of charge, to any person obtaining
5 * a copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sublicense, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
12 * The above copyright notice and this permission notice shall be
13 * included in all copies or substantial portions of the Software.
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28 * If BR_NO_ARITH_SHIFT is undefined, or defined to 0, then we _assume_
29 * that right-shifting a signed negative integer copies the sign bit
30 * (arithmetic right-shift). This is "implementation-defined behaviour",
31 * i.e. it is not undefined, but it may differ between compilers. Each
32 * compiler is supposed to document its behaviour in that respect. GCC
33 * explicitly defines that an arithmetic right shift is used. We expect
34 * all other compilers to do the same, because underlying CPU offer an
35 * arithmetic right shift opcode that could not be used otherwise.
38 #define ARSH(x, n) (((uint32_t)(x) >> (n)) \
39 | ((-((uint32_t)(x) >> 31)) << (32 - (n))))
41 #define ARSH(x, n) ((*(int32_t *)&(x)) >> (n))
45 * Convert an integer from unsigned big-endian encoding to a sequence of
46 * 13-bit words in little-endian order. The final "partial" word is
50 be8_to_le13(uint32_t *dst
, const unsigned char *src
, size_t len
)
58 acc
|= (uint32_t)src
[len
] << acc_len
;
61 *dst
++ = acc
& 0x1FFF;
70 * Convert an integer (13-bit words, little-endian) to unsigned
71 * big-endian encoding. The total encoding length is provided; all
72 * the destination bytes will be filled.
75 le13_to_be8(unsigned char *dst
, size_t len
, const uint32_t *src
)
84 acc
|= (*src
++) << acc_len
;
87 dst
[len
] = (unsigned char)acc
;
94 * Multiply two 4-word integers. Basis is 2^13 = 8192. Individual words
95 * may have value up to 32766. Result integer consists in eight words;
96 * each word is in the 0..8191 range, except the last one (d[7]) which
97 * gathers the excess bits.
99 * Max value for d[7], depending on max word value:
106 mul4(uint32_t *d
, const uint32_t *a
, const uint32_t *b
)
108 uint32_t t0
, t1
, t2
, t3
, t4
, t5
, t6
;
110 t0
= MUL15(a
[0], b
[0]);
111 t1
= MUL15(a
[0], b
[1]) + MUL15(a
[1], b
[0]);
112 t2
= MUL15(a
[0], b
[2]) + MUL15(a
[1], b
[1]) + MUL15(a
[2], b
[0]);
113 t3
= MUL15(a
[0], b
[3]) + MUL15(a
[1], b
[2])
114 + MUL15(a
[2], b
[1]) + MUL15(a
[3], b
[0]);
115 t4
= MUL15(a
[1], b
[3]) + MUL15(a
[2], b
[2]) + MUL15(a
[3], b
[1]);
116 t5
= MUL15(a
[2], b
[3]) + MUL15(a
[3], b
[2]);
117 t6
= MUL15(a
[3], b
[3]);
120 * Maximum value is obtained when adding the carry to t3, when all
121 * input words have maximum value. When the maximum is 32766,
122 * the addition of t3 with the carry (t2 >> 13) yields 4294836223,
123 * which is still below 2^32 = 4294967296.
143 * Normalise an array of words to a strict 13 bits per word. Returned
144 * value is the resulting carry. The source (w) and destination (d)
145 * arrays may be identical, but shall not overlap partially.
148 norm13(uint32_t *d
, const uint32_t *w
, size_t len
)
154 for (u
= 0; u
< len
; u
++) {
165 * Multiply two 20-word values together, result over 40 words. Each word
166 * contains 13 bits of data; the top words (a[19] and b[19]) may contain
167 * up to 15 bits each. Therefore this routine handles integer up to 2^262-1.
168 * All words in the output have length 13 bits, except possibly the top
169 * one, in case the sources had excess bits.
172 mul20(uint32_t *d
, const uint32_t *a
, const uint32_t *b
)
175 * We first split the 20-word values into a 16-word value and
176 * a 4-word value. The 16x16 product uses two levels of Karatsuba
177 * decomposition. The preparatory additions are done word-wise
178 * without carry propagation; since the input words have size
179 * at most 13 bits, the sums may be up to 4*8191 = 32764, which
180 * is in the range supported by mul4().
182 uint32_t u
[72], v
[72], w
[144];
187 * Karatsuba decomposition, two levels.
193 * 16 0..3+4..7 * 0..3+4..7
198 * 20 8..11+12..15 * 8..11+12..15
200 * 0..7+8..15 * 0..7+8..15
201 * 24 0..3+8..11 * 0..3+8..11
202 * 28 4..7+12..15 * 4..7+12..15
203 * 32 0..3+4..7+8..11+12..15 * 0..3+4..7+8..11+12..15
206 #define WADD(z, x, y) do { \
207 u[(z) + 0] = u[(x) + 0] + u[y + 0]; \
208 u[(z) + 1] = u[(x) + 1] + u[y + 1]; \
209 u[(z) + 2] = u[(x) + 2] + u[y + 2]; \
210 u[(z) + 3] = u[(x) + 3] + u[y + 3]; \
211 v[(z) + 0] = v[(x) + 0] + v[y + 0]; \
212 v[(z) + 1] = v[(x) + 1] + v[y + 1]; \
213 v[(z) + 2] = v[(x) + 2] + v[y + 2]; \
214 v[(z) + 3] = v[(x) + 3] + v[y + 3]; \
217 memcpy(u
, a
, 16 * sizeof *a
);
218 memcpy(v
, b
, 16 * sizeof *b
);
224 memcpy(u
+ 36, a
, 16 * sizeof *a
);
225 memcpy(v
+ 52, b
, 16 * sizeof *b
);
226 for (i
= 0; i
< 4; i
++) {
227 memcpy(u
+ 52 + (i
<< 2), a
+ 16, 4 * sizeof *a
);
228 memcpy(v
+ 36 + (i
<< 2), b
+ 16, 4 * sizeof *b
);
230 memcpy(u
+ 68, a
+ 16, 4 * sizeof *a
);
231 memcpy(v
+ 68, b
+ 16, 4 * sizeof *b
);
236 * Perform the elementary 4x4 multiplications:
237 * 16x16: 9 multiplications (Karatsuba)
238 * 16x4 (1): 4 multiplications
239 * 16x4 (2): 4 multiplications
240 * 4x4: 1 multiplication
242 for (i
= 0; i
< 18; i
++) {
243 mul4(w
+ (i
<< 3), u
+ (i
<< 2), v
+ (i
<< 2));
247 * mul4 cross-product adjustments:
248 * subtract 0 and 8 from 32 (8 words)
249 * subtract 16 and 24 from 40 (8 words)
250 * subtract 48 and 56 from 64 (8 words)
252 for (i
= 0; i
< 8; i
++) {
253 w
[i
+ 32] -= w
[i
+ 0] + w
[i
+ 8];
254 w
[i
+ 40] -= w
[i
+ 16] + w
[i
+ 24];
255 w
[i
+ 64] -= w
[i
+ 48] + w
[i
+ 56];
259 * complete the three 8x8 products:
260 * add 32 to 4 (8 words)
261 * add 40 to 20 (8 words)
262 * add 64 to 52 (8 words)
264 for (i
= 0; i
< 8; i
++) {
265 w
[i
+ 4] += w
[i
+ 32];
266 w
[i
+ 20] += w
[i
+ 40];
267 w
[i
+ 52] += w
[i
+ 64];
271 * Adjust the 16x16 product:
272 * subtract 0 from 48 (16 words)
273 * subtract 16 from 48 (16 words)
274 * add 48 to 8 (16 words)
276 for (i
= 0; i
< 16; i
++) {
277 w
[i
+ 48] -= w
[i
+ 0];
278 w
[i
+ 48] -= w
[i
+ 16];
280 for (i
= 0; i
< 16; i
++) {
281 w
[i
+ 8] += w
[i
+ 48];
285 * At that point, the product of the low chunks (0..15 * 0..15)
286 * is in words 0..31. We must add the three other partial products,
287 * which begin at word 72 in w[]. Words 32 to 39 are first set to
288 * the product of the high chunks (16..19 * 16..19), then the
289 * low-high cross products are added in.
291 memcpy(w
+ 32, w
+ 136, 8 * sizeof w
[0]);
292 for (i
= 0; i
< 8; i
++) {
293 w
[i
+ 16] += w
[i
+ 72] + w
[i
+ 104];
294 w
[i
+ 20] += w
[i
+ 80] + w
[i
+ 112];
295 w
[i
+ 24] += w
[i
+ 88] + w
[i
+ 120];
296 w
[i
+ 28] += w
[i
+ 96] + w
[i
+ 128];
300 * We did all the additions and subtractions in a word-wise way,
301 * which is fine since we have plenty of extra bits for carries.
302 * We must now do the carry propagation.
304 cc
= norm13(d
, w
, 40);
309 * Modulus for field F256 (field for point coordinates in curve P-256).
311 static const uint32_t F256
[] = {
312 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x001F,
313 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0400, 0x0000,
314 0x0000, 0x1FF8, 0x1FFF, 0x01FF
318 * The 'b' curve equation coefficient for P-256.
320 static const uint32_t P256_B
[] = {
321 0x004B, 0x1E93, 0x0F89, 0x1C78, 0x03BC, 0x187B, 0x114E, 0x1619,
322 0x1D06, 0x0328, 0x01AF, 0x0D31, 0x1557, 0x15DE, 0x1ECF, 0x127C,
323 0x0A3A, 0x0EC5, 0x118D, 0x00B5
327 * Perform a "short reduction" in field F256 (field for curve P-256).
328 * The source value should be less than 262 bits; on output, it will
329 * be at most 257 bits, and less than twice the modulus.
332 reduce_f256(uint32_t *d
)
346 * Perform a "final reduction" in field F256 (field for curve P-256).
347 * The source value must be less than twice the modulus. If the value
348 * is not lower than the modulus, then the modulus is subtracted and
349 * this function returns 1; otherwise, it leaves it untouched and it
353 reduce_final_f256(uint32_t *d
)
359 memcpy(t
, d
, sizeof t
);
361 for (i
= 0; i
< 20; i
++) {
364 w
= t
[i
] - F256
[i
] - cc
;
369 CCOPY(cc
, d
, t
, sizeof t
);
374 * Perform a multiplication of two integers modulo
375 * 2^256-2^224+2^192+2^96-1 (for NIST curve P-256). Operands are arrays
376 * of 20 words, each containing 13 bits of data, in little-endian order.
377 * On input, upper word may be up to 15 bits (hence value up to 2^262-1);
378 * on output, value fits on 257 bits and is lower than twice the modulus.
381 mul_f256(uint32_t *d
, const uint32_t *a
, const uint32_t *b
)
387 * Compute raw multiplication. All result words fit in 13 bits
393 * Modular reduction: each high word in added/subtracted where
397 * p = 2^256 - 2^224 + 2^192 + 2^96 - 1
399 * 2^256 = 2^224 - 2^192 - 2^96 + 1 mod p
401 * For a word x at bit offset n (n >= 256), we have:
402 * x*2^n = x*2^(n-32) - x*2^(n-64)
403 * - x*2^(n - 160) + x*2^(n-256) mod p
405 * Thus, we can nullify the high word if we reinject it at some
406 * proper emplacements.
408 for (i
= 39; i
>= 20; i
--) {
412 t
[i
- 2] += ARSH(x
, 6);
413 t
[i
- 3] += (x
<< 7) & 0x1FFF;
414 t
[i
- 4] -= ARSH(x
, 12);
415 t
[i
- 5] -= (x
<< 1) & 0x1FFF;
416 t
[i
- 12] -= ARSH(x
, 4);
417 t
[i
- 13] -= (x
<< 9) & 0x1FFF;
418 t
[i
- 19] += ARSH(x
, 9);
419 t
[i
- 20] += (x
<< 4) & 0x1FFF;
423 * Propagate carries. Since the operation above really is a
424 * truncature, followed by the addition of nonnegative values,
425 * the result will be positive. Moreover, the carry cannot
426 * exceed 5 bits (we performed 20 additions with values smaller
429 cc
= norm13(t
, t
, 20);
432 * Perform modular reduction again for the bits beyond 256 (the carry
433 * and the bits 256..259). This time, we can simply inject full
436 cc
= (cc
<< 4) | (t
[19] >> 9);
446 * Jacobian coordinates for a point in P-256: affine coordinates (X,Y)
450 * For the point at infinity, z = 0.
451 * Each point thus admits many possible representations.
453 * Coordinates are represented in arrays of 32-bit integers, each holding
454 * 13 bits of data. Values may also be slightly greater than the modulus,
455 * but they will always be lower than twice the modulus.
464 * Convert a point to affine coordinates:
465 * - If the point is the point at infinity, then all three coordinates
467 * - Otherwise, the 'z' coordinate is set to 1, and the 'x' and 'y'
468 * coordinates are the 'X' and 'Y' affine coordinates.
469 * The coordinates are guaranteed to be lower than the modulus.
472 p256_to_affine(p256_jacobian
*P
)
474 uint32_t t1
[20], t2
[20];
478 * Invert z with a modular exponentiation: the modulus is
479 * p = 2^256 - 2^224 + 2^192 + 2^96 - 1, and the exponent is
480 * p-2. Exponent bit pattern (from high to low) is:
481 * - 32 bits of value 1
482 * - 31 bits of value 0
484 * - 96 bits of value 0
485 * - 94 bits of value 1
488 * Thus, we precompute z^(2^31-1) to speed things up.
490 * If z = 0 (point at infinity) then the modular exponentiation
491 * will yield 0, which leads to the expected result (all three
492 * coordinates set to 0).
496 * A simple square-and-multiply for z^(2^31-1). We could save about
497 * two dozen multiplications here with an addition chain, but
498 * this would require a bit more code, and extra stack buffers.
500 memcpy(t1
, P
->z
, sizeof P
->z
);
501 for (i
= 0; i
< 30; i
++) {
502 mul_f256(t1
, t1
, t1
);
503 mul_f256(t1
, t1
, P
->z
);
507 * Square-and-multiply. Apart from the squarings, we have a few
508 * multiplications to set bits to 1; we multiply by the original z
509 * for setting 1 bit, and by t1 for setting 31 bits.
511 memcpy(t2
, P
->z
, sizeof P
->z
);
512 for (i
= 1; i
< 256; i
++) {
513 mul_f256(t2
, t2
, t2
);
519 mul_f256(t2
, t2
, t1
);
524 mul_f256(t2
, t2
, P
->z
);
530 * Now that we have 1/z, multiply x by 1/z^2 and y by 1/z^3.
532 mul_f256(t1
, t2
, t2
);
533 mul_f256(P
->x
, t1
, P
->x
);
534 mul_f256(t1
, t1
, t2
);
535 mul_f256(P
->y
, t1
, P
->y
);
536 reduce_final_f256(P
->x
);
537 reduce_final_f256(P
->y
);
540 * Multiply z by 1/z. If z = 0, then this will yield 0, otherwise
541 * this will set z to 1.
543 mul_f256(P
->z
, P
->z
, t2
);
544 reduce_final_f256(P
->z
);
548 * Double a point in P-256. This function works for all valid points,
549 * including the point at infinity.
552 p256_double(p256_jacobian
*Q
)
555 * Doubling formulas are:
558 * m = 3*(x + z^2)*(x - z^2)
560 * y' = m*(s - x') - 8*y^4
563 * These formulas work for all points, including points of order 2
564 * and points at infinity:
565 * - If y = 0 then z' = 0. But there is no such point in P-256
567 * - If z = 0 then z' = 0.
569 uint32_t t1
[20], t2
[20], t3
[20], t4
[20];
575 mul_f256(t1
, Q
->z
, Q
->z
);
578 * Compute x-z^2 in t2 and x+z^2 in t1.
580 for (i
= 0; i
< 20; i
++) {
581 t2
[i
] = (F256
[i
] << 1) + Q
->x
[i
] - t1
[i
];
588 * Compute 3*(x+z^2)*(x-z^2) in t1.
590 mul_f256(t3
, t1
, t2
);
591 for (i
= 0; i
< 20; i
++) {
592 t1
[i
] = MUL15(3, t3
[i
]);
597 * Compute 4*x*y^2 (in t2) and 2*y^2 (in t3).
599 mul_f256(t3
, Q
->y
, Q
->y
);
600 for (i
= 0; i
< 20; i
++) {
604 mul_f256(t2
, Q
->x
, t3
);
605 for (i
= 0; i
< 20; i
++) {
612 * Compute x' = m^2 - 2*s.
614 mul_f256(Q
->x
, t1
, t1
);
615 for (i
= 0; i
< 20; i
++) {
616 Q
->x
[i
] += (F256
[i
] << 2) - (t2
[i
] << 1);
618 norm13(Q
->x
, Q
->x
, 20);
622 * Compute z' = 2*y*z.
624 mul_f256(t4
, Q
->y
, Q
->z
);
625 for (i
= 0; i
< 20; i
++) {
626 Q
->z
[i
] = t4
[i
] << 1;
628 norm13(Q
->z
, Q
->z
, 20);
632 * Compute y' = m*(s - x') - 8*y^4. Note that we already have
635 for (i
= 0; i
< 20; i
++) {
636 t2
[i
] += (F256
[i
] << 1) - Q
->x
[i
];
639 mul_f256(Q
->y
, t1
, t2
);
640 mul_f256(t4
, t3
, t3
);
641 for (i
= 0; i
< 20; i
++) {
642 Q
->y
[i
] += (F256
[i
] << 2) - (t4
[i
] << 1);
644 norm13(Q
->y
, Q
->y
, 20);
649 * Add point P2 to point P1.
651 * This function computes the wrong result in the following cases:
653 * - If P1 == 0 but P2 != 0
654 * - If P1 != 0 but P2 == 0
657 * In all three cases, P1 is set to the point at infinity.
659 * Returned value is 0 if one of the following occurs:
661 * - P1 and P2 have the same Y coordinate
662 * - P1 == 0 and P2 == 0
663 * - The Y coordinate of one of the points is 0 and the other point is
664 * the point at infinity.
666 * The third case cannot actually happen with valid points, since a point
667 * with Y == 0 is a point of order 2, and there is no point of order 2 on
670 * Therefore, assuming that P1 != 0 and P2 != 0 on input, then the caller
671 * can apply the following:
673 * - If the result is not the point at infinity, then it is correct.
674 * - Otherwise, if the returned value is 1, then this is a case of
675 * P1+P2 == 0, so the result is indeed the point at infinity.
676 * - Otherwise, P1 == P2, so a "double" operation should have been
680 p256_add(p256_jacobian
*P1
, const p256_jacobian
*P2
)
683 * Addtions formulas are:
691 * x3 = r^2 - h^3 - 2 * u1 * h^2
692 * y3 = r * (u1 * h^2 - x3) - s1 * h^3
695 uint32_t t1
[20], t2
[20], t3
[20], t4
[20], t5
[20], t6
[20], t7
[20];
700 * Compute u1 = x1*z2^2 (in t1) and s1 = y1*z2^3 (in t3).
702 mul_f256(t3
, P2
->z
, P2
->z
);
703 mul_f256(t1
, P1
->x
, t3
);
704 mul_f256(t4
, P2
->z
, t3
);
705 mul_f256(t3
, P1
->y
, t4
);
708 * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
710 mul_f256(t4
, P1
->z
, P1
->z
);
711 mul_f256(t2
, P2
->x
, t4
);
712 mul_f256(t5
, P1
->z
, t4
);
713 mul_f256(t4
, P2
->y
, t5
);
716 * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
717 * We need to test whether r is zero, so we will do some extra
720 for (i
= 0; i
< 20; i
++) {
721 t2
[i
] += (F256
[i
] << 1) - t1
[i
];
722 t4
[i
] += (F256
[i
] << 1) - t3
[i
];
727 reduce_final_f256(t4
);
729 for (i
= 0; i
< 20; i
++) {
732 ret
= (ret
| -ret
) >> 31;
735 * Compute u1*h^2 (in t6) and h^3 (in t5);
737 mul_f256(t7
, t2
, t2
);
738 mul_f256(t6
, t1
, t7
);
739 mul_f256(t5
, t7
, t2
);
742 * Compute x3 = r^2 - h^3 - 2*u1*h^2.
744 mul_f256(P1
->x
, t4
, t4
);
745 for (i
= 0; i
< 20; i
++) {
746 P1
->x
[i
] += (F256
[i
] << 3) - t5
[i
] - (t6
[i
] << 1);
748 norm13(P1
->x
, P1
->x
, 20);
752 * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
754 for (i
= 0; i
< 20; i
++) {
755 t6
[i
] += (F256
[i
] << 1) - P1
->x
[i
];
758 mul_f256(P1
->y
, t4
, t6
);
759 mul_f256(t1
, t5
, t3
);
760 for (i
= 0; i
< 20; i
++) {
761 P1
->y
[i
] += (F256
[i
] << 1) - t1
[i
];
763 norm13(P1
->y
, P1
->y
, 20);
767 * Compute z3 = h*z1*z2.
769 mul_f256(t1
, P1
->z
, P2
->z
);
770 mul_f256(P1
->z
, t1
, t2
);
776 * Decode a P-256 point. This function does not support the point at
777 * infinity. Returned value is 0 if the point is invalid, 1 otherwise.
780 p256_decode(p256_jacobian
*P
, const void *src
, size_t len
)
782 const unsigned char *buf
;
783 uint32_t tx
[20], ty
[20], t1
[20], t2
[20];
793 * First byte must be 0x04 (uncompressed format). We could support
794 * "hybrid format" (first byte is 0x06 or 0x07, and encodes the
795 * least significant bit of the Y coordinate), but it is explicitly
796 * forbidden by RFC 5480 (section 2.2).
798 bad
= NEQ(buf
[0], 0x04);
801 * Decode the coordinates, and check that they are both lower
804 tx
[19] = be8_to_le13(tx
, buf
+ 1, 32);
805 ty
[19] = be8_to_le13(ty
, buf
+ 33, 32);
806 bad
|= reduce_final_f256(tx
);
807 bad
|= reduce_final_f256(ty
);
810 * Check curve equation.
812 mul_f256(t1
, tx
, tx
);
813 mul_f256(t1
, tx
, t1
);
814 mul_f256(t2
, ty
, ty
);
815 for (i
= 0; i
< 20; i
++) {
816 t1
[i
] += (F256
[i
] << 3) - MUL15(3, tx
[i
]) + P256_B
[i
] - t2
[i
];
820 reduce_final_f256(t1
);
821 for (i
= 0; i
< 20; i
++) {
826 * Copy coordinates to the point structure.
828 memcpy(P
->x
, tx
, sizeof tx
);
829 memcpy(P
->y
, ty
, sizeof ty
);
830 memset(P
->z
, 0, sizeof P
->z
);
832 return NEQ(bad
, 0) ^ 1;
836 * Encode a point into a buffer. This function assumes that the point is
837 * valid, in affine coordinates, and not the point at infinity.
840 p256_encode(void *dst
, const p256_jacobian
*P
)
846 le13_to_be8(buf
+ 1, 32, P
->x
);
847 le13_to_be8(buf
+ 33, 32, P
->y
);
851 * Multiply a curve point by an integer. The integer is assumed to be
852 * lower than the curve order, and the base point must not be the point
856 p256_mul(p256_jacobian
*P
, const unsigned char *x
, size_t xlen
)
859 * qz is a flag that is initially 1, and remains equal to 1
860 * as long as the point is the point at infinity.
862 * We use a 2-bit window to handle multiplier bits by pairs.
863 * The precomputed window really is the points P2 and P3.
866 p256_jacobian P2
, P3
, Q
, T
, U
;
869 * Compute window values.
877 * We start with Q = 0. We process multiplier bits 2 by 2.
879 memset(&Q
, 0, sizeof Q
);
881 while (xlen
-- > 0) {
884 for (k
= 6; k
>= 0; k
-= 2) {
892 bits
= (*x
>> k
) & (uint32_t)3;
894 CCOPY(EQ(bits
, 2), &T
, &P2
, sizeof T
);
895 CCOPY(EQ(bits
, 3), &T
, &P3
, sizeof T
);
897 CCOPY(bnz
& qz
, &Q
, &T
, sizeof Q
);
898 CCOPY(bnz
& ~qz
, &Q
, &U
, sizeof Q
);
906 static const unsigned char P256_G
[] = {
907 0x04, 0x6B, 0x17, 0xD1, 0xF2, 0xE1, 0x2C, 0x42, 0x47, 0xF8,
908 0xBC, 0xE6, 0xE5, 0x63, 0xA4, 0x40, 0xF2, 0x77, 0x03, 0x7D,
909 0x81, 0x2D, 0xEB, 0x33, 0xA0, 0xF4, 0xA1, 0x39, 0x45, 0xD8,
910 0x98, 0xC2, 0x96, 0x4F, 0xE3, 0x42, 0xE2, 0xFE, 0x1A, 0x7F,
911 0x9B, 0x8E, 0xE7, 0xEB, 0x4A, 0x7C, 0x0F, 0x9E, 0x16, 0x2B,
912 0xCE, 0x33, 0x57, 0x6B, 0x31, 0x5E, 0xCE, 0xCB, 0xB6, 0x40,
913 0x68, 0x37, 0xBF, 0x51, 0xF5
916 static const unsigned char P256_N
[] = {
917 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF,
918 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xBC, 0xE6, 0xFA, 0xAD,
919 0xA7, 0x17, 0x9E, 0x84, 0xF3, 0xB9, 0xCA, 0xC2, 0xFC, 0x63,
923 static const unsigned char *
924 api_generator(int curve
, size_t *len
)
927 *len
= sizeof P256_G
;
931 static const unsigned char *
932 api_order(int curve
, size_t *len
)
935 *len
= sizeof P256_N
;
940 api_mul(unsigned char *G
, size_t Glen
,
941 const unsigned char *x
, size_t xlen
, int curve
)
947 r
= p256_decode(&P
, G
, Glen
);
948 p256_mul(&P
, x
, xlen
);
957 api_muladd(unsigned char *A
, const unsigned char *B
, size_t len
,
958 const unsigned char *x
, size_t xlen
,
959 const unsigned char *y
, size_t ylen
, int curve
)
966 r
= p256_decode(&P
, A
, len
);
967 r
&= p256_decode(&Q
, B
, len
);
968 p256_mul(&P
, x
, xlen
);
969 p256_mul(&Q
, y
, ylen
);
972 * The final addition may fail in case both points are equal.
974 t
= p256_add(&P
, &Q
);
975 reduce_final_f256(P
.z
);
977 for (i
= 0; i
< 20; i
++) {
984 * If z is 1 then either P+Q = 0 (t = 1) or P = Q (t = 0). So we
985 * have the following:
987 * z = 0, t = 0 return P (normal addition)
988 * z = 0, t = 1 return P (normal addition)
989 * z = 1, t = 0 return Q (a 'double' case)
990 * z = 1, t = 1 report an error (P+Q = 0)
992 CCOPY(z
& ~t
, &P
, &Q
, sizeof Q
);
999 /* see bearssl_ec.h */
1000 const br_ec_impl br_ec_p256_i15
= {
1001 (uint32_t)0x00800000,