2 * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
4 * Permission is hereby granted, free of charge, to any person obtaining
5 * a copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sublicense, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
12 * The above copyright notice and this permission notice shall be
13 * included in all copies or substantial portions of the Software.
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27 #if BR_INT128 || BR_UMUL128
29 static const unsigned char GEN
[] = {
30 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
31 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
32 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
33 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
36 static const unsigned char ORDER
[] = {
37 0x7F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
38 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
39 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
40 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
43 static const unsigned char *
44 api_generator(int curve
, size_t *len
)
51 static const unsigned char *
52 api_order(int curve
, size_t *len
)
60 api_xoff(int curve
, size_t *len
)
68 * A field element is encoded as five 64-bit integers, in basis 2^51.
69 * Limbs may be occasionally larger than 2^51, to save on carry
73 #define MASK51 (((uint64_t)1 << 51) - (uint64_t)1)
76 * Swap two field elements, conditionally on a flag.
79 f255_cswap(uint64_t *a
, uint64_t *b
, uint32_t ctl
)
84 w
= m
& (a
[0] ^ b
[0]); a
[0] ^= w
; b
[0] ^= w
;
85 w
= m
& (a
[1] ^ b
[1]); a
[1] ^= w
; b
[1] ^= w
;
86 w
= m
& (a
[2] ^ b
[2]); a
[2] ^= w
; b
[2] ^= w
;
87 w
= m
& (a
[3] ^ b
[3]); a
[3] ^= w
; b
[3] ^= w
;
88 w
= m
& (a
[4] ^ b
[4]); a
[4] ^= w
; b
[4] ^= w
;
92 * Addition with no carry propagation. Limbs double in size.
95 f255_add(uint64_t *d
, const uint64_t *a
, const uint64_t *b
)
106 * On input, limbs must fit on 60 bits each. On output, result is
107 * partially reduced, with max value 2^255+19456; moreover, all
108 * limbs will fit on 51 bits, except the low limb, which may have
109 * value up to 2^51+19455.
112 f255_sub(uint64_t *d
, const uint64_t *a
, const uint64_t *b
)
117 * We compute d = (2^255-19)*1024 + a - b. Since the limbs
118 * fit on 60 bits, the maximum value of operands are slightly
119 * more than 2^264, but much less than 2^265-19456. This
120 * ensures that the result is positive.
124 * Initial carry is 19456, since we add 2^265-19456. Each
125 * individual subtraction may yield a carry up to 513.
127 w
= a
[0] - b
[0] - 19456;
129 cc
= -(w
>> 51) & 0x3FF;
130 w
= a
[1] - b
[1] - cc
;
132 cc
= -(w
>> 51) & 0x3FF;
133 w
= a
[2] - b
[2] - cc
;
135 cc
= -(w
>> 51) & 0x3FF;
136 w
= a
[3] - b
[3] - cc
;
138 cc
= -(w
>> 51) & 0x3FF;
139 d
[4] = ((uint64_t)1 << 61) + a
[4] - b
[4] - cc
;
142 * Partial reduction. The intermediate result may be up to
143 * slightly above 2^265, but less than 2^265+2^255. When we
144 * truncate to 255 bits, the upper bits will be at most 1024.
146 d
[0] += 19 * (d
[4] >> 51);
151 * UMUL51(hi, lo, x, y) computes:
153 * hi = floor((x * y) / (2^51))
154 * lo = x * y mod 2^51
156 * Note that lo < 2^51, but "hi" may be larger, if the input operands are
161 #define UMUL51(hi, lo, x, y) do { \
162 unsigned __int128 umul_tmp; \
163 umul_tmp = (unsigned __int128)(x) * (unsigned __int128)(y); \
164 (hi) = (uint64_t)(umul_tmp >> 51); \
165 (lo) = (uint64_t)umul_tmp & MASK51; \
170 #define UMUL51(hi, lo, x, y) do { \
171 uint64_t umul_hi, umul_lo; \
172 umul_lo = _umul128((x), (y), &umul_hi); \
173 (hi) = (umul_hi << 13) | (umul_lo >> 51); \
174 (lo) = umul_lo & MASK51; \
181 * On input, limbs must fit on 54 bits each.
182 * On output, limb 0 is at most 2^51 + 155647, and other limbs fit
186 f255_mul(uint64_t *d
, uint64_t *a
, uint64_t *b
)
188 uint64_t t
[10], hi
, lo
, w
, cc
;
191 * Perform cross products, accumulating values without carry
194 * Since input limbs fit on 54 bits each, each individual
195 * UMUL51 will produce a "hi" of less than 2^57. The maximum
196 * sum will be at most 5*(2^57-1) + 4*(2^51-1) (for t[5]),
197 * i.e. less than 324*2^51.
200 UMUL51(t
[1], t
[0], a
[0], b
[0]);
202 UMUL51(t
[2], lo
, a
[1], b
[0]); t
[1] += lo
;
203 UMUL51(hi
, lo
, a
[0], b
[1]); t
[1] += lo
; t
[2] += hi
;
205 UMUL51(t
[3], lo
, a
[2], b
[0]); t
[2] += lo
;
206 UMUL51(hi
, lo
, a
[1], b
[1]); t
[2] += lo
; t
[3] += hi
;
207 UMUL51(hi
, lo
, a
[0], b
[2]); t
[2] += lo
; t
[3] += hi
;
209 UMUL51(t
[4], lo
, a
[3], b
[0]); t
[3] += lo
;
210 UMUL51(hi
, lo
, a
[2], b
[1]); t
[3] += lo
; t
[4] += hi
;
211 UMUL51(hi
, lo
, a
[1], b
[2]); t
[3] += lo
; t
[4] += hi
;
212 UMUL51(hi
, lo
, a
[0], b
[3]); t
[3] += lo
; t
[4] += hi
;
214 UMUL51(t
[5], lo
, a
[4], b
[0]); t
[4] += lo
;
215 UMUL51(hi
, lo
, a
[3], b
[1]); t
[4] += lo
; t
[5] += hi
;
216 UMUL51(hi
, lo
, a
[2], b
[2]); t
[4] += lo
; t
[5] += hi
;
217 UMUL51(hi
, lo
, a
[1], b
[3]); t
[4] += lo
; t
[5] += hi
;
218 UMUL51(hi
, lo
, a
[0], b
[4]); t
[4] += lo
; t
[5] += hi
;
220 UMUL51(t
[6], lo
, a
[4], b
[1]); t
[5] += lo
;
221 UMUL51(hi
, lo
, a
[3], b
[2]); t
[5] += lo
; t
[6] += hi
;
222 UMUL51(hi
, lo
, a
[2], b
[3]); t
[5] += lo
; t
[6] += hi
;
223 UMUL51(hi
, lo
, a
[1], b
[4]); t
[5] += lo
; t
[6] += hi
;
225 UMUL51(t
[7], lo
, a
[4], b
[2]); t
[6] += lo
;
226 UMUL51(hi
, lo
, a
[3], b
[3]); t
[6] += lo
; t
[7] += hi
;
227 UMUL51(hi
, lo
, a
[2], b
[4]); t
[6] += lo
; t
[7] += hi
;
229 UMUL51(t
[8], lo
, a
[4], b
[3]); t
[7] += lo
;
230 UMUL51(hi
, lo
, a
[3], b
[4]); t
[7] += lo
; t
[8] += hi
;
232 UMUL51(t
[9], lo
, a
[4], b
[4]); t
[8] += lo
;
235 * The upper words t[5]..t[9] are folded back into the lower
236 * words, using the rule that 2^255 = 19 in the field.
238 * Since each t[i] is less than 324*2^51, the additions below
239 * will yield less than 6480*2^51 in each limb; this fits in
240 * 64 bits (6480*2^51 < 8192*2^51 = 2^64), hence there is
269 * Since the limbs were 64-bit values, the top carry is at
270 * most 8192 (in practice, that cannot be reached). We simply
271 * performed a partial reduction.
277 * Multiplication by A24 = 121665.
278 * Input must have limbs of 60 bits at most.
281 f255_mul_a24(uint64_t *d
, const uint64_t *a
)
283 uint64_t t
[5], cc
, w
;
286 * 121665 = 15 * 8111. We first multiply by 15, with carry
287 * propagation and partial reduction.
303 t
[0] += 19 * (w
>> 51);
306 * Then multiplication by 8111. At that point, we known that
307 * t[0] is less than 2^51 + 19*8192, and other limbs are less
308 * than 2^51; thus, there will be no overflow.
313 w
= t
[1] * 8111 + cc
;
316 w
= t
[2] * 8111 + cc
;
319 w
= t
[3] * 8111 + cc
;
322 w
= t
[4] * 8111 + cc
;
324 d
[0] += 19 * (w
>> 51);
328 * Finalize reduction.
329 * On input, limbs must fit on 51 bits, except possibly the low limb,
330 * which may be slightly above 2^51.
333 f255_final_reduce(uint64_t *a
)
335 uint64_t t
[5], cc
, w
;
338 * We add 19. If the result (in t[]) is below 2^255, then a[]
339 * is already less than 2^255-19, thus already reduced.
340 * Otherwise, we subtract 2^255 from t[], in which case we
341 * have t = a - (2^255-19), and that's our result.
360 * The bit 255 of t is in cc. If that bit is 0, when a[] must
361 * be unchanged; otherwise, it must be replaced with t[].
364 a
[0] ^= cc
& (a
[0] ^ t
[0]);
365 a
[1] ^= cc
& (a
[1] ^ t
[1]);
366 a
[2] ^= cc
& (a
[2] ^ t
[2]);
367 a
[3] ^= cc
& (a
[3] ^ t
[3]);
368 a
[4] ^= cc
& (a
[4] ^ t
[4]);
372 api_mul(unsigned char *G
, size_t Glen
,
373 const unsigned char *kb
, size_t kblen
, int curve
)
376 uint64_t x1
[5], x2
[5], z2
[5], x3
[5], z3
[5];
383 * Points are encoded over exactly 32 bytes. Multipliers must fit
384 * in 32 bytes as well.
386 if (Glen
!= 32 || kblen
> 32) {
391 * RFC 7748 mandates that the high bit of the last point byte must
392 * be ignored/cleared; the "& MASK51" in the initialization for
393 * x1[4] clears that bit.
395 x1
[0] = br_dec64le(&G
[0]) & MASK51
;
396 x1
[1] = (br_dec64le(&G
[6]) >> 3) & MASK51
;
397 x1
[2] = (br_dec64le(&G
[12]) >> 6) & MASK51
;
398 x1
[3] = (br_dec64le(&G
[19]) >> 1) & MASK51
;
399 x1
[4] = (br_dec64le(&G
[24]) >> 12) & MASK51
;
402 * We can use memset() to clear values, because exact-width types
403 * like uint64_t are guaranteed to have no padding bits or
404 * trap representations.
406 memset(x2
, 0, sizeof x2
);
408 memset(z2
, 0, sizeof z2
);
409 memcpy(x3
, x1
, sizeof x1
);
410 memcpy(z3
, x2
, sizeof x2
);
413 * The multiplier is provided in big-endian notation, and
414 * possibly shorter than 32 bytes.
416 memset(k
, 0, (sizeof k
) - kblen
);
417 memcpy(k
+ (sizeof k
) - kblen
, kb
, kblen
);
424 for (i
= 254; i
>= 0; i
--) {
425 uint64_t a
[5], aa
[5], b
[5], bb
[5], e
[5];
426 uint64_t c
[5], d
[5], da
[5], cb
[5];
429 kt
= (k
[31 - (i
>> 3)] >> (i
& 7)) & 1;
431 f255_cswap(x2
, x3
, swap
);
432 f255_cswap(z2
, z3
, swap
);
436 * At that point, limbs of x_2 and z_2 are assumed to fit
437 * on at most 52 bits each.
439 * Each f255_add() adds one bit to the maximum range of
440 * the values, but f255_sub() and f255_mul() bring back
441 * the limbs into 52 bits. All f255_add() outputs are
442 * used only as inputs for f255_mul(), which ensures
443 * that limbs remain in the proper range.
446 /* A = x_2 + z_2 -- limbs fit on 53 bits each */
461 /* C = x_3 + z_3 -- limbs fit on 53 bits each */
473 /* x_3 = (DA + CB)^2 */
474 f255_add(x3
, da
, cb
);
475 f255_mul(x3
, x3
, x3
);
477 /* z_3 = x_1 * (DA - CB)^2 */
478 f255_sub(z3
, da
, cb
);
479 f255_mul(z3
, z3
, z3
);
480 f255_mul(z3
, x1
, z3
);
483 f255_mul(x2
, aa
, bb
);
485 /* z_2 = E * (AA + a24 * E) */
487 f255_add(z2
, aa
, z2
);
491 f255_cswap(x2
, x3
, swap
);
492 f255_cswap(z2
, z3
, swap
);
495 * Compute 1/z2 = z2^(p-2). Since p = 2^255-19, we can mutualize
496 * most non-squarings. We use x1 and x3, now useless, as temporaries.
498 memcpy(x1
, z2
, sizeof z2
);
499 for (i
= 0; i
< 15; i
++) {
500 f255_mul(x1
, x1
, x1
);
501 f255_mul(x1
, x1
, z2
);
503 memcpy(x3
, x1
, sizeof x1
);
504 for (i
= 0; i
< 14; i
++) {
507 for (j
= 0; j
< 16; j
++) {
508 f255_mul(x3
, x3
, x3
);
510 f255_mul(x3
, x3
, x1
);
512 for (i
= 14; i
>= 0; i
--) {
513 f255_mul(x3
, x3
, x3
);
514 if ((0xFFEB >> i
) & 1) {
515 f255_mul(x3
, z2
, x3
);
520 * Compute x2/z2. We have 1/z2 in x3.
522 f255_mul(x2
, x2
, x3
);
523 f255_final_reduce(x2
);
526 * Encode the final x2 value in little-endian. We first assemble
527 * the limbs into 64-bit values.
529 x2
[0] |= x2
[1] << 51;
530 x2
[1] = (x2
[1] >> 13) | (x2
[2] << 38);
531 x2
[2] = (x2
[2] >> 26) | (x2
[3] << 25);
532 x2
[3] = (x2
[3] >> 39) | (x2
[4] << 12);
533 br_enc64le(G
, x2
[0]);
534 br_enc64le(G
+ 8, x2
[1]);
535 br_enc64le(G
+ 16, x2
[2]);
536 br_enc64le(G
+ 24, x2
[3]);
541 api_mulgen(unsigned char *R
,
542 const unsigned char *x
, size_t xlen
, int curve
)
544 const unsigned char *G
;
547 G
= api_generator(curve
, &Glen
);
549 api_mul(R
, Glen
, x
, xlen
, curve
);
554 api_muladd(unsigned char *A
, const unsigned char *B
, size_t len
,
555 const unsigned char *x
, size_t xlen
,
556 const unsigned char *y
, size_t ylen
, int curve
)
559 * We don't implement this method, since it is used for ECDSA
560 * only, and there is no ECDSA over Curve25519 (which instead
574 /* see bearssl_ec.h */
575 const br_ec_impl br_ec_c25519_m62
= {
576 (uint32_t)0x20000000,
585 /* see bearssl_ec.h */
587 br_ec_c25519_m62_get(void)
589 return &br_ec_c25519_m62
;
594 /* see bearssl_ec.h */
596 br_ec_c25519_m62_get(void)