2 * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
4 * Permission is hereby granted, free of charge, to any person obtaining
5 * a copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sublicense, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
12 * The above copyright notice and this permission notice shall be
13 * included in all copies or substantial portions of the Software.
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27 #if BR_INT128 || BR_UMUL128
33 static const unsigned char GEN
[] = {
34 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
35 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
36 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
37 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
40 static const unsigned char ORDER
[] = {
41 0x7F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
42 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
43 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
44 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
47 static const unsigned char *
48 api_generator(int curve
, size_t *len
)
55 static const unsigned char *
56 api_order(int curve
, size_t *len
)
64 api_xoff(int curve
, size_t *len
)
72 * A field element is encoded as five 64-bit integers, in basis 2^51.
73 * Limbs may be occasionally larger than 2^51, to save on carry
77 #define MASK51 (((uint64_t)1 << 51) - (uint64_t)1)
80 * Swap two field elements, conditionally on a flag.
83 f255_cswap(uint64_t *a
, uint64_t *b
, uint32_t ctl
)
88 w
= m
& (a
[0] ^ b
[0]); a
[0] ^= w
; b
[0] ^= w
;
89 w
= m
& (a
[1] ^ b
[1]); a
[1] ^= w
; b
[1] ^= w
;
90 w
= m
& (a
[2] ^ b
[2]); a
[2] ^= w
; b
[2] ^= w
;
91 w
= m
& (a
[3] ^ b
[3]); a
[3] ^= w
; b
[3] ^= w
;
92 w
= m
& (a
[4] ^ b
[4]); a
[4] ^= w
; b
[4] ^= w
;
96 * Addition with no carry propagation. Limbs double in size.
99 f255_add(uint64_t *d
, const uint64_t *a
, const uint64_t *b
)
110 * On input, limbs must fit on 60 bits each. On output, result is
111 * partially reduced, with max value 2^255+19456; moreover, all
112 * limbs will fit on 51 bits, except the low limb, which may have
113 * value up to 2^51+19455.
116 f255_sub(uint64_t *d
, const uint64_t *a
, const uint64_t *b
)
121 * We compute d = (2^255-19)*1024 + a - b. Since the limbs
122 * fit on 60 bits, the maximum value of operands are slightly
123 * more than 2^264, but much less than 2^265-19456. This
124 * ensures that the result is positive.
128 * Initial carry is 19456, since we add 2^265-19456. Each
129 * individual subtraction may yield a carry up to 513.
131 w
= a
[0] - b
[0] - 19456;
133 cc
= -(w
>> 51) & 0x3FF;
134 w
= a
[1] - b
[1] - cc
;
136 cc
= -(w
>> 51) & 0x3FF;
137 w
= a
[2] - b
[2] - cc
;
139 cc
= -(w
>> 51) & 0x3FF;
140 w
= a
[3] - b
[3] - cc
;
142 cc
= -(w
>> 51) & 0x3FF;
143 d
[4] = ((uint64_t)1 << 61) + a
[4] - b
[4] - cc
;
146 * Partial reduction. The intermediate result may be up to
147 * slightly above 2^265, but less than 2^265+2^255. When we
148 * truncate to 255 bits, the upper bits will be at most 1024.
150 d
[0] += 19 * (d
[4] >> 51);
155 * UMUL51(hi, lo, x, y) computes:
157 * hi = floor((x * y) / (2^51))
158 * lo = x * y mod 2^51
160 * Note that lo < 2^51, but "hi" may be larger, if the input operands are
165 #define UMUL51(hi, lo, x, y) do { \
166 unsigned __int128 umul_tmp; \
167 umul_tmp = (unsigned __int128)(x) * (unsigned __int128)(y); \
168 (hi) = (uint64_t)(umul_tmp >> 51); \
169 (lo) = (uint64_t)umul_tmp & MASK51; \
174 #define UMUL51(hi, lo, x, y) do { \
175 uint64_t umul_hi, umul_lo; \
176 umul_lo = _umul128((x), (y), &umul_hi); \
177 (hi) = (umul_hi << 13) | (umul_lo >> 51); \
178 (lo) = umul_lo & MASK51; \
185 * On input, limbs must fit on 54 bits each.
186 * On output, limb 0 is at most 2^51 + 155647, and other limbs fit
190 f255_mul(uint64_t *d
, uint64_t *a
, uint64_t *b
)
192 uint64_t t
[10], hi
, lo
, w
, cc
;
195 * Perform cross products, accumulating values without carry
198 * Since input limbs fit on 54 bits each, each individual
199 * UMUL51 will produce a "hi" of less than 2^57. The maximum
200 * sum will be at most 5*(2^57-1) + 4*(2^51-1) (for t[5]),
201 * i.e. less than 324*2^51.
204 UMUL51(t
[1], t
[0], a
[0], b
[0]);
206 UMUL51(t
[2], lo
, a
[1], b
[0]); t
[1] += lo
;
207 UMUL51(hi
, lo
, a
[0], b
[1]); t
[1] += lo
; t
[2] += hi
;
209 UMUL51(t
[3], lo
, a
[2], b
[0]); t
[2] += lo
;
210 UMUL51(hi
, lo
, a
[1], b
[1]); t
[2] += lo
; t
[3] += hi
;
211 UMUL51(hi
, lo
, a
[0], b
[2]); t
[2] += lo
; t
[3] += hi
;
213 UMUL51(t
[4], lo
, a
[3], b
[0]); t
[3] += lo
;
214 UMUL51(hi
, lo
, a
[2], b
[1]); t
[3] += lo
; t
[4] += hi
;
215 UMUL51(hi
, lo
, a
[1], b
[2]); t
[3] += lo
; t
[4] += hi
;
216 UMUL51(hi
, lo
, a
[0], b
[3]); t
[3] += lo
; t
[4] += hi
;
218 UMUL51(t
[5], lo
, a
[4], b
[0]); t
[4] += lo
;
219 UMUL51(hi
, lo
, a
[3], b
[1]); t
[4] += lo
; t
[5] += hi
;
220 UMUL51(hi
, lo
, a
[2], b
[2]); t
[4] += lo
; t
[5] += hi
;
221 UMUL51(hi
, lo
, a
[1], b
[3]); t
[4] += lo
; t
[5] += hi
;
222 UMUL51(hi
, lo
, a
[0], b
[4]); t
[4] += lo
; t
[5] += hi
;
224 UMUL51(t
[6], lo
, a
[4], b
[1]); t
[5] += lo
;
225 UMUL51(hi
, lo
, a
[3], b
[2]); t
[5] += lo
; t
[6] += hi
;
226 UMUL51(hi
, lo
, a
[2], b
[3]); t
[5] += lo
; t
[6] += hi
;
227 UMUL51(hi
, lo
, a
[1], b
[4]); t
[5] += lo
; t
[6] += hi
;
229 UMUL51(t
[7], lo
, a
[4], b
[2]); t
[6] += lo
;
230 UMUL51(hi
, lo
, a
[3], b
[3]); t
[6] += lo
; t
[7] += hi
;
231 UMUL51(hi
, lo
, a
[2], b
[4]); t
[6] += lo
; t
[7] += hi
;
233 UMUL51(t
[8], lo
, a
[4], b
[3]); t
[7] += lo
;
234 UMUL51(hi
, lo
, a
[3], b
[4]); t
[7] += lo
; t
[8] += hi
;
236 UMUL51(t
[9], lo
, a
[4], b
[4]); t
[8] += lo
;
239 * The upper words t[5]..t[9] are folded back into the lower
240 * words, using the rule that 2^255 = 19 in the field.
242 * Since each t[i] is less than 324*2^51, the additions below
243 * will yield less than 6480*2^51 in each limb; this fits in
244 * 64 bits (6480*2^51 < 8192*2^51 = 2^64), hence there is
273 * Since the limbs were 64-bit values, the top carry is at
274 * most 8192 (in practice, that cannot be reached). We simply
275 * performed a partial reduction.
281 * Multiplication by A24 = 121665.
282 * Input must have limbs of 60 bits at most.
285 f255_mul_a24(uint64_t *d
, const uint64_t *a
)
287 uint64_t t
[5], cc
, w
;
290 * 121665 = 15 * 8111. We first multiply by 15, with carry
291 * propagation and partial reduction.
307 t
[0] += 19 * (w
>> 51);
310 * Then multiplication by 8111. At that point, we known that
311 * t[0] is less than 2^51 + 19*8192, and other limbs are less
312 * than 2^51; thus, there will be no overflow.
317 w
= t
[1] * 8111 + cc
;
320 w
= t
[2] * 8111 + cc
;
323 w
= t
[3] * 8111 + cc
;
326 w
= t
[4] * 8111 + cc
;
328 d
[0] += 19 * (w
>> 51);
332 * Finalize reduction.
333 * On input, limbs must fit on 51 bits, except possibly the low limb,
334 * which may be slightly above 2^51.
337 f255_final_reduce(uint64_t *a
)
339 uint64_t t
[5], cc
, w
;
342 * We add 19. If the result (in t[]) is below 2^255, then a[]
343 * is already less than 2^255-19, thus already reduced.
344 * Otherwise, we subtract 2^255 from t[], in which case we
345 * have t = a - (2^255-19), and that's our result.
364 * The bit 255 of t is in cc. If that bit is 0, when a[] must
365 * be unchanged; otherwise, it must be replaced with t[].
368 a
[0] ^= cc
& (a
[0] ^ t
[0]);
369 a
[1] ^= cc
& (a
[1] ^ t
[1]);
370 a
[2] ^= cc
& (a
[2] ^ t
[2]);
371 a
[3] ^= cc
& (a
[3] ^ t
[3]);
372 a
[4] ^= cc
& (a
[4] ^ t
[4]);
376 api_mul(unsigned char *G
, size_t Glen
,
377 const unsigned char *kb
, size_t kblen
, int curve
)
380 uint64_t x1
[5], x2
[5], z2
[5], x3
[5], z3
[5];
387 * Points are encoded over exactly 32 bytes. Multipliers must fit
388 * in 32 bytes as well.
390 if (Glen
!= 32 || kblen
> 32) {
395 * RFC 7748 mandates that the high bit of the last point byte must
396 * be ignored/cleared; the "& MASK51" in the initialization for
397 * x1[4] clears that bit.
399 x1
[0] = br_dec64le(&G
[0]) & MASK51
;
400 x1
[1] = (br_dec64le(&G
[6]) >> 3) & MASK51
;
401 x1
[2] = (br_dec64le(&G
[12]) >> 6) & MASK51
;
402 x1
[3] = (br_dec64le(&G
[19]) >> 1) & MASK51
;
403 x1
[4] = (br_dec64le(&G
[24]) >> 12) & MASK51
;
406 * We can use memset() to clear values, because exact-width types
407 * like uint64_t are guaranteed to have no padding bits or
408 * trap representations.
410 memset(x2
, 0, sizeof x2
);
412 memset(z2
, 0, sizeof z2
);
413 memcpy(x3
, x1
, sizeof x1
);
414 memcpy(z3
, x2
, sizeof x2
);
417 * The multiplier is provided in big-endian notation, and
418 * possibly shorter than 32 bytes.
420 memset(k
, 0, (sizeof k
) - kblen
);
421 memcpy(k
+ (sizeof k
) - kblen
, kb
, kblen
);
428 for (i
= 254; i
>= 0; i
--) {
429 uint64_t a
[5], aa
[5], b
[5], bb
[5], e
[5];
430 uint64_t c
[5], d
[5], da
[5], cb
[5];
433 kt
= (k
[31 - (i
>> 3)] >> (i
& 7)) & 1;
435 f255_cswap(x2
, x3
, swap
);
436 f255_cswap(z2
, z3
, swap
);
440 * At that point, limbs of x_2 and z_2 are assumed to fit
441 * on at most 52 bits each.
443 * Each f255_add() adds one bit to the maximum range of
444 * the values, but f255_sub() and f255_mul() bring back
445 * the limbs into 52 bits. All f255_add() outputs are
446 * used only as inputs for f255_mul(), which ensures
447 * that limbs remain in the proper range.
450 /* A = x_2 + z_2 -- limbs fit on 53 bits each */
465 /* C = x_3 + z_3 -- limbs fit on 53 bits each */
477 /* x_3 = (DA + CB)^2 */
478 f255_add(x3
, da
, cb
);
479 f255_mul(x3
, x3
, x3
);
481 /* z_3 = x_1 * (DA - CB)^2 */
482 f255_sub(z3
, da
, cb
);
483 f255_mul(z3
, z3
, z3
);
484 f255_mul(z3
, x1
, z3
);
487 f255_mul(x2
, aa
, bb
);
489 /* z_2 = E * (AA + a24 * E) */
491 f255_add(z2
, aa
, z2
);
495 f255_cswap(x2
, x3
, swap
);
496 f255_cswap(z2
, z3
, swap
);
499 * Compute 1/z2 = z2^(p-2). Since p = 2^255-19, we can mutualize
500 * most non-squarings. We use x1 and x3, now useless, as temporaries.
502 memcpy(x1
, z2
, sizeof z2
);
503 for (i
= 0; i
< 15; i
++) {
504 f255_mul(x1
, x1
, x1
);
505 f255_mul(x1
, x1
, z2
);
507 memcpy(x3
, x1
, sizeof x1
);
508 for (i
= 0; i
< 14; i
++) {
511 for (j
= 0; j
< 16; j
++) {
512 f255_mul(x3
, x3
, x3
);
514 f255_mul(x3
, x3
, x1
);
516 for (i
= 14; i
>= 0; i
--) {
517 f255_mul(x3
, x3
, x3
);
518 if ((0xFFEB >> i
) & 1) {
519 f255_mul(x3
, z2
, x3
);
524 * Compute x2/z2. We have 1/z2 in x3.
526 f255_mul(x2
, x2
, x3
);
527 f255_final_reduce(x2
);
530 * Encode the final x2 value in little-endian. We first assemble
531 * the limbs into 64-bit values.
533 x2
[0] |= x2
[1] << 51;
534 x2
[1] = (x2
[1] >> 13) | (x2
[2] << 38);
535 x2
[2] = (x2
[2] >> 26) | (x2
[3] << 25);
536 x2
[3] = (x2
[3] >> 39) | (x2
[4] << 12);
537 br_enc64le(G
, x2
[0]);
538 br_enc64le(G
+ 8, x2
[1]);
539 br_enc64le(G
+ 16, x2
[2]);
540 br_enc64le(G
+ 24, x2
[3]);
545 api_mulgen(unsigned char *R
,
546 const unsigned char *x
, size_t xlen
, int curve
)
548 const unsigned char *G
;
551 G
= api_generator(curve
, &Glen
);
553 api_mul(R
, Glen
, x
, xlen
, curve
);
558 api_muladd(unsigned char *A
, const unsigned char *B
, size_t len
,
559 const unsigned char *x
, size_t xlen
,
560 const unsigned char *y
, size_t ylen
, int curve
)
563 * We don't implement this method, since it is used for ECDSA
564 * only, and there is no ECDSA over Curve25519 (which instead
578 /* see bearssl_ec.h */
579 const br_ec_impl br_ec_c25519_m62
= {
580 (uint32_t)0x20000000,
589 /* see bearssl_ec.h */
591 br_ec_c25519_m62_get(void)
593 return &br_ec_c25519_m62
;
598 /* see bearssl_ec.h */
600 br_ec_c25519_m62_get(void)