2 * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
4 * Permission is hereby granted, free of charge, to any person obtaining
5 * a copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sublicense, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
12 * The above copyright notice and this permission notice shall be
13 * included in all copies or substantial portions of the Software.
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27 #if BR_INT128 || BR_UMUL128
33 static const unsigned char GEN
[] = {
34 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
35 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
36 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
37 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
40 static const unsigned char ORDER
[] = {
41 0x7F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
42 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
43 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
44 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
47 static const unsigned char *
48 api_generator(int curve
, size_t *len
)
55 static const unsigned char *
56 api_order(int curve
, size_t *len
)
64 api_xoff(int curve
, size_t *len
)
72 * A field element is encoded as four 64-bit integers, in basis 2^63.
73 * Operations return partially reduced values, which may range up to
77 #define MASK63 (((uint64_t)1 << 63) - (uint64_t)1)
80 * Swap two field elements, conditionally on a flag.
83 f255_cswap(uint64_t *a
, uint64_t *b
, uint32_t ctl
)
88 w
= m
& (a
[0] ^ b
[0]); a
[0] ^= w
; b
[0] ^= w
;
89 w
= m
& (a
[1] ^ b
[1]); a
[1] ^= w
; b
[1] ^= w
;
90 w
= m
& (a
[2] ^ b
[2]); a
[2] ^= w
; b
[2] ^= w
;
91 w
= m
& (a
[3] ^ b
[3]); a
[3] ^= w
; b
[3] ^= w
;
95 * Addition in the field.
98 f255_add(uint64_t *d
, const uint64_t *a
, const uint64_t *b
)
102 uint64_t t0
, t1
, t2
, t3
, cc
;
105 z
= (unsigned __int128
)a
[0] + (unsigned __int128
)b
[0];
107 z
= (unsigned __int128
)a
[1] + (unsigned __int128
)b
[1] + (z
>> 64);
109 z
= (unsigned __int128
)a
[2] + (unsigned __int128
)b
[2] + (z
>> 64);
111 z
= (unsigned __int128
)a
[3] + (unsigned __int128
)b
[3] + (z
>> 64);
112 t3
= (uint64_t)z
& MASK63
;
113 cc
= (uint64_t)(z
>> 63);
116 * Since operands are at most 2^255+37, the sum is at most
117 * 2^256+74; thus, the carry cc is equal to 0, 1 or 2.
119 * We use: 2^255 = 19 mod p.
120 * Since we add 0, 19 or 38 to a value that fits on 255 bits,
121 * the result is at most 2^255+37.
123 z
= (unsigned __int128
)t0
+ (unsigned __int128
)(19 * cc
);
125 z
= (unsigned __int128
)t1
+ (z
>> 64);
127 z
= (unsigned __int128
)t2
+ (z
>> 64);
129 d
[3] = t3
+ (uint64_t)(z
>> 64);
133 uint64_t t0
, t1
, t2
, t3
, cc
;
136 k
= _addcarry_u64(0, a
[0], b
[0], &t0
);
137 k
= _addcarry_u64(k
, a
[1], b
[1], &t1
);
138 k
= _addcarry_u64(k
, a
[2], b
[2], &t2
);
139 k
= _addcarry_u64(k
, a
[3], b
[3], &t3
);
140 cc
= (k
<< 1) + (t3
>> 63);
144 * Since operands are at most 2^255+37, the sum is at most
145 * 2^256+74; thus, the carry cc is equal to 0, 1 or 2.
147 * We use: 2^255 = 19 mod p.
148 * Since we add 0, 19 or 38 to a value that fits on 255 bits,
149 * the result is at most 2^255+37.
151 k
= _addcarry_u64(0, t0
, 19 * cc
, &d
[0]);
152 k
= _addcarry_u64(k
, t1
, 0, &d
[1]);
153 k
= _addcarry_u64(k
, t2
, 0, &d
[2]);
154 (void)_addcarry_u64(k
, t3
, 0, &d
[3]);
163 f255_sub(uint64_t *d
, const uint64_t *a
, const uint64_t *b
)
168 * We compute t = 2^256 - 38 + a - b, which is necessarily
169 * positive but lower than 2^256 + 2^255, since a <= 2^255 + 37
170 * and b <= 2^255 + 37. We then subtract 0, p or 2*p, depending
171 * on the two upper bits of t (bits 255 and 256).
174 uint64_t t0
, t1
, t2
, t3
, t4
, cc
;
177 z
= (unsigned __int128
)a
[0] - (unsigned __int128
)b
[0] - 38;
179 cc
= -(uint64_t)(z
>> 64);
180 z
= (unsigned __int128
)a
[1] - (unsigned __int128
)b
[1]
181 - (unsigned __int128
)cc
;
183 cc
= -(uint64_t)(z
>> 64);
184 z
= (unsigned __int128
)a
[2] - (unsigned __int128
)b
[2]
185 - (unsigned __int128
)cc
;
187 cc
= -(uint64_t)(z
>> 64);
188 z
= (unsigned __int128
)a
[3] - (unsigned __int128
)b
[3]
189 - (unsigned __int128
)cc
;
191 t4
= 1 + (uint64_t)(z
>> 64);
194 * We have a 257-bit result. The two top bits can be 00, 01 or 10,
195 * but not 11 (value t <= 2^256 - 38 + 2^255 + 37 = 2^256 + 2^255 - 1).
196 * Therefore, we can truncate to 255 bits, and add 0, 19 or 38.
197 * This guarantees that the result is at most 2^255+37.
199 cc
= (38 & -t4
) + (19 & -(t3
>> 63));
201 z
= (unsigned __int128
)t0
+ (unsigned __int128
)cc
;
203 z
= (unsigned __int128
)t1
+ (z
>> 64);
205 z
= (unsigned __int128
)t2
+ (z
>> 64);
207 d
[3] = t3
+ (uint64_t)(z
>> 64);
212 * We compute t = 2^256 - 38 + a - b, which is necessarily
213 * positive but lower than 2^256 + 2^255, since a <= 2^255 + 37
214 * and b <= 2^255 + 37. We then subtract 0, p or 2*p, depending
215 * on the two upper bits of t (bits 255 and 256).
218 uint64_t t0
, t1
, t2
, t3
, t4
;
221 k
= _subborrow_u64(0, a
[0], b
[0], &t0
);
222 k
= _subborrow_u64(k
, a
[1], b
[1], &t1
);
223 k
= _subborrow_u64(k
, a
[2], b
[2], &t2
);
224 k
= _subborrow_u64(k
, a
[3], b
[3], &t3
);
225 (void)_subborrow_u64(k
, 1, 0, &t4
);
227 k
= _subborrow_u64(0, t0
, 38, &t0
);
228 k
= _subborrow_u64(k
, t1
, 0, &t1
);
229 k
= _subborrow_u64(k
, t2
, 0, &t2
);
230 k
= _subborrow_u64(k
, t3
, 0, &t3
);
231 (void)_subborrow_u64(k
, t4
, 0, &t4
);
234 * We have a 257-bit result. The two top bits can be 00, 01 or 10,
235 * but not 11 (value t <= 2^256 - 38 + 2^255 + 37 = 2^256 + 2^255 - 1).
236 * Therefore, we can truncate to 255 bits, and add 0, 19 or 38.
237 * This guarantees that the result is at most 2^255+37.
239 t4
= (38 & -t4
) + (19 & -(t3
>> 63));
241 k
= _addcarry_u64(0, t0
, t4
, &d
[0]);
242 k
= _addcarry_u64(k
, t1
, 0, &d
[1]);
243 k
= _addcarry_u64(k
, t2
, 0, &d
[2]);
244 (void)_addcarry_u64(k
, t3
, 0, &d
[3]);
253 f255_mul(uint64_t *d
, uint64_t *a
, uint64_t *b
)
258 uint64_t t0
, t1
, t2
, t3
, t4
, t5
, t6
, t7
, th
;
261 * Compute the product a*b over plain integers.
263 z
= (unsigned __int128
)a
[0] * (unsigned __int128
)b
[0];
265 z
= (unsigned __int128
)a
[0] * (unsigned __int128
)b
[1] + (z
>> 64);
267 z
= (unsigned __int128
)a
[0] * (unsigned __int128
)b
[2] + (z
>> 64);
269 z
= (unsigned __int128
)a
[0] * (unsigned __int128
)b
[3] + (z
>> 64);
271 t4
= (uint64_t)(z
>> 64);
273 z
= (unsigned __int128
)a
[1] * (unsigned __int128
)b
[0]
274 + (unsigned __int128
)t1
;
276 z
= (unsigned __int128
)a
[1] * (unsigned __int128
)b
[1]
277 + (unsigned __int128
)t2
+ (z
>> 64);
279 z
= (unsigned __int128
)a
[1] * (unsigned __int128
)b
[2]
280 + (unsigned __int128
)t3
+ (z
>> 64);
282 z
= (unsigned __int128
)a
[1] * (unsigned __int128
)b
[3]
283 + (unsigned __int128
)t4
+ (z
>> 64);
285 t5
= (uint64_t)(z
>> 64);
287 z
= (unsigned __int128
)a
[2] * (unsigned __int128
)b
[0]
288 + (unsigned __int128
)t2
;
290 z
= (unsigned __int128
)a
[2] * (unsigned __int128
)b
[1]
291 + (unsigned __int128
)t3
+ (z
>> 64);
293 z
= (unsigned __int128
)a
[2] * (unsigned __int128
)b
[2]
294 + (unsigned __int128
)t4
+ (z
>> 64);
296 z
= (unsigned __int128
)a
[2] * (unsigned __int128
)b
[3]
297 + (unsigned __int128
)t5
+ (z
>> 64);
299 t6
= (uint64_t)(z
>> 64);
301 z
= (unsigned __int128
)a
[3] * (unsigned __int128
)b
[0]
302 + (unsigned __int128
)t3
;
304 z
= (unsigned __int128
)a
[3] * (unsigned __int128
)b
[1]
305 + (unsigned __int128
)t4
+ (z
>> 64);
307 z
= (unsigned __int128
)a
[3] * (unsigned __int128
)b
[2]
308 + (unsigned __int128
)t5
+ (z
>> 64);
310 z
= (unsigned __int128
)a
[3] * (unsigned __int128
)b
[3]
311 + (unsigned __int128
)t6
+ (z
>> 64);
313 t7
= (uint64_t)(z
>> 64);
319 * 2^510 = 19*19 = 361
321 * We split the intermediate t into three parts, in basis
322 * 2^255. The low one will be in t0..t3; the middle one in t4..t7.
323 * The upper one can only be a single bit (th), since the
324 * multiplication operands are at most 2^255+37 each.
327 t7
= ((t7
<< 1) | (t6
>> 63)) & MASK63
;
328 t6
= (t6
<< 1) | (t5
>> 63);
329 t5
= (t5
<< 1) | (t4
>> 63);
330 t4
= (t4
<< 1) | (t3
>> 63);
334 * Multiply the middle part (t4..t7) by 19. We truncate it to
335 * 255 bits; the extra bits will go along with th.
337 z
= (unsigned __int128
)t4
* 19;
339 z
= (unsigned __int128
)t5
* 19 + (z
>> 64);
341 z
= (unsigned __int128
)t6
* 19 + (z
>> 64);
343 z
= (unsigned __int128
)t7
* 19 + (z
>> 64);
344 t7
= (uint64_t)z
& MASK63
;
346 th
= (361 & -th
) + (19 * (uint64_t)(z
>> 63));
349 * Add elements together.
351 * t0..t3 fits on 255 bits.
352 * t4..t7 fits on 255 bits.
353 * th <= 361 + 342 = 703.
355 z
= (unsigned __int128
)t0
+ (unsigned __int128
)t4
356 + (unsigned __int128
)th
;
358 z
= (unsigned __int128
)t1
+ (unsigned __int128
)t5
+ (z
>> 64);
360 z
= (unsigned __int128
)t2
+ (unsigned __int128
)t6
+ (z
>> 64);
362 z
= (unsigned __int128
)t3
+ (unsigned __int128
)t7
+ (z
>> 64);
363 t3
= (uint64_t)z
& MASK63
;
364 th
= (uint64_t)(z
>> 63);
367 * Since the sum is at most 2^256 + 703, the two upper bits, in th,
368 * can only have value 0, 1 or 2. We just add th*19, which
369 * guarantees a result of at most 2^255+37.
371 z
= (unsigned __int128
)t0
+ (19 * th
);
373 z
= (unsigned __int128
)t1
+ (z
>> 64);
375 z
= (unsigned __int128
)t2
+ (z
>> 64);
377 d
[3] = t3
+ (uint64_t)(z
>> 64);
381 uint64_t t0
, t1
, t2
, t3
, t4
, t5
, t6
, t7
, th
;
382 uint64_t h0
, h1
, h2
, h3
;
386 * Compute the product a*b over plain integers.
388 t0
= _umul128(a
[0], b
[0], &h0
);
389 t1
= _umul128(a
[0], b
[1], &h1
);
390 k
= _addcarry_u64(0, t1
, h0
, &t1
);
391 t2
= _umul128(a
[0], b
[2], &h2
);
392 k
= _addcarry_u64(k
, t2
, h1
, &t2
);
393 t3
= _umul128(a
[0], b
[3], &h3
);
394 k
= _addcarry_u64(k
, t3
, h2
, &t3
);
395 (void)_addcarry_u64(k
, h3
, 0, &t4
);
397 k
= _addcarry_u64(0, _umul128(a
[1], b
[0], &h0
), t1
, &t1
);
398 k
= _addcarry_u64(k
, _umul128(a
[1], b
[1], &h1
), t2
, &t2
);
399 k
= _addcarry_u64(k
, _umul128(a
[1], b
[2], &h2
), t3
, &t3
);
400 k
= _addcarry_u64(k
, _umul128(a
[1], b
[3], &h3
), t4
, &t4
);
402 k
= _addcarry_u64(0, t2
, h0
, &t2
);
403 k
= _addcarry_u64(k
, t3
, h1
, &t3
);
404 k
= _addcarry_u64(k
, t4
, h2
, &t4
);
405 (void)_addcarry_u64(k
, t5
, h3
, &t5
);
407 k
= _addcarry_u64(0, _umul128(a
[2], b
[0], &h0
), t2
, &t2
);
408 k
= _addcarry_u64(k
, _umul128(a
[2], b
[1], &h1
), t3
, &t3
);
409 k
= _addcarry_u64(k
, _umul128(a
[2], b
[2], &h2
), t4
, &t4
);
410 k
= _addcarry_u64(k
, _umul128(a
[2], b
[3], &h3
), t5
, &t5
);
412 k
= _addcarry_u64(0, t3
, h0
, &t3
);
413 k
= _addcarry_u64(k
, t4
, h1
, &t4
);
414 k
= _addcarry_u64(k
, t5
, h2
, &t5
);
415 (void)_addcarry_u64(k
, t6
, h3
, &t6
);
417 k
= _addcarry_u64(0, _umul128(a
[3], b
[0], &h0
), t3
, &t3
);
418 k
= _addcarry_u64(k
, _umul128(a
[3], b
[1], &h1
), t4
, &t4
);
419 k
= _addcarry_u64(k
, _umul128(a
[3], b
[2], &h2
), t5
, &t5
);
420 k
= _addcarry_u64(k
, _umul128(a
[3], b
[3], &h3
), t6
, &t6
);
422 k
= _addcarry_u64(0, t4
, h0
, &t4
);
423 k
= _addcarry_u64(k
, t5
, h1
, &t5
);
424 k
= _addcarry_u64(k
, t6
, h2
, &t6
);
425 (void)_addcarry_u64(k
, t7
, h3
, &t7
);
431 * 2^510 = 19*19 = 361
433 * We split the intermediate t into three parts, in basis
434 * 2^255. The low one will be in t0..t3; the middle one in t4..t7.
435 * The upper one can only be a single bit (th), since the
436 * multiplication operands are at most 2^255+37 each.
439 t7
= ((t7
<< 1) | (t6
>> 63)) & MASK63
;
440 t6
= (t6
<< 1) | (t5
>> 63);
441 t5
= (t5
<< 1) | (t4
>> 63);
442 t4
= (t4
<< 1) | (t3
>> 63);
446 * Multiply the middle part (t4..t7) by 19. We truncate it to
447 * 255 bits; the extra bits will go along with th.
449 t4
= _umul128(t4
, 19, &h0
);
450 t5
= _umul128(t5
, 19, &h1
);
451 t6
= _umul128(t6
, 19, &h2
);
452 t7
= _umul128(t7
, 19, &h3
);
453 k
= _addcarry_u64(0, t5
, h0
, &t5
);
454 k
= _addcarry_u64(k
, t6
, h1
, &t6
);
455 k
= _addcarry_u64(k
, t7
, h2
, &t7
);
456 (void)_addcarry_u64(k
, h3
, 0, &h3
);
457 th
= (361 & -th
) + (19 * ((h3
<< 1) + (t7
>> 63)));
461 * Add elements together.
463 * t0..t3 fits on 255 bits.
464 * t4..t7 fits on 255 bits.
465 * th <= 361 + 342 = 703.
467 k
= _addcarry_u64(0, t0
, t4
, &t0
);
468 k
= _addcarry_u64(k
, t1
, t5
, &t1
);
469 k
= _addcarry_u64(k
, t2
, t6
, &t2
);
470 k
= _addcarry_u64(k
, t3
, t7
, &t3
);
472 k
= _addcarry_u64(0, t0
, th
, &t0
);
473 k
= _addcarry_u64(k
, t1
, 0, &t1
);
474 k
= _addcarry_u64(k
, t2
, 0, &t2
);
475 k
= _addcarry_u64(k
, t3
, 0, &t3
);
476 (void)_addcarry_u64(k
, t4
, 0, &t4
);
478 th
= (t4
<< 1) + (t3
>> 63);
482 * Since the sum is at most 2^256 + 703, the two upper bits, in th,
483 * can only have value 0, 1 or 2. We just add th*19, which
484 * guarantees a result of at most 2^255+37.
486 k
= _addcarry_u64(0, t0
, 19 * th
, &d
[0]);
487 k
= _addcarry_u64(k
, t1
, 0, &d
[1]);
488 k
= _addcarry_u64(k
, t2
, 0, &d
[2]);
489 (void)_addcarry_u64(k
, t3
, 0, &d
[3]);
495 * Multiplication by A24 = 121665.
498 f255_mul_a24(uint64_t *d
, const uint64_t *a
)
502 uint64_t t0
, t1
, t2
, t3
;
505 z
= (unsigned __int128
)a
[0] * 121665;
507 z
= (unsigned __int128
)a
[1] * 121665 + (z
>> 64);
509 z
= (unsigned __int128
)a
[2] * 121665 + (z
>> 64);
511 z
= (unsigned __int128
)a
[3] * 121665 + (z
>> 64);
512 t3
= (uint64_t)z
& MASK63
;
514 z
= (unsigned __int128
)t0
+ (19 * (uint64_t)(z
>> 63));
516 z
= (unsigned __int128
)t1
+ (z
>> 64);
518 z
= (unsigned __int128
)t2
+ (z
>> 64);
520 t3
= t3
+ (uint64_t)(z
>> 64);
522 z
= (unsigned __int128
)t0
+ (19 & -(t3
>> 63));
524 z
= (unsigned __int128
)t1
+ (z
>> 64);
526 z
= (unsigned __int128
)t2
+ (z
>> 64);
528 d
[3] = (t3
& MASK63
) + (uint64_t)(z
>> 64);
532 uint64_t t0
, t1
, t2
, t3
, t4
, h0
, h1
, h2
, h3
;
535 t0
= _umul128(a
[0], 121665, &h0
);
536 t1
= _umul128(a
[1], 121665, &h1
);
537 k
= _addcarry_u64(0, t1
, h0
, &t1
);
538 t2
= _umul128(a
[2], 121665, &h2
);
539 k
= _addcarry_u64(k
, t2
, h1
, &t2
);
540 t3
= _umul128(a
[3], 121665, &h3
);
541 k
= _addcarry_u64(k
, t3
, h2
, &t3
);
542 (void)_addcarry_u64(k
, h3
, 0, &t4
);
544 t4
= (t4
<< 1) + (t3
>> 63);
546 k
= _addcarry_u64(0, t0
, 19 * t4
, &t0
);
547 k
= _addcarry_u64(k
, t1
, 0, &t1
);
548 k
= _addcarry_u64(k
, t2
, 0, &t2
);
549 (void)_addcarry_u64(k
, t3
, 0, &t3
);
551 t4
= 19 & -(t3
>> 63);
553 k
= _addcarry_u64(0, t0
, t4
, &d
[0]);
554 k
= _addcarry_u64(k
, t1
, 0, &d
[1]);
555 k
= _addcarry_u64(k
, t2
, 0, &d
[2]);
556 (void)_addcarry_u64(k
, t3
, 0, &d
[3]);
562 * Finalize reduction.
565 f255_final_reduce(uint64_t *a
)
569 uint64_t t0
, t1
, t2
, t3
, m
;
573 * We add 19. If the result (in t) is below 2^255, then a[]
574 * is already less than 2^255-19, thus already reduced.
575 * Otherwise, we subtract 2^255 from t[], in which case we
576 * have t = a - (2^255-19), and that's our result.
578 z
= (unsigned __int128
)a
[0] + 19;
580 z
= (unsigned __int128
)a
[1] + (z
>> 64);
582 z
= (unsigned __int128
)a
[2] + (z
>> 64);
584 t3
= a
[3] + (uint64_t)(z
>> 64);
588 a
[0] ^= m
& (a
[0] ^ t0
);
589 a
[1] ^= m
& (a
[1] ^ t1
);
590 a
[2] ^= m
& (a
[2] ^ t2
);
591 a
[3] ^= m
& (a
[3] ^ t3
);
595 uint64_t t0
, t1
, t2
, t3
, m
;
599 * We add 19. If the result (in t) is below 2^255, then a[]
600 * is already less than 2^255-19, thus already reduced.
601 * Otherwise, we subtract 2^255 from t[], in which case we
602 * have t = a - (2^255-19), and that's our result.
604 k
= _addcarry_u64(0, a
[0], 19, &t0
);
605 k
= _addcarry_u64(k
, a
[1], 0, &t1
);
606 k
= _addcarry_u64(k
, a
[2], 0, &t2
);
607 (void)_addcarry_u64(k
, a
[3], 0, &t3
);
611 a
[0] ^= m
& (a
[0] ^ t0
);
612 a
[1] ^= m
& (a
[1] ^ t1
);
613 a
[2] ^= m
& (a
[2] ^ t2
);
614 a
[3] ^= m
& (a
[3] ^ t3
);
620 api_mul(unsigned char *G
, size_t Glen
,
621 const unsigned char *kb
, size_t kblen
, int curve
)
624 uint64_t x1
[4], x2
[4], z2
[4], x3
[4], z3
[4];
631 * Points are encoded over exactly 32 bytes. Multipliers must fit
632 * in 32 bytes as well.
634 if (Glen
!= 32 || kblen
> 32) {
639 * RFC 7748 mandates that the high bit of the last point byte must
640 * be ignored/cleared.
642 x1
[0] = br_dec64le(&G
[ 0]);
643 x1
[1] = br_dec64le(&G
[ 8]);
644 x1
[2] = br_dec64le(&G
[16]);
645 x1
[3] = br_dec64le(&G
[24]) & MASK63
;
648 * We can use memset() to clear values, because exact-width types
649 * like uint64_t are guaranteed to have no padding bits or
650 * trap representations.
652 memset(x2
, 0, sizeof x2
);
654 memset(z2
, 0, sizeof z2
);
655 memcpy(x3
, x1
, sizeof x1
);
656 memcpy(z3
, x2
, sizeof x2
);
659 * The multiplier is provided in big-endian notation, and
660 * possibly shorter than 32 bytes.
662 memset(k
, 0, (sizeof k
) - kblen
);
663 memcpy(k
+ (sizeof k
) - kblen
, kb
, kblen
);
670 for (i
= 254; i
>= 0; i
--) {
671 uint64_t a
[4], aa
[4], b
[4], bb
[4], e
[4];
672 uint64_t c
[4], d
[4], da
[4], cb
[4];
675 kt
= (k
[31 - (i
>> 3)] >> (i
& 7)) & 1;
677 f255_cswap(x2
, x3
, swap
);
678 f255_cswap(z2
, z3
, swap
);
708 /* x_3 = (DA + CB)^2 */
709 f255_add(x3
, da
, cb
);
710 f255_mul(x3
, x3
, x3
);
712 /* z_3 = x_1 * (DA - CB)^2 */
713 f255_sub(z3
, da
, cb
);
714 f255_mul(z3
, z3
, z3
);
715 f255_mul(z3
, x1
, z3
);
718 f255_mul(x2
, aa
, bb
);
720 /* z_2 = E * (AA + a24 * E) */
722 f255_add(z2
, aa
, z2
);
726 f255_cswap(x2
, x3
, swap
);
727 f255_cswap(z2
, z3
, swap
);
730 * Compute 1/z2 = z2^(p-2). Since p = 2^255-19, we can mutualize
731 * most non-squarings. We use x1 and x3, now useless, as temporaries.
733 memcpy(x1
, z2
, sizeof z2
);
734 for (i
= 0; i
< 15; i
++) {
735 f255_mul(x1
, x1
, x1
);
736 f255_mul(x1
, x1
, z2
);
738 memcpy(x3
, x1
, sizeof x1
);
739 for (i
= 0; i
< 14; i
++) {
742 for (j
= 0; j
< 16; j
++) {
743 f255_mul(x3
, x3
, x3
);
745 f255_mul(x3
, x3
, x1
);
747 for (i
= 14; i
>= 0; i
--) {
748 f255_mul(x3
, x3
, x3
);
749 if ((0xFFEB >> i
) & 1) {
750 f255_mul(x3
, z2
, x3
);
755 * Compute x2/z2. We have 1/z2 in x3.
757 f255_mul(x2
, x2
, x3
);
758 f255_final_reduce(x2
);
761 * Encode the final x2 value in little-endian.
763 br_enc64le(G
, x2
[0]);
764 br_enc64le(G
+ 8, x2
[1]);
765 br_enc64le(G
+ 16, x2
[2]);
766 br_enc64le(G
+ 24, x2
[3]);
771 api_mulgen(unsigned char *R
,
772 const unsigned char *x
, size_t xlen
, int curve
)
774 const unsigned char *G
;
777 G
= api_generator(curve
, &Glen
);
779 api_mul(R
, Glen
, x
, xlen
, curve
);
784 api_muladd(unsigned char *A
, const unsigned char *B
, size_t len
,
785 const unsigned char *x
, size_t xlen
,
786 const unsigned char *y
, size_t ylen
, int curve
)
789 * We don't implement this method, since it is used for ECDSA
790 * only, and there is no ECDSA over Curve25519 (which instead
804 /* see bearssl_ec.h */
805 const br_ec_impl br_ec_c25519_m64
= {
806 (uint32_t)0x20000000,
815 /* see bearssl_ec.h */
817 br_ec_c25519_m64_get(void)
819 return &br_ec_c25519_m64
;
824 /* see bearssl_ec.h */
826 br_ec_c25519_m64_get(void)