_ Git - BearSSL/blob - src/ec/ec_p256_m31.c

   1 /*
   2  * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining
   5  * a copy of this software and associated documentation files (the
   6  * "Software"), to deal in the Software without restriction, including
   7  * without limitation the rights to use, copy, modify, merge, publish,
   8  * distribute, sublicense, and/or sell copies of the Software, and to
   9  * permit persons to whom the Software is furnished to do so, subject to
  10  * the following conditions:
  11  *
  12  * The above copyright notice and this permission notice shall be
  13  * included in all copies or substantial portions of the Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  16  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  17  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  18  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  19  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  20  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  21  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "inner.h"
  26
  27 /*
  28  * If BR_NO_ARITH_SHIFT is undefined, or defined to 0, then we _assume_
  29  * that right-shifting a signed negative integer copies the sign bit
  30  * (arithmetic right-shift). This is "implementation-defined behaviour",
  31  * i.e. it is not undefined, but it may differ between compilers. Each
  32  * compiler is supposed to document its behaviour in that respect. GCC
  33  * explicitly defines that an arithmetic right shift is used. We expect
  34  * all other compilers to do the same, because underlying CPU offer an
  35  * arithmetic right shift opcode that could not be used otherwise.
  36  */
  37 #if BR_NO_ARITH_SHIFT
  38 #define ARSH(x, n)    (((uint32_t)(x) >> (n)) \
  39                       | ((-((uint32_t)(x) >> 31)) << (32 - (n))))
  40 #define ARSHW(x, n)   (((uint64_t)(x) >> (n)) \
  41                       | ((-((uint64_t)(x) >> 63)) << (64 - (n))))
  42 #else
  43 #define ARSH(x, n)    ((*(int32_t *)&(x)) >> (n))
  44 #define ARSHW(x, n)   ((*(int64_t *)&(x)) >> (n))
  45 #endif
  46
  47 /*
  48  * Convert an integer from unsigned big-endian encoding to a sequence of
  49  * 30-bit words in little-endian order. The final "partial" word is
  50  * returned.
  51  */
  52 static uint32_t
  53 be8_to_le30(uint32_t *dst, const unsigned char *src, size_t len)
  54 {
  55         uint32_t acc;
  56         int acc_len;
  57
  58         acc = 0;
  59         acc_len = 0;
  60         while (len -- > 0) {
  61                 uint32_t b;
  62
  63                 b = src[len];
  64                 if (acc_len < 22) {
  65                         acc |= b << acc_len;
  66                         acc_len += 8;
  67                 } else {
  68                         *dst ++ = (acc | (b << acc_len)) & 0x3FFFFFFF;
  69                         acc = b >> (30 - acc_len);
  70                         acc_len -= 22;
  71                 }
  72         }
  73         return acc;
  74 }
  75
  76 /*
  77  * Convert an integer (30-bit words, little-endian) to unsigned
  78  * big-endian encoding. The total encoding length is provided; all
  79  * the destination bytes will be filled.
  80  */
  81 static void
  82 le30_to_be8(unsigned char *dst, size_t len, const uint32_t *src)
  83 {
  84         uint32_t acc;
  85         int acc_len;
  86
  87         acc = 0;
  88         acc_len = 0;
  89         while (len -- > 0) {
  90                 if (acc_len < 8) {
  91                         uint32_t w;
  92
  93                         w = *src ++;
  94                         dst[len] = (unsigned char)(acc | (w << acc_len));
  95                         acc = w >> (8 - acc_len);
  96                         acc_len += 22;
  97                 } else {
  98                         dst[len] = (unsigned char)acc;
  99                         acc >>= 8;
 100                         acc_len -= 8;
 101                 }
 102         }
 103 }
 104
 105 /*
 106  * Multiply two integers. Source integers are represented as arrays of
 107  * nine 30-bit words, for values up to 2^270-1. Result is encoded over
 108  * 18 words of 30 bits each.
 109  */
 110 static void
 111 mul9(uint32_t *d, const uint32_t *a, const uint32_t *b)
 112 {
 113         /*
 114          * Maximum intermediate result is no more than
 115          * 10376293531797946367, which fits in 64 bits. Reason:
 116          *
 117          *   10376293531797946367 = 9 * (2^30-1)^2 + 9663676406
 118          *   10376293531797946367 < 9663676407 * 2^30
 119          *
 120          * Thus, adding together 9 products of 30-bit integers, with
 121          * a carry of at most 9663676406, yields an integer that fits
 122          * on 64 bits and generates a carry of at most 9663676406.
 123          */
 124         uint64_t t[17];
 125         uint64_t cc;
 126         int i;
 127
 128         t[ 0] = MUL31(a[0], b[0]);
 129         t[ 1] = MUL31(a[0], b[1])
 130                 + MUL31(a[1], b[0]);
 131         t[ 2] = MUL31(a[0], b[2])
 132                 + MUL31(a[1], b[1])
 133                 + MUL31(a[2], b[0]);
 134         t[ 3] = MUL31(a[0], b[3])
 135                 + MUL31(a[1], b[2])
 136                 + MUL31(a[2], b[1])
 137                 + MUL31(a[3], b[0]);
 138         t[ 4] = MUL31(a[0], b[4])
 139                 + MUL31(a[1], b[3])
 140                 + MUL31(a[2], b[2])
 141                 + MUL31(a[3], b[1])
 142                 + MUL31(a[4], b[0]);
 143         t[ 5] = MUL31(a[0], b[5])
 144                 + MUL31(a[1], b[4])
 145                 + MUL31(a[2], b[3])
 146                 + MUL31(a[3], b[2])
 147                 + MUL31(a[4], b[1])
 148                 + MUL31(a[5], b[0]);
 149         t[ 6] = MUL31(a[0], b[6])
 150                 + MUL31(a[1], b[5])
 151                 + MUL31(a[2], b[4])
 152                 + MUL31(a[3], b[3])
 153                 + MUL31(a[4], b[2])
 154                 + MUL31(a[5], b[1])
 155                 + MUL31(a[6], b[0]);
 156         t[ 7] = MUL31(a[0], b[7])
 157                 + MUL31(a[1], b[6])
 158                 + MUL31(a[2], b[5])
 159                 + MUL31(a[3], b[4])
 160                 + MUL31(a[4], b[3])
 161                 + MUL31(a[5], b[2])
 162                 + MUL31(a[6], b[1])
 163                 + MUL31(a[7], b[0]);
 164         t[ 8] = MUL31(a[0], b[8])
 165                 + MUL31(a[1], b[7])
 166                 + MUL31(a[2], b[6])
 167                 + MUL31(a[3], b[5])
 168                 + MUL31(a[4], b[4])
 169                 + MUL31(a[5], b[3])
 170                 + MUL31(a[6], b[2])
 171                 + MUL31(a[7], b[1])
 172                 + MUL31(a[8], b[0]);
 173         t[ 9] = MUL31(a[1], b[8])
 174                 + MUL31(a[2], b[7])
 175                 + MUL31(a[3], b[6])
 176                 + MUL31(a[4], b[5])
 177                 + MUL31(a[5], b[4])
 178                 + MUL31(a[6], b[3])
 179                 + MUL31(a[7], b[2])
 180                 + MUL31(a[8], b[1]);
 181         t[10] = MUL31(a[2], b[8])
 182                 + MUL31(a[3], b[7])
 183                 + MUL31(a[4], b[6])
 184                 + MUL31(a[5], b[5])
 185                 + MUL31(a[6], b[4])
 186                 + MUL31(a[7], b[3])
 187                 + MUL31(a[8], b[2]);
 188         t[11] = MUL31(a[3], b[8])
 189                 + MUL31(a[4], b[7])
 190                 + MUL31(a[5], b[6])
 191                 + MUL31(a[6], b[5])
 192                 + MUL31(a[7], b[4])
 193                 + MUL31(a[8], b[3]);
 194         t[12] = MUL31(a[4], b[8])
 195                 + MUL31(a[5], b[7])
 196                 + MUL31(a[6], b[6])
 197                 + MUL31(a[7], b[5])
 198                 + MUL31(a[8], b[4]);
 199         t[13] = MUL31(a[5], b[8])
 200                 + MUL31(a[6], b[7])
 201                 + MUL31(a[7], b[6])
 202                 + MUL31(a[8], b[5]);
 203         t[14] = MUL31(a[6], b[8])
 204                 + MUL31(a[7], b[7])
 205                 + MUL31(a[8], b[6]);
 206         t[15] = MUL31(a[7], b[8])
 207                 + MUL31(a[8], b[7]);
 208         t[16] = MUL31(a[8], b[8]);
 209
 210         /*
 211          * Propagate carries.
 212          */
 213         cc = 0;
 214         for (i = 0; i < 17; i ++) {
 215                 uint64_t w;
 216
 217                 w = t[i] + cc;
 218                 d[i] = (uint32_t)w & 0x3FFFFFFF;
 219                 cc = w >> 30;
 220         }
 221         d[17] = (uint32_t)cc;
 222 }
 223
 224 /*
 225  * Square a 270-bit integer, represented as an array of nine 30-bit words.
 226  * Result uses 18 words of 30 bits each.
 227  */
 228 static void
 229 square9(uint32_t *d, const uint32_t *a)
 230 {
 231         uint64_t t[17];
 232         uint64_t cc;
 233         int i;
 234
 235         t[ 0] = MUL31(a[0], a[0]);
 236         t[ 1] = ((MUL31(a[0], a[1])) << 1);
 237         t[ 2] = MUL31(a[1], a[1])
 238                 + ((MUL31(a[0], a[2])) << 1);
 239         t[ 3] = ((MUL31(a[0], a[3])
 240                 + MUL31(a[1], a[2])) << 1);
 241         t[ 4] = MUL31(a[2], a[2])
 242                 + ((MUL31(a[0], a[4])
 243                 + MUL31(a[1], a[3])) << 1);
 244         t[ 5] = ((MUL31(a[0], a[5])
 245                 + MUL31(a[1], a[4])
 246                 + MUL31(a[2], a[3])) << 1);
 247         t[ 6] = MUL31(a[3], a[3])
 248                 + ((MUL31(a[0], a[6])
 249                 + MUL31(a[1], a[5])
 250                 + MUL31(a[2], a[4])) << 1);
 251         t[ 7] = ((MUL31(a[0], a[7])
 252                 + MUL31(a[1], a[6])
 253                 + MUL31(a[2], a[5])
 254                 + MUL31(a[3], a[4])) << 1);
 255         t[ 8] = MUL31(a[4], a[4])
 256                 + ((MUL31(a[0], a[8])
 257                 + MUL31(a[1], a[7])
 258                 + MUL31(a[2], a[6])
 259                 + MUL31(a[3], a[5])) << 1);
 260         t[ 9] = ((MUL31(a[1], a[8])
 261                 + MUL31(a[2], a[7])
 262                 + MUL31(a[3], a[6])
 263                 + MUL31(a[4], a[5])) << 1);
 264         t[10] = MUL31(a[5], a[5])
 265                 + ((MUL31(a[2], a[8])
 266                 + MUL31(a[3], a[7])
 267                 + MUL31(a[4], a[6])) << 1);
 268         t[11] = ((MUL31(a[3], a[8])
 269                 + MUL31(a[4], a[7])
 270                 + MUL31(a[5], a[6])) << 1);
 271         t[12] = MUL31(a[6], a[6])
 272                 + ((MUL31(a[4], a[8])
 273                 + MUL31(a[5], a[7])) << 1);
 274         t[13] = ((MUL31(a[5], a[8])
 275                 + MUL31(a[6], a[7])) << 1);
 276         t[14] = MUL31(a[7], a[7])
 277                 + ((MUL31(a[6], a[8])) << 1);
 278         t[15] = ((MUL31(a[7], a[8])) << 1);
 279         t[16] = MUL31(a[8], a[8]);
 280
 281         /*
 282          * Propagate carries.
 283          */
 284         cc = 0;
 285         for (i = 0; i < 17; i ++) {
 286                 uint64_t w;
 287
 288                 w = t[i] + cc;
 289                 d[i] = (uint32_t)w & 0x3FFFFFFF;
 290                 cc = w >> 30;
 291         }
 292         d[17] = (uint32_t)cc;
 293 }
 294
 295 /*
 296  * Base field modulus for P-256.
 297  */
 298 static const uint32_t F256[] = {
 299
 300         0x3FFFFFFF, 0x3FFFFFFF, 0x3FFFFFFF, 0x0000003F, 0x00000000,
 301         0x00000000, 0x00001000, 0x3FFFC000, 0x0000FFFF
 302 };
 303
 304 /*
 305  * The 'b' curve equation coefficient for P-256.
 306  */
 307 static const uint32_t P256_B[] = {
 308
 309         0x27D2604B, 0x2F38F0F8, 0x053B0F63, 0x0741AC33, 0x1886BC65,
 310         0x2EF555DA, 0x293E7B3E, 0x0D762A8E, 0x00005AC6
 311 };
 312
 313 /*
 314  * Addition in the field. Source operands shall fit on 257 bits; output
 315  * will be lower than twice the modulus.
 316  */
 317 static void
 318 add_f256(uint32_t *d, const uint32_t *a, const uint32_t *b)
 319 {
 320         uint32_t w, cc;
 321         int i;
 322
 323         cc = 0;
 324         for (i = 0; i < 9; i ++) {
 325                 w = a[i] + b[i] + cc;
 326                 d[i] = w & 0x3FFFFFFF;
 327                 cc = w >> 30;
 328         }
 329         w >>= 16;
 330         d[8] &= 0xFFFF;
 331         d[3] -= w << 6;
 332         d[6] -= w << 12;
 333         d[7] += w << 14;
 334         cc = w;
 335         for (i = 0; i < 9; i ++) {
 336                 w = d[i] + cc;
 337                 d[i] = w & 0x3FFFFFFF;
 338                 cc = ARSH(w, 30);
 339         }
 340 }
 341
 342 /*
 343  * Subtraction in the field. Source operands shall be smaller than twice
 344  * the modulus; the result will fulfil the same property.
 345  */
 346 static void
 347 sub_f256(uint32_t *d, const uint32_t *a, const uint32_t *b)
 348 {
 349         uint32_t w, cc;
 350         int i;
 351
 352         /*
 353          * We really compute a - b + 2*p to make sure that the result is
 354          * positive.
 355          */
 356         w = a[0] - b[0] - 0x00002;
 357         d[0] = w & 0x3FFFFFFF;
 358         w = a[1] - b[1] + ARSH(w, 30);
 359         d[1] = w & 0x3FFFFFFF;
 360         w = a[2] - b[2] + ARSH(w, 30);
 361         d[2] = w & 0x3FFFFFFF;
 362         w = a[3] - b[3] + ARSH(w, 30) + 0x00080;
 363         d[3] = w & 0x3FFFFFFF;
 364         w = a[4] - b[4] + ARSH(w, 30);
 365         d[4] = w & 0x3FFFFFFF;
 366         w = a[5] - b[5] + ARSH(w, 30);
 367         d[5] = w & 0x3FFFFFFF;
 368         w = a[6] - b[6] + ARSH(w, 30) + 0x02000;
 369         d[6] = w & 0x3FFFFFFF;
 370         w = a[7] - b[7] + ARSH(w, 30) - 0x08000;
 371         d[7] = w & 0x3FFFFFFF;
 372         w = a[8] - b[8] + ARSH(w, 30) + 0x20000;
 373         d[8] = w & 0xFFFF;
 374         w >>= 16;
 375         d[8] &= 0xFFFF;
 376         d[3] -= w << 6;
 377         d[6] -= w << 12;
 378         d[7] += w << 14;
 379         cc = w;
 380         for (i = 0; i < 9; i ++) {
 381                 w = d[i] + cc;
 382                 d[i] = w & 0x3FFFFFFF;
 383                 cc = ARSH(w, 30);
 384         }
 385 }
 386
 387 /*
 388  * Compute a multiplication in F256. Source operands shall be less than
 389  * twice the modulus.
 390  */
 391 static void
 392 mul_f256(uint32_t *d, const uint32_t *a, const uint32_t *b)
 393 {
 394         uint32_t t[18];
 395         uint64_t s[18];
 396         uint64_t cc, x;
 397         uint32_t z;
 398         int i;
 399
 400         mul9(t, a, b);
 401
 402         /*
 403          * Modular reduction: each high word in added/subtracted where
 404          * necessary.
 405          *
 406          * The modulus is:
 407          *    p = 2^256 - 2^224 + 2^192 + 2^96 - 1
 408          * Therefore:
 409          *    2^256 = 2^224 - 2^192 - 2^96 + 1 mod p
 410          *
 411          * For a word x at bit offset n (n >= 256), we have:
 412          *    x*2^n = x*2^(n-32) - x*2^(n-64)
 413          *            - x*2^(n - 160) + x*2^(n-256) mod p
 414          *
 415          * Thus, we can nullify the high word if we reinject it at some
 416          * proper emplacements.
 417          *
 418          * We use 64-bit intermediate words to allow for carries to
 419          * accumulate easily, before performing the final propagation.
 420          */
 421         for (i = 0; i < 18; i ++) {
 422                 s[i] = t[i];
 423         }
 424
 425         for (i = 17; i >= 9; i --) {
 426                 uint64_t x;
 427
 428                 x = s[i];
 429                 s[i - 1] += ARSHW(x, 2);
 430                 s[i - 2] += (x << 28) & 0x3FFFFFFF;
 431                 s[i - 2] -= ARSHW(x, 4);
 432                 s[i - 3] -= (x << 26) & 0x3FFFFFFF;
 433                 s[i - 5] -= ARSHW(x, 10);
 434                 s[i - 6] -= (x << 20) & 0x3FFFFFFF;
 435                 s[i - 8] += ARSHW(x, 16);
 436                 s[i - 9] += (x << 14) & 0x3FFFFFFF;
 437         }
 438
 439         /*
 440          * Carry propagation must be signed. Moreover, we may have overdone
 441          * it a bit, and obtain a negative result.
 442          *
 443          * The loop above ran 9 times; each time, each word was augmented
 444          * by at most one extra word (in absolute value). Thus, the top
 445          * word must in fine fit in 39 bits, so the carry below will fit
 446          * on 9 bits.
 447          */
 448         cc = 0;
 449         for (i = 0; i < 9; i ++) {
 450                 x = s[i] + cc;
 451                 d[i] = (uint32_t)x & 0x3FFFFFFF;
 452                 cc = ARSHW(x, 30);
 453         }
 454
 455         /*
 456          * All nine words fit on 30 bits, but there may be an extra
 457          * carry for a few bits (at most 9), and that carry may be
 458          * negative. Moreover, we want the result to fit on 257 bits.
 459          * The two lines below ensure that the word in d[] has length
 460          * 256 bits, and the (signed) carry (beyond 2^256) is in cc. The
 461          * significant length of cc is less than 24 bits, so we will be
 462          * able to switch to 32-bit operations.
 463          */
 464         cc = ARSHW(x, 16);
 465         d[8] &= 0xFFFF;
 466
 467         /*
 468          * Subtract cc*p.
 469          */
 470         z = (uint32_t)cc;
 471         d[3] -= z << 6;
 472         d[6] -= (z << 12) & 0x3FFFFFFF;
 473         d[7] -= ARSH(z, 18);
 474         d[7] += (z << 14) & 0x3FFFFFFF;
 475         d[8] += ARSH(z, 16);
 476         for (i = 0; i < 9; i ++) {
 477                 uint32_t w;
 478
 479                 w = d[i] + z;
 480                 d[i] = w & 0x3FFFFFFF;
 481                 z = ARSH(w, 30);
 482         }
 483 }
 484
 485 /*
 486  * Compute a square in F256. Source operand shall be less than
 487  * twice the modulus.
 488  */
 489 static void
 490 square_f256(uint32_t *d, const uint32_t *a)
 491 {
 492         uint32_t t[18];
 493         uint64_t s[18];
 494         uint64_t cc, x;
 495         uint32_t z;
 496         int i;
 497
 498         square9(t, a);
 499
 500         /*
 501          * Modular reduction: each high word in added/subtracted where
 502          * necessary.
 503          *
 504          * The modulus is:
 505          *    p = 2^256 - 2^224 + 2^192 + 2^96 - 1
 506          * Therefore:
 507          *    2^256 = 2^224 - 2^192 - 2^96 + 1 mod p
 508          *
 509          * For a word x at bit offset n (n >= 256), we have:
 510          *    x*2^n = x*2^(n-32) - x*2^(n-64)
 511          *            - x*2^(n - 160) + x*2^(n-256) mod p
 512          *
 513          * Thus, we can nullify the high word if we reinject it at some
 514          * proper emplacements.
 515          *
 516          * We use 64-bit intermediate words to allow for carries to
 517          * accumulate easily, before performing the final propagation.
 518          */
 519         for (i = 0; i < 18; i ++) {
 520                 s[i] = t[i];
 521         }
 522
 523         for (i = 17; i >= 9; i --) {
 524                 uint64_t x;
 525
 526                 x = s[i];
 527                 s[i - 1] += ARSHW(x, 2);
 528                 s[i - 2] += (x << 28) & 0x3FFFFFFF;
 529                 s[i - 2] -= ARSHW(x, 4);
 530                 s[i - 3] -= (x << 26) & 0x3FFFFFFF;
 531                 s[i - 5] -= ARSHW(x, 10);
 532                 s[i - 6] -= (x << 20) & 0x3FFFFFFF;
 533                 s[i - 8] += ARSHW(x, 16);
 534                 s[i - 9] += (x << 14) & 0x3FFFFFFF;
 535         }
 536
 537         /*
 538          * Carry propagation must be signed. Moreover, we may have overdone
 539          * it a bit, and obtain a negative result.
 540          *
 541          * The loop above ran 9 times; each time, each word was augmented
 542          * by at most one extra word (in absolute value). Thus, the top
 543          * word must in fine fit in 39 bits, so the carry below will fit
 544          * on 9 bits.
 545          */
 546         cc = 0;
 547         for (i = 0; i < 9; i ++) {
 548                 x = s[i] + cc;
 549                 d[i] = (uint32_t)x & 0x3FFFFFFF;
 550                 cc = ARSHW(x, 30);
 551         }
 552
 553         /*
 554          * All nine words fit on 30 bits, but there may be an extra
 555          * carry for a few bits (at most 9), and that carry may be
 556          * negative. Moreover, we want the result to fit on 257 bits.
 557          * The two lines below ensure that the word in d[] has length
 558          * 256 bits, and the (signed) carry (beyond 2^256) is in cc. The
 559          * significant length of cc is less than 24 bits, so we will be
 560          * able to switch to 32-bit operations.
 561          */
 562         cc = ARSHW(x, 16);
 563         d[8] &= 0xFFFF;
 564
 565         /*
 566          * Subtract cc*p.
 567          */
 568         z = (uint32_t)cc;
 569         d[3] -= z << 6;
 570         d[6] -= (z << 12) & 0x3FFFFFFF;
 571         d[7] -= ARSH(z, 18);
 572         d[7] += (z << 14) & 0x3FFFFFFF;
 573         d[8] += ARSH(z, 16);
 574         for (i = 0; i < 9; i ++) {
 575                 uint32_t w;
 576
 577                 w = d[i] + z;
 578                 d[i] = w & 0x3FFFFFFF;
 579                 z = ARSH(w, 30);
 580         }
 581 }
 582
 583 /*
 584  * Perform a "final reduction" in field F256 (field for curve P-256).
 585  * The source value must be less than twice the modulus. If the value
 586  * is not lower than the modulus, then the modulus is subtracted and
 587  * this function returns 1; otherwise, it leaves it untouched and it
 588  * returns 0.
 589  */
 590 static uint32_t
 591 reduce_final_f256(uint32_t *d)
 592 {
 593         uint32_t t[9];
 594         uint32_t cc;
 595         int i;
 596
 597         cc = 0;
 598         for (i = 0; i < 9; i ++) {
 599                 uint32_t w;
 600
 601                 w = d[i] - F256[i] - cc;
 602                 cc = w >> 31;
 603                 t[i] = w & 0x3FFFFFFF;
 604         }
 605         cc ^= 1;
 606         CCOPY(cc, d, t, sizeof t);
 607         return cc;
 608 }
 609
 610 /*
 611  * Jacobian coordinates for a point in P-256: affine coordinates (X,Y)
 612  * are such that:
 613  *   X = x / z^2
 614  *   Y = y / z^3
 615  * For the point at infinity, z = 0.
 616  * Each point thus admits many possible representations.
 617  *
 618  * Coordinates are represented in arrays of 32-bit integers, each holding
 619  * 30 bits of data. Values may also be slightly greater than the modulus,
 620  * but they will always be lower than twice the modulus.
 621  */
 622 typedef struct {
 623         uint32_t x[9];
 624         uint32_t y[9];
 625         uint32_t z[9];
 626 } p256_jacobian;
 627
 628 /*
 629  * Convert a point to affine coordinates:
 630  *  - If the point is the point at infinity, then all three coordinates
 631  *    are set to 0.
 632  *  - Otherwise, the 'z' coordinate is set to 1, and the 'x' and 'y'
 633  *    coordinates are the 'X' and 'Y' affine coordinates.
 634  * The coordinates are guaranteed to be lower than the modulus.
 635  */
 636 static void
 637 p256_to_affine(p256_jacobian *P)
 638 {
 639         uint32_t t1[9], t2[9];
 640         int i;
 641
 642         /*
 643          * Invert z with a modular exponentiation: the modulus is
 644          * p = 2^256 - 2^224 + 2^192 + 2^96 - 1, and the exponent is
 645          * p-2. Exponent bit pattern (from high to low) is:
 646          *  - 32 bits of value 1
 647          *  - 31 bits of value 0
 648          *  - 1 bit of value 1
 649          *  - 96 bits of value 0
 650          *  - 94 bits of value 1
 651          *  - 1 bit of value 0
 652          *  - 1 bit of value 1
 653          * Thus, we precompute z^(2^31-1) to speed things up.
 654          *
 655          * If z = 0 (point at infinity) then the modular exponentiation
 656          * will yield 0, which leads to the expected result (all three
 657          * coordinates set to 0).
 658          */
 659
 660         /*
 661          * A simple square-and-multiply for z^(2^31-1). We could save about
 662          * two dozen multiplications here with an addition chain, but
 663          * this would require a bit more code, and extra stack buffers.
 664          */
 665         memcpy(t1, P->z, sizeof P->z);
 666         for (i = 0; i < 30; i ++) {
 667                 square_f256(t1, t1);
 668                 mul_f256(t1, t1, P->z);
 669         }
 670
 671         /*
 672          * Square-and-multiply. Apart from the squarings, we have a few
 673          * multiplications to set bits to 1; we multiply by the original z
 674          * for setting 1 bit, and by t1 for setting 31 bits.
 675          */
 676         memcpy(t2, P->z, sizeof P->z);
 677         for (i = 1; i < 256; i ++) {
 678                 square_f256(t2, t2);
 679                 switch (i) {
 680                 case 31:
 681                 case 190:
 682                 case 221:
 683                 case 252:
 684                         mul_f256(t2, t2, t1);
 685                         break;
 686                 case 63:
 687                 case 253:
 688                 case 255:
 689                         mul_f256(t2, t2, P->z);
 690                         break;
 691                 }
 692         }
 693
 694         /*
 695          * Now that we have 1/z, multiply x by 1/z^2 and y by 1/z^3.
 696          */
 697         mul_f256(t1, t2, t2);
 698         mul_f256(P->x, t1, P->x);
 699         mul_f256(t1, t1, t2);
 700         mul_f256(P->y, t1, P->y);
 701         reduce_final_f256(P->x);
 702         reduce_final_f256(P->y);
 703
 704         /*
 705          * Multiply z by 1/z. If z = 0, then this will yield 0, otherwise
 706          * this will set z to 1.
 707          */
 708         mul_f256(P->z, P->z, t2);
 709         reduce_final_f256(P->z);
 710 }
 711
 712 /*
 713  * Double a point in P-256. This function works for all valid points,
 714  * including the point at infinity.
 715  */
 716 static void
 717 p256_double(p256_jacobian *Q)
 718 {
 719         /*
 720          * Doubling formulas are:
 721          *
 722          *   s = 4*x*y^2
 723          *   m = 3*(x + z^2)*(x - z^2)
 724          *   x' = m^2 - 2*s
 725          *   y' = m*(s - x') - 8*y^4
 726          *   z' = 2*y*z
 727          *
 728          * These formulas work for all points, including points of order 2
 729          * and points at infinity:
 730          *   - If y = 0 then z' = 0. But there is no such point in P-256
 731          *     anyway.
 732          *   - If z = 0 then z' = 0.
 733          */
 734         uint32_t t1[9], t2[9], t3[9], t4[9];
 735
 736         /*
 737          * Compute z^2 in t1.
 738          */
 739         square_f256(t1, Q->z);
 740
 741         /*
 742          * Compute x-z^2 in t2 and x+z^2 in t1.
 743          */
 744         add_f256(t2, Q->x, t1);
 745         sub_f256(t1, Q->x, t1);
 746
 747         /*
 748          * Compute 3*(x+z^2)*(x-z^2) in t1.
 749          */
 750         mul_f256(t3, t1, t2);
 751         add_f256(t1, t3, t3);
 752         add_f256(t1, t3, t1);
 753
 754         /*
 755          * Compute 4*x*y^2 (in t2) and 2*y^2 (in t3).
 756          */
 757         square_f256(t3, Q->y);
 758         add_f256(t3, t3, t3);
 759         mul_f256(t2, Q->x, t3);
 760         add_f256(t2, t2, t2);
 761
 762         /*
 763          * Compute x' = m^2 - 2*s.
 764          */
 765         square_f256(Q->x, t1);
 766         sub_f256(Q->x, Q->x, t2);
 767         sub_f256(Q->x, Q->x, t2);
 768
 769         /*
 770          * Compute z' = 2*y*z.
 771          */
 772         mul_f256(t4, Q->y, Q->z);
 773         add_f256(Q->z, t4, t4);
 774
 775         /*
 776          * Compute y' = m*(s - x') - 8*y^4. Note that we already have
 777          * 2*y^2 in t3.
 778          */
 779         sub_f256(t2, t2, Q->x);
 780         mul_f256(Q->y, t1, t2);
 781         square_f256(t4, t3);
 782         add_f256(t4, t4, t4);
 783         sub_f256(Q->y, Q->y, t4);
 784 }
 785
 786 /*
 787  * Add point P2 to point P1.
 788  *
 789  * This function computes the wrong result in the following cases:
 790  *
 791  *   - If P1 == 0 but P2 != 0
 792  *   - If P1 != 0 but P2 == 0
 793  *   - If P1 == P2
 794  *
 795  * In all three cases, P1 is set to the point at infinity.
 796  *
 797  * Returned value is 0 if one of the following occurs:
 798  *
 799  *   - P1 and P2 have the same Y coordinate
 800  *   - P1 == 0 and P2 == 0
 801  *   - The Y coordinate of one of the points is 0 and the other point is
 802  *     the point at infinity.
 803  *
 804  * The third case cannot actually happen with valid points, since a point
 805  * with Y == 0 is a point of order 2, and there is no point of order 2 on
 806  * curve P-256.
 807  *
 808  * Therefore, assuming that P1 != 0 and P2 != 0 on input, then the caller
 809  * can apply the following:
 810  *
 811  *   - If the result is not the point at infinity, then it is correct.
 812  *   - Otherwise, if the returned value is 1, then this is a case of
 813  *     P1+P2 == 0, so the result is indeed the point at infinity.
 814  *   - Otherwise, P1 == P2, so a "double" operation should have been
 815  *     performed.
 816  */
 817 static uint32_t
 818 p256_add(p256_jacobian *P1, const p256_jacobian *P2)
 819 {
 820         /*
 821          * Addtions formulas are:
 822          *
 823          *   u1 = x1 * z2^2
 824          *   u2 = x2 * z1^2
 825          *   s1 = y1 * z2^3
 826          *   s2 = y2 * z1^3
 827          *   h = u2 - u1
 828          *   r = s2 - s1
 829          *   x3 = r^2 - h^3 - 2 * u1 * h^2
 830          *   y3 = r * (u1 * h^2 - x3) - s1 * h^3
 831          *   z3 = h * z1 * z2
 832          */
 833         uint32_t t1[9], t2[9], t3[9], t4[9], t5[9], t6[9], t7[9];
 834         uint32_t ret;
 835         int i;
 836
 837         /*
 838          * Compute u1 = x1*z2^2 (in t1) and s1 = y1*z2^3 (in t3).
 839          */
 840         square_f256(t3, P2->z);
 841         mul_f256(t1, P1->x, t3);
 842         mul_f256(t4, P2->z, t3);
 843         mul_f256(t3, P1->y, t4);
 844
 845         /*
 846          * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
 847          */
 848         square_f256(t4, P1->z);
 849         mul_f256(t2, P2->x, t4);
 850         mul_f256(t5, P1->z, t4);
 851         mul_f256(t4, P2->y, t5);
 852
 853         /*
 854          * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
 855          * We need to test whether r is zero, so we will do some extra
 856          * reduce.
 857          */
 858         sub_f256(t2, t2, t1);
 859         sub_f256(t4, t4, t3);
 860         reduce_final_f256(t4);
 861         ret = 0;
 862         for (i = 0; i < 9; i ++) {
 863                 ret |= t4[i];
 864         }
 865         ret = (ret | -ret) >> 31;
 866
 867         /*
 868          * Compute u1*h^2 (in t6) and h^3 (in t5);
 869          */
 870         square_f256(t7, t2);
 871         mul_f256(t6, t1, t7);
 872         mul_f256(t5, t7, t2);
 873
 874         /*
 875          * Compute x3 = r^2 - h^3 - 2*u1*h^2.
 876          */
 877         square_f256(P1->x, t4);
 878         sub_f256(P1->x, P1->x, t5);
 879         sub_f256(P1->x, P1->x, t6);
 880         sub_f256(P1->x, P1->x, t6);
 881
 882         /*
 883          * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
 884          */
 885         sub_f256(t6, t6, P1->x);
 886         mul_f256(P1->y, t4, t6);
 887         mul_f256(t1, t5, t3);
 888         sub_f256(P1->y, P1->y, t1);
 889
 890         /*
 891          * Compute z3 = h*z1*z2.
 892          */
 893         mul_f256(t1, P1->z, P2->z);
 894         mul_f256(P1->z, t1, t2);
 895
 896         return ret;
 897 }
 898
 899 /*
 900  * Add point P2 to point P1. This is a specialised function for the
 901  * case when P2 is a non-zero point in affine coordinate.
 902  *
 903  * This function computes the wrong result in the following cases:
 904  *
 905  *   - If P1 == 0
 906  *   - If P1 == P2
 907  *
 908  * In both cases, P1 is set to the point at infinity.
 909  *
 910  * Returned value is 0 if one of the following occurs:
 911  *
 912  *   - P1 and P2 have the same Y coordinate
 913  *   - The Y coordinate of P2 is 0 and P1 is the point at infinity.
 914  *
 915  * The second case cannot actually happen with valid points, since a point
 916  * with Y == 0 is a point of order 2, and there is no point of order 2 on
 917  * curve P-256.
 918  *
 919  * Therefore, assuming that P1 != 0 on input, then the caller
 920  * can apply the following:
 921  *
 922  *   - If the result is not the point at infinity, then it is correct.
 923  *   - Otherwise, if the returned value is 1, then this is a case of
 924  *     P1+P2 == 0, so the result is indeed the point at infinity.
 925  *   - Otherwise, P1 == P2, so a "double" operation should have been
 926  *     performed.
 927  */
 928 static uint32_t
 929 p256_add_mixed(p256_jacobian *P1, const p256_jacobian *P2)
 930 {
 931         /*
 932          * Addtions formulas are:
 933          *
 934          *   u1 = x1
 935          *   u2 = x2 * z1^2
 936          *   s1 = y1
 937          *   s2 = y2 * z1^3
 938          *   h = u2 - u1
 939          *   r = s2 - s1
 940          *   x3 = r^2 - h^3 - 2 * u1 * h^2
 941          *   y3 = r * (u1 * h^2 - x3) - s1 * h^3
 942          *   z3 = h * z1
 943          */
 944         uint32_t t1[9], t2[9], t3[9], t4[9], t5[9], t6[9], t7[9];
 945         uint32_t ret;
 946         int i;
 947
 948         /*
 949          * Compute u1 = x1 (in t1) and s1 = y1 (in t3).
 950          */
 951         memcpy(t1, P1->x, sizeof t1);
 952         memcpy(t3, P1->y, sizeof t3);
 953
 954         /*
 955          * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
 956          */
 957         square_f256(t4, P1->z);
 958         mul_f256(t2, P2->x, t4);
 959         mul_f256(t5, P1->z, t4);
 960         mul_f256(t4, P2->y, t5);
 961
 962         /*
 963          * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
 964          * We need to test whether r is zero, so we will do some extra
 965          * reduce.
 966          */
 967         sub_f256(t2, t2, t1);
 968         sub_f256(t4, t4, t3);
 969         reduce_final_f256(t4);
 970         ret = 0;
 971         for (i = 0; i < 9; i ++) {
 972                 ret |= t4[i];
 973         }
 974         ret = (ret | -ret) >> 31;
 975
 976         /*
 977          * Compute u1*h^2 (in t6) and h^3 (in t5);
 978          */
 979         square_f256(t7, t2);
 980         mul_f256(t6, t1, t7);
 981         mul_f256(t5, t7, t2);
 982
 983         /*
 984          * Compute x3 = r^2 - h^3 - 2*u1*h^2.
 985          */
 986         square_f256(P1->x, t4);
 987         sub_f256(P1->x, P1->x, t5);
 988         sub_f256(P1->x, P1->x, t6);
 989         sub_f256(P1->x, P1->x, t6);
 990
 991         /*
 992          * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
 993          */
 994         sub_f256(t6, t6, P1->x);
 995         mul_f256(P1->y, t4, t6);
 996         mul_f256(t1, t5, t3);
 997         sub_f256(P1->y, P1->y, t1);
 998
 999         /*
1000          * Compute z3 = h*z1*z2.
1001          */
1002         mul_f256(P1->z, P1->z, t2);
1003
1004         return ret;
1005 }
1006
1007 /*
1008  * Decode a P-256 point. This function does not support the point at
1009  * infinity. Returned value is 0 if the point is invalid, 1 otherwise.
1010  */
1011 static uint32_t
1012 p256_decode(p256_jacobian *P, const void *src, size_t len)
1013 {
1014         const unsigned char *buf;
1015         uint32_t tx[9], ty[9], t1[9], t2[9];
1016         uint32_t bad;
1017         int i;
1018
1019         if (len != 65) {
1020                 return 0;
1021         }
1022         buf = src;
1023
1024         /*
1025          * First byte must be 0x04 (uncompressed format). We could support
1026          * "hybrid format" (first byte is 0x06 or 0x07, and encodes the
1027          * least significant bit of the Y coordinate), but it is explicitly
1028          * forbidden by RFC 5480 (section 2.2).
1029          */
1030         bad = NEQ(buf[0], 0x04);
1031
1032         /*
1033          * Decode the coordinates, and check that they are both lower
1034          * than the modulus.
1035          */
1036         tx[8] = be8_to_le30(tx, buf + 1, 32);
1037         ty[8] = be8_to_le30(ty, buf + 33, 32);
1038         bad |= reduce_final_f256(tx);
1039         bad |= reduce_final_f256(ty);
1040
1041         /*
1042          * Check curve equation.
1043          */
1044         square_f256(t1, tx);
1045         mul_f256(t1, tx, t1);
1046         square_f256(t2, ty);
1047         sub_f256(t1, t1, tx);
1048         sub_f256(t1, t1, tx);
1049         sub_f256(t1, t1, tx);
1050         add_f256(t1, t1, P256_B);
1051         sub_f256(t1, t1, t2);
1052         reduce_final_f256(t1);
1053         for (i = 0; i < 9; i ++) {
1054                 bad |= t1[i];
1055         }
1056
1057         /*
1058          * Copy coordinates to the point structure.
1059          */
1060         memcpy(P->x, tx, sizeof tx);
1061         memcpy(P->y, ty, sizeof ty);
1062         memset(P->z, 0, sizeof P->z);
1063         P->z[0] = 1;
1064         return NEQ(bad, 0) ^ 1;
1065 }
1066
1067 /*
1068  * Encode a point into a buffer. This function assumes that the point is
1069  * valid, in affine coordinates, and not the point at infinity.
1070  */
1071 static void
1072 p256_encode(void *dst, const p256_jacobian *P)
1073 {
1074         unsigned char *buf;
1075
1076         buf = dst;
1077         buf[0] = 0x04;
1078         le30_to_be8(buf + 1, 32, P->x);
1079         le30_to_be8(buf + 33, 32, P->y);
1080 }
1081
1082 /*
1083  * Multiply a curve point by an integer. The integer is assumed to be
1084  * lower than the curve order, and the base point must not be the point
1085  * at infinity.
1086  */
1087 static void
1088 p256_mul(p256_jacobian *P, const unsigned char *x, size_t xlen)
1089 {
1090         /*
1091          * qz is a flag that is initially 1, and remains equal to 1
1092          * as long as the point is the point at infinity.
1093          *
1094          * We use a 2-bit window to handle multiplier bits by pairs.
1095          * The precomputed window really is the points P2 and P3.
1096          */
1097         uint32_t qz;
1098         p256_jacobian P2, P3, Q, T, U;
1099
1100         /*
1101          * Compute window values.
1102          */
1103         P2 = *P;
1104         p256_double(&P2);
1105         P3 = *P;
1106         p256_add(&P3, &P2);
1107
1108         /*
1109          * We start with Q = 0. We process multiplier bits 2 by 2.
1110          */
1111         memset(&Q, 0, sizeof Q);
1112         qz = 1;
1113         while (xlen -- > 0) {
1114                 int k;
1115
1116                 for (k = 6; k >= 0; k -= 2) {
1117                         uint32_t bits;
1118                         uint32_t bnz;
1119
1120                         p256_double(&Q);
1121                         p256_double(&Q);
1122                         T = *P;
1123                         U = Q;
1124                         bits = (*x >> k) & (uint32_t)3;
1125                         bnz = NEQ(bits, 0);
1126                         CCOPY(EQ(bits, 2), &T, &P2, sizeof T);
1127                         CCOPY(EQ(bits, 3), &T, &P3, sizeof T);
1128                         p256_add(&U, &T);
1129                         CCOPY(bnz & qz, &Q, &T, sizeof Q);
1130                         CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
1131                         qz &= ~bnz;
1132                 }
1133                 x ++;
1134         }
1135         *P = Q;
1136 }
1137
1138 /*
1139  * Precomputed window: k*G points, where G is the curve generator, and k
1140  * is an integer from 1 to 15 (inclusive). The X and Y coordinates of
1141  * the point are encoded as 9 words of 30 bits each (little-endian
1142  * order).
1143  */
1144 static const uint32_t Gwin[15][18] = {
1145
1146         { 0x1898C296, 0x1284E517, 0x1EB33A0F, 0x00DF604B,
1147           0x2440F277, 0x339B958E, 0x04247F8B, 0x347CB84B,
1148           0x00006B17, 0x37BF51F5, 0x2ED901A0, 0x3315ECEC,
1149           0x338CD5DA, 0x0F9E162B, 0x1FAD29F0, 0x27F9B8EE,
1150           0x10B8BF86, 0x00004FE3 },
1151
1152         { 0x07669978, 0x182D23F1, 0x3F21B35A, 0x225A789D,
1153           0x351AC3C0, 0x08E00C12, 0x34F7E8A5, 0x1EC62340,
1154           0x00007CF2, 0x227873D1, 0x3812DE74, 0x0E982299,
1155           0x1F6B798F, 0x3430DBBA, 0x366B1A7D, 0x2D040293,
1156           0x154436E3, 0x00000777 },
1157
1158         { 0x06E7FD6C, 0x2D05986F, 0x3ADA985F, 0x31ADC87B,
1159           0x0BF165E6, 0x1FBE5475, 0x30A44C8F, 0x3934698C,
1160           0x00005ECB, 0x227D5032, 0x29E6C49E, 0x04FB83D9,
1161           0x0AAC0D8E, 0x24A2ECD8, 0x2C1B3869, 0x0FF7E374,
1162           0x19031266, 0x00008734 },
1163
1164         { 0x2B030852, 0x024C0911, 0x05596EF5, 0x07F8B6DE,
1165           0x262BD003, 0x3779967B, 0x08FBBA02, 0x128D4CB4,
1166           0x0000E253, 0x184ED8C6, 0x310B08FC, 0x30EE0055,
1167           0x3F25B0FC, 0x062D764E, 0x3FB97F6A, 0x33CC719D,
1168           0x15D69318, 0x0000E0F1 },
1169
1170         { 0x03D033ED, 0x05552837, 0x35BE5242, 0x2320BF47,
1171           0x268FDFEF, 0x13215821, 0x140D2D78, 0x02DE9454,
1172           0x00005159, 0x3DA16DA4, 0x0742ED13, 0x0D80888D,
1173           0x004BC035, 0x0A79260D, 0x06FCDAFE, 0x2727D8AE,
1174           0x1F6A2412, 0x0000E0C1 },
1175
1176         { 0x3C2291A9, 0x1AC2ABA4, 0x3B215B4C, 0x131D037A,
1177           0x17DDE302, 0x0C90B2E2, 0x0602C92D, 0x05CA9DA9,
1178           0x0000B01A, 0x0FC77FE2, 0x35F1214E, 0x07E16BDF,
1179           0x003DDC07, 0x2703791C, 0x3038B7EE, 0x3DAD56FE,
1180           0x041D0C8D, 0x0000E85C },
1181
1182         { 0x3187B2A3, 0x0018A1C0, 0x00FEF5B3, 0x3E7E2E2A,
1183           0x01FB607E, 0x2CC199F0, 0x37B4625B, 0x0EDBE82F,
1184           0x00008E53, 0x01F400B4, 0x15786A1B, 0x3041B21C,
1185           0x31CD8CF2, 0x35900053, 0x1A7E0E9B, 0x318366D0,
1186           0x076F780C, 0x000073EB },
1187
1188         { 0x1B6FB393, 0x13767707, 0x3CE97DBB, 0x348E2603,
1189           0x354CADC1, 0x09D0B4EA, 0x1B053404, 0x1DE76FBA,
1190           0x000062D9, 0x0F09957E, 0x295029A8, 0x3E76A78D,
1191           0x3B547DAE, 0x27CEE0A2, 0x0575DC45, 0x1D8244FF,
1192           0x332F647A, 0x0000AD5A },
1193
1194         { 0x10949EE0, 0x1E7A292E, 0x06DF8B3D, 0x02B2E30B,
1195           0x31F8729E, 0x24E35475, 0x30B71878, 0x35EDBFB7,
1196           0x0000EA68, 0x0DD048FA, 0x21688929, 0x0DE823FE,
1197           0x1C53FAA9, 0x0EA0C84D, 0x052A592A, 0x1FCE7870,
1198           0x11325CB2, 0x00002A27 },
1199
1200         { 0x04C5723F, 0x30D81A50, 0x048306E4, 0x329B11C7,
1201           0x223FB545, 0x085347A8, 0x2993E591, 0x1B5ACA8E,
1202           0x0000CEF6, 0x04AF0773, 0x28D2EEA9, 0x2751EEEC,
1203           0x037B4A7F, 0x3B4C1059, 0x08F37674, 0x2AE906E1,
1204           0x18A88A6A, 0x00008786 },
1205
1206         { 0x34BC21D1, 0x0CCE474D, 0x15048BF4, 0x1D0BB409,
1207           0x021CDA16, 0x20DE76C3, 0x34C59063, 0x04EDE20E,
1208           0x00003ED1, 0x282A3740, 0x0BE3BBF3, 0x29889DAE,
1209           0x03413697, 0x34C68A09, 0x210EBE93, 0x0C8A224C,
1210           0x0826B331, 0x00009099 },
1211
1212         { 0x0624E3C4, 0x140317BA, 0x2F82C99D, 0x260C0A2C,
1213           0x25D55179, 0x194DCC83, 0x3D95E462, 0x356F6A05,
1214           0x0000741D, 0x0D4481D3, 0x2657FC8B, 0x1BA5CA71,
1215           0x3AE44B0D, 0x07B1548E, 0x0E0D5522, 0x05FDC567,
1216           0x2D1AA70E, 0x00000770 },
1217
1218         { 0x06072C01, 0x23857675, 0x1EAD58A9, 0x0B8A12D9,
1219           0x1EE2FC79, 0x0177CB61, 0x0495A618, 0x20DEB82B,
1220           0x0000177C, 0x2FC7BFD8, 0x310EEF8B, 0x1FB4DF39,
1221           0x3B8530E8, 0x0F4E7226, 0x0246B6D0, 0x2A558A24,
1222           0x163353AF, 0x000063BB },
1223
1224         { 0x24D2920B, 0x1C249DCC, 0x2069C5E5, 0x09AB2F9E,
1225           0x36DF3CF1, 0x1991FD0C, 0x062B97A7, 0x1E80070E,
1226           0x000054E7, 0x20D0B375, 0x2E9F20BD, 0x35090081,
1227           0x1C7A9DDC, 0x22E7C371, 0x087E3016, 0x03175421,
1228           0x3C6ECA7D, 0x0000F599 },
1229
1230         { 0x259B9D5F, 0x0D9A318F, 0x23A0EF16, 0x00EBE4B7,
1231           0x088265AE, 0x2CDE2666, 0x2BAE7ADF, 0x1371A5C6,
1232           0x0000F045, 0x0D034F36, 0x1F967378, 0x1B5FA3F4,
1233           0x0EC8739D, 0x1643E62A, 0x1653947E, 0x22D1F4E6,
1234           0x0FB8D64B, 0x0000B5B9 }
1235 };
1236
1237 /*
1238  * Lookup one of the Gwin[] values, by index. This is constant-time.
1239  */
1240 static void
1241 lookup_Gwin(p256_jacobian *T, uint32_t idx)
1242 {
1243         uint32_t xy[18];
1244         uint32_t k;
1245         size_t u;
1246
1247         memset(xy, 0, sizeof xy);
1248         for (k = 0; k < 15; k ++) {
1249                 uint32_t m;
1250
1251                 m = -EQ(idx, k + 1);
1252                 for (u = 0; u < 18; u ++) {
1253                         xy[u] |= m & Gwin[k][u];
1254                 }
1255         }
1256         memcpy(T->x, &xy[0], sizeof T->x);
1257         memcpy(T->y, &xy[9], sizeof T->y);
1258         memset(T->z, 0, sizeof T->z);
1259         T->z[0] = 1;
1260 }
1261
1262 /*
1263  * Multiply the generator by an integer. The integer is assumed non-zero
1264  * and lower than the curve order.
1265  */
1266 static void
1267 p256_mulgen(p256_jacobian *P, const unsigned char *x, size_t xlen)
1268 {
1269         /*
1270          * qz is a flag that is initially 1, and remains equal to 1
1271          * as long as the point is the point at infinity.
1272          *
1273          * We use a 4-bit window to handle multiplier bits by groups
1274          * of 4. The precomputed window is constant static data, with
1275          * points in affine coordinates; we use a constant-time lookup.
1276          */
1277         p256_jacobian Q;
1278         uint32_t qz;
1279
1280         memset(&Q, 0, sizeof Q);
1281         qz = 1;
1282         while (xlen -- > 0) {
1283                 int k;
1284                 unsigned bx;
1285
1286                 bx = *x ++;
1287                 for (k = 0; k < 2; k ++) {
1288                         uint32_t bits;
1289                         uint32_t bnz;
1290                         p256_jacobian T, U;
1291
1292                         p256_double(&Q);
1293                         p256_double(&Q);
1294                         p256_double(&Q);
1295                         p256_double(&Q);
1296                         bits = (bx >> 4) & 0x0F;
1297                         bnz = NEQ(bits, 0);
1298                         lookup_Gwin(&T, bits);
1299                         U = Q;
1300                         p256_add_mixed(&U, &T);
1301                         CCOPY(bnz & qz, &Q, &T, sizeof Q);
1302                         CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
1303                         qz &= ~bnz;
1304                         bx <<= 4;
1305                 }
1306         }
1307         *P = Q;
1308 }
1309
1310 static const unsigned char P256_G[] = {
1311         0x04, 0x6B, 0x17, 0xD1, 0xF2, 0xE1, 0x2C, 0x42, 0x47, 0xF8,
1312         0xBC, 0xE6, 0xE5, 0x63, 0xA4, 0x40, 0xF2, 0x77, 0x03, 0x7D,
1313         0x81, 0x2D, 0xEB, 0x33, 0xA0, 0xF4, 0xA1, 0x39, 0x45, 0xD8,
1314         0x98, 0xC2, 0x96, 0x4F, 0xE3, 0x42, 0xE2, 0xFE, 0x1A, 0x7F,
1315         0x9B, 0x8E, 0xE7, 0xEB, 0x4A, 0x7C, 0x0F, 0x9E, 0x16, 0x2B,
1316         0xCE, 0x33, 0x57, 0x6B, 0x31, 0x5E, 0xCE, 0xCB, 0xB6, 0x40,
1317         0x68, 0x37, 0xBF, 0x51, 0xF5
1318 };
1319
1320 static const unsigned char P256_N[] = {
1321         0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF,
1322         0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xBC, 0xE6, 0xFA, 0xAD,
1323         0xA7, 0x17, 0x9E, 0x84, 0xF3, 0xB9, 0xCA, 0xC2, 0xFC, 0x63,
1324         0x25, 0x51
1325 };
1326
1327 static const unsigned char *
1328 api_generator(int curve, size_t *len)
1329 {
1330         (void)curve;
1331         *len = sizeof P256_G;
1332         return P256_G;
1333 }
1334
1335 static const unsigned char *
1336 api_order(int curve, size_t *len)
1337 {
1338         (void)curve;
1339         *len = sizeof P256_N;
1340         return P256_N;
1341 }
1342
1343 static size_t
1344 api_xoff(int curve, size_t *len)
1345 {
1346         (void)curve;
1347         *len = 32;
1348         return 1;
1349 }
1350
1351 static uint32_t
1352 api_mul(unsigned char *G, size_t Glen,
1353         const unsigned char *x, size_t xlen, int curve)
1354 {
1355         uint32_t r;
1356         p256_jacobian P;
1357
1358         (void)curve;
1359         r = p256_decode(&P, G, Glen);
1360         p256_mul(&P, x, xlen);
1361         if (Glen >= 65) {
1362                 p256_to_affine(&P);
1363                 p256_encode(G, &P);
1364         }
1365         return r;
1366 }
1367
1368 static size_t
1369 api_mulgen(unsigned char *R,
1370         const unsigned char *x, size_t xlen, int curve)
1371 {
1372         p256_jacobian P;
1373
1374         (void)curve;
1375         p256_mulgen(&P, x, xlen);
1376         p256_to_affine(&P);
1377         p256_encode(R, &P);
1378         return 65;
1379
1380         /*
1381         const unsigned char *G;
1382         size_t Glen;
1383
1384         G = api_generator(curve, &Glen);
1385         memcpy(R, G, Glen);
1386         api_mul(R, Glen, x, xlen, curve);
1387         return Glen;
1388         */
1389 }
1390
1391 static uint32_t
1392 api_muladd(unsigned char *A, const unsigned char *B, size_t len,
1393         const unsigned char *x, size_t xlen,
1394         const unsigned char *y, size_t ylen, int curve)
1395 {
1396         p256_jacobian P, Q;
1397         uint32_t r, t, z;
1398         int i;
1399
1400         (void)curve;
1401         r = p256_decode(&P, A, len);
1402         p256_mul(&P, x, xlen);
1403         if (B == NULL) {
1404                 p256_mulgen(&Q, y, ylen);
1405         } else {
1406                 r &= p256_decode(&Q, B, len);
1407                 p256_mul(&Q, y, ylen);
1408         }
1409
1410         /*
1411          * The final addition may fail in case both points are equal.
1412          */
1413         t = p256_add(&P, &Q);
1414         reduce_final_f256(P.z);
1415         z = 0;
1416         for (i = 0; i < 9; i ++) {
1417                 z |= P.z[i];
1418         }
1419         z = EQ(z, 0);
1420         p256_double(&Q);
1421
1422         /*
1423          * If z is 1 then either P+Q = 0 (t = 1) or P = Q (t = 0). So we
1424          * have the following:
1425          *
1426          *   z = 0, t = 0   return P (normal addition)
1427          *   z = 0, t = 1   return P (normal addition)
1428          *   z = 1, t = 0   return Q (a 'double' case)
1429          *   z = 1, t = 1   report an error (P+Q = 0)
1430          */
1431         CCOPY(z & ~t, &P, &Q, sizeof Q);
1432         p256_to_affine(&P);
1433         p256_encode(A, &P);
1434         r &= ~(z & t);
1435         return r;
1436 }
1437
1438 /* see bearssl_ec.h */
1439 const br_ec_impl br_ec_p256_m31 = {
1440         (uint32_t)0x00800000,
1441         &api_generator,
1442         &api_order,
1443         &api_xoff,
1444         &api_mul,
1445         &api_mulgen,
1446         &api_muladd
1447 };