_ Git - BearSSL/blob - src/ec/ec_p256_m31.c

   1 /*
   2  * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining
   5  * a copy of this software and associated documentation files (the
   6  * "Software"), to deal in the Software without restriction, including
   7  * without limitation the rights to use, copy, modify, merge, publish,
   8  * distribute, sublicense, and/or sell copies of the Software, and to
   9  * permit persons to whom the Software is furnished to do so, subject to
  10  * the following conditions:
  11  *
  12  * The above copyright notice and this permission notice shall be
  13  * included in all copies or substantial portions of the Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  16  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  17  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  18  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  19  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  20  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  21  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "inner.h"
  26
  27 /*
  28  * If BR_NO_ARITH_SHIFT is undefined, or defined to 0, then we _assume_
  29  * that right-shifting a signed negative integer copies the sign bit
  30  * (arithmetic right-shift). This is "implementation-defined behaviour",
  31  * i.e. it is not undefined, but it may differ between compilers. Each
  32  * compiler is supposed to document its behaviour in that respect. GCC
  33  * explicitly defines that an arithmetic right shift is used. We expect
  34  * all other compilers to do the same, because underlying CPU offer an
  35  * arithmetic right shift opcode that could not be used otherwise.
  36  */
  37 #if BR_NO_ARITH_SHIFT
  38 #define ARSH(x, n)    (((uint32_t)(x) >> (n)) \
  39                       | ((-((uint32_t)(x) >> 31)) << (32 - (n))))
  40 #define ARSHW(x, n)   (((uint64_t)(x) >> (n)) \
  41                       | ((-((uint64_t)(x) >> 63)) << (64 - (n))))
  42 #else
  43 #define ARSH(x, n)    ((*(int32_t *)&(x)) >> (n))
  44 #define ARSHW(x, n)   ((*(int64_t *)&(x)) >> (n))
  45 #endif
  46
  47 /*
  48  * Convert an integer from unsigned big-endian encoding to a sequence of
  49  * 30-bit words in little-endian order. The final "partial" word is
  50  * returned.
  51  */
  52 static uint32_t
  53 be8_to_le30(uint32_t *dst, const unsigned char *src, size_t len)
  54 {
  55         uint32_t acc;
  56         int acc_len;
  57
  58         acc = 0;
  59         acc_len = 0;
  60         while (len -- > 0) {
  61                 uint32_t b;
  62
  63                 b = src[len];
  64                 if (acc_len < 22) {
  65                         acc |= b << acc_len;
  66                         acc_len += 8;
  67                 } else {
  68                         *dst ++ = (acc | (b << acc_len)) & 0x3FFFFFFF;
  69                         acc = b >> (30 - acc_len);
  70                         acc_len -= 22;
  71                 }
  72         }
  73         return acc;
  74 }
  75
  76 /*
  77  * Convert an integer (30-bit words, little-endian) to unsigned
  78  * big-endian encoding. The total encoding length is provided; all
  79  * the destination bytes will be filled.
  80  */
  81 static void
  82 le30_to_be8(unsigned char *dst, size_t len, const uint32_t *src)
  83 {
  84         uint32_t acc;
  85         int acc_len;
  86
  87         acc = 0;
  88         acc_len = 0;
  89         while (len -- > 0) {
  90                 if (acc_len < 8) {
  91                         uint32_t w;
  92
  93                         w = *src ++;
  94                         dst[len] = (unsigned char)(acc | (w << acc_len));
  95                         acc = w >> (8 - acc_len);
  96                         acc_len += 22;
  97                 } else {
  98                         dst[len] = (unsigned char)acc;
  99                         acc >>= 8;
 100                         acc_len -= 8;
 101                 }
 102         }
 103 }
 104
 105 /*
 106  * Multiply two integers. Source integers are represented as arrays of
 107  * nine 30-bit words, for values up to 2^270-1. Result is encoded over
 108  * 18 words of 30 bits each.
 109  */
 110 static void
 111 mul9(uint32_t *d, const uint32_t *a, const uint32_t *b)
 112 {
 113         /*
 114          * Maximum intermediate result is no more than
 115          * 10376293531797946367, which fits in 64 bits. Reason:
 116          *
 117          *   10376293531797946367 = 9 * (2^30-1)^2 + 9663676406
 118          *   10376293531797946367 < 9663676407 * 2^30
 119          *
 120          * Thus, adding together 9 products of 30-bit integers, with
 121          * a carry of at most 9663676406, yields an integer that fits
 122          * on 64 bits and generates a carry of at most 9663676406.
 123          */
 124         uint64_t t[17];
 125         uint64_t cc;
 126         int i;
 127
 128         t[ 0] = MUL31(a[0], b[0]);
 129         t[ 1] = MUL31(a[0], b[1])
 130                 + MUL31(a[1], b[0]);
 131         t[ 2] = MUL31(a[0], b[2])
 132                 + MUL31(a[1], b[1])
 133                 + MUL31(a[2], b[0]);
 134         t[ 3] = MUL31(a[0], b[3])
 135                 + MUL31(a[1], b[2])
 136                 + MUL31(a[2], b[1])
 137                 + MUL31(a[3], b[0]);
 138         t[ 4] = MUL31(a[0], b[4])
 139                 + MUL31(a[1], b[3])
 140                 + MUL31(a[2], b[2])
 141                 + MUL31(a[3], b[1])
 142                 + MUL31(a[4], b[0]);
 143         t[ 5] = MUL31(a[0], b[5])
 144                 + MUL31(a[1], b[4])
 145                 + MUL31(a[2], b[3])
 146                 + MUL31(a[3], b[2])
 147                 + MUL31(a[4], b[1])
 148                 + MUL31(a[5], b[0]);
 149         t[ 6] = MUL31(a[0], b[6])
 150                 + MUL31(a[1], b[5])
 151                 + MUL31(a[2], b[4])
 152                 + MUL31(a[3], b[3])
 153                 + MUL31(a[4], b[2])
 154                 + MUL31(a[5], b[1])
 155                 + MUL31(a[6], b[0]);
 156         t[ 7] = MUL31(a[0], b[7])
 157                 + MUL31(a[1], b[6])
 158                 + MUL31(a[2], b[5])
 159                 + MUL31(a[3], b[4])
 160                 + MUL31(a[4], b[3])
 161                 + MUL31(a[5], b[2])
 162                 + MUL31(a[6], b[1])
 163                 + MUL31(a[7], b[0]);
 164         t[ 8] = MUL31(a[0], b[8])
 165                 + MUL31(a[1], b[7])
 166                 + MUL31(a[2], b[6])
 167                 + MUL31(a[3], b[5])
 168                 + MUL31(a[4], b[4])
 169                 + MUL31(a[5], b[3])
 170                 + MUL31(a[6], b[2])
 171                 + MUL31(a[7], b[1])
 172                 + MUL31(a[8], b[0]);
 173         t[ 9] = MUL31(a[1], b[8])
 174                 + MUL31(a[2], b[7])
 175                 + MUL31(a[3], b[6])
 176                 + MUL31(a[4], b[5])
 177                 + MUL31(a[5], b[4])
 178                 + MUL31(a[6], b[3])
 179                 + MUL31(a[7], b[2])
 180                 + MUL31(a[8], b[1]);
 181         t[10] = MUL31(a[2], b[8])
 182                 + MUL31(a[3], b[7])
 183                 + MUL31(a[4], b[6])
 184                 + MUL31(a[5], b[5])
 185                 + MUL31(a[6], b[4])
 186                 + MUL31(a[7], b[3])
 187                 + MUL31(a[8], b[2]);
 188         t[11] = MUL31(a[3], b[8])
 189                 + MUL31(a[4], b[7])
 190                 + MUL31(a[5], b[6])
 191                 + MUL31(a[6], b[5])
 192                 + MUL31(a[7], b[4])
 193                 + MUL31(a[8], b[3]);
 194         t[12] = MUL31(a[4], b[8])
 195                 + MUL31(a[5], b[7])
 196                 + MUL31(a[6], b[6])
 197                 + MUL31(a[7], b[5])
 198                 + MUL31(a[8], b[4]);
 199         t[13] = MUL31(a[5], b[8])
 200                 + MUL31(a[6], b[7])
 201                 + MUL31(a[7], b[6])
 202                 + MUL31(a[8], b[5]);
 203         t[14] = MUL31(a[6], b[8])
 204                 + MUL31(a[7], b[7])
 205                 + MUL31(a[8], b[6]);
 206         t[15] = MUL31(a[7], b[8])
 207                 + MUL31(a[8], b[7]);
 208         t[16] = MUL31(a[8], b[8]);
 209
 210         /*
 211          * Propagate carries.
 212          */
 213         cc = 0;
 214         for (i = 0; i < 17; i ++) {
 215                 uint64_t w;
 216
 217                 w = t[i] + cc;
 218                 d[i] = (uint32_t)w & 0x3FFFFFFF;
 219                 cc = w >> 30;
 220         }
 221         d[17] = (uint32_t)cc;
 222 }
 223
 224 /*
 225  * Square a 270-bit integer, represented as an array of nine 30-bit words.
 226  * Result uses 18 words of 30 bits each.
 227  */
 228 static void
 229 square9(uint32_t *d, const uint32_t *a)
 230 {
 231         uint64_t t[17];
 232         uint64_t cc;
 233         int i;
 234
 235         t[ 0] = MUL31(a[0], a[0]);
 236         t[ 1] = ((MUL31(a[0], a[1])) << 1);
 237         t[ 2] = MUL31(a[1], a[1])
 238                 + ((MUL31(a[0], a[2])) << 1);
 239         t[ 3] = ((MUL31(a[0], a[3])
 240                 + MUL31(a[1], a[2])) << 1);
 241         t[ 4] = MUL31(a[2], a[2])
 242                 + ((MUL31(a[0], a[4])
 243                 + MUL31(a[1], a[3])) << 1);
 244         t[ 5] = ((MUL31(a[0], a[5])
 245                 + MUL31(a[1], a[4])
 246                 + MUL31(a[2], a[3])) << 1);
 247         t[ 6] = MUL31(a[3], a[3])
 248                 + ((MUL31(a[0], a[6])
 249                 + MUL31(a[1], a[5])
 250                 + MUL31(a[2], a[4])) << 1);
 251         t[ 7] = ((MUL31(a[0], a[7])
 252                 + MUL31(a[1], a[6])
 253                 + MUL31(a[2], a[5])
 254                 + MUL31(a[3], a[4])) << 1);
 255         t[ 8] = MUL31(a[4], a[4])
 256                 + ((MUL31(a[0], a[8])
 257                 + MUL31(a[1], a[7])
 258                 + MUL31(a[2], a[6])
 259                 + MUL31(a[3], a[5])) << 1);
 260         t[ 9] = ((MUL31(a[1], a[8])
 261                 + MUL31(a[2], a[7])
 262                 + MUL31(a[3], a[6])
 263                 + MUL31(a[4], a[5])) << 1);
 264         t[10] = MUL31(a[5], a[5])
 265                 + ((MUL31(a[2], a[8])
 266                 + MUL31(a[3], a[7])
 267                 + MUL31(a[4], a[6])) << 1);
 268         t[11] = ((MUL31(a[3], a[8])
 269                 + MUL31(a[4], a[7])
 270                 + MUL31(a[5], a[6])) << 1);
 271         t[12] = MUL31(a[6], a[6])
 272                 + ((MUL31(a[4], a[8])
 273                 + MUL31(a[5], a[7])) << 1);
 274         t[13] = ((MUL31(a[5], a[8])
 275                 + MUL31(a[6], a[7])) << 1);
 276         t[14] = MUL31(a[7], a[7])
 277                 + ((MUL31(a[6], a[8])) << 1);
 278         t[15] = ((MUL31(a[7], a[8])) << 1);
 279         t[16] = MUL31(a[8], a[8]);
 280
 281         /*
 282          * Propagate carries.
 283          */
 284         cc = 0;
 285         for (i = 0; i < 17; i ++) {
 286                 uint64_t w;
 287
 288                 w = t[i] + cc;
 289                 d[i] = (uint32_t)w & 0x3FFFFFFF;
 290                 cc = w >> 30;
 291         }
 292         d[17] = (uint32_t)cc;
 293 }
 294
 295 /*
 296  * Base field modulus for P-256.
 297  */
 298 static const uint32_t F256[] = {
 299
 300         0x3FFFFFFF, 0x3FFFFFFF, 0x3FFFFFFF, 0x0000003F, 0x00000000,
 301         0x00000000, 0x00001000, 0x3FFFC000, 0x0000FFFF
 302 };
 303
 304 /*
 305  * The 'b' curve equation coefficient for P-256.
 306  */
 307 static const uint32_t P256_B[] = {
 308
 309         0x27D2604B, 0x2F38F0F8, 0x053B0F63, 0x0741AC33, 0x1886BC65,
 310         0x2EF555DA, 0x293E7B3E, 0x0D762A8E, 0x00005AC6
 311 };
 312
 313 /*
 314  * Addition in the field. Source operands shall fit on 257 bits; output
 315  * will be lower than twice the modulus.
 316  */
 317 static void
 318 add_f256(uint32_t *d, const uint32_t *a, const uint32_t *b)
 319 {
 320         uint32_t w, cc;
 321         int i;
 322
 323         cc = 0;
 324         for (i = 0; i < 9; i ++) {
 325                 w = a[i] + b[i] + cc;
 326                 d[i] = w & 0x3FFFFFFF;
 327                 cc = w >> 30;
 328         }
 329         w >>= 16;
 330         d[8] &= 0xFFFF;
 331         d[3] -= w << 6;
 332         d[6] -= w << 12;
 333         d[7] += w << 14;
 334         cc = w;
 335         for (i = 0; i < 9; i ++) {
 336                 w = d[i] + cc;
 337                 d[i] = w & 0x3FFFFFFF;
 338                 cc = ARSH(w, 30);
 339         }
 340 }
 341
 342 /*
 343  * Subtraction in the field. Source operands shall be smaller than twice
 344  * the modulus; the result will fulfil the same property.
 345  */
 346 static void
 347 sub_f256(uint32_t *d, const uint32_t *a, const uint32_t *b)
 348 {
 349         uint32_t w, cc;
 350         int i;
 351
 352         /*
 353          * We really compute a - b + 2*p to make sure that the result is
 354          * positive.
 355          */
 356         w = a[0] - b[0] - 0x00002;
 357         d[0] = w & 0x3FFFFFFF;
 358         w = a[1] - b[1] + ARSH(w, 30);
 359         d[1] = w & 0x3FFFFFFF;
 360         w = a[2] - b[2] + ARSH(w, 30);
 361         d[2] = w & 0x3FFFFFFF;
 362         w = a[3] - b[3] + ARSH(w, 30) + 0x00080;
 363         d[3] = w & 0x3FFFFFFF;
 364         w = a[4] - b[4] + ARSH(w, 30);
 365         d[4] = w & 0x3FFFFFFF;
 366         w = a[5] - b[5] + ARSH(w, 30);
 367         d[5] = w & 0x3FFFFFFF;
 368         w = a[6] - b[6] + ARSH(w, 30) + 0x02000;
 369         d[6] = w & 0x3FFFFFFF;
 370         w = a[7] - b[7] + ARSH(w, 30) - 0x08000;
 371         d[7] = w & 0x3FFFFFFF;
 372         w = a[8] - b[8] + ARSH(w, 30) + 0x20000;
 373         d[8] = w & 0xFFFF;
 374         w >>= 16;
 375         d[8] &= 0xFFFF;
 376         d[3] -= w << 6;
 377         d[6] -= w << 12;
 378         d[7] += w << 14;
 379         cc = w;
 380         for (i = 0; i < 9; i ++) {
 381                 w = d[i] + cc;
 382                 d[i] = w & 0x3FFFFFFF;
 383                 cc = ARSH(w, 30);
 384         }
 385 }
 386
 387 /*
 388  * Compute a multiplication in F256. Source operands shall be less than
 389  * twice the modulus.
 390  */
 391 static void
 392 mul_f256(uint32_t *d, const uint32_t *a, const uint32_t *b)
 393 {
 394         uint32_t t[18];
 395         uint64_t s[18];
 396         uint64_t cc, x;
 397         uint32_t z, c;
 398         int i;
 399
 400         mul9(t, a, b);
 401
 402         /*
 403          * Modular reduction: each high word in added/subtracted where
 404          * necessary.
 405          *
 406          * The modulus is:
 407          *    p = 2^256 - 2^224 + 2^192 + 2^96 - 1
 408          * Therefore:
 409          *    2^256 = 2^224 - 2^192 - 2^96 + 1 mod p
 410          *
 411          * For a word x at bit offset n (n >= 256), we have:
 412          *    x*2^n = x*2^(n-32) - x*2^(n-64)
 413          *            - x*2^(n - 160) + x*2^(n-256) mod p
 414          *
 415          * Thus, we can nullify the high word if we reinject it at some
 416          * proper emplacements.
 417          *
 418          * We use 64-bit intermediate words to allow for carries to
 419          * accumulate easily, before performing the final propagation.
 420          */
 421         for (i = 0; i < 18; i ++) {
 422                 s[i] = t[i];
 423         }
 424
 425         for (i = 17; i >= 9; i --) {
 426                 uint64_t y;
 427
 428                 y = s[i];
 429                 s[i - 1] += ARSHW(y, 2);
 430                 s[i - 2] += (y << 28) & 0x3FFFFFFF;
 431                 s[i - 2] -= ARSHW(y, 4);
 432                 s[i - 3] -= (y << 26) & 0x3FFFFFFF;
 433                 s[i - 5] -= ARSHW(y, 10);
 434                 s[i - 6] -= (y << 20) & 0x3FFFFFFF;
 435                 s[i - 8] += ARSHW(y, 16);
 436                 s[i - 9] += (y << 14) & 0x3FFFFFFF;
 437         }
 438
 439         /*
 440          * Carry propagation must be signed. Moreover, we may have overdone
 441          * it a bit, and obtain a negative result.
 442          *
 443          * The loop above ran 9 times; each time, each word was augmented
 444          * by at most one extra word (in absolute value). Thus, the top
 445          * word must in fine fit in 39 bits, so the carry below will fit
 446          * on 9 bits.
 447          */
 448         cc = 0;
 449         for (i = 0; i < 9; i ++) {
 450                 x = s[i] + cc;
 451                 d[i] = (uint32_t)x & 0x3FFFFFFF;
 452                 cc = ARSHW(x, 30);
 453         }
 454
 455         /*
 456          * All nine words fit on 30 bits, but there may be an extra
 457          * carry for a few bits (at most 9), and that carry may be
 458          * negative. Moreover, we want the result to fit on 257 bits.
 459          * The two lines below ensure that the word in d[] has length
 460          * 256 bits, and the (signed) carry (beyond 2^256) is in cc. The
 461          * significant length of cc is less than 24 bits, so we will be
 462          * able to switch to 32-bit operations.
 463          */
 464         cc = ARSHW(x, 16);
 465         d[8] &= 0xFFFF;
 466
 467         /*
 468          * One extra round of reduction, for cc*2^256, which means
 469          * adding cc*(2^224-2^192-2^96+1) to a 256-bit (nonnegative)
 470          * value. If cc is negative, then it may happen (rarely, but
 471          * not neglectibly so) that the result would be negative. In
 472          * order to avoid that, if cc is negative, then we add the
 473          * modulus once. Note that if cc is negative, then propagating
 474          * that carry must yield a value lower than the modulus, so
 475          * adding the modulus once will keep the final result under
 476          * twice the modulus.
 477          */
 478         z = (uint32_t)cc;
 479         d[3] -= z << 6;
 480         d[6] -= (z << 12) & 0x3FFFFFFF;
 481         d[7] -= ARSH(z, 18);
 482         d[7] += (z << 14) & 0x3FFFFFFF;
 483         d[8] += ARSH(z, 16);
 484         c = z >> 31;
 485         d[0] -= c;
 486         d[3] += c << 6;
 487         d[6] += c << 12;
 488         d[7] -= c << 14;
 489         d[8] += c << 16;
 490         for (i = 0; i < 9; i ++) {
 491                 uint32_t w;
 492
 493                 w = d[i] + z;
 494                 d[i] = w & 0x3FFFFFFF;
 495                 z = ARSH(w, 30);
 496         }
 497 }
 498
 499 /*
 500  * Compute a square in F256. Source operand shall be less than
 501  * twice the modulus.
 502  */
 503 static void
 504 square_f256(uint32_t *d, const uint32_t *a)
 505 {
 506         uint32_t t[18];
 507         uint64_t s[18];
 508         uint64_t cc, x;
 509         uint32_t z, c;
 510         int i;
 511
 512         square9(t, a);
 513
 514         /*
 515          * Modular reduction: each high word in added/subtracted where
 516          * necessary.
 517          *
 518          * The modulus is:
 519          *    p = 2^256 - 2^224 + 2^192 + 2^96 - 1
 520          * Therefore:
 521          *    2^256 = 2^224 - 2^192 - 2^96 + 1 mod p
 522          *
 523          * For a word x at bit offset n (n >= 256), we have:
 524          *    x*2^n = x*2^(n-32) - x*2^(n-64)
 525          *            - x*2^(n - 160) + x*2^(n-256) mod p
 526          *
 527          * Thus, we can nullify the high word if we reinject it at some
 528          * proper emplacements.
 529          *
 530          * We use 64-bit intermediate words to allow for carries to
 531          * accumulate easily, before performing the final propagation.
 532          */
 533         for (i = 0; i < 18; i ++) {
 534                 s[i] = t[i];
 535         }
 536
 537         for (i = 17; i >= 9; i --) {
 538                 uint64_t y;
 539
 540                 y = s[i];
 541                 s[i - 1] += ARSHW(y, 2);
 542                 s[i - 2] += (y << 28) & 0x3FFFFFFF;
 543                 s[i - 2] -= ARSHW(y, 4);
 544                 s[i - 3] -= (y << 26) & 0x3FFFFFFF;
 545                 s[i - 5] -= ARSHW(y, 10);
 546                 s[i - 6] -= (y << 20) & 0x3FFFFFFF;
 547                 s[i - 8] += ARSHW(y, 16);
 548                 s[i - 9] += (y << 14) & 0x3FFFFFFF;
 549         }
 550
 551         /*
 552          * Carry propagation must be signed. Moreover, we may have overdone
 553          * it a bit, and obtain a negative result.
 554          *
 555          * The loop above ran 9 times; each time, each word was augmented
 556          * by at most one extra word (in absolute value). Thus, the top
 557          * word must in fine fit in 39 bits, so the carry below will fit
 558          * on 9 bits.
 559          */
 560         cc = 0;
 561         for (i = 0; i < 9; i ++) {
 562                 x = s[i] + cc;
 563                 d[i] = (uint32_t)x & 0x3FFFFFFF;
 564                 cc = ARSHW(x, 30);
 565         }
 566
 567         /*
 568          * All nine words fit on 30 bits, but there may be an extra
 569          * carry for a few bits (at most 9), and that carry may be
 570          * negative. Moreover, we want the result to fit on 257 bits.
 571          * The two lines below ensure that the word in d[] has length
 572          * 256 bits, and the (signed) carry (beyond 2^256) is in cc. The
 573          * significant length of cc is less than 24 bits, so we will be
 574          * able to switch to 32-bit operations.
 575          */
 576         cc = ARSHW(x, 16);
 577         d[8] &= 0xFFFF;
 578
 579         /*
 580          * One extra round of reduction, for cc*2^256, which means
 581          * adding cc*(2^224-2^192-2^96+1) to a 256-bit (nonnegative)
 582          * value. If cc is negative, then it may happen (rarely, but
 583          * not neglectibly so) that the result would be negative. In
 584          * order to avoid that, if cc is negative, then we add the
 585          * modulus once. Note that if cc is negative, then propagating
 586          * that carry must yield a value lower than the modulus, so
 587          * adding the modulus once will keep the final result under
 588          * twice the modulus.
 589          */
 590         z = (uint32_t)cc;
 591         d[3] -= z << 6;
 592         d[6] -= (z << 12) & 0x3FFFFFFF;
 593         d[7] -= ARSH(z, 18);
 594         d[7] += (z << 14) & 0x3FFFFFFF;
 595         d[8] += ARSH(z, 16);
 596         c = z >> 31;
 597         d[0] -= c;
 598         d[3] += c << 6;
 599         d[6] += c << 12;
 600         d[7] -= c << 14;
 601         d[8] += c << 16;
 602         for (i = 0; i < 9; i ++) {
 603                 uint32_t w;
 604
 605                 w = d[i] + z;
 606                 d[i] = w & 0x3FFFFFFF;
 607                 z = ARSH(w, 30);
 608         }
 609 }
 610
 611 /*
 612  * Perform a "final reduction" in field F256 (field for curve P-256).
 613  * The source value must be less than twice the modulus. If the value
 614  * is not lower than the modulus, then the modulus is subtracted and
 615  * this function returns 1; otherwise, it leaves it untouched and it
 616  * returns 0.
 617  */
 618 static uint32_t
 619 reduce_final_f256(uint32_t *d)
 620 {
 621         uint32_t t[9];
 622         uint32_t cc;
 623         int i;
 624
 625         cc = 0;
 626         for (i = 0; i < 9; i ++) {
 627                 uint32_t w;
 628
 629                 w = d[i] - F256[i] - cc;
 630                 cc = w >> 31;
 631                 t[i] = w & 0x3FFFFFFF;
 632         }
 633         cc ^= 1;
 634         CCOPY(cc, d, t, sizeof t);
 635         return cc;
 636 }
 637
 638 /*
 639  * Jacobian coordinates for a point in P-256: affine coordinates (X,Y)
 640  * are such that:
 641  *   X = x / z^2
 642  *   Y = y / z^3
 643  * For the point at infinity, z = 0.
 644  * Each point thus admits many possible representations.
 645  *
 646  * Coordinates are represented in arrays of 32-bit integers, each holding
 647  * 30 bits of data. Values may also be slightly greater than the modulus,
 648  * but they will always be lower than twice the modulus.
 649  */
 650 typedef struct {
 651         uint32_t x[9];
 652         uint32_t y[9];
 653         uint32_t z[9];
 654 } p256_jacobian;
 655
 656 /*
 657  * Convert a point to affine coordinates:
 658  *  - If the point is the point at infinity, then all three coordinates
 659  *    are set to 0.
 660  *  - Otherwise, the 'z' coordinate is set to 1, and the 'x' and 'y'
 661  *    coordinates are the 'X' and 'Y' affine coordinates.
 662  * The coordinates are guaranteed to be lower than the modulus.
 663  */
 664 static void
 665 p256_to_affine(p256_jacobian *P)
 666 {
 667         uint32_t t1[9], t2[9];
 668         int i;
 669
 670         /*
 671          * Invert z with a modular exponentiation: the modulus is
 672          * p = 2^256 - 2^224 + 2^192 + 2^96 - 1, and the exponent is
 673          * p-2. Exponent bit pattern (from high to low) is:
 674          *  - 32 bits of value 1
 675          *  - 31 bits of value 0
 676          *  - 1 bit of value 1
 677          *  - 96 bits of value 0
 678          *  - 94 bits of value 1
 679          *  - 1 bit of value 0
 680          *  - 1 bit of value 1
 681          * Thus, we precompute z^(2^31-1) to speed things up.
 682          *
 683          * If z = 0 (point at infinity) then the modular exponentiation
 684          * will yield 0, which leads to the expected result (all three
 685          * coordinates set to 0).
 686          */
 687
 688         /*
 689          * A simple square-and-multiply for z^(2^31-1). We could save about
 690          * two dozen multiplications here with an addition chain, but
 691          * this would require a bit more code, and extra stack buffers.
 692          */
 693         memcpy(t1, P->z, sizeof P->z);
 694         for (i = 0; i < 30; i ++) {
 695                 square_f256(t1, t1);
 696                 mul_f256(t1, t1, P->z);
 697         }
 698
 699         /*
 700          * Square-and-multiply. Apart from the squarings, we have a few
 701          * multiplications to set bits to 1; we multiply by the original z
 702          * for setting 1 bit, and by t1 for setting 31 bits.
 703          */
 704         memcpy(t2, P->z, sizeof P->z);
 705         for (i = 1; i < 256; i ++) {
 706                 square_f256(t2, t2);
 707                 switch (i) {
 708                 case 31:
 709                 case 190:
 710                 case 221:
 711                 case 252:
 712                         mul_f256(t2, t2, t1);
 713                         break;
 714                 case 63:
 715                 case 253:
 716                 case 255:
 717                         mul_f256(t2, t2, P->z);
 718                         break;
 719                 }
 720         }
 721
 722         /*
 723          * Now that we have 1/z, multiply x by 1/z^2 and y by 1/z^3.
 724          */
 725         mul_f256(t1, t2, t2);
 726         mul_f256(P->x, t1, P->x);
 727         mul_f256(t1, t1, t2);
 728         mul_f256(P->y, t1, P->y);
 729         reduce_final_f256(P->x);
 730         reduce_final_f256(P->y);
 731
 732         /*
 733          * Multiply z by 1/z. If z = 0, then this will yield 0, otherwise
 734          * this will set z to 1.
 735          */
 736         mul_f256(P->z, P->z, t2);
 737         reduce_final_f256(P->z);
 738 }
 739
 740 /*
 741  * Double a point in P-256. This function works for all valid points,
 742  * including the point at infinity.
 743  */
 744 static void
 745 p256_double(p256_jacobian *Q)
 746 {
 747         /*
 748          * Doubling formulas are:
 749          *
 750          *   s = 4*x*y^2
 751          *   m = 3*(x + z^2)*(x - z^2)
 752          *   x' = m^2 - 2*s
 753          *   y' = m*(s - x') - 8*y^4
 754          *   z' = 2*y*z
 755          *
 756          * These formulas work for all points, including points of order 2
 757          * and points at infinity:
 758          *   - If y = 0 then z' = 0. But there is no such point in P-256
 759          *     anyway.
 760          *   - If z = 0 then z' = 0.
 761          */
 762         uint32_t t1[9], t2[9], t3[9], t4[9];
 763
 764         /*
 765          * Compute z^2 in t1.
 766          */
 767         square_f256(t1, Q->z);
 768
 769         /*
 770          * Compute x-z^2 in t2 and x+z^2 in t1.
 771          */
 772         add_f256(t2, Q->x, t1);
 773         sub_f256(t1, Q->x, t1);
 774
 775         /*
 776          * Compute 3*(x+z^2)*(x-z^2) in t1.
 777          */
 778         mul_f256(t3, t1, t2);
 779         add_f256(t1, t3, t3);
 780         add_f256(t1, t3, t1);
 781
 782         /*
 783          * Compute 4*x*y^2 (in t2) and 2*y^2 (in t3).
 784          */
 785         square_f256(t3, Q->y);
 786         add_f256(t3, t3, t3);
 787         mul_f256(t2, Q->x, t3);
 788         add_f256(t2, t2, t2);
 789
 790         /*
 791          * Compute x' = m^2 - 2*s.
 792          */
 793         square_f256(Q->x, t1);
 794         sub_f256(Q->x, Q->x, t2);
 795         sub_f256(Q->x, Q->x, t2);
 796
 797         /*
 798          * Compute z' = 2*y*z.
 799          */
 800         mul_f256(t4, Q->y, Q->z);
 801         add_f256(Q->z, t4, t4);
 802
 803         /*
 804          * Compute y' = m*(s - x') - 8*y^4. Note that we already have
 805          * 2*y^2 in t3.
 806          */
 807         sub_f256(t2, t2, Q->x);
 808         mul_f256(Q->y, t1, t2);
 809         square_f256(t4, t3);
 810         add_f256(t4, t4, t4);
 811         sub_f256(Q->y, Q->y, t4);
 812 }
 813
 814 /*
 815  * Add point P2 to point P1.
 816  *
 817  * This function computes the wrong result in the following cases:
 818  *
 819  *   - If P1 == 0 but P2 != 0
 820  *   - If P1 != 0 but P2 == 0
 821  *   - If P1 == P2
 822  *
 823  * In all three cases, P1 is set to the point at infinity.
 824  *
 825  * Returned value is 0 if one of the following occurs:
 826  *
 827  *   - P1 and P2 have the same Y coordinate
 828  *   - P1 == 0 and P2 == 0
 829  *   - The Y coordinate of one of the points is 0 and the other point is
 830  *     the point at infinity.
 831  *
 832  * The third case cannot actually happen with valid points, since a point
 833  * with Y == 0 is a point of order 2, and there is no point of order 2 on
 834  * curve P-256.
 835  *
 836  * Therefore, assuming that P1 != 0 and P2 != 0 on input, then the caller
 837  * can apply the following:
 838  *
 839  *   - If the result is not the point at infinity, then it is correct.
 840  *   - Otherwise, if the returned value is 1, then this is a case of
 841  *     P1+P2 == 0, so the result is indeed the point at infinity.
 842  *   - Otherwise, P1 == P2, so a "double" operation should have been
 843  *     performed.
 844  */
 845 static uint32_t
 846 p256_add(p256_jacobian *P1, const p256_jacobian *P2)
 847 {
 848         /*
 849          * Addtions formulas are:
 850          *
 851          *   u1 = x1 * z2^2
 852          *   u2 = x2 * z1^2
 853          *   s1 = y1 * z2^3
 854          *   s2 = y2 * z1^3
 855          *   h = u2 - u1
 856          *   r = s2 - s1
 857          *   x3 = r^2 - h^3 - 2 * u1 * h^2
 858          *   y3 = r * (u1 * h^2 - x3) - s1 * h^3
 859          *   z3 = h * z1 * z2
 860          */
 861         uint32_t t1[9], t2[9], t3[9], t4[9], t5[9], t6[9], t7[9];
 862         uint32_t ret;
 863         int i;
 864
 865         /*
 866          * Compute u1 = x1*z2^2 (in t1) and s1 = y1*z2^3 (in t3).
 867          */
 868         square_f256(t3, P2->z);
 869         mul_f256(t1, P1->x, t3);
 870         mul_f256(t4, P2->z, t3);
 871         mul_f256(t3, P1->y, t4);
 872
 873         /*
 874          * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
 875          */
 876         square_f256(t4, P1->z);
 877         mul_f256(t2, P2->x, t4);
 878         mul_f256(t5, P1->z, t4);
 879         mul_f256(t4, P2->y, t5);
 880
 881         /*
 882          * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
 883          * We need to test whether r is zero, so we will do some extra
 884          * reduce.
 885          */
 886         sub_f256(t2, t2, t1);
 887         sub_f256(t4, t4, t3);
 888         reduce_final_f256(t4);
 889         ret = 0;
 890         for (i = 0; i < 9; i ++) {
 891                 ret |= t4[i];
 892         }
 893         ret = (ret | -ret) >> 31;
 894
 895         /*
 896          * Compute u1*h^2 (in t6) and h^3 (in t5);
 897          */
 898         square_f256(t7, t2);
 899         mul_f256(t6, t1, t7);
 900         mul_f256(t5, t7, t2);
 901
 902         /*
 903          * Compute x3 = r^2 - h^3 - 2*u1*h^2.
 904          */
 905         square_f256(P1->x, t4);
 906         sub_f256(P1->x, P1->x, t5);
 907         sub_f256(P1->x, P1->x, t6);
 908         sub_f256(P1->x, P1->x, t6);
 909
 910         /*
 911          * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
 912          */
 913         sub_f256(t6, t6, P1->x);
 914         mul_f256(P1->y, t4, t6);
 915         mul_f256(t1, t5, t3);
 916         sub_f256(P1->y, P1->y, t1);
 917
 918         /*
 919          * Compute z3 = h*z1*z2.
 920          */
 921         mul_f256(t1, P1->z, P2->z);
 922         mul_f256(P1->z, t1, t2);
 923
 924         return ret;
 925 }
 926
 927 /*
 928  * Add point P2 to point P1. This is a specialised function for the
 929  * case when P2 is a non-zero point in affine coordinate.
 930  *
 931  * This function computes the wrong result in the following cases:
 932  *
 933  *   - If P1 == 0
 934  *   - If P1 == P2
 935  *
 936  * In both cases, P1 is set to the point at infinity.
 937  *
 938  * Returned value is 0 if one of the following occurs:
 939  *
 940  *   - P1 and P2 have the same Y coordinate
 941  *   - The Y coordinate of P2 is 0 and P1 is the point at infinity.
 942  *
 943  * The second case cannot actually happen with valid points, since a point
 944  * with Y == 0 is a point of order 2, and there is no point of order 2 on
 945  * curve P-256.
 946  *
 947  * Therefore, assuming that P1 != 0 on input, then the caller
 948  * can apply the following:
 949  *
 950  *   - If the result is not the point at infinity, then it is correct.
 951  *   - Otherwise, if the returned value is 1, then this is a case of
 952  *     P1+P2 == 0, so the result is indeed the point at infinity.
 953  *   - Otherwise, P1 == P2, so a "double" operation should have been
 954  *     performed.
 955  */
 956 static uint32_t
 957 p256_add_mixed(p256_jacobian *P1, const p256_jacobian *P2)
 958 {
 959         /*
 960          * Addtions formulas are:
 961          *
 962          *   u1 = x1
 963          *   u2 = x2 * z1^2
 964          *   s1 = y1
 965          *   s2 = y2 * z1^3
 966          *   h = u2 - u1
 967          *   r = s2 - s1
 968          *   x3 = r^2 - h^3 - 2 * u1 * h^2
 969          *   y3 = r * (u1 * h^2 - x3) - s1 * h^3
 970          *   z3 = h * z1
 971          */
 972         uint32_t t1[9], t2[9], t3[9], t4[9], t5[9], t6[9], t7[9];
 973         uint32_t ret;
 974         int i;
 975
 976         /*
 977          * Compute u1 = x1 (in t1) and s1 = y1 (in t3).
 978          */
 979         memcpy(t1, P1->x, sizeof t1);
 980         memcpy(t3, P1->y, sizeof t3);
 981
 982         /*
 983          * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
 984          */
 985         square_f256(t4, P1->z);
 986         mul_f256(t2, P2->x, t4);
 987         mul_f256(t5, P1->z, t4);
 988         mul_f256(t4, P2->y, t5);
 989
 990         /*
 991          * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
 992          * We need to test whether r is zero, so we will do some extra
 993          * reduce.
 994          */
 995         sub_f256(t2, t2, t1);
 996         sub_f256(t4, t4, t3);
 997         reduce_final_f256(t4);
 998         ret = 0;
 999         for (i = 0; i < 9; i ++) {
1000                 ret |= t4[i];
1001         }
1002         ret = (ret | -ret) >> 31;
1003
1004         /*
1005          * Compute u1*h^2 (in t6) and h^3 (in t5);
1006          */
1007         square_f256(t7, t2);
1008         mul_f256(t6, t1, t7);
1009         mul_f256(t5, t7, t2);
1010
1011         /*
1012          * Compute x3 = r^2 - h^3 - 2*u1*h^2.
1013          */
1014         square_f256(P1->x, t4);
1015         sub_f256(P1->x, P1->x, t5);
1016         sub_f256(P1->x, P1->x, t6);
1017         sub_f256(P1->x, P1->x, t6);
1018
1019         /*
1020          * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
1021          */
1022         sub_f256(t6, t6, P1->x);
1023         mul_f256(P1->y, t4, t6);
1024         mul_f256(t1, t5, t3);
1025         sub_f256(P1->y, P1->y, t1);
1026
1027         /*
1028          * Compute z3 = h*z1*z2.
1029          */
1030         mul_f256(P1->z, P1->z, t2);
1031
1032         return ret;
1033 }
1034
1035 /*
1036  * Decode a P-256 point. This function does not support the point at
1037  * infinity. Returned value is 0 if the point is invalid, 1 otherwise.
1038  */
1039 static uint32_t
1040 p256_decode(p256_jacobian *P, const void *src, size_t len)
1041 {
1042         const unsigned char *buf;
1043         uint32_t tx[9], ty[9], t1[9], t2[9];
1044         uint32_t bad;
1045         int i;
1046
1047         if (len != 65) {
1048                 return 0;
1049         }
1050         buf = src;
1051
1052         /*
1053          * First byte must be 0x04 (uncompressed format). We could support
1054          * "hybrid format" (first byte is 0x06 or 0x07, and encodes the
1055          * least significant bit of the Y coordinate), but it is explicitly
1056          * forbidden by RFC 5480 (section 2.2).
1057          */
1058         bad = NEQ(buf[0], 0x04);
1059
1060         /*
1061          * Decode the coordinates, and check that they are both lower
1062          * than the modulus.
1063          */
1064         tx[8] = be8_to_le30(tx, buf + 1, 32);
1065         ty[8] = be8_to_le30(ty, buf + 33, 32);
1066         bad |= reduce_final_f256(tx);
1067         bad |= reduce_final_f256(ty);
1068
1069         /*
1070          * Check curve equation.
1071          */
1072         square_f256(t1, tx);
1073         mul_f256(t1, tx, t1);
1074         square_f256(t2, ty);
1075         sub_f256(t1, t1, tx);
1076         sub_f256(t1, t1, tx);
1077         sub_f256(t1, t1, tx);
1078         add_f256(t1, t1, P256_B);
1079         sub_f256(t1, t1, t2);
1080         reduce_final_f256(t1);
1081         for (i = 0; i < 9; i ++) {
1082                 bad |= t1[i];
1083         }
1084
1085         /*
1086          * Copy coordinates to the point structure.
1087          */
1088         memcpy(P->x, tx, sizeof tx);
1089         memcpy(P->y, ty, sizeof ty);
1090         memset(P->z, 0, sizeof P->z);
1091         P->z[0] = 1;
1092         return EQ(bad, 0);
1093 }
1094
1095 /*
1096  * Encode a point into a buffer. This function assumes that the point is
1097  * valid, in affine coordinates, and not the point at infinity.
1098  */
1099 static void
1100 p256_encode(void *dst, const p256_jacobian *P)
1101 {
1102         unsigned char *buf;
1103
1104         buf = dst;
1105         buf[0] = 0x04;
1106         le30_to_be8(buf + 1, 32, P->x);
1107         le30_to_be8(buf + 33, 32, P->y);
1108 }
1109
1110 /*
1111  * Multiply a curve point by an integer. The integer is assumed to be
1112  * lower than the curve order, and the base point must not be the point
1113  * at infinity.
1114  */
1115 static void
1116 p256_mul(p256_jacobian *P, const unsigned char *x, size_t xlen)
1117 {
1118         /*
1119          * qz is a flag that is initially 1, and remains equal to 1
1120          * as long as the point is the point at infinity.
1121          *
1122          * We use a 2-bit window to handle multiplier bits by pairs.
1123          * The precomputed window really is the points P2 and P3.
1124          */
1125         uint32_t qz;
1126         p256_jacobian P2, P3, Q, T, U;
1127
1128         /*
1129          * Compute window values.
1130          */
1131         P2 = *P;
1132         p256_double(&P2);
1133         P3 = *P;
1134         p256_add(&P3, &P2);
1135
1136         /*
1137          * We start with Q = 0. We process multiplier bits 2 by 2.
1138          */
1139         memset(&Q, 0, sizeof Q);
1140         qz = 1;
1141         while (xlen -- > 0) {
1142                 int k;
1143
1144                 for (k = 6; k >= 0; k -= 2) {
1145                         uint32_t bits;
1146                         uint32_t bnz;
1147
1148                         p256_double(&Q);
1149                         p256_double(&Q);
1150                         T = *P;
1151                         U = Q;
1152                         bits = (*x >> k) & (uint32_t)3;
1153                         bnz = NEQ(bits, 0);
1154                         CCOPY(EQ(bits, 2), &T, &P2, sizeof T);
1155                         CCOPY(EQ(bits, 3), &T, &P3, sizeof T);
1156                         p256_add(&U, &T);
1157                         CCOPY(bnz & qz, &Q, &T, sizeof Q);
1158                         CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
1159                         qz &= ~bnz;
1160                 }
1161                 x ++;
1162         }
1163         *P = Q;
1164 }
1165
1166 /*
1167  * Precomputed window: k*G points, where G is the curve generator, and k
1168  * is an integer from 1 to 15 (inclusive). The X and Y coordinates of
1169  * the point are encoded as 9 words of 30 bits each (little-endian
1170  * order).
1171  */
1172 static const uint32_t Gwin[15][18] = {
1173
1174         { 0x1898C296, 0x1284E517, 0x1EB33A0F, 0x00DF604B,
1175           0x2440F277, 0x339B958E, 0x04247F8B, 0x347CB84B,
1176           0x00006B17, 0x37BF51F5, 0x2ED901A0, 0x3315ECEC,
1177           0x338CD5DA, 0x0F9E162B, 0x1FAD29F0, 0x27F9B8EE,
1178           0x10B8BF86, 0x00004FE3 },
1179
1180         { 0x07669978, 0x182D23F1, 0x3F21B35A, 0x225A789D,
1181           0x351AC3C0, 0x08E00C12, 0x34F7E8A5, 0x1EC62340,
1182           0x00007CF2, 0x227873D1, 0x3812DE74, 0x0E982299,
1183           0x1F6B798F, 0x3430DBBA, 0x366B1A7D, 0x2D040293,
1184           0x154436E3, 0x00000777 },
1185
1186         { 0x06E7FD6C, 0x2D05986F, 0x3ADA985F, 0x31ADC87B,
1187           0x0BF165E6, 0x1FBE5475, 0x30A44C8F, 0x3934698C,
1188           0x00005ECB, 0x227D5032, 0x29E6C49E, 0x04FB83D9,
1189           0x0AAC0D8E, 0x24A2ECD8, 0x2C1B3869, 0x0FF7E374,
1190           0x19031266, 0x00008734 },
1191
1192         { 0x2B030852, 0x024C0911, 0x05596EF5, 0x07F8B6DE,
1193           0x262BD003, 0x3779967B, 0x08FBBA02, 0x128D4CB4,
1194           0x0000E253, 0x184ED8C6, 0x310B08FC, 0x30EE0055,
1195           0x3F25B0FC, 0x062D764E, 0x3FB97F6A, 0x33CC719D,
1196           0x15D69318, 0x0000E0F1 },
1197
1198         { 0x03D033ED, 0x05552837, 0x35BE5242, 0x2320BF47,
1199           0x268FDFEF, 0x13215821, 0x140D2D78, 0x02DE9454,
1200           0x00005159, 0x3DA16DA4, 0x0742ED13, 0x0D80888D,
1201           0x004BC035, 0x0A79260D, 0x06FCDAFE, 0x2727D8AE,
1202           0x1F6A2412, 0x0000E0C1 },
1203
1204         { 0x3C2291A9, 0x1AC2ABA4, 0x3B215B4C, 0x131D037A,
1205           0x17DDE302, 0x0C90B2E2, 0x0602C92D, 0x05CA9DA9,
1206           0x0000B01A, 0x0FC77FE2, 0x35F1214E, 0x07E16BDF,
1207           0x003DDC07, 0x2703791C, 0x3038B7EE, 0x3DAD56FE,
1208           0x041D0C8D, 0x0000E85C },
1209
1210         { 0x3187B2A3, 0x0018A1C0, 0x00FEF5B3, 0x3E7E2E2A,
1211           0x01FB607E, 0x2CC199F0, 0x37B4625B, 0x0EDBE82F,
1212           0x00008E53, 0x01F400B4, 0x15786A1B, 0x3041B21C,
1213           0x31CD8CF2, 0x35900053, 0x1A7E0E9B, 0x318366D0,
1214           0x076F780C, 0x000073EB },
1215
1216         { 0x1B6FB393, 0x13767707, 0x3CE97DBB, 0x348E2603,
1217           0x354CADC1, 0x09D0B4EA, 0x1B053404, 0x1DE76FBA,
1218           0x000062D9, 0x0F09957E, 0x295029A8, 0x3E76A78D,
1219           0x3B547DAE, 0x27CEE0A2, 0x0575DC45, 0x1D8244FF,
1220           0x332F647A, 0x0000AD5A },
1221
1222         { 0x10949EE0, 0x1E7A292E, 0x06DF8B3D, 0x02B2E30B,
1223           0x31F8729E, 0x24E35475, 0x30B71878, 0x35EDBFB7,
1224           0x0000EA68, 0x0DD048FA, 0x21688929, 0x0DE823FE,
1225           0x1C53FAA9, 0x0EA0C84D, 0x052A592A, 0x1FCE7870,
1226           0x11325CB2, 0x00002A27 },
1227
1228         { 0x04C5723F, 0x30D81A50, 0x048306E4, 0x329B11C7,
1229           0x223FB545, 0x085347A8, 0x2993E591, 0x1B5ACA8E,
1230           0x0000CEF6, 0x04AF0773, 0x28D2EEA9, 0x2751EEEC,
1231           0x037B4A7F, 0x3B4C1059, 0x08F37674, 0x2AE906E1,
1232           0x18A88A6A, 0x00008786 },
1233
1234         { 0x34BC21D1, 0x0CCE474D, 0x15048BF4, 0x1D0BB409,
1235           0x021CDA16, 0x20DE76C3, 0x34C59063, 0x04EDE20E,
1236           0x00003ED1, 0x282A3740, 0x0BE3BBF3, 0x29889DAE,
1237           0x03413697, 0x34C68A09, 0x210EBE93, 0x0C8A224C,
1238           0x0826B331, 0x00009099 },
1239
1240         { 0x0624E3C4, 0x140317BA, 0x2F82C99D, 0x260C0A2C,
1241           0x25D55179, 0x194DCC83, 0x3D95E462, 0x356F6A05,
1242           0x0000741D, 0x0D4481D3, 0x2657FC8B, 0x1BA5CA71,
1243           0x3AE44B0D, 0x07B1548E, 0x0E0D5522, 0x05FDC567,
1244           0x2D1AA70E, 0x00000770 },
1245
1246         { 0x06072C01, 0x23857675, 0x1EAD58A9, 0x0B8A12D9,
1247           0x1EE2FC79, 0x0177CB61, 0x0495A618, 0x20DEB82B,
1248           0x0000177C, 0x2FC7BFD8, 0x310EEF8B, 0x1FB4DF39,
1249           0x3B8530E8, 0x0F4E7226, 0x0246B6D0, 0x2A558A24,
1250           0x163353AF, 0x000063BB },
1251
1252         { 0x24D2920B, 0x1C249DCC, 0x2069C5E5, 0x09AB2F9E,
1253           0x36DF3CF1, 0x1991FD0C, 0x062B97A7, 0x1E80070E,
1254           0x000054E7, 0x20D0B375, 0x2E9F20BD, 0x35090081,
1255           0x1C7A9DDC, 0x22E7C371, 0x087E3016, 0x03175421,
1256           0x3C6ECA7D, 0x0000F599 },
1257
1258         { 0x259B9D5F, 0x0D9A318F, 0x23A0EF16, 0x00EBE4B7,
1259           0x088265AE, 0x2CDE2666, 0x2BAE7ADF, 0x1371A5C6,
1260           0x0000F045, 0x0D034F36, 0x1F967378, 0x1B5FA3F4,
1261           0x0EC8739D, 0x1643E62A, 0x1653947E, 0x22D1F4E6,
1262           0x0FB8D64B, 0x0000B5B9 }
1263 };
1264
1265 /*
1266  * Lookup one of the Gwin[] values, by index. This is constant-time.
1267  */
1268 static void
1269 lookup_Gwin(p256_jacobian *T, uint32_t idx)
1270 {
1271         uint32_t xy[18];
1272         uint32_t k;
1273         size_t u;
1274
1275         memset(xy, 0, sizeof xy);
1276         for (k = 0; k < 15; k ++) {
1277                 uint32_t m;
1278
1279                 m = -EQ(idx, k + 1);
1280                 for (u = 0; u < 18; u ++) {
1281                         xy[u] |= m & Gwin[k][u];
1282                 }
1283         }
1284         memcpy(T->x, &xy[0], sizeof T->x);
1285         memcpy(T->y, &xy[9], sizeof T->y);
1286         memset(T->z, 0, sizeof T->z);
1287         T->z[0] = 1;
1288 }
1289
1290 /*
1291  * Multiply the generator by an integer. The integer is assumed non-zero
1292  * and lower than the curve order.
1293  */
1294 static void
1295 p256_mulgen(p256_jacobian *P, const unsigned char *x, size_t xlen)
1296 {
1297         /*
1298          * qz is a flag that is initially 1, and remains equal to 1
1299          * as long as the point is the point at infinity.
1300          *
1301          * We use a 4-bit window to handle multiplier bits by groups
1302          * of 4. The precomputed window is constant static data, with
1303          * points in affine coordinates; we use a constant-time lookup.
1304          */
1305         p256_jacobian Q;
1306         uint32_t qz;
1307
1308         memset(&Q, 0, sizeof Q);
1309         qz = 1;
1310         while (xlen -- > 0) {
1311                 int k;
1312                 unsigned bx;
1313
1314                 bx = *x ++;
1315                 for (k = 0; k < 2; k ++) {
1316                         uint32_t bits;
1317                         uint32_t bnz;
1318                         p256_jacobian T, U;
1319
1320                         p256_double(&Q);
1321                         p256_double(&Q);
1322                         p256_double(&Q);
1323                         p256_double(&Q);
1324                         bits = (bx >> 4) & 0x0F;
1325                         bnz = NEQ(bits, 0);
1326                         lookup_Gwin(&T, bits);
1327                         U = Q;
1328                         p256_add_mixed(&U, &T);
1329                         CCOPY(bnz & qz, &Q, &T, sizeof Q);
1330                         CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
1331                         qz &= ~bnz;
1332                         bx <<= 4;
1333                 }
1334         }
1335         *P = Q;
1336 }
1337
1338 static const unsigned char P256_G[] = {
1339         0x04, 0x6B, 0x17, 0xD1, 0xF2, 0xE1, 0x2C, 0x42, 0x47, 0xF8,
1340         0xBC, 0xE6, 0xE5, 0x63, 0xA4, 0x40, 0xF2, 0x77, 0x03, 0x7D,
1341         0x81, 0x2D, 0xEB, 0x33, 0xA0, 0xF4, 0xA1, 0x39, 0x45, 0xD8,
1342         0x98, 0xC2, 0x96, 0x4F, 0xE3, 0x42, 0xE2, 0xFE, 0x1A, 0x7F,
1343         0x9B, 0x8E, 0xE7, 0xEB, 0x4A, 0x7C, 0x0F, 0x9E, 0x16, 0x2B,
1344         0xCE, 0x33, 0x57, 0x6B, 0x31, 0x5E, 0xCE, 0xCB, 0xB6, 0x40,
1345         0x68, 0x37, 0xBF, 0x51, 0xF5
1346 };
1347
1348 static const unsigned char P256_N[] = {
1349         0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF,
1350         0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xBC, 0xE6, 0xFA, 0xAD,
1351         0xA7, 0x17, 0x9E, 0x84, 0xF3, 0xB9, 0xCA, 0xC2, 0xFC, 0x63,
1352         0x25, 0x51
1353 };
1354
1355 static const unsigned char *
1356 api_generator(int curve, size_t *len)
1357 {
1358         (void)curve;
1359         *len = sizeof P256_G;
1360         return P256_G;
1361 }
1362
1363 static const unsigned char *
1364 api_order(int curve, size_t *len)
1365 {
1366         (void)curve;
1367         *len = sizeof P256_N;
1368         return P256_N;
1369 }
1370
1371 static size_t
1372 api_xoff(int curve, size_t *len)
1373 {
1374         (void)curve;
1375         *len = 32;
1376         return 1;
1377 }
1378
1379 static uint32_t
1380 api_mul(unsigned char *G, size_t Glen,
1381         const unsigned char *x, size_t xlen, int curve)
1382 {
1383         uint32_t r;
1384         p256_jacobian P;
1385
1386         (void)curve;
1387         r = p256_decode(&P, G, Glen);
1388         p256_mul(&P, x, xlen);
1389         if (Glen >= 65) {
1390                 p256_to_affine(&P);
1391                 p256_encode(G, &P);
1392         }
1393         return r;
1394 }
1395
1396 static size_t
1397 api_mulgen(unsigned char *R,
1398         const unsigned char *x, size_t xlen, int curve)
1399 {
1400         p256_jacobian P;
1401
1402         (void)curve;
1403         p256_mulgen(&P, x, xlen);
1404         p256_to_affine(&P);
1405         p256_encode(R, &P);
1406         return 65;
1407
1408         /*
1409         const unsigned char *G;
1410         size_t Glen;
1411
1412         G = api_generator(curve, &Glen);
1413         memcpy(R, G, Glen);
1414         api_mul(R, Glen, x, xlen, curve);
1415         return Glen;
1416         */
1417 }
1418
1419 static uint32_t
1420 api_muladd(unsigned char *A, const unsigned char *B, size_t len,
1421         const unsigned char *x, size_t xlen,
1422         const unsigned char *y, size_t ylen, int curve)
1423 {
1424         p256_jacobian P, Q;
1425         uint32_t r, t, z;
1426         int i;
1427
1428         (void)curve;
1429         r = p256_decode(&P, A, len);
1430         p256_mul(&P, x, xlen);
1431         if (B == NULL) {
1432                 p256_mulgen(&Q, y, ylen);
1433         } else {
1434                 r &= p256_decode(&Q, B, len);
1435                 p256_mul(&Q, y, ylen);
1436         }
1437
1438         /*
1439          * The final addition may fail in case both points are equal.
1440          */
1441         t = p256_add(&P, &Q);
1442         reduce_final_f256(P.z);
1443         z = 0;
1444         for (i = 0; i < 9; i ++) {
1445                 z |= P.z[i];
1446         }
1447         z = EQ(z, 0);
1448         p256_double(&Q);
1449
1450         /*
1451          * If z is 1 then either P+Q = 0 (t = 1) or P = Q (t = 0). So we
1452          * have the following:
1453          *
1454          *   z = 0, t = 0   return P (normal addition)
1455          *   z = 0, t = 1   return P (normal addition)
1456          *   z = 1, t = 0   return Q (a 'double' case)
1457          *   z = 1, t = 1   report an error (P+Q = 0)
1458          */
1459         CCOPY(z & ~t, &P, &Q, sizeof Q);
1460         p256_to_affine(&P);
1461         p256_encode(A, &P);
1462         r &= ~(z & t);
1463         return r;
1464 }
1465
1466 /* see bearssl_ec.h */
1467 const br_ec_impl br_ec_p256_m31 = {
1468         (uint32_t)0x00800000,
1469         &api_generator,
1470         &api_order,
1471         &api_xoff,
1472         &api_mul,
1473         &api_mulgen,
1474         &api_muladd
1475 };