_ Git - BearSSL/blob - src/ec/ec_p256_m15.c

   1 /*
   2  * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining
   5  * a copy of this software and associated documentation files (the
   6  * "Software"), to deal in the Software without restriction, including
   7  * without limitation the rights to use, copy, modify, merge, publish,
   8  * distribute, sublicense, and/or sell copies of the Software, and to
   9  * permit persons to whom the Software is furnished to do so, subject to
  10  * the following conditions:
  11  *
  12  * The above copyright notice and this permission notice shall be
  13  * included in all copies or substantial portions of the Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  16  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  17  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  18  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  19  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  20  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  21  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "inner.h"
  26
  27 /*
  28  * If BR_NO_ARITH_SHIFT is undefined, or defined to 0, then we _assume_
  29  * that right-shifting a signed negative integer copies the sign bit
  30  * (arithmetic right-shift). This is "implementation-defined behaviour",
  31  * i.e. it is not undefined, but it may differ between compilers. Each
  32  * compiler is supposed to document its behaviour in that respect. GCC
  33  * explicitly defines that an arithmetic right shift is used. We expect
  34  * all other compilers to do the same, because underlying CPU offer an
  35  * arithmetic right shift opcode that could not be used otherwise.
  36  */
  37 #if BR_NO_ARITH_SHIFT
  38 #define ARSH(x, n)   (((uint32_t)(x) >> (n)) \
  39                     | ((-((uint32_t)(x) >> 31)) << (32 - (n))))
  40 #else
  41 #define ARSH(x, n)   ((*(int32_t *)&(x)) >> (n))
  42 #endif
  43
  44 /*
  45  * Convert an integer from unsigned big-endian encoding to a sequence of
  46  * 13-bit words in little-endian order. The final "partial" word is
  47  * returned.
  48  */
  49 static uint32_t
  50 be8_to_le13(uint32_t *dst, const unsigned char *src, size_t len)
  51 {
  52         uint32_t acc;
  53         int acc_len;
  54
  55         acc = 0;
  56         acc_len = 0;
  57         while (len -- > 0) {
  58                 acc |= (uint32_t)src[len] << acc_len;
  59                 acc_len += 8;
  60                 if (acc_len >= 13) {
  61                         *dst ++ = acc & 0x1FFF;
  62                         acc >>= 13;
  63                         acc_len -= 13;
  64                 }
  65         }
  66         return acc;
  67 }
  68
  69 /*
  70  * Convert an integer (13-bit words, little-endian) to unsigned
  71  * big-endian encoding. The total encoding length is provided; all
  72  * the destination bytes will be filled.
  73  */
  74 static void
  75 le13_to_be8(unsigned char *dst, size_t len, const uint32_t *src)
  76 {
  77         uint32_t acc;
  78         int acc_len;
  79
  80         acc = 0;
  81         acc_len = 0;
  82         while (len -- > 0) {
  83                 if (acc_len < 8) {
  84                         acc |= (*src ++) << acc_len;
  85                         acc_len += 13;
  86                 }
  87                 dst[len] = (unsigned char)acc;
  88                 acc >>= 8;
  89                 acc_len -= 8;
  90         }
  91 }
  92
  93 /*
  94  * Normalise an array of words to a strict 13 bits per word. Returned
  95  * value is the resulting carry. The source (w) and destination (d)
  96  * arrays may be identical, but shall not overlap partially.
  97  */
  98 static inline uint32_t
  99 norm13(uint32_t *d, const uint32_t *w, size_t len)
 100 {
 101         size_t u;
 102         uint32_t cc;
 103
 104         cc = 0;
 105         for (u = 0; u < len; u ++) {
 106                 int32_t z;
 107
 108                 z = w[u] + cc;
 109                 d[u] = z & 0x1FFF;
 110                 cc = ARSH(z, 13);
 111         }
 112         return cc;
 113 }
 114
 115 /*
 116  * mul20() multiplies two 260-bit integers together. Each word must fit
 117  * on 13 bits; source operands use 20 words, destination operand
 118  * receives 40 words. All overlaps allowed.
 119  *
 120  * square20() computes the square of a 260-bit integer. Each word must
 121  * fit on 13 bits; source operand uses 20 words, destination operand
 122  * receives 40 words. All overlaps allowed.
 123  */
 124
 125 #if BR_SLOW_MUL15
 126
 127 static void
 128 mul20(uint32_t *d, const uint32_t *a, const uint32_t *b)
 129 {
 130         /*
 131          * Two-level Karatsuba: turns a 20x20 multiplication into
 132          * nine 5x5 multiplications. We use 13-bit words but do not
 133          * propagate carries immediately, so words may expand:
 134          *
 135          *  - First Karatsuba decomposition turns the 20x20 mul on
 136          *    13-bit words into three 10x10 muls, two on 13-bit words
 137          *    and one on 14-bit words.
 138          *
 139          *  - Second Karatsuba decomposition further splits these into:
 140          *
 141          *     * four 5x5 muls on 13-bit words
 142          *     * four 5x5 muls on 14-bit words
 143          *     * one 5x5 mul on 15-bit words
 144          *
 145          * Highest word value is 8191, 16382 or 32764, for 13-bit, 14-bit
 146          * or 15-bit words, respectively.
 147          */
 148         uint32_t u[45], v[45], w[90];
 149         uint32_t cc;
 150         int i;
 151
 152 #define ZADD(dw, d_off, s1w, s1_off, s2w, s2_off)   do { \
 153                 (dw)[5 * (d_off) + 0] = (s1w)[5 * (s1_off) + 0] \
 154                         + (s2w)[5 * (s2_off) + 0]; \
 155                 (dw)[5 * (d_off) + 1] = (s1w)[5 * (s1_off) + 1] \
 156                         + (s2w)[5 * (s2_off) + 1]; \
 157                 (dw)[5 * (d_off) + 2] = (s1w)[5 * (s1_off) + 2] \
 158                         + (s2w)[5 * (s2_off) + 2]; \
 159                 (dw)[5 * (d_off) + 3] = (s1w)[5 * (s1_off) + 3] \
 160                         + (s2w)[5 * (s2_off) + 3]; \
 161                 (dw)[5 * (d_off) + 4] = (s1w)[5 * (s1_off) + 4] \
 162                         + (s2w)[5 * (s2_off) + 4]; \
 163         } while (0)
 164
 165 #define ZADDT(dw, d_off, sw, s_off)   do { \
 166                 (dw)[5 * (d_off) + 0] += (sw)[5 * (s_off) + 0]; \
 167                 (dw)[5 * (d_off) + 1] += (sw)[5 * (s_off) + 1]; \
 168                 (dw)[5 * (d_off) + 2] += (sw)[5 * (s_off) + 2]; \
 169                 (dw)[5 * (d_off) + 3] += (sw)[5 * (s_off) + 3]; \
 170                 (dw)[5 * (d_off) + 4] += (sw)[5 * (s_off) + 4]; \
 171         } while (0)
 172
 173 #define ZSUB2F(dw, d_off, s1w, s1_off, s2w, s2_off)   do { \
 174                 (dw)[5 * (d_off) + 0] -= (s1w)[5 * (s1_off) + 0] \
 175                         + (s2w)[5 * (s2_off) + 0]; \
 176                 (dw)[5 * (d_off) + 1] -= (s1w)[5 * (s1_off) + 1] \
 177                         + (s2w)[5 * (s2_off) + 1]; \
 178                 (dw)[5 * (d_off) + 2] -= (s1w)[5 * (s1_off) + 2] \
 179                         + (s2w)[5 * (s2_off) + 2]; \
 180                 (dw)[5 * (d_off) + 3] -= (s1w)[5 * (s1_off) + 3] \
 181                         + (s2w)[5 * (s2_off) + 3]; \
 182                 (dw)[5 * (d_off) + 4] -= (s1w)[5 * (s1_off) + 4] \
 183                         + (s2w)[5 * (s2_off) + 4]; \
 184         } while (0)
 185
 186 #define CPR1(w, cprcc)   do { \
 187                 uint32_t cprz = (w) + cprcc; \
 188                 (w) = cprz & 0x1FFF; \
 189                 cprcc = cprz >> 13; \
 190         } while (0)
 191
 192 #define CPR(dw, d_off)   do { \
 193                 uint32_t cprcc; \
 194                 cprcc = 0; \
 195                 CPR1((dw)[(d_off) + 0], cprcc); \
 196                 CPR1((dw)[(d_off) + 1], cprcc); \
 197                 CPR1((dw)[(d_off) + 2], cprcc); \
 198                 CPR1((dw)[(d_off) + 3], cprcc); \
 199                 CPR1((dw)[(d_off) + 4], cprcc); \
 200                 CPR1((dw)[(d_off) + 5], cprcc); \
 201                 CPR1((dw)[(d_off) + 6], cprcc); \
 202                 CPR1((dw)[(d_off) + 7], cprcc); \
 203                 CPR1((dw)[(d_off) + 8], cprcc); \
 204                 (dw)[(d_off) + 9] = cprcc; \
 205         } while (0)
 206
 207         memcpy(u, a, 20 * sizeof *a);
 208         ZADD(u, 4, a, 0, a, 1);
 209         ZADD(u, 5, a, 2, a, 3);
 210         ZADD(u, 6, a, 0, a, 2);
 211         ZADD(u, 7, a, 1, a, 3);
 212         ZADD(u, 8, u, 6, u, 7);
 213
 214         memcpy(v, b, 20 * sizeof *b);
 215         ZADD(v, 4, b, 0, b, 1);
 216         ZADD(v, 5, b, 2, b, 3);
 217         ZADD(v, 6, b, 0, b, 2);
 218         ZADD(v, 7, b, 1, b, 3);
 219         ZADD(v, 8, v, 6, v, 7);
 220
 221         /*
 222          * Do the eight first 8x8 muls. Source words are at most 16382
 223          * each, so we can add product results together "as is" in 32-bit
 224          * words.
 225          */
 226         for (i = 0; i < 40; i += 5) {
 227                 w[(i << 1) + 0] = MUL15(u[i + 0], v[i + 0]);
 228                 w[(i << 1) + 1] = MUL15(u[i + 0], v[i + 1])
 229                         + MUL15(u[i + 1], v[i + 0]);
 230                 w[(i << 1) + 2] = MUL15(u[i + 0], v[i + 2])
 231                         + MUL15(u[i + 1], v[i + 1])
 232                         + MUL15(u[i + 2], v[i + 0]);
 233                 w[(i << 1) + 3] = MUL15(u[i + 0], v[i + 3])
 234                         + MUL15(u[i + 1], v[i + 2])
 235                         + MUL15(u[i + 2], v[i + 1])
 236                         + MUL15(u[i + 3], v[i + 0]);
 237                 w[(i << 1) + 4] = MUL15(u[i + 0], v[i + 4])
 238                         + MUL15(u[i + 1], v[i + 3])
 239                         + MUL15(u[i + 2], v[i + 2])
 240                         + MUL15(u[i + 3], v[i + 1])
 241                         + MUL15(u[i + 4], v[i + 0]);
 242                 w[(i << 1) + 5] = MUL15(u[i + 1], v[i + 4])
 243                         + MUL15(u[i + 2], v[i + 3])
 244                         + MUL15(u[i + 3], v[i + 2])
 245                         + MUL15(u[i + 4], v[i + 1]);
 246                 w[(i << 1) + 6] = MUL15(u[i + 2], v[i + 4])
 247                         + MUL15(u[i + 3], v[i + 3])
 248                         + MUL15(u[i + 4], v[i + 2]);
 249                 w[(i << 1) + 7] = MUL15(u[i + 3], v[i + 4])
 250                         + MUL15(u[i + 4], v[i + 3]);
 251                 w[(i << 1) + 8] = MUL15(u[i + 4], v[i + 4]);
 252                 w[(i << 1) + 9] = 0;
 253         }
 254
 255         /*
 256          * For the 9th multiplication, source words are up to 32764,
 257          * so we must do some carry propagation. If we add up to
 258          * 4 products and the carry is no more than 524224, then the
 259          * result fits in 32 bits, and the next carry will be no more
 260          * than 524224 (because 4*(32764^2)+524224 < 8192*524225).
 261          *
 262          * We thus just skip one of the products in the middle word,
 263          * then do a carry propagation (this reduces words to 13 bits
 264          * each, except possibly the last, which may use up to 17 bits
 265          * or so), then add the missing product.
 266          */
 267         w[80 + 0] = MUL15(u[40 + 0], v[40 + 0]);
 268         w[80 + 1] = MUL15(u[40 + 0], v[40 + 1])
 269                 + MUL15(u[40 + 1], v[40 + 0]);
 270         w[80 + 2] = MUL15(u[40 + 0], v[40 + 2])
 271                 + MUL15(u[40 + 1], v[40 + 1])
 272                 + MUL15(u[40 + 2], v[40 + 0]);
 273         w[80 + 3] = MUL15(u[40 + 0], v[40 + 3])
 274                 + MUL15(u[40 + 1], v[40 + 2])
 275                 + MUL15(u[40 + 2], v[40 + 1])
 276                 + MUL15(u[40 + 3], v[40 + 0]);
 277         w[80 + 4] = MUL15(u[40 + 0], v[40 + 4])
 278                 + MUL15(u[40 + 1], v[40 + 3])
 279                 + MUL15(u[40 + 2], v[40 + 2])
 280                 + MUL15(u[40 + 3], v[40 + 1]);
 281                 /* + MUL15(u[40 + 4], v[40 + 0]) */
 282         w[80 + 5] = MUL15(u[40 + 1], v[40 + 4])
 283                 + MUL15(u[40 + 2], v[40 + 3])
 284                 + MUL15(u[40 + 3], v[40 + 2])
 285                 + MUL15(u[40 + 4], v[40 + 1]);
 286         w[80 + 6] = MUL15(u[40 + 2], v[40 + 4])
 287                 + MUL15(u[40 + 3], v[40 + 3])
 288                 + MUL15(u[40 + 4], v[40 + 2]);
 289         w[80 + 7] = MUL15(u[40 + 3], v[40 + 4])
 290                 + MUL15(u[40 + 4], v[40 + 3]);
 291         w[80 + 8] = MUL15(u[40 + 4], v[40 + 4]);
 292
 293         CPR(w, 80);
 294
 295         w[80 + 4] += MUL15(u[40 + 4], v[40 + 0]);
 296
 297         /*
 298          * The products on 14-bit words in slots 6 and 7 yield values
 299          * up to 5*(16382^2) each, and we need to subtract two such
 300          * values from the higher word. We need the subtraction to fit
 301          * in a _signed_ 32-bit integer, i.e. 31 bits + a sign bit.
 302          * However, 10*(16382^2) does not fit. So we must perform a
 303          * bit of reduction here.
 304          */
 305         CPR(w, 60);
 306         CPR(w, 70);
 307
 308         /*
 309          * Recompose results.
 310          */
 311
 312         /* 0..1*0..1 into 0..3 */
 313         ZSUB2F(w, 8, w, 0, w, 2);
 314         ZSUB2F(w, 9, w, 1, w, 3);
 315         ZADDT(w, 1, w, 8);
 316         ZADDT(w, 2, w, 9);
 317
 318         /* 2..3*2..3 into 4..7 */
 319         ZSUB2F(w, 10, w, 4, w, 6);
 320         ZSUB2F(w, 11, w, 5, w, 7);
 321         ZADDT(w, 5, w, 10);
 322         ZADDT(w, 6, w, 11);
 323
 324         /* (0..1+2..3)*(0..1+2..3) into 12..15 */
 325         ZSUB2F(w, 16, w, 12, w, 14);
 326         ZSUB2F(w, 17, w, 13, w, 15);
 327         ZADDT(w, 13, w, 16);
 328         ZADDT(w, 14, w, 17);
 329
 330         /* first-level recomposition */
 331         ZSUB2F(w, 12, w, 0, w, 4);
 332         ZSUB2F(w, 13, w, 1, w, 5);
 333         ZSUB2F(w, 14, w, 2, w, 6);
 334         ZSUB2F(w, 15, w, 3, w, 7);
 335         ZADDT(w, 2, w, 12);
 336         ZADDT(w, 3, w, 13);
 337         ZADDT(w, 4, w, 14);
 338         ZADDT(w, 5, w, 15);
 339
 340         /*
 341          * Perform carry propagation to bring all words down to 13 bits.
 342          */
 343         cc = norm13(d, w, 40);
 344         d[39] += (cc << 13);
 345
 346 #undef ZADD
 347 #undef ZADDT
 348 #undef ZSUB2F
 349 #undef CPR1
 350 #undef CPR
 351 }
 352
 353 static inline void
 354 square20(uint32_t *d, const uint32_t *a)
 355 {
 356         mul20(d, a, a);
 357 }
 358
 359 #else
 360
 361 static void
 362 mul20(uint32_t *d, const uint32_t *a, const uint32_t *b)
 363 {
 364         uint32_t t[39];
 365
 366         t[ 0] = MUL15(a[ 0], b[ 0]);
 367         t[ 1] = MUL15(a[ 0], b[ 1])
 368                 + MUL15(a[ 1], b[ 0]);
 369         t[ 2] = MUL15(a[ 0], b[ 2])
 370                 + MUL15(a[ 1], b[ 1])
 371                 + MUL15(a[ 2], b[ 0]);
 372         t[ 3] = MUL15(a[ 0], b[ 3])
 373                 + MUL15(a[ 1], b[ 2])
 374                 + MUL15(a[ 2], b[ 1])
 375                 + MUL15(a[ 3], b[ 0]);
 376         t[ 4] = MUL15(a[ 0], b[ 4])
 377                 + MUL15(a[ 1], b[ 3])
 378                 + MUL15(a[ 2], b[ 2])
 379                 + MUL15(a[ 3], b[ 1])
 380                 + MUL15(a[ 4], b[ 0]);
 381         t[ 5] = MUL15(a[ 0], b[ 5])
 382                 + MUL15(a[ 1], b[ 4])
 383                 + MUL15(a[ 2], b[ 3])
 384                 + MUL15(a[ 3], b[ 2])
 385                 + MUL15(a[ 4], b[ 1])
 386                 + MUL15(a[ 5], b[ 0]);
 387         t[ 6] = MUL15(a[ 0], b[ 6])
 388                 + MUL15(a[ 1], b[ 5])
 389                 + MUL15(a[ 2], b[ 4])
 390                 + MUL15(a[ 3], b[ 3])
 391                 + MUL15(a[ 4], b[ 2])
 392                 + MUL15(a[ 5], b[ 1])
 393                 + MUL15(a[ 6], b[ 0]);
 394         t[ 7] = MUL15(a[ 0], b[ 7])
 395                 + MUL15(a[ 1], b[ 6])
 396                 + MUL15(a[ 2], b[ 5])
 397                 + MUL15(a[ 3], b[ 4])
 398                 + MUL15(a[ 4], b[ 3])
 399                 + MUL15(a[ 5], b[ 2])
 400                 + MUL15(a[ 6], b[ 1])
 401                 + MUL15(a[ 7], b[ 0]);
 402         t[ 8] = MUL15(a[ 0], b[ 8])
 403                 + MUL15(a[ 1], b[ 7])
 404                 + MUL15(a[ 2], b[ 6])
 405                 + MUL15(a[ 3], b[ 5])
 406                 + MUL15(a[ 4], b[ 4])
 407                 + MUL15(a[ 5], b[ 3])
 408                 + MUL15(a[ 6], b[ 2])
 409                 + MUL15(a[ 7], b[ 1])
 410                 + MUL15(a[ 8], b[ 0]);
 411         t[ 9] = MUL15(a[ 0], b[ 9])
 412                 + MUL15(a[ 1], b[ 8])
 413                 + MUL15(a[ 2], b[ 7])
 414                 + MUL15(a[ 3], b[ 6])
 415                 + MUL15(a[ 4], b[ 5])
 416                 + MUL15(a[ 5], b[ 4])
 417                 + MUL15(a[ 6], b[ 3])
 418                 + MUL15(a[ 7], b[ 2])
 419                 + MUL15(a[ 8], b[ 1])
 420                 + MUL15(a[ 9], b[ 0]);
 421         t[10] = MUL15(a[ 0], b[10])
 422                 + MUL15(a[ 1], b[ 9])
 423                 + MUL15(a[ 2], b[ 8])
 424                 + MUL15(a[ 3], b[ 7])
 425                 + MUL15(a[ 4], b[ 6])
 426                 + MUL15(a[ 5], b[ 5])
 427                 + MUL15(a[ 6], b[ 4])
 428                 + MUL15(a[ 7], b[ 3])
 429                 + MUL15(a[ 8], b[ 2])
 430                 + MUL15(a[ 9], b[ 1])
 431                 + MUL15(a[10], b[ 0]);
 432         t[11] = MUL15(a[ 0], b[11])
 433                 + MUL15(a[ 1], b[10])
 434                 + MUL15(a[ 2], b[ 9])
 435                 + MUL15(a[ 3], b[ 8])
 436                 + MUL15(a[ 4], b[ 7])
 437                 + MUL15(a[ 5], b[ 6])
 438                 + MUL15(a[ 6], b[ 5])
 439                 + MUL15(a[ 7], b[ 4])
 440                 + MUL15(a[ 8], b[ 3])
 441                 + MUL15(a[ 9], b[ 2])
 442                 + MUL15(a[10], b[ 1])
 443                 + MUL15(a[11], b[ 0]);
 444         t[12] = MUL15(a[ 0], b[12])
 445                 + MUL15(a[ 1], b[11])
 446                 + MUL15(a[ 2], b[10])
 447                 + MUL15(a[ 3], b[ 9])
 448                 + MUL15(a[ 4], b[ 8])
 449                 + MUL15(a[ 5], b[ 7])
 450                 + MUL15(a[ 6], b[ 6])
 451                 + MUL15(a[ 7], b[ 5])
 452                 + MUL15(a[ 8], b[ 4])
 453                 + MUL15(a[ 9], b[ 3])
 454                 + MUL15(a[10], b[ 2])
 455                 + MUL15(a[11], b[ 1])
 456                 + MUL15(a[12], b[ 0]);
 457         t[13] = MUL15(a[ 0], b[13])
 458                 + MUL15(a[ 1], b[12])
 459                 + MUL15(a[ 2], b[11])
 460                 + MUL15(a[ 3], b[10])
 461                 + MUL15(a[ 4], b[ 9])
 462                 + MUL15(a[ 5], b[ 8])
 463                 + MUL15(a[ 6], b[ 7])
 464                 + MUL15(a[ 7], b[ 6])
 465                 + MUL15(a[ 8], b[ 5])
 466                 + MUL15(a[ 9], b[ 4])
 467                 + MUL15(a[10], b[ 3])
 468                 + MUL15(a[11], b[ 2])
 469                 + MUL15(a[12], b[ 1])
 470                 + MUL15(a[13], b[ 0]);
 471         t[14] = MUL15(a[ 0], b[14])
 472                 + MUL15(a[ 1], b[13])
 473                 + MUL15(a[ 2], b[12])
 474                 + MUL15(a[ 3], b[11])
 475                 + MUL15(a[ 4], b[10])
 476                 + MUL15(a[ 5], b[ 9])
 477                 + MUL15(a[ 6], b[ 8])
 478                 + MUL15(a[ 7], b[ 7])
 479                 + MUL15(a[ 8], b[ 6])
 480                 + MUL15(a[ 9], b[ 5])
 481                 + MUL15(a[10], b[ 4])
 482                 + MUL15(a[11], b[ 3])
 483                 + MUL15(a[12], b[ 2])
 484                 + MUL15(a[13], b[ 1])
 485                 + MUL15(a[14], b[ 0]);
 486         t[15] = MUL15(a[ 0], b[15])
 487                 + MUL15(a[ 1], b[14])
 488                 + MUL15(a[ 2], b[13])
 489                 + MUL15(a[ 3], b[12])
 490                 + MUL15(a[ 4], b[11])
 491                 + MUL15(a[ 5], b[10])
 492                 + MUL15(a[ 6], b[ 9])
 493                 + MUL15(a[ 7], b[ 8])
 494                 + MUL15(a[ 8], b[ 7])
 495                 + MUL15(a[ 9], b[ 6])
 496                 + MUL15(a[10], b[ 5])
 497                 + MUL15(a[11], b[ 4])
 498                 + MUL15(a[12], b[ 3])
 499                 + MUL15(a[13], b[ 2])
 500                 + MUL15(a[14], b[ 1])
 501                 + MUL15(a[15], b[ 0]);
 502         t[16] = MUL15(a[ 0], b[16])
 503                 + MUL15(a[ 1], b[15])
 504                 + MUL15(a[ 2], b[14])
 505                 + MUL15(a[ 3], b[13])
 506                 + MUL15(a[ 4], b[12])
 507                 + MUL15(a[ 5], b[11])
 508                 + MUL15(a[ 6], b[10])
 509                 + MUL15(a[ 7], b[ 9])
 510                 + MUL15(a[ 8], b[ 8])
 511                 + MUL15(a[ 9], b[ 7])
 512                 + MUL15(a[10], b[ 6])
 513                 + MUL15(a[11], b[ 5])
 514                 + MUL15(a[12], b[ 4])
 515                 + MUL15(a[13], b[ 3])
 516                 + MUL15(a[14], b[ 2])
 517                 + MUL15(a[15], b[ 1])
 518                 + MUL15(a[16], b[ 0]);
 519         t[17] = MUL15(a[ 0], b[17])
 520                 + MUL15(a[ 1], b[16])
 521                 + MUL15(a[ 2], b[15])
 522                 + MUL15(a[ 3], b[14])
 523                 + MUL15(a[ 4], b[13])
 524                 + MUL15(a[ 5], b[12])
 525                 + MUL15(a[ 6], b[11])
 526                 + MUL15(a[ 7], b[10])
 527                 + MUL15(a[ 8], b[ 9])
 528                 + MUL15(a[ 9], b[ 8])
 529                 + MUL15(a[10], b[ 7])
 530                 + MUL15(a[11], b[ 6])
 531                 + MUL15(a[12], b[ 5])
 532                 + MUL15(a[13], b[ 4])
 533                 + MUL15(a[14], b[ 3])
 534                 + MUL15(a[15], b[ 2])
 535                 + MUL15(a[16], b[ 1])
 536                 + MUL15(a[17], b[ 0]);
 537         t[18] = MUL15(a[ 0], b[18])
 538                 + MUL15(a[ 1], b[17])
 539                 + MUL15(a[ 2], b[16])
 540                 + MUL15(a[ 3], b[15])
 541                 + MUL15(a[ 4], b[14])
 542                 + MUL15(a[ 5], b[13])
 543                 + MUL15(a[ 6], b[12])
 544                 + MUL15(a[ 7], b[11])
 545                 + MUL15(a[ 8], b[10])
 546                 + MUL15(a[ 9], b[ 9])
 547                 + MUL15(a[10], b[ 8])
 548                 + MUL15(a[11], b[ 7])
 549                 + MUL15(a[12], b[ 6])
 550                 + MUL15(a[13], b[ 5])
 551                 + MUL15(a[14], b[ 4])
 552                 + MUL15(a[15], b[ 3])
 553                 + MUL15(a[16], b[ 2])
 554                 + MUL15(a[17], b[ 1])
 555                 + MUL15(a[18], b[ 0]);
 556         t[19] = MUL15(a[ 0], b[19])
 557                 + MUL15(a[ 1], b[18])
 558                 + MUL15(a[ 2], b[17])
 559                 + MUL15(a[ 3], b[16])
 560                 + MUL15(a[ 4], b[15])
 561                 + MUL15(a[ 5], b[14])
 562                 + MUL15(a[ 6], b[13])
 563                 + MUL15(a[ 7], b[12])
 564                 + MUL15(a[ 8], b[11])
 565                 + MUL15(a[ 9], b[10])
 566                 + MUL15(a[10], b[ 9])
 567                 + MUL15(a[11], b[ 8])
 568                 + MUL15(a[12], b[ 7])
 569                 + MUL15(a[13], b[ 6])
 570                 + MUL15(a[14], b[ 5])
 571                 + MUL15(a[15], b[ 4])
 572                 + MUL15(a[16], b[ 3])
 573                 + MUL15(a[17], b[ 2])
 574                 + MUL15(a[18], b[ 1])
 575                 + MUL15(a[19], b[ 0]);
 576         t[20] = MUL15(a[ 1], b[19])
 577                 + MUL15(a[ 2], b[18])
 578                 + MUL15(a[ 3], b[17])
 579                 + MUL15(a[ 4], b[16])
 580                 + MUL15(a[ 5], b[15])
 581                 + MUL15(a[ 6], b[14])
 582                 + MUL15(a[ 7], b[13])
 583                 + MUL15(a[ 8], b[12])
 584                 + MUL15(a[ 9], b[11])
 585                 + MUL15(a[10], b[10])
 586                 + MUL15(a[11], b[ 9])
 587                 + MUL15(a[12], b[ 8])
 588                 + MUL15(a[13], b[ 7])
 589                 + MUL15(a[14], b[ 6])
 590                 + MUL15(a[15], b[ 5])
 591                 + MUL15(a[16], b[ 4])
 592                 + MUL15(a[17], b[ 3])
 593                 + MUL15(a[18], b[ 2])
 594                 + MUL15(a[19], b[ 1]);
 595         t[21] = MUL15(a[ 2], b[19])
 596                 + MUL15(a[ 3], b[18])
 597                 + MUL15(a[ 4], b[17])
 598                 + MUL15(a[ 5], b[16])
 599                 + MUL15(a[ 6], b[15])
 600                 + MUL15(a[ 7], b[14])
 601                 + MUL15(a[ 8], b[13])
 602                 + MUL15(a[ 9], b[12])
 603                 + MUL15(a[10], b[11])
 604                 + MUL15(a[11], b[10])
 605                 + MUL15(a[12], b[ 9])
 606                 + MUL15(a[13], b[ 8])
 607                 + MUL15(a[14], b[ 7])
 608                 + MUL15(a[15], b[ 6])
 609                 + MUL15(a[16], b[ 5])
 610                 + MUL15(a[17], b[ 4])
 611                 + MUL15(a[18], b[ 3])
 612                 + MUL15(a[19], b[ 2]);
 613         t[22] = MUL15(a[ 3], b[19])
 614                 + MUL15(a[ 4], b[18])
 615                 + MUL15(a[ 5], b[17])
 616                 + MUL15(a[ 6], b[16])
 617                 + MUL15(a[ 7], b[15])
 618                 + MUL15(a[ 8], b[14])
 619                 + MUL15(a[ 9], b[13])
 620                 + MUL15(a[10], b[12])
 621                 + MUL15(a[11], b[11])
 622                 + MUL15(a[12], b[10])
 623                 + MUL15(a[13], b[ 9])
 624                 + MUL15(a[14], b[ 8])
 625                 + MUL15(a[15], b[ 7])
 626                 + MUL15(a[16], b[ 6])
 627                 + MUL15(a[17], b[ 5])
 628                 + MUL15(a[18], b[ 4])
 629                 + MUL15(a[19], b[ 3]);
 630         t[23] = MUL15(a[ 4], b[19])
 631                 + MUL15(a[ 5], b[18])
 632                 + MUL15(a[ 6], b[17])
 633                 + MUL15(a[ 7], b[16])
 634                 + MUL15(a[ 8], b[15])
 635                 + MUL15(a[ 9], b[14])
 636                 + MUL15(a[10], b[13])
 637                 + MUL15(a[11], b[12])
 638                 + MUL15(a[12], b[11])
 639                 + MUL15(a[13], b[10])
 640                 + MUL15(a[14], b[ 9])
 641                 + MUL15(a[15], b[ 8])
 642                 + MUL15(a[16], b[ 7])
 643                 + MUL15(a[17], b[ 6])
 644                 + MUL15(a[18], b[ 5])
 645                 + MUL15(a[19], b[ 4]);
 646         t[24] = MUL15(a[ 5], b[19])
 647                 + MUL15(a[ 6], b[18])
 648                 + MUL15(a[ 7], b[17])
 649                 + MUL15(a[ 8], b[16])
 650                 + MUL15(a[ 9], b[15])
 651                 + MUL15(a[10], b[14])
 652                 + MUL15(a[11], b[13])
 653                 + MUL15(a[12], b[12])
 654                 + MUL15(a[13], b[11])
 655                 + MUL15(a[14], b[10])
 656                 + MUL15(a[15], b[ 9])
 657                 + MUL15(a[16], b[ 8])
 658                 + MUL15(a[17], b[ 7])
 659                 + MUL15(a[18], b[ 6])
 660                 + MUL15(a[19], b[ 5]);
 661         t[25] = MUL15(a[ 6], b[19])
 662                 + MUL15(a[ 7], b[18])
 663                 + MUL15(a[ 8], b[17])
 664                 + MUL15(a[ 9], b[16])
 665                 + MUL15(a[10], b[15])
 666                 + MUL15(a[11], b[14])
 667                 + MUL15(a[12], b[13])
 668                 + MUL15(a[13], b[12])
 669                 + MUL15(a[14], b[11])
 670                 + MUL15(a[15], b[10])
 671                 + MUL15(a[16], b[ 9])
 672                 + MUL15(a[17], b[ 8])
 673                 + MUL15(a[18], b[ 7])
 674                 + MUL15(a[19], b[ 6]);
 675         t[26] = MUL15(a[ 7], b[19])
 676                 + MUL15(a[ 8], b[18])
 677                 + MUL15(a[ 9], b[17])
 678                 + MUL15(a[10], b[16])
 679                 + MUL15(a[11], b[15])
 680                 + MUL15(a[12], b[14])
 681                 + MUL15(a[13], b[13])
 682                 + MUL15(a[14], b[12])
 683                 + MUL15(a[15], b[11])
 684                 + MUL15(a[16], b[10])
 685                 + MUL15(a[17], b[ 9])
 686                 + MUL15(a[18], b[ 8])
 687                 + MUL15(a[19], b[ 7]);
 688         t[27] = MUL15(a[ 8], b[19])
 689                 + MUL15(a[ 9], b[18])
 690                 + MUL15(a[10], b[17])
 691                 + MUL15(a[11], b[16])
 692                 + MUL15(a[12], b[15])
 693                 + MUL15(a[13], b[14])
 694                 + MUL15(a[14], b[13])
 695                 + MUL15(a[15], b[12])
 696                 + MUL15(a[16], b[11])
 697                 + MUL15(a[17], b[10])
 698                 + MUL15(a[18], b[ 9])
 699                 + MUL15(a[19], b[ 8]);
 700         t[28] = MUL15(a[ 9], b[19])
 701                 + MUL15(a[10], b[18])
 702                 + MUL15(a[11], b[17])
 703                 + MUL15(a[12], b[16])
 704                 + MUL15(a[13], b[15])
 705                 + MUL15(a[14], b[14])
 706                 + MUL15(a[15], b[13])
 707                 + MUL15(a[16], b[12])
 708                 + MUL15(a[17], b[11])
 709                 + MUL15(a[18], b[10])
 710                 + MUL15(a[19], b[ 9]);
 711         t[29] = MUL15(a[10], b[19])
 712                 + MUL15(a[11], b[18])
 713                 + MUL15(a[12], b[17])
 714                 + MUL15(a[13], b[16])
 715                 + MUL15(a[14], b[15])
 716                 + MUL15(a[15], b[14])
 717                 + MUL15(a[16], b[13])
 718                 + MUL15(a[17], b[12])
 719                 + MUL15(a[18], b[11])
 720                 + MUL15(a[19], b[10]);
 721         t[30] = MUL15(a[11], b[19])
 722                 + MUL15(a[12], b[18])
 723                 + MUL15(a[13], b[17])
 724                 + MUL15(a[14], b[16])
 725                 + MUL15(a[15], b[15])
 726                 + MUL15(a[16], b[14])
 727                 + MUL15(a[17], b[13])
 728                 + MUL15(a[18], b[12])
 729                 + MUL15(a[19], b[11]);
 730         t[31] = MUL15(a[12], b[19])
 731                 + MUL15(a[13], b[18])
 732                 + MUL15(a[14], b[17])
 733                 + MUL15(a[15], b[16])
 734                 + MUL15(a[16], b[15])
 735                 + MUL15(a[17], b[14])
 736                 + MUL15(a[18], b[13])
 737                 + MUL15(a[19], b[12]);
 738         t[32] = MUL15(a[13], b[19])
 739                 + MUL15(a[14], b[18])
 740                 + MUL15(a[15], b[17])
 741                 + MUL15(a[16], b[16])
 742                 + MUL15(a[17], b[15])
 743                 + MUL15(a[18], b[14])
 744                 + MUL15(a[19], b[13]);
 745         t[33] = MUL15(a[14], b[19])
 746                 + MUL15(a[15], b[18])
 747                 + MUL15(a[16], b[17])
 748                 + MUL15(a[17], b[16])
 749                 + MUL15(a[18], b[15])
 750                 + MUL15(a[19], b[14]);
 751         t[34] = MUL15(a[15], b[19])
 752                 + MUL15(a[16], b[18])
 753                 + MUL15(a[17], b[17])
 754                 + MUL15(a[18], b[16])
 755                 + MUL15(a[19], b[15]);
 756         t[35] = MUL15(a[16], b[19])
 757                 + MUL15(a[17], b[18])
 758                 + MUL15(a[18], b[17])
 759                 + MUL15(a[19], b[16]);
 760         t[36] = MUL15(a[17], b[19])
 761                 + MUL15(a[18], b[18])
 762                 + MUL15(a[19], b[17]);
 763         t[37] = MUL15(a[18], b[19])
 764                 + MUL15(a[19], b[18]);
 765         t[38] = MUL15(a[19], b[19]);
 766         d[39] = norm13(d, t, 39);
 767 }
 768
 769 static void
 770 square20(uint32_t *d, const uint32_t *a)
 771 {
 772         uint32_t t[39];
 773
 774         t[ 0] = MUL15(a[ 0], a[ 0]);
 775         t[ 1] = ((MUL15(a[ 0], a[ 1])) << 1);
 776         t[ 2] = MUL15(a[ 1], a[ 1])
 777                 + ((MUL15(a[ 0], a[ 2])) << 1);
 778         t[ 3] = ((MUL15(a[ 0], a[ 3])
 779                 + MUL15(a[ 1], a[ 2])) << 1);
 780         t[ 4] = MUL15(a[ 2], a[ 2])
 781                 + ((MUL15(a[ 0], a[ 4])
 782                 + MUL15(a[ 1], a[ 3])) << 1);
 783         t[ 5] = ((MUL15(a[ 0], a[ 5])
 784                 + MUL15(a[ 1], a[ 4])
 785                 + MUL15(a[ 2], a[ 3])) << 1);
 786         t[ 6] = MUL15(a[ 3], a[ 3])
 787                 + ((MUL15(a[ 0], a[ 6])
 788                 + MUL15(a[ 1], a[ 5])
 789                 + MUL15(a[ 2], a[ 4])) << 1);
 790         t[ 7] = ((MUL15(a[ 0], a[ 7])
 791                 + MUL15(a[ 1], a[ 6])
 792                 + MUL15(a[ 2], a[ 5])
 793                 + MUL15(a[ 3], a[ 4])) << 1);
 794         t[ 8] = MUL15(a[ 4], a[ 4])
 795                 + ((MUL15(a[ 0], a[ 8])
 796                 + MUL15(a[ 1], a[ 7])
 797                 + MUL15(a[ 2], a[ 6])
 798                 + MUL15(a[ 3], a[ 5])) << 1);
 799         t[ 9] = ((MUL15(a[ 0], a[ 9])
 800                 + MUL15(a[ 1], a[ 8])
 801                 + MUL15(a[ 2], a[ 7])
 802                 + MUL15(a[ 3], a[ 6])
 803                 + MUL15(a[ 4], a[ 5])) << 1);
 804         t[10] = MUL15(a[ 5], a[ 5])
 805                 + ((MUL15(a[ 0], a[10])
 806                 + MUL15(a[ 1], a[ 9])
 807                 + MUL15(a[ 2], a[ 8])
 808                 + MUL15(a[ 3], a[ 7])
 809                 + MUL15(a[ 4], a[ 6])) << 1);
 810         t[11] = ((MUL15(a[ 0], a[11])
 811                 + MUL15(a[ 1], a[10])
 812                 + MUL15(a[ 2], a[ 9])
 813                 + MUL15(a[ 3], a[ 8])
 814                 + MUL15(a[ 4], a[ 7])
 815                 + MUL15(a[ 5], a[ 6])) << 1);
 816         t[12] = MUL15(a[ 6], a[ 6])
 817                 + ((MUL15(a[ 0], a[12])
 818                 + MUL15(a[ 1], a[11])
 819                 + MUL15(a[ 2], a[10])
 820                 + MUL15(a[ 3], a[ 9])
 821                 + MUL15(a[ 4], a[ 8])
 822                 + MUL15(a[ 5], a[ 7])) << 1);
 823         t[13] = ((MUL15(a[ 0], a[13])
 824                 + MUL15(a[ 1], a[12])
 825                 + MUL15(a[ 2], a[11])
 826                 + MUL15(a[ 3], a[10])
 827                 + MUL15(a[ 4], a[ 9])
 828                 + MUL15(a[ 5], a[ 8])
 829                 + MUL15(a[ 6], a[ 7])) << 1);
 830         t[14] = MUL15(a[ 7], a[ 7])
 831                 + ((MUL15(a[ 0], a[14])
 832                 + MUL15(a[ 1], a[13])
 833                 + MUL15(a[ 2], a[12])
 834                 + MUL15(a[ 3], a[11])
 835                 + MUL15(a[ 4], a[10])
 836                 + MUL15(a[ 5], a[ 9])
 837                 + MUL15(a[ 6], a[ 8])) << 1);
 838         t[15] = ((MUL15(a[ 0], a[15])
 839                 + MUL15(a[ 1], a[14])
 840                 + MUL15(a[ 2], a[13])
 841                 + MUL15(a[ 3], a[12])
 842                 + MUL15(a[ 4], a[11])
 843                 + MUL15(a[ 5], a[10])
 844                 + MUL15(a[ 6], a[ 9])
 845                 + MUL15(a[ 7], a[ 8])) << 1);
 846         t[16] = MUL15(a[ 8], a[ 8])
 847                 + ((MUL15(a[ 0], a[16])
 848                 + MUL15(a[ 1], a[15])
 849                 + MUL15(a[ 2], a[14])
 850                 + MUL15(a[ 3], a[13])
 851                 + MUL15(a[ 4], a[12])
 852                 + MUL15(a[ 5], a[11])
 853                 + MUL15(a[ 6], a[10])
 854                 + MUL15(a[ 7], a[ 9])) << 1);
 855         t[17] = ((MUL15(a[ 0], a[17])
 856                 + MUL15(a[ 1], a[16])
 857                 + MUL15(a[ 2], a[15])
 858                 + MUL15(a[ 3], a[14])
 859                 + MUL15(a[ 4], a[13])
 860                 + MUL15(a[ 5], a[12])
 861                 + MUL15(a[ 6], a[11])
 862                 + MUL15(a[ 7], a[10])
 863                 + MUL15(a[ 8], a[ 9])) << 1);
 864         t[18] = MUL15(a[ 9], a[ 9])
 865                 + ((MUL15(a[ 0], a[18])
 866                 + MUL15(a[ 1], a[17])
 867                 + MUL15(a[ 2], a[16])
 868                 + MUL15(a[ 3], a[15])
 869                 + MUL15(a[ 4], a[14])
 870                 + MUL15(a[ 5], a[13])
 871                 + MUL15(a[ 6], a[12])
 872                 + MUL15(a[ 7], a[11])
 873                 + MUL15(a[ 8], a[10])) << 1);
 874         t[19] = ((MUL15(a[ 0], a[19])
 875                 + MUL15(a[ 1], a[18])
 876                 + MUL15(a[ 2], a[17])
 877                 + MUL15(a[ 3], a[16])
 878                 + MUL15(a[ 4], a[15])
 879                 + MUL15(a[ 5], a[14])
 880                 + MUL15(a[ 6], a[13])
 881                 + MUL15(a[ 7], a[12])
 882                 + MUL15(a[ 8], a[11])
 883                 + MUL15(a[ 9], a[10])) << 1);
 884         t[20] = MUL15(a[10], a[10])
 885                 + ((MUL15(a[ 1], a[19])
 886                 + MUL15(a[ 2], a[18])
 887                 + MUL15(a[ 3], a[17])
 888                 + MUL15(a[ 4], a[16])
 889                 + MUL15(a[ 5], a[15])
 890                 + MUL15(a[ 6], a[14])
 891                 + MUL15(a[ 7], a[13])
 892                 + MUL15(a[ 8], a[12])
 893                 + MUL15(a[ 9], a[11])) << 1);
 894         t[21] = ((MUL15(a[ 2], a[19])
 895                 + MUL15(a[ 3], a[18])
 896                 + MUL15(a[ 4], a[17])
 897                 + MUL15(a[ 5], a[16])
 898                 + MUL15(a[ 6], a[15])
 899                 + MUL15(a[ 7], a[14])
 900                 + MUL15(a[ 8], a[13])
 901                 + MUL15(a[ 9], a[12])
 902                 + MUL15(a[10], a[11])) << 1);
 903         t[22] = MUL15(a[11], a[11])
 904                 + ((MUL15(a[ 3], a[19])
 905                 + MUL15(a[ 4], a[18])
 906                 + MUL15(a[ 5], a[17])
 907                 + MUL15(a[ 6], a[16])
 908                 + MUL15(a[ 7], a[15])
 909                 + MUL15(a[ 8], a[14])
 910                 + MUL15(a[ 9], a[13])
 911                 + MUL15(a[10], a[12])) << 1);
 912         t[23] = ((MUL15(a[ 4], a[19])
 913                 + MUL15(a[ 5], a[18])
 914                 + MUL15(a[ 6], a[17])
 915                 + MUL15(a[ 7], a[16])
 916                 + MUL15(a[ 8], a[15])
 917                 + MUL15(a[ 9], a[14])
 918                 + MUL15(a[10], a[13])
 919                 + MUL15(a[11], a[12])) << 1);
 920         t[24] = MUL15(a[12], a[12])
 921                 + ((MUL15(a[ 5], a[19])
 922                 + MUL15(a[ 6], a[18])
 923                 + MUL15(a[ 7], a[17])
 924                 + MUL15(a[ 8], a[16])
 925                 + MUL15(a[ 9], a[15])
 926                 + MUL15(a[10], a[14])
 927                 + MUL15(a[11], a[13])) << 1);
 928         t[25] = ((MUL15(a[ 6], a[19])
 929                 + MUL15(a[ 7], a[18])
 930                 + MUL15(a[ 8], a[17])
 931                 + MUL15(a[ 9], a[16])
 932                 + MUL15(a[10], a[15])
 933                 + MUL15(a[11], a[14])
 934                 + MUL15(a[12], a[13])) << 1);
 935         t[26] = MUL15(a[13], a[13])
 936                 + ((MUL15(a[ 7], a[19])
 937                 + MUL15(a[ 8], a[18])
 938                 + MUL15(a[ 9], a[17])
 939                 + MUL15(a[10], a[16])
 940                 + MUL15(a[11], a[15])
 941                 + MUL15(a[12], a[14])) << 1);
 942         t[27] = ((MUL15(a[ 8], a[19])
 943                 + MUL15(a[ 9], a[18])
 944                 + MUL15(a[10], a[17])
 945                 + MUL15(a[11], a[16])
 946                 + MUL15(a[12], a[15])
 947                 + MUL15(a[13], a[14])) << 1);
 948         t[28] = MUL15(a[14], a[14])
 949                 + ((MUL15(a[ 9], a[19])
 950                 + MUL15(a[10], a[18])
 951                 + MUL15(a[11], a[17])
 952                 + MUL15(a[12], a[16])
 953                 + MUL15(a[13], a[15])) << 1);
 954         t[29] = ((MUL15(a[10], a[19])
 955                 + MUL15(a[11], a[18])
 956                 + MUL15(a[12], a[17])
 957                 + MUL15(a[13], a[16])
 958                 + MUL15(a[14], a[15])) << 1);
 959         t[30] = MUL15(a[15], a[15])
 960                 + ((MUL15(a[11], a[19])
 961                 + MUL15(a[12], a[18])
 962                 + MUL15(a[13], a[17])
 963                 + MUL15(a[14], a[16])) << 1);
 964         t[31] = ((MUL15(a[12], a[19])
 965                 + MUL15(a[13], a[18])
 966                 + MUL15(a[14], a[17])
 967                 + MUL15(a[15], a[16])) << 1);
 968         t[32] = MUL15(a[16], a[16])
 969                 + ((MUL15(a[13], a[19])
 970                 + MUL15(a[14], a[18])
 971                 + MUL15(a[15], a[17])) << 1);
 972         t[33] = ((MUL15(a[14], a[19])
 973                 + MUL15(a[15], a[18])
 974                 + MUL15(a[16], a[17])) << 1);
 975         t[34] = MUL15(a[17], a[17])
 976                 + ((MUL15(a[15], a[19])
 977                 + MUL15(a[16], a[18])) << 1);
 978         t[35] = ((MUL15(a[16], a[19])
 979                 + MUL15(a[17], a[18])) << 1);
 980         t[36] = MUL15(a[18], a[18])
 981                 + ((MUL15(a[17], a[19])) << 1);
 982         t[37] = ((MUL15(a[18], a[19])) << 1);
 983         t[38] = MUL15(a[19], a[19]);
 984         d[39] = norm13(d, t, 39);
 985 }
 986
 987 #endif
 988
 989 /*
 990  * Modulus for field F256 (field for point coordinates in curve P-256).
 991  */
 992 static const uint32_t F256[] = {
 993         0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x001F,
 994         0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0400, 0x0000,
 995         0x0000, 0x1FF8, 0x1FFF, 0x01FF
 996 };
 997
 998 /*
 999  * The 'b' curve equation coefficient for P-256.
1000  */
1001 static const uint32_t P256_B[] = {
1002         0x004B, 0x1E93, 0x0F89, 0x1C78, 0x03BC, 0x187B, 0x114E, 0x1619,
1003         0x1D06, 0x0328, 0x01AF, 0x0D31, 0x1557, 0x15DE, 0x1ECF, 0x127C,
1004         0x0A3A, 0x0EC5, 0x118D, 0x00B5
1005 };
1006
1007 /*
1008  * Perform a "short reduction" in field F256 (field for curve P-256).
1009  * The source value should be less than 262 bits; on output, it will
1010  * be at most 257 bits, and less than twice the modulus.
1011  */
1012 static void
1013 reduce_f256(uint32_t *d)
1014 {
1015         uint32_t x;
1016
1017         x = d[19] >> 9;
1018         d[19] &= 0x01FF;
1019         d[17] += x << 3;
1020         d[14] -= x << 10;
1021         d[7] -= x << 5;
1022         d[0] += x;
1023         norm13(d, d, 20);
1024 }
1025
1026 /*
1027  * Perform a "final reduction" in field F256 (field for curve P-256).
1028  * The source value must be less than twice the modulus. If the value
1029  * is not lower than the modulus, then the modulus is subtracted and
1030  * this function returns 1; otherwise, it leaves it untouched and it
1031  * returns 0.
1032  */
1033 static uint32_t
1034 reduce_final_f256(uint32_t *d)
1035 {
1036         uint32_t t[20];
1037         uint32_t cc;
1038         int i;
1039
1040         memcpy(t, d, sizeof t);
1041         cc = 0;
1042         for (i = 0; i < 20; i ++) {
1043                 uint32_t w;
1044
1045                 w = t[i] - F256[i] - cc;
1046                 cc = w >> 31;
1047                 t[i] = w & 0x1FFF;
1048         }
1049         cc ^= 1;
1050         CCOPY(cc, d, t, sizeof t);
1051         return cc;
1052 }
1053
1054 /*
1055  * Perform a multiplication of two integers modulo
1056  * 2^256-2^224+2^192+2^96-1 (for NIST curve P-256). Operands are arrays
1057  * of 20 words, each containing 13 bits of data, in little-endian order.
1058  * On input, upper word may be up to 13 bits (hence value up to 2^260-1);
1059  * on output, value fits on 257 bits and is lower than twice the modulus.
1060  */
1061 static void
1062 mul_f256(uint32_t *d, const uint32_t *a, const uint32_t *b)
1063 {
1064         uint32_t t[40], cc;
1065         int i;
1066
1067         /*
1068          * Compute raw multiplication. All result words fit in 13 bits
1069          * each.
1070          */
1071         mul20(t, a, b);
1072
1073         /*
1074          * Modular reduction: each high word in added/subtracted where
1075          * necessary.
1076          *
1077          * The modulus is:
1078          *    p = 2^256 - 2^224 + 2^192 + 2^96 - 1
1079          * Therefore:
1080          *    2^256 = 2^224 - 2^192 - 2^96 + 1 mod p
1081          *
1082          * For a word x at bit offset n (n >= 256), we have:
1083          *    x*2^n = x*2^(n-32) - x*2^(n-64)
1084          *            - x*2^(n - 160) + x*2^(n-256) mod p
1085          *
1086          * Thus, we can nullify the high word if we reinject it at some
1087          * proper emplacements.
1088          */
1089         for (i = 39; i >= 20; i --) {
1090                 uint32_t x;
1091
1092                 x = t[i];
1093                 t[i - 2] += ARSH(x, 6);
1094                 t[i - 3] += (x << 7) & 0x1FFF;
1095                 t[i - 4] -= ARSH(x, 12);
1096                 t[i - 5] -= (x << 1) & 0x1FFF;
1097                 t[i - 12] -= ARSH(x, 4);
1098                 t[i - 13] -= (x << 9) & 0x1FFF;
1099                 t[i - 19] += ARSH(x, 9);
1100                 t[i - 20] += (x << 4) & 0x1FFF;
1101         }
1102
1103         /*
1104          * Propagate carries. This is a signed propagation, and the
1105          * result may be negative. The loop above may enlarge values,
1106          * but not two much: worst case is the chain involving t[i - 3],
1107          * in which a value may be added to itself up to 7 times. Since
1108          * starting values are 13-bit each, all words fit on 20 bits
1109          * (21 to account for the sign bit).
1110          */
1111         cc = norm13(t, t, 20);
1112
1113         /*
1114          * Perform modular reduction again for the bits beyond 256 (the carry
1115          * and the bits 256..259). Since the largest shift below is by 10
1116          * bits, and the values fit on 21 bits, values fit in 32-bit words,
1117          * thereby allowing injecting full word values.
1118          */
1119         cc = (cc << 4) | (t[19] >> 9);
1120         t[19] &= 0x01FF;
1121         t[17] += cc << 3;
1122         t[14] -= cc << 10;
1123         t[7] -= cc << 5;
1124         t[0] += cc;
1125         norm13(d, t, 20);
1126 }
1127
1128 /*
1129  * Square an integer modulo 2^256-2^224+2^192+2^96-1 (for NIST curve
1130  * P-256). Operand is an array of 20 words, each containing 13 bits of
1131  * data, in little-endian order. On input, upper word may be up to 13
1132  * bits (hence value up to 2^260-1); on output, value fits on 257 bits
1133  * and is lower than twice the modulus.
1134  */
1135 static void
1136 square_f256(uint32_t *d, const uint32_t *a)
1137 {
1138         uint32_t t[40], cc;
1139         int i;
1140
1141         /*
1142          * Compute raw square. All result words fit in 13 bits each.
1143          */
1144         square20(t, a);
1145
1146         /*
1147          * Modular reduction: each high word in added/subtracted where
1148          * necessary.
1149          *
1150          * The modulus is:
1151          *    p = 2^256 - 2^224 + 2^192 + 2^96 - 1
1152          * Therefore:
1153          *    2^256 = 2^224 - 2^192 - 2^96 + 1 mod p
1154          *
1155          * For a word x at bit offset n (n >= 256), we have:
1156          *    x*2^n = x*2^(n-32) - x*2^(n-64)
1157          *            - x*2^(n - 160) + x*2^(n-256) mod p
1158          *
1159          * Thus, we can nullify the high word if we reinject it at some
1160          * proper emplacements.
1161          */
1162         for (i = 39; i >= 20; i --) {
1163                 uint32_t x;
1164
1165                 x = t[i];
1166                 t[i - 2] += ARSH(x, 6);
1167                 t[i - 3] += (x << 7) & 0x1FFF;
1168                 t[i - 4] -= ARSH(x, 12);
1169                 t[i - 5] -= (x << 1) & 0x1FFF;
1170                 t[i - 12] -= ARSH(x, 4);
1171                 t[i - 13] -= (x << 9) & 0x1FFF;
1172                 t[i - 19] += ARSH(x, 9);
1173                 t[i - 20] += (x << 4) & 0x1FFF;
1174         }
1175
1176         /*
1177          * Propagate carries. This is a signed propagation, and the
1178          * result may be negative. The loop above may enlarge values,
1179          * but not two much: worst case is the chain involving t[i - 3],
1180          * in which a value may be added to itself up to 7 times. Since
1181          * starting values are 13-bit each, all words fit on 20 bits
1182          * (21 to account for the sign bit).
1183          */
1184         cc = norm13(t, t, 20);
1185
1186         /*
1187          * Perform modular reduction again for the bits beyond 256 (the carry
1188          * and the bits 256..259). Since the largest shift below is by 10
1189          * bits, and the values fit on 21 bits, values fit in 32-bit words,
1190          * thereby allowing injecting full word values.
1191          */
1192         cc = (cc << 4) | (t[19] >> 9);
1193         t[19] &= 0x01FF;
1194         t[17] += cc << 3;
1195         t[14] -= cc << 10;
1196         t[7] -= cc << 5;
1197         t[0] += cc;
1198         norm13(d, t, 20);
1199 }
1200
1201 /*
1202  * Jacobian coordinates for a point in P-256: affine coordinates (X,Y)
1203  * are such that:
1204  *   X = x / z^2
1205  *   Y = y / z^3
1206  * For the point at infinity, z = 0.
1207  * Each point thus admits many possible representations.
1208  *
1209  * Coordinates are represented in arrays of 32-bit integers, each holding
1210  * 13 bits of data. Values may also be slightly greater than the modulus,
1211  * but they will always be lower than twice the modulus.
1212  */
1213 typedef struct {
1214         uint32_t x[20];
1215         uint32_t y[20];
1216         uint32_t z[20];
1217 } p256_jacobian;
1218
1219 /*
1220  * Convert a point to affine coordinates:
1221  *  - If the point is the point at infinity, then all three coordinates
1222  *    are set to 0.
1223  *  - Otherwise, the 'z' coordinate is set to 1, and the 'x' and 'y'
1224  *    coordinates are the 'X' and 'Y' affine coordinates.
1225  * The coordinates are guaranteed to be lower than the modulus.
1226  */
1227 static void
1228 p256_to_affine(p256_jacobian *P)
1229 {
1230         uint32_t t1[20], t2[20];
1231         int i;
1232
1233         /*
1234          * Invert z with a modular exponentiation: the modulus is
1235          * p = 2^256 - 2^224 + 2^192 + 2^96 - 1, and the exponent is
1236          * p-2. Exponent bit pattern (from high to low) is:
1237          *  - 32 bits of value 1
1238          *  - 31 bits of value 0
1239          *  - 1 bit of value 1
1240          *  - 96 bits of value 0
1241          *  - 94 bits of value 1
1242          *  - 1 bit of value 0
1243          *  - 1 bit of value 1
1244          * Thus, we precompute z^(2^31-1) to speed things up.
1245          *
1246          * If z = 0 (point at infinity) then the modular exponentiation
1247          * will yield 0, which leads to the expected result (all three
1248          * coordinates set to 0).
1249          */
1250
1251         /*
1252          * A simple square-and-multiply for z^(2^31-1). We could save about
1253          * two dozen multiplications here with an addition chain, but
1254          * this would require a bit more code, and extra stack buffers.
1255          */
1256         memcpy(t1, P->z, sizeof P->z);
1257         for (i = 0; i < 30; i ++) {
1258                 square_f256(t1, t1);
1259                 mul_f256(t1, t1, P->z);
1260         }
1261
1262         /*
1263          * Square-and-multiply. Apart from the squarings, we have a few
1264          * multiplications to set bits to 1; we multiply by the original z
1265          * for setting 1 bit, and by t1 for setting 31 bits.
1266          */
1267         memcpy(t2, P->z, sizeof P->z);
1268         for (i = 1; i < 256; i ++) {
1269                 square_f256(t2, t2);
1270                 switch (i) {
1271                 case 31:
1272                 case 190:
1273                 case 221:
1274                 case 252:
1275                         mul_f256(t2, t2, t1);
1276                         break;
1277                 case 63:
1278                 case 253:
1279                 case 255:
1280                         mul_f256(t2, t2, P->z);
1281                         break;
1282                 }
1283         }
1284
1285         /*
1286          * Now that we have 1/z, multiply x by 1/z^2 and y by 1/z^3.
1287          */
1288         mul_f256(t1, t2, t2);
1289         mul_f256(P->x, t1, P->x);
1290         mul_f256(t1, t1, t2);
1291         mul_f256(P->y, t1, P->y);
1292         reduce_final_f256(P->x);
1293         reduce_final_f256(P->y);
1294
1295         /*
1296          * Multiply z by 1/z. If z = 0, then this will yield 0, otherwise
1297          * this will set z to 1.
1298          */
1299         mul_f256(P->z, P->z, t2);
1300         reduce_final_f256(P->z);
1301 }
1302
1303 /*
1304  * Double a point in P-256. This function works for all valid points,
1305  * including the point at infinity.
1306  */
1307 static void
1308 p256_double(p256_jacobian *Q)
1309 {
1310         /*
1311          * Doubling formulas are:
1312          *
1313          *   s = 4*x*y^2
1314          *   m = 3*(x + z^2)*(x - z^2)
1315          *   x' = m^2 - 2*s
1316          *   y' = m*(s - x') - 8*y^4
1317          *   z' = 2*y*z
1318          *
1319          * These formulas work for all points, including points of order 2
1320          * and points at infinity:
1321          *   - If y = 0 then z' = 0. But there is no such point in P-256
1322          *     anyway.
1323          *   - If z = 0 then z' = 0.
1324          */
1325         uint32_t t1[20], t2[20], t3[20], t4[20];
1326         int i;
1327
1328         /*
1329          * Compute z^2 in t1.
1330          */
1331         square_f256(t1, Q->z);
1332
1333         /*
1334          * Compute x-z^2 in t2 and x+z^2 in t1.
1335          */
1336         for (i = 0; i < 20; i ++) {
1337                 t2[i] = (F256[i] << 1) + Q->x[i] - t1[i];
1338                 t1[i] += Q->x[i];
1339         }
1340         norm13(t1, t1, 20);
1341         norm13(t2, t2, 20);
1342
1343         /*
1344          * Compute 3*(x+z^2)*(x-z^2) in t1.
1345          */
1346         mul_f256(t3, t1, t2);
1347         for (i = 0; i < 20; i ++) {
1348                 t1[i] = MUL15(3, t3[i]);
1349         }
1350         norm13(t1, t1, 20);
1351
1352         /*
1353          * Compute 4*x*y^2 (in t2) and 2*y^2 (in t3).
1354          */
1355         square_f256(t3, Q->y);
1356         for (i = 0; i < 20; i ++) {
1357                 t3[i] <<= 1;
1358         }
1359         norm13(t3, t3, 20);
1360         mul_f256(t2, Q->x, t3);
1361         for (i = 0; i < 20; i ++) {
1362                 t2[i] <<= 1;
1363         }
1364         norm13(t2, t2, 20);
1365         reduce_f256(t2);
1366
1367         /*
1368          * Compute x' = m^2 - 2*s.
1369          */
1370         square_f256(Q->x, t1);
1371         for (i = 0; i < 20; i ++) {
1372                 Q->x[i] += (F256[i] << 2) - (t2[i] << 1);
1373         }
1374         norm13(Q->x, Q->x, 20);
1375         reduce_f256(Q->x);
1376
1377         /*
1378          * Compute z' = 2*y*z.
1379          */
1380         mul_f256(t4, Q->y, Q->z);
1381         for (i = 0; i < 20; i ++) {
1382                 Q->z[i] = t4[i] << 1;
1383         }
1384         norm13(Q->z, Q->z, 20);
1385         reduce_f256(Q->z);
1386
1387         /*
1388          * Compute y' = m*(s - x') - 8*y^4. Note that we already have
1389          * 2*y^2 in t3.
1390          */
1391         for (i = 0; i < 20; i ++) {
1392                 t2[i] += (F256[i] << 1) - Q->x[i];
1393         }
1394         norm13(t2, t2, 20);
1395         mul_f256(Q->y, t1, t2);
1396         square_f256(t4, t3);
1397         for (i = 0; i < 20; i ++) {
1398                 Q->y[i] += (F256[i] << 2) - (t4[i] << 1);
1399         }
1400         norm13(Q->y, Q->y, 20);
1401         reduce_f256(Q->y);
1402 }
1403
1404 /*
1405  * Add point P2 to point P1.
1406  *
1407  * This function computes the wrong result in the following cases:
1408  *
1409  *   - If P1 == 0 but P2 != 0
1410  *   - If P1 != 0 but P2 == 0
1411  *   - If P1 == P2
1412  *
1413  * In all three cases, P1 is set to the point at infinity.
1414  *
1415  * Returned value is 0 if one of the following occurs:
1416  *
1417  *   - P1 and P2 have the same Y coordinate
1418  *   - P1 == 0 and P2 == 0
1419  *   - The Y coordinate of one of the points is 0 and the other point is
1420  *     the point at infinity.
1421  *
1422  * The third case cannot actually happen with valid points, since a point
1423  * with Y == 0 is a point of order 2, and there is no point of order 2 on
1424  * curve P-256.
1425  *
1426  * Therefore, assuming that P1 != 0 and P2 != 0 on input, then the caller
1427  * can apply the following:
1428  *
1429  *   - If the result is not the point at infinity, then it is correct.
1430  *   - Otherwise, if the returned value is 1, then this is a case of
1431  *     P1+P2 == 0, so the result is indeed the point at infinity.
1432  *   - Otherwise, P1 == P2, so a "double" operation should have been
1433  *     performed.
1434  */
1435 static uint32_t
1436 p256_add(p256_jacobian *P1, const p256_jacobian *P2)
1437 {
1438         /*
1439          * Addtions formulas are:
1440          *
1441          *   u1 = x1 * z2^2
1442          *   u2 = x2 * z1^2
1443          *   s1 = y1 * z2^3
1444          *   s2 = y2 * z1^3
1445          *   h = u2 - u1
1446          *   r = s2 - s1
1447          *   x3 = r^2 - h^3 - 2 * u1 * h^2
1448          *   y3 = r * (u1 * h^2 - x3) - s1 * h^3
1449          *   z3 = h * z1 * z2
1450          */
1451         uint32_t t1[20], t2[20], t3[20], t4[20], t5[20], t6[20], t7[20];
1452         uint32_t ret;
1453         int i;
1454
1455         /*
1456          * Compute u1 = x1*z2^2 (in t1) and s1 = y1*z2^3 (in t3).
1457          */
1458         square_f256(t3, P2->z);
1459         mul_f256(t1, P1->x, t3);
1460         mul_f256(t4, P2->z, t3);
1461         mul_f256(t3, P1->y, t4);
1462
1463         /*
1464          * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
1465          */
1466         square_f256(t4, P1->z);
1467         mul_f256(t2, P2->x, t4);
1468         mul_f256(t5, P1->z, t4);
1469         mul_f256(t4, P2->y, t5);
1470
1471         /*
1472          * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
1473          * We need to test whether r is zero, so we will do some extra
1474          * reduce.
1475          */
1476         for (i = 0; i < 20; i ++) {
1477                 t2[i] += (F256[i] << 1) - t1[i];
1478                 t4[i] += (F256[i] << 1) - t3[i];
1479         }
1480         norm13(t2, t2, 20);
1481         norm13(t4, t4, 20);
1482         reduce_f256(t4);
1483         reduce_final_f256(t4);
1484         ret = 0;
1485         for (i = 0; i < 20; i ++) {
1486                 ret |= t4[i];
1487         }
1488         ret = (ret | -ret) >> 31;
1489
1490         /*
1491          * Compute u1*h^2 (in t6) and h^3 (in t5);
1492          */
1493         square_f256(t7, t2);
1494         mul_f256(t6, t1, t7);
1495         mul_f256(t5, t7, t2);
1496
1497         /*
1498          * Compute x3 = r^2 - h^3 - 2*u1*h^2.
1499          */
1500         square_f256(P1->x, t4);
1501         for (i = 0; i < 20; i ++) {
1502                 P1->x[i] += (F256[i] << 3) - t5[i] - (t6[i] << 1);
1503         }
1504         norm13(P1->x, P1->x, 20);
1505         reduce_f256(P1->x);
1506
1507         /*
1508          * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
1509          */
1510         for (i = 0; i < 20; i ++) {
1511                 t6[i] += (F256[i] << 1) - P1->x[i];
1512         }
1513         norm13(t6, t6, 20);
1514         mul_f256(P1->y, t4, t6);
1515         mul_f256(t1, t5, t3);
1516         for (i = 0; i < 20; i ++) {
1517                 P1->y[i] += (F256[i] << 1) - t1[i];
1518         }
1519         norm13(P1->y, P1->y, 20);
1520         reduce_f256(P1->y);
1521
1522         /*
1523          * Compute z3 = h*z1*z2.
1524          */
1525         mul_f256(t1, P1->z, P2->z);
1526         mul_f256(P1->z, t1, t2);
1527
1528         return ret;
1529 }
1530
1531 /*
1532  * Add point P2 to point P1. This is a specialised function for the
1533  * case when P2 is a non-zero point in affine coordinate.
1534  *
1535  * This function computes the wrong result in the following cases:
1536  *
1537  *   - If P1 == 0
1538  *   - If P1 == P2
1539  *
1540  * In both cases, P1 is set to the point at infinity.
1541  *
1542  * Returned value is 0 if one of the following occurs:
1543  *
1544  *   - P1 and P2 have the same Y coordinate
1545  *   - The Y coordinate of P2 is 0 and P1 is the point at infinity.
1546  *
1547  * The second case cannot actually happen with valid points, since a point
1548  * with Y == 0 is a point of order 2, and there is no point of order 2 on
1549  * curve P-256.
1550  *
1551  * Therefore, assuming that P1 != 0 on input, then the caller
1552  * can apply the following:
1553  *
1554  *   - If the result is not the point at infinity, then it is correct.
1555  *   - Otherwise, if the returned value is 1, then this is a case of
1556  *     P1+P2 == 0, so the result is indeed the point at infinity.
1557  *   - Otherwise, P1 == P2, so a "double" operation should have been
1558  *     performed.
1559  */
1560 static uint32_t
1561 p256_add_mixed(p256_jacobian *P1, const p256_jacobian *P2)
1562 {
1563         /*
1564          * Addtions formulas are:
1565          *
1566          *   u1 = x1
1567          *   u2 = x2 * z1^2
1568          *   s1 = y1
1569          *   s2 = y2 * z1^3
1570          *   h = u2 - u1
1571          *   r = s2 - s1
1572          *   x3 = r^2 - h^3 - 2 * u1 * h^2
1573          *   y3 = r * (u1 * h^2 - x3) - s1 * h^3
1574          *   z3 = h * z1
1575          */
1576         uint32_t t1[20], t2[20], t3[20], t4[20], t5[20], t6[20], t7[20];
1577         uint32_t ret;
1578         int i;
1579
1580         /*
1581          * Compute u1 = x1 (in t1) and s1 = y1 (in t3).
1582          */
1583         memcpy(t1, P1->x, sizeof t1);
1584         memcpy(t3, P1->y, sizeof t3);
1585
1586         /*
1587          * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
1588          */
1589         square_f256(t4, P1->z);
1590         mul_f256(t2, P2->x, t4);
1591         mul_f256(t5, P1->z, t4);
1592         mul_f256(t4, P2->y, t5);
1593
1594         /*
1595          * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
1596          * We need to test whether r is zero, so we will do some extra
1597          * reduce.
1598          */
1599         for (i = 0; i < 20; i ++) {
1600                 t2[i] += (F256[i] << 1) - t1[i];
1601                 t4[i] += (F256[i] << 1) - t3[i];
1602         }
1603         norm13(t2, t2, 20);
1604         norm13(t4, t4, 20);
1605         reduce_f256(t4);
1606         reduce_final_f256(t4);
1607         ret = 0;
1608         for (i = 0; i < 20; i ++) {
1609                 ret |= t4[i];
1610         }
1611         ret = (ret | -ret) >> 31;
1612
1613         /*
1614          * Compute u1*h^2 (in t6) and h^3 (in t5);
1615          */
1616         square_f256(t7, t2);
1617         mul_f256(t6, t1, t7);
1618         mul_f256(t5, t7, t2);
1619
1620         /*
1621          * Compute x3 = r^2 - h^3 - 2*u1*h^2.
1622          */
1623         square_f256(P1->x, t4);
1624         for (i = 0; i < 20; i ++) {
1625                 P1->x[i] += (F256[i] << 3) - t5[i] - (t6[i] << 1);
1626         }
1627         norm13(P1->x, P1->x, 20);
1628         reduce_f256(P1->x);
1629
1630         /*
1631          * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
1632          */
1633         for (i = 0; i < 20; i ++) {
1634                 t6[i] += (F256[i] << 1) - P1->x[i];
1635         }
1636         norm13(t6, t6, 20);
1637         mul_f256(P1->y, t4, t6);
1638         mul_f256(t1, t5, t3);
1639         for (i = 0; i < 20; i ++) {
1640                 P1->y[i] += (F256[i] << 1) - t1[i];
1641         }
1642         norm13(P1->y, P1->y, 20);
1643         reduce_f256(P1->y);
1644
1645         /*
1646          * Compute z3 = h*z1*z2.
1647          */
1648         mul_f256(P1->z, P1->z, t2);
1649
1650         return ret;
1651 }
1652
1653 /*
1654  * Decode a P-256 point. This function does not support the point at
1655  * infinity. Returned value is 0 if the point is invalid, 1 otherwise.
1656  */
1657 static uint32_t
1658 p256_decode(p256_jacobian *P, const void *src, size_t len)
1659 {
1660         const unsigned char *buf;
1661         uint32_t tx[20], ty[20], t1[20], t2[20];
1662         uint32_t bad;
1663         int i;
1664
1665         if (len != 65) {
1666                 return 0;
1667         }
1668         buf = src;
1669
1670         /*
1671          * First byte must be 0x04 (uncompressed format). We could support
1672          * "hybrid format" (first byte is 0x06 or 0x07, and encodes the
1673          * least significant bit of the Y coordinate), but it is explicitly
1674          * forbidden by RFC 5480 (section 2.2).
1675          */
1676         bad = NEQ(buf[0], 0x04);
1677
1678         /*
1679          * Decode the coordinates, and check that they are both lower
1680          * than the modulus.
1681          */
1682         tx[19] = be8_to_le13(tx, buf + 1, 32);
1683         ty[19] = be8_to_le13(ty, buf + 33, 32);
1684         bad |= reduce_final_f256(tx);
1685         bad |= reduce_final_f256(ty);
1686
1687         /*
1688          * Check curve equation.
1689          */
1690         square_f256(t1, tx);
1691         mul_f256(t1, tx, t1);
1692         square_f256(t2, ty);
1693         for (i = 0; i < 20; i ++) {
1694                 t1[i] += (F256[i] << 3) - MUL15(3, tx[i]) + P256_B[i] - t2[i];
1695         }
1696         norm13(t1, t1, 20);
1697         reduce_f256(t1);
1698         reduce_final_f256(t1);
1699         for (i = 0; i < 20; i ++) {
1700                 bad |= t1[i];
1701         }
1702
1703         /*
1704          * Copy coordinates to the point structure.
1705          */
1706         memcpy(P->x, tx, sizeof tx);
1707         memcpy(P->y, ty, sizeof ty);
1708         memset(P->z, 0, sizeof P->z);
1709         P->z[0] = 1;
1710         return NEQ(bad, 0) ^ 1;
1711 }
1712
1713 /*
1714  * Encode a point into a buffer. This function assumes that the point is
1715  * valid, in affine coordinates, and not the point at infinity.
1716  */
1717 static void
1718 p256_encode(void *dst, const p256_jacobian *P)
1719 {
1720         unsigned char *buf;
1721
1722         buf = dst;
1723         buf[0] = 0x04;
1724         le13_to_be8(buf + 1, 32, P->x);
1725         le13_to_be8(buf + 33, 32, P->y);
1726 }
1727
1728 /*
1729  * Multiply a curve point by an integer. The integer is assumed to be
1730  * lower than the curve order, and the base point must not be the point
1731  * at infinity.
1732  */
1733 static void
1734 p256_mul(p256_jacobian *P, const unsigned char *x, size_t xlen)
1735 {
1736         /*
1737          * qz is a flag that is initially 1, and remains equal to 1
1738          * as long as the point is the point at infinity.
1739          *
1740          * We use a 2-bit window to handle multiplier bits by pairs.
1741          * The precomputed window really is the points P2 and P3.
1742          */
1743         uint32_t qz;
1744         p256_jacobian P2, P3, Q, T, U;
1745
1746         /*
1747          * Compute window values.
1748          */
1749         P2 = *P;
1750         p256_double(&P2);
1751         P3 = *P;
1752         p256_add(&P3, &P2);
1753
1754         /*
1755          * We start with Q = 0. We process multiplier bits 2 by 2.
1756          */
1757         memset(&Q, 0, sizeof Q);
1758         qz = 1;
1759         while (xlen -- > 0) {
1760                 int k;
1761
1762                 for (k = 6; k >= 0; k -= 2) {
1763                         uint32_t bits;
1764                         uint32_t bnz;
1765
1766                         p256_double(&Q);
1767                         p256_double(&Q);
1768                         T = *P;
1769                         U = Q;
1770                         bits = (*x >> k) & (uint32_t)3;
1771                         bnz = NEQ(bits, 0);
1772                         CCOPY(EQ(bits, 2), &T, &P2, sizeof T);
1773                         CCOPY(EQ(bits, 3), &T, &P3, sizeof T);
1774                         p256_add(&U, &T);
1775                         CCOPY(bnz & qz, &Q, &T, sizeof Q);
1776                         CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
1777                         qz &= ~bnz;
1778                 }
1779                 x ++;
1780         }
1781         *P = Q;
1782 }
1783
1784 /*
1785  * Precomputed window: k*G points, where G is the curve generator, and k
1786  * is an integer from 1 to 15 (inclusive). The X and Y coordinates of
1787  * the point are encoded as 20 words of 13 bits each (little-endian
1788  * order); 13-bit words are then grouped 2-by-2 into 32-bit words
1789  * (little-endian order within each word).
1790  */
1791 static const uint32_t Gwin[15][20] = {
1792
1793         { 0x04C60296, 0x02721176, 0x19D00F4A, 0x102517AC,
1794           0x13B8037D, 0x0748103C, 0x1E730E56, 0x08481FE2,
1795           0x0F97012C, 0x00D605F4, 0x1DFA11F5, 0x0C801A0D,
1796           0x0F670CBB, 0x0AED0CC5, 0x115E0E33, 0x181F0785,
1797           0x13F514A7, 0x0FF30E3B, 0x17171E1A, 0x009F18D0 },
1798
1799         { 0x1B341978, 0x16911F11, 0x0D9A1A60, 0x1C4E1FC8,
1800           0x1E040969, 0x096A06B0, 0x091C0030, 0x09EF1A29,
1801           0x18C40D03, 0x00F91C9E, 0x13C313D1, 0x096F0748,
1802           0x011419E0, 0x1CC713A6, 0x1DD31DAD, 0x1EE80C36,
1803           0x1ECD0C69, 0x1A0800A4, 0x08861B8E, 0x000E1DD5 },
1804
1805         { 0x173F1D6C, 0x02CC06F1, 0x14C21FB4, 0x043D1EB6,
1806           0x0F3606B7, 0x1A971C59, 0x1BF71951, 0x01481323,
1807           0x068D0633, 0x00BD12F9, 0x13EA1032, 0x136209E8,
1808           0x1C1E19A7, 0x06C7013E, 0x06C10AB0, 0x14C908BB,
1809           0x05830CE1, 0x1FEF18DD, 0x00620998, 0x010E0D19 },
1810
1811         { 0x18180852, 0x0604111A, 0x0B771509, 0x1B6F0156,
1812           0x00181FE2, 0x1DCC0AF4, 0x16EF0659, 0x11F70E80,
1813           0x11A912D0, 0x01C414D2, 0x027618C6, 0x05840FC6,
1814           0x100215C4, 0x187E0C3B, 0x12771C96, 0x150C0B5D,
1815           0x0FF705FD, 0x07981C67, 0x1AD20C63, 0x01C11C55 },
1816
1817         { 0x1E8113ED, 0x0A940370, 0x12920215, 0x1FA31D6F,
1818           0x1F7C0C82, 0x10CD03F7, 0x02640560, 0x081A0B5E,
1819           0x1BD21151, 0x00A21642, 0x0D0B0DA4, 0x0176113F,
1820           0x04440D1D, 0x001A1360, 0x1068012F, 0x1F141E49,
1821           0x10DF136B, 0x0E4F162B, 0x0D44104A, 0x01C1105F },
1822
1823         { 0x011411A9, 0x01551A4F, 0x0ADA0C6B, 0x01BD0EC8,
1824           0x18120C74, 0x112F1778, 0x099202CB, 0x0C05124B,
1825           0x195316A4, 0x01600685, 0x1E3B1FE2, 0x189014E3,
1826           0x0B5E1FD7, 0x0E0311F8, 0x08E000F7, 0x174E00DE,
1827           0x160702DF, 0x1B5A15BF, 0x03A11237, 0x01D01704 },
1828
1829         { 0x0C3D12A3, 0x0C501C0C, 0x17AD1300, 0x1715003F,
1830           0x03F719F8, 0x18031ED8, 0x1D980667, 0x0F681896,
1831           0x1B7D00BF, 0x011C14CE, 0x0FA000B4, 0x1C3501B0,
1832           0x0D901C55, 0x06790C10, 0x029E0736, 0x0DEB0400,
1833           0x034F183A, 0x030619B4, 0x0DEF0033, 0x00E71AC7 },
1834
1835         { 0x1B7D1393, 0x1B3B1076, 0x0BED1B4D, 0x13011F3A,
1836           0x0E0E1238, 0x156A132B, 0x013A02D3, 0x160A0D01,
1837           0x1CED1EE9, 0x00C5165D, 0x184C157E, 0x08141A83,
1838           0x153C0DA5, 0x1ED70F9D, 0x05170D51, 0x02CF13B8,
1839           0x18AE1771, 0x1B04113F, 0x05EC11E9, 0x015A16B3 },
1840
1841         { 0x04A41EE0, 0x1D1412E4, 0x1C591D79, 0x118511B7,
1842           0x14F00ACB, 0x1AE31E1C, 0x049C0D51, 0x016E061E,
1843           0x1DB71EDF, 0x01D41A35, 0x0E8208FA, 0x14441293,
1844           0x011F1E85, 0x1D54137A, 0x026B114F, 0x151D0832,
1845           0x00A50964, 0x1F9C1E1C, 0x064B12C9, 0x005409D1 },
1846
1847         { 0x062B123F, 0x0C0D0501, 0x183704C3, 0x08E31120,
1848           0x0A2E0A6C, 0x14440FED, 0x090A0D1E, 0x13271964,
1849           0x0B590A3A, 0x019D1D9B, 0x05780773, 0x09770A91,
1850           0x0F770CA3, 0x053F19D4, 0x02C80DED, 0x1A761304,
1851           0x091E0DD9, 0x15D201B8, 0x151109AA, 0x010F0198 },
1852
1853         { 0x05E101D1, 0x072314DD, 0x045F1433, 0x1A041541,
1854           0x10B3142E, 0x01840736, 0x1C1B19DB, 0x098B0418,
1855           0x1DBC083B, 0x007D1444, 0x01511740, 0x11DD1F3A,
1856           0x04ED0E2F, 0x1B4B1A62, 0x10480D04, 0x09E911A2,
1857           0x04211AFA, 0x19140893, 0x04D60CC4, 0x01210648 },
1858
1859         { 0x112703C4, 0x018B1BA1, 0x164C1D50, 0x05160BE0,
1860           0x0BCC1830, 0x01CB1554, 0x13291732, 0x1B2B1918,
1861           0x0DED0817, 0x00E80775, 0x0A2401D3, 0x0BFE08B3,
1862           0x0E531199, 0x058616E9, 0x04770B91, 0x110F0C55,
1863           0x19C11554, 0x0BFB1159, 0x03541C38, 0x000E1C2D },
1864
1865         { 0x10390C01, 0x02BB0751, 0x0AC5098E, 0x096C17AB,
1866           0x03C90E28, 0x10BD18BF, 0x002E1F2D, 0x092B0986,
1867           0x1BD700AC, 0x002E1F20, 0x1E3D1FD8, 0x077718BB,
1868           0x06F919C4, 0x187407ED, 0x11370E14, 0x081E139C,
1869           0x00481ADB, 0x14AB0289, 0x066A0EBE, 0x00C70ED6 },
1870
1871         { 0x0694120B, 0x124E1CC9, 0x0E2F0570, 0x17CF081A,
1872           0x078906AC, 0x066D17CF, 0x1B3207F4, 0x0C5705E9,
1873           0x10001C38, 0x00A919DE, 0x06851375, 0x0F900BD8,
1874           0x080401BA, 0x0EEE0D42, 0x1B8B11EA, 0x0B4519F0,
1875           0x090F18C0, 0x062E1508, 0x0DD909F4, 0x01EB067C },
1876
1877         { 0x0CDC1D5F, 0x0D1818F9, 0x07781636, 0x125B18E8,
1878           0x0D7003AF, 0x13110099, 0x1D9B1899, 0x175C1EB7,
1879           0x0E34171A, 0x01E01153, 0x081A0F36, 0x0B391783,
1880           0x1D1F147E, 0x19CE16D7, 0x11511B21, 0x1F2C10F9,
1881           0x12CA0E51, 0x05A31D39, 0x171A192E, 0x016B0E4F }
1882 };
1883
1884 /*
1885  * Lookup one of the Gwin[] values, by index. This is constant-time.
1886  */
1887 static void
1888 lookup_Gwin(p256_jacobian *T, uint32_t idx)
1889 {
1890         uint32_t xy[20];
1891         uint32_t k;
1892         size_t u;
1893
1894         memset(xy, 0, sizeof xy);
1895         for (k = 0; k < 15; k ++) {
1896                 uint32_t m;
1897
1898                 m = -EQ(idx, k + 1);
1899                 for (u = 0; u < 20; u ++) {
1900                         xy[u] |= m & Gwin[k][u];
1901                 }
1902         }
1903         for (u = 0; u < 10; u ++) {
1904                 T->x[(u << 1) + 0] = xy[u] & 0xFFFF;
1905                 T->x[(u << 1) + 1] = xy[u] >> 16;
1906                 T->y[(u << 1) + 0] = xy[u + 10] & 0xFFFF;
1907                 T->y[(u << 1) + 1] = xy[u + 10] >> 16;
1908         }
1909         memset(T->z, 0, sizeof T->z);
1910         T->z[0] = 1;
1911 }
1912
1913 /*
1914  * Multiply the generator by an integer. The integer is assumed non-zero
1915  * and lower than the curve order.
1916  */
1917 static void
1918 p256_mulgen(p256_jacobian *P, const unsigned char *x, size_t xlen)
1919 {
1920         /*
1921          * qz is a flag that is initially 1, and remains equal to 1
1922          * as long as the point is the point at infinity.
1923          *
1924          * We use a 4-bit window to handle multiplier bits by groups
1925          * of 4. The precomputed window is constant static data, with
1926          * points in affine coordinates; we use a constant-time lookup.
1927          */
1928         p256_jacobian Q;
1929         uint32_t qz;
1930
1931         memset(&Q, 0, sizeof Q);
1932         qz = 1;
1933         while (xlen -- > 0) {
1934                 int k;
1935                 unsigned bx;
1936
1937                 bx = *x ++;
1938                 for (k = 0; k < 2; k ++) {
1939                         uint32_t bits;
1940                         uint32_t bnz;
1941                         p256_jacobian T, U;
1942
1943                         p256_double(&Q);
1944                         p256_double(&Q);
1945                         p256_double(&Q);
1946                         p256_double(&Q);
1947                         bits = (bx >> 4) & 0x0F;
1948                         bnz = NEQ(bits, 0);
1949                         lookup_Gwin(&T, bits);
1950                         U = Q;
1951                         p256_add_mixed(&U, &T);
1952                         CCOPY(bnz & qz, &Q, &T, sizeof Q);
1953                         CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
1954                         qz &= ~bnz;
1955                         bx <<= 4;
1956                 }
1957         }
1958         *P = Q;
1959 }
1960
1961 static const unsigned char P256_G[] = {
1962         0x04, 0x6B, 0x17, 0xD1, 0xF2, 0xE1, 0x2C, 0x42, 0x47, 0xF8,
1963         0xBC, 0xE6, 0xE5, 0x63, 0xA4, 0x40, 0xF2, 0x77, 0x03, 0x7D,
1964         0x81, 0x2D, 0xEB, 0x33, 0xA0, 0xF4, 0xA1, 0x39, 0x45, 0xD8,
1965         0x98, 0xC2, 0x96, 0x4F, 0xE3, 0x42, 0xE2, 0xFE, 0x1A, 0x7F,
1966         0x9B, 0x8E, 0xE7, 0xEB, 0x4A, 0x7C, 0x0F, 0x9E, 0x16, 0x2B,
1967         0xCE, 0x33, 0x57, 0x6B, 0x31, 0x5E, 0xCE, 0xCB, 0xB6, 0x40,
1968         0x68, 0x37, 0xBF, 0x51, 0xF5
1969 };
1970
1971 static const unsigned char P256_N[] = {
1972         0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF,
1973         0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xBC, 0xE6, 0xFA, 0xAD,
1974         0xA7, 0x17, 0x9E, 0x84, 0xF3, 0xB9, 0xCA, 0xC2, 0xFC, 0x63,
1975         0x25, 0x51
1976 };
1977
1978 static const unsigned char *
1979 api_generator(int curve, size_t *len)
1980 {
1981         (void)curve;
1982         *len = sizeof P256_G;
1983         return P256_G;
1984 }
1985
1986 static const unsigned char *
1987 api_order(int curve, size_t *len)
1988 {
1989         (void)curve;
1990         *len = sizeof P256_N;
1991         return P256_N;
1992 }
1993
1994 static size_t
1995 api_xoff(int curve, size_t *len)
1996 {
1997         (void)curve;
1998         *len = 32;
1999         return 1;
2000 }
2001
2002 static uint32_t
2003 api_mul(unsigned char *G, size_t Glen,
2004         const unsigned char *x, size_t xlen, int curve)
2005 {
2006         uint32_t r;
2007         p256_jacobian P;
2008
2009         (void)curve;
2010         r = p256_decode(&P, G, Glen);
2011         p256_mul(&P, x, xlen);
2012         if (Glen >= 65) {
2013                 p256_to_affine(&P);
2014                 p256_encode(G, &P);
2015         }
2016         return r;
2017 }
2018
2019 static size_t
2020 api_mulgen(unsigned char *R,
2021         const unsigned char *x, size_t xlen, int curve)
2022 {
2023         p256_jacobian P;
2024
2025         (void)curve;
2026         p256_mulgen(&P, x, xlen);
2027         p256_to_affine(&P);
2028         p256_encode(R, &P);
2029         return 65;
2030
2031         /*
2032         const unsigned char *G;
2033         size_t Glen;
2034
2035         G = api_generator(curve, &Glen);
2036         memcpy(R, G, Glen);
2037         api_mul(R, Glen, x, xlen, curve);
2038         return Glen;
2039         */
2040 }
2041
2042 static uint32_t
2043 api_muladd(unsigned char *A, const unsigned char *B, size_t len,
2044         const unsigned char *x, size_t xlen,
2045         const unsigned char *y, size_t ylen, int curve)
2046 {
2047         p256_jacobian P, Q;
2048         uint32_t r, t, z;
2049         int i;
2050
2051         (void)curve;
2052         r = p256_decode(&P, A, len);
2053         p256_mul(&P, x, xlen);
2054         if (B == NULL) {
2055                 p256_mulgen(&Q, y, ylen);
2056         } else {
2057                 r &= p256_decode(&Q, B, len);
2058                 p256_mul(&Q, y, ylen);
2059         }
2060
2061         /*
2062          * The final addition may fail in case both points are equal.
2063          */
2064         t = p256_add(&P, &Q);
2065         reduce_final_f256(P.z);
2066         z = 0;
2067         for (i = 0; i < 20; i ++) {
2068                 z |= P.z[i];
2069         }
2070         z = EQ(z, 0);
2071         p256_double(&Q);
2072
2073         /*
2074          * If z is 1 then either P+Q = 0 (t = 1) or P = Q (t = 0). So we
2075          * have the following:
2076          *
2077          *   z = 0, t = 0   return P (normal addition)
2078          *   z = 0, t = 1   return P (normal addition)
2079          *   z = 1, t = 0   return Q (a 'double' case)
2080          *   z = 1, t = 1   report an error (P+Q = 0)
2081          */
2082         CCOPY(z & ~t, &P, &Q, sizeof Q);
2083         p256_to_affine(&P);
2084         p256_encode(A, &P);
2085         r &= ~(z & t);
2086         return r;
2087 }
2088
2089 /* see bearssl_ec.h */
2090 const br_ec_impl br_ec_p256_m15 = {
2091         (uint32_t)0x00800000,
2092         &api_generator,
2093         &api_order,
2094         &api_xoff,
2095         &api_mul,
2096         &api_mulgen,
2097         &api_muladd
2098 };