_ Git - BearSSL/blob - src/ec/ec_p256_m15.c

   1 /*
   2  * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining
   5  * a copy of this software and associated documentation files (the
   6  * "Software"), to deal in the Software without restriction, including
   7  * without limitation the rights to use, copy, modify, merge, publish,
   8  * distribute, sublicense, and/or sell copies of the Software, and to
   9  * permit persons to whom the Software is furnished to do so, subject to
  10  * the following conditions:
  11  *
  12  * The above copyright notice and this permission notice shall be
  13  * included in all copies or substantial portions of the Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  16  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  17  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  18  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  19  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  20  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  21  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "inner.h"
  26
  27 /*
  28  * If BR_NO_ARITH_SHIFT is undefined, or defined to 0, then we _assume_
  29  * that right-shifting a signed negative integer copies the sign bit
  30  * (arithmetic right-shift). This is "implementation-defined behaviour",
  31  * i.e. it is not undefined, but it may differ between compilers. Each
  32  * compiler is supposed to document its behaviour in that respect. GCC
  33  * explicitly defines that an arithmetic right shift is used. We expect
  34  * all other compilers to do the same, because underlying CPU offer an
  35  * arithmetic right shift opcode that could not be used otherwise.
  36  */
  37 #if BR_NO_ARITH_SHIFT
  38 #define ARSH(x, n)   (((uint32_t)(x) >> (n)) \
  39                     | ((-((uint32_t)(x) >> 31)) << (32 - (n))))
  40 #else
  41 #define ARSH(x, n)   ((*(int32_t *)&(x)) >> (n))
  42 #endif
  43
  44 /*
  45  * Convert an integer from unsigned big-endian encoding to a sequence of
  46  * 13-bit words in little-endian order. The final "partial" word is
  47  * returned.
  48  */
  49 static uint32_t
  50 be8_to_le13(uint32_t *dst, const unsigned char *src, size_t len)
  51 {
  52         uint32_t acc;
  53         int acc_len;
  54
  55         acc = 0;
  56         acc_len = 0;
  57         while (len -- > 0) {
  58                 acc |= (uint32_t)src[len] << acc_len;
  59                 acc_len += 8;
  60                 if (acc_len >= 13) {
  61                         *dst ++ = acc & 0x1FFF;
  62                         acc >>= 13;
  63                         acc_len -= 13;
  64                 }
  65         }
  66         return acc;
  67 }
  68
  69 /*
  70  * Convert an integer (13-bit words, little-endian) to unsigned
  71  * big-endian encoding. The total encoding length is provided; all
  72  * the destination bytes will be filled.
  73  */
  74 static void
  75 le13_to_be8(unsigned char *dst, size_t len, const uint32_t *src)
  76 {
  77         uint32_t acc;
  78         int acc_len;
  79
  80         acc = 0;
  81         acc_len = 0;
  82         while (len -- > 0) {
  83                 if (acc_len < 8) {
  84                         acc |= (*src ++) << acc_len;
  85                         acc_len += 13;
  86                 }
  87                 dst[len] = (unsigned char)acc;
  88                 acc >>= 8;
  89                 acc_len -= 8;
  90         }
  91 }
  92
  93 /*
  94  * Normalise an array of words to a strict 13 bits per word. Returned
  95  * value is the resulting carry. The source (w) and destination (d)
  96  * arrays may be identical, but shall not overlap partially.
  97  */
  98 static inline uint32_t
  99 norm13(uint32_t *d, const uint32_t *w, size_t len)
 100 {
 101         size_t u;
 102         uint32_t cc;
 103
 104         cc = 0;
 105         for (u = 0; u < len; u ++) {
 106                 int32_t z;
 107
 108                 z = w[u] + cc;
 109                 d[u] = z & 0x1FFF;
 110                 cc = ARSH(z, 13);
 111         }
 112         return cc;
 113 }
 114
 115 /*
 116  * mul20() multiplies two 260-bit integers together. Each word must fit
 117  * on 13 bits; source operands use 20 words, destination operand
 118  * receives 40 words. All overlaps allowed.
 119  *
 120  * square20() computes the square of a 260-bit integer. Each word must
 121  * fit on 13 bits; source operand uses 20 words, destination operand
 122  * receives 40 words. All overlaps allowed.
 123  */
 124
 125 #if BR_SLOW_MUL15
 126
 127 static void
 128 mul20(uint32_t *d, const uint32_t *a, const uint32_t *b)
 129 {
 130         /*
 131          * Two-level Karatsuba: turns a 20x20 multiplication into
 132          * nine 5x5 multiplications. We use 13-bit words but do not
 133          * propagate carries immediately, so words may expand:
 134          *
 135          *  - First Karatsuba decomposition turns the 20x20 mul on
 136          *    13-bit words into three 10x10 muls, two on 13-bit words
 137          *    and one on 14-bit words.
 138          *
 139          *  - Second Karatsuba decomposition further splits these into:
 140          *
 141          *     * four 5x5 muls on 13-bit words
 142          *     * four 5x5 muls on 14-bit words
 143          *     * one 5x5 mul on 15-bit words
 144          *
 145          * Highest word value is 8191, 16382 or 32764, for 13-bit, 14-bit
 146          * or 15-bit words, respectively.
 147          */
 148         uint32_t u[45], v[45], w[90];
 149         uint32_t cc;
 150         int i;
 151
 152 #define ZADD(dw, d_off, s1w, s1_off, s2w, s2_off)   do { \
 153                 (dw)[5 * (d_off) + 0] = (s1w)[5 * (s1_off) + 0] \
 154                         + (s2w)[5 * (s2_off) + 0]; \
 155                 (dw)[5 * (d_off) + 1] = (s1w)[5 * (s1_off) + 1] \
 156                         + (s2w)[5 * (s2_off) + 1]; \
 157                 (dw)[5 * (d_off) + 2] = (s1w)[5 * (s1_off) + 2] \
 158                         + (s2w)[5 * (s2_off) + 2]; \
 159                 (dw)[5 * (d_off) + 3] = (s1w)[5 * (s1_off) + 3] \
 160                         + (s2w)[5 * (s2_off) + 3]; \
 161                 (dw)[5 * (d_off) + 4] = (s1w)[5 * (s1_off) + 4] \
 162                         + (s2w)[5 * (s2_off) + 4]; \
 163         } while (0)
 164
 165 #define ZADDT(dw, d_off, sw, s_off)   do { \
 166                 (dw)[5 * (d_off) + 0] += (sw)[5 * (s_off) + 0]; \
 167                 (dw)[5 * (d_off) + 1] += (sw)[5 * (s_off) + 1]; \
 168                 (dw)[5 * (d_off) + 2] += (sw)[5 * (s_off) + 2]; \
 169                 (dw)[5 * (d_off) + 3] += (sw)[5 * (s_off) + 3]; \
 170                 (dw)[5 * (d_off) + 4] += (sw)[5 * (s_off) + 4]; \
 171         } while (0)
 172
 173 #define ZSUB2F(dw, d_off, s1w, s1_off, s2w, s2_off)   do { \
 174                 (dw)[5 * (d_off) + 0] -= (s1w)[5 * (s1_off) + 0] \
 175                         + (s2w)[5 * (s2_off) + 0]; \
 176                 (dw)[5 * (d_off) + 1] -= (s1w)[5 * (s1_off) + 1] \
 177                         + (s2w)[5 * (s2_off) + 1]; \
 178                 (dw)[5 * (d_off) + 2] -= (s1w)[5 * (s1_off) + 2] \
 179                         + (s2w)[5 * (s2_off) + 2]; \
 180                 (dw)[5 * (d_off) + 3] -= (s1w)[5 * (s1_off) + 3] \
 181                         + (s2w)[5 * (s2_off) + 3]; \
 182                 (dw)[5 * (d_off) + 4] -= (s1w)[5 * (s1_off) + 4] \
 183                         + (s2w)[5 * (s2_off) + 4]; \
 184         } while (0)
 185
 186 #define CPR1(w, cprcc)   do { \
 187                 uint32_t cprz = (w) + cprcc; \
 188                 (w) = cprz & 0x1FFF; \
 189                 cprcc = cprz >> 13; \
 190         } while (0)
 191
 192 #define CPR(dw, d_off)   do { \
 193                 uint32_t cprcc; \
 194                 cprcc = 0; \
 195                 CPR1((dw)[(d_off) + 0], cprcc); \
 196                 CPR1((dw)[(d_off) + 1], cprcc); \
 197                 CPR1((dw)[(d_off) + 2], cprcc); \
 198                 CPR1((dw)[(d_off) + 3], cprcc); \
 199                 CPR1((dw)[(d_off) + 4], cprcc); \
 200                 CPR1((dw)[(d_off) + 5], cprcc); \
 201                 CPR1((dw)[(d_off) + 6], cprcc); \
 202                 CPR1((dw)[(d_off) + 7], cprcc); \
 203                 CPR1((dw)[(d_off) + 8], cprcc); \
 204                 (dw)[(d_off) + 9] = cprcc; \
 205         } while (0)
 206
 207         memcpy(u, a, 20 * sizeof *a);
 208         ZADD(u, 4, a, 0, a, 1);
 209         ZADD(u, 5, a, 2, a, 3);
 210         ZADD(u, 6, a, 0, a, 2);
 211         ZADD(u, 7, a, 1, a, 3);
 212         ZADD(u, 8, u, 6, u, 7);
 213
 214         memcpy(v, b, 20 * sizeof *b);
 215         ZADD(v, 4, b, 0, b, 1);
 216         ZADD(v, 5, b, 2, b, 3);
 217         ZADD(v, 6, b, 0, b, 2);
 218         ZADD(v, 7, b, 1, b, 3);
 219         ZADD(v, 8, v, 6, v, 7);
 220
 221         /*
 222          * Do the eight first 8x8 muls. Source words are at most 16382
 223          * each, so we can add product results together "as is" in 32-bit
 224          * words.
 225          */
 226         for (i = 0; i < 40; i += 5) {
 227                 w[(i << 1) + 0] = MUL15(u[i + 0], v[i + 0]);
 228                 w[(i << 1) + 1] = MUL15(u[i + 0], v[i + 1])
 229                         + MUL15(u[i + 1], v[i + 0]);
 230                 w[(i << 1) + 2] = MUL15(u[i + 0], v[i + 2])
 231                         + MUL15(u[i + 1], v[i + 1])
 232                         + MUL15(u[i + 2], v[i + 0]);
 233                 w[(i << 1) + 3] = MUL15(u[i + 0], v[i + 3])
 234                         + MUL15(u[i + 1], v[i + 2])
 235                         + MUL15(u[i + 2], v[i + 1])
 236                         + MUL15(u[i + 3], v[i + 0]);
 237                 w[(i << 1) + 4] = MUL15(u[i + 0], v[i + 4])
 238                         + MUL15(u[i + 1], v[i + 3])
 239                         + MUL15(u[i + 2], v[i + 2])
 240                         + MUL15(u[i + 3], v[i + 1])
 241                         + MUL15(u[i + 4], v[i + 0]);
 242                 w[(i << 1) + 5] = MUL15(u[i + 1], v[i + 4])
 243                         + MUL15(u[i + 2], v[i + 3])
 244                         + MUL15(u[i + 3], v[i + 2])
 245                         + MUL15(u[i + 4], v[i + 1]);
 246                 w[(i << 1) + 6] = MUL15(u[i + 2], v[i + 4])
 247                         + MUL15(u[i + 3], v[i + 3])
 248                         + MUL15(u[i + 4], v[i + 2]);
 249                 w[(i << 1) + 7] = MUL15(u[i + 3], v[i + 4])
 250                         + MUL15(u[i + 4], v[i + 3]);
 251                 w[(i << 1) + 8] = MUL15(u[i + 4], v[i + 4]);
 252                 w[(i << 1) + 9] = 0;
 253         }
 254
 255         /*
 256          * For the 9th multiplication, source words are up to 32764,
 257          * so we must do some carry propagation. If we add up to
 258          * 4 products and the carry is no more than 524224, then the
 259          * result fits in 32 bits, and the next carry will be no more
 260          * than 524224 (because 4*(32764^2)+524224 < 8192*524225).
 261          *
 262          * We thus just skip one of the products in the middle word,
 263          * then do a carry propagation (this reduces words to 13 bits
 264          * each, except possibly the last, which may use up to 17 bits
 265          * or so), then add the missing product.
 266          */
 267         w[80 + 0] = MUL15(u[40 + 0], v[40 + 0]);
 268         w[80 + 1] = MUL15(u[40 + 0], v[40 + 1])
 269                 + MUL15(u[40 + 1], v[40 + 0]);
 270         w[80 + 2] = MUL15(u[40 + 0], v[40 + 2])
 271                 + MUL15(u[40 + 1], v[40 + 1])
 272                 + MUL15(u[40 + 2], v[40 + 0]);
 273         w[80 + 3] = MUL15(u[40 + 0], v[40 + 3])
 274                 + MUL15(u[40 + 1], v[40 + 2])
 275                 + MUL15(u[40 + 2], v[40 + 1])
 276                 + MUL15(u[40 + 3], v[40 + 0]);
 277         w[80 + 4] = MUL15(u[40 + 0], v[40 + 4])
 278                 + MUL15(u[40 + 1], v[40 + 3])
 279                 + MUL15(u[40 + 2], v[40 + 2])
 280                 + MUL15(u[40 + 3], v[40 + 1]);
 281                 /* + MUL15(u[40 + 4], v[40 + 0]) */
 282         w[80 + 5] = MUL15(u[40 + 1], v[40 + 4])
 283                 + MUL15(u[40 + 2], v[40 + 3])
 284                 + MUL15(u[40 + 3], v[40 + 2])
 285                 + MUL15(u[40 + 4], v[40 + 1]);
 286         w[80 + 6] = MUL15(u[40 + 2], v[40 + 4])
 287                 + MUL15(u[40 + 3], v[40 + 3])
 288                 + MUL15(u[40 + 4], v[40 + 2]);
 289         w[80 + 7] = MUL15(u[40 + 3], v[40 + 4])
 290                 + MUL15(u[40 + 4], v[40 + 3]);
 291         w[80 + 8] = MUL15(u[40 + 4], v[40 + 4]);
 292
 293         CPR(w, 80);
 294
 295         w[80 + 4] += MUL15(u[40 + 4], v[40 + 0]);
 296
 297         /*
 298          * The products on 14-bit words in slots 6 and 7 yield values
 299          * up to 5*(16382^2) each, and we need to subtract two such
 300          * values from the higher word. We need the subtraction to fit
 301          * in a _signed_ 32-bit integer, i.e. 31 bits + a sign bit.
 302          * However, 10*(16382^2) does not fit. So we must perform a
 303          * bit of reduction here.
 304          */
 305         CPR(w, 60);
 306         CPR(w, 70);
 307
 308         /*
 309          * Recompose results.
 310          */
 311
 312         /* 0..1*0..1 into 0..3 */
 313         ZSUB2F(w, 8, w, 0, w, 2);
 314         ZSUB2F(w, 9, w, 1, w, 3);
 315         ZADDT(w, 1, w, 8);
 316         ZADDT(w, 2, w, 9);
 317
 318         /* 2..3*2..3 into 4..7 */
 319         ZSUB2F(w, 10, w, 4, w, 6);
 320         ZSUB2F(w, 11, w, 5, w, 7);
 321         ZADDT(w, 5, w, 10);
 322         ZADDT(w, 6, w, 11);
 323
 324         /* (0..1+2..3)*(0..1+2..3) into 12..15 */
 325         ZSUB2F(w, 16, w, 12, w, 14);
 326         ZSUB2F(w, 17, w, 13, w, 15);
 327         ZADDT(w, 13, w, 16);
 328         ZADDT(w, 14, w, 17);
 329
 330         /* first-level recomposition */
 331         ZSUB2F(w, 12, w, 0, w, 4);
 332         ZSUB2F(w, 13, w, 1, w, 5);
 333         ZSUB2F(w, 14, w, 2, w, 6);
 334         ZSUB2F(w, 15, w, 3, w, 7);
 335         ZADDT(w, 2, w, 12);
 336         ZADDT(w, 3, w, 13);
 337         ZADDT(w, 4, w, 14);
 338         ZADDT(w, 5, w, 15);
 339
 340         /*
 341          * Perform carry propagation to bring all words down to 13 bits.
 342          */
 343         cc = norm13(d, w, 40);
 344         d[39] += (cc << 13);
 345
 346 #undef ZADD
 347 #undef ZADDT
 348 #undef ZSUB2F
 349 #undef CPR1
 350 #undef CPR
 351 }
 352
 353 static inline void
 354 square20(uint32_t *d, const uint32_t *a)
 355 {
 356         mul20(d, a, a);
 357 }
 358
 359 #else
 360
 361 static void
 362 mul20(uint32_t *d, const uint32_t *a, const uint32_t *b)
 363 {
 364         uint32_t t[39];
 365
 366         t[ 0] = MUL15(a[ 0], b[ 0]);
 367         t[ 1] = MUL15(a[ 0], b[ 1])
 368                 + MUL15(a[ 1], b[ 0]);
 369         t[ 2] = MUL15(a[ 0], b[ 2])
 370                 + MUL15(a[ 1], b[ 1])
 371                 + MUL15(a[ 2], b[ 0]);
 372         t[ 3] = MUL15(a[ 0], b[ 3])
 373                 + MUL15(a[ 1], b[ 2])
 374                 + MUL15(a[ 2], b[ 1])
 375                 + MUL15(a[ 3], b[ 0]);
 376         t[ 4] = MUL15(a[ 0], b[ 4])
 377                 + MUL15(a[ 1], b[ 3])
 378                 + MUL15(a[ 2], b[ 2])
 379                 + MUL15(a[ 3], b[ 1])
 380                 + MUL15(a[ 4], b[ 0]);
 381         t[ 5] = MUL15(a[ 0], b[ 5])
 382                 + MUL15(a[ 1], b[ 4])
 383                 + MUL15(a[ 2], b[ 3])
 384                 + MUL15(a[ 3], b[ 2])
 385                 + MUL15(a[ 4], b[ 1])
 386                 + MUL15(a[ 5], b[ 0]);
 387         t[ 6] = MUL15(a[ 0], b[ 6])
 388                 + MUL15(a[ 1], b[ 5])
 389                 + MUL15(a[ 2], b[ 4])
 390                 + MUL15(a[ 3], b[ 3])
 391                 + MUL15(a[ 4], b[ 2])
 392                 + MUL15(a[ 5], b[ 1])
 393                 + MUL15(a[ 6], b[ 0]);
 394         t[ 7] = MUL15(a[ 0], b[ 7])
 395                 + MUL15(a[ 1], b[ 6])
 396                 + MUL15(a[ 2], b[ 5])
 397                 + MUL15(a[ 3], b[ 4])
 398                 + MUL15(a[ 4], b[ 3])
 399                 + MUL15(a[ 5], b[ 2])
 400                 + MUL15(a[ 6], b[ 1])
 401                 + MUL15(a[ 7], b[ 0]);
 402         t[ 8] = MUL15(a[ 0], b[ 8])
 403                 + MUL15(a[ 1], b[ 7])
 404                 + MUL15(a[ 2], b[ 6])
 405                 + MUL15(a[ 3], b[ 5])
 406                 + MUL15(a[ 4], b[ 4])
 407                 + MUL15(a[ 5], b[ 3])
 408                 + MUL15(a[ 6], b[ 2])
 409                 + MUL15(a[ 7], b[ 1])
 410                 + MUL15(a[ 8], b[ 0]);
 411         t[ 9] = MUL15(a[ 0], b[ 9])
 412                 + MUL15(a[ 1], b[ 8])
 413                 + MUL15(a[ 2], b[ 7])
 414                 + MUL15(a[ 3], b[ 6])
 415                 + MUL15(a[ 4], b[ 5])
 416                 + MUL15(a[ 5], b[ 4])
 417                 + MUL15(a[ 6], b[ 3])
 418                 + MUL15(a[ 7], b[ 2])
 419                 + MUL15(a[ 8], b[ 1])
 420                 + MUL15(a[ 9], b[ 0]);
 421         t[10] = MUL15(a[ 0], b[10])
 422                 + MUL15(a[ 1], b[ 9])
 423                 + MUL15(a[ 2], b[ 8])
 424                 + MUL15(a[ 3], b[ 7])
 425                 + MUL15(a[ 4], b[ 6])
 426                 + MUL15(a[ 5], b[ 5])
 427                 + MUL15(a[ 6], b[ 4])
 428                 + MUL15(a[ 7], b[ 3])
 429                 + MUL15(a[ 8], b[ 2])
 430                 + MUL15(a[ 9], b[ 1])
 431                 + MUL15(a[10], b[ 0]);
 432         t[11] = MUL15(a[ 0], b[11])
 433                 + MUL15(a[ 1], b[10])
 434                 + MUL15(a[ 2], b[ 9])
 435                 + MUL15(a[ 3], b[ 8])
 436                 + MUL15(a[ 4], b[ 7])
 437                 + MUL15(a[ 5], b[ 6])
 438                 + MUL15(a[ 6], b[ 5])
 439                 + MUL15(a[ 7], b[ 4])
 440                 + MUL15(a[ 8], b[ 3])
 441                 + MUL15(a[ 9], b[ 2])
 442                 + MUL15(a[10], b[ 1])
 443                 + MUL15(a[11], b[ 0]);
 444         t[12] = MUL15(a[ 0], b[12])
 445                 + MUL15(a[ 1], b[11])
 446                 + MUL15(a[ 2], b[10])
 447                 + MUL15(a[ 3], b[ 9])
 448                 + MUL15(a[ 4], b[ 8])
 449                 + MUL15(a[ 5], b[ 7])
 450                 + MUL15(a[ 6], b[ 6])
 451                 + MUL15(a[ 7], b[ 5])
 452                 + MUL15(a[ 8], b[ 4])
 453                 + MUL15(a[ 9], b[ 3])
 454                 + MUL15(a[10], b[ 2])
 455                 + MUL15(a[11], b[ 1])
 456                 + MUL15(a[12], b[ 0]);
 457         t[13] = MUL15(a[ 0], b[13])
 458                 + MUL15(a[ 1], b[12])
 459                 + MUL15(a[ 2], b[11])
 460                 + MUL15(a[ 3], b[10])
 461                 + MUL15(a[ 4], b[ 9])
 462                 + MUL15(a[ 5], b[ 8])
 463                 + MUL15(a[ 6], b[ 7])
 464                 + MUL15(a[ 7], b[ 6])
 465                 + MUL15(a[ 8], b[ 5])
 466                 + MUL15(a[ 9], b[ 4])
 467                 + MUL15(a[10], b[ 3])
 468                 + MUL15(a[11], b[ 2])
 469                 + MUL15(a[12], b[ 1])
 470                 + MUL15(a[13], b[ 0]);
 471         t[14] = MUL15(a[ 0], b[14])
 472                 + MUL15(a[ 1], b[13])
 473                 + MUL15(a[ 2], b[12])
 474                 + MUL15(a[ 3], b[11])
 475                 + MUL15(a[ 4], b[10])
 476                 + MUL15(a[ 5], b[ 9])
 477                 + MUL15(a[ 6], b[ 8])
 478                 + MUL15(a[ 7], b[ 7])
 479                 + MUL15(a[ 8], b[ 6])
 480                 + MUL15(a[ 9], b[ 5])
 481                 + MUL15(a[10], b[ 4])
 482                 + MUL15(a[11], b[ 3])
 483                 + MUL15(a[12], b[ 2])
 484                 + MUL15(a[13], b[ 1])
 485                 + MUL15(a[14], b[ 0]);
 486         t[15] = MUL15(a[ 0], b[15])
 487                 + MUL15(a[ 1], b[14])
 488                 + MUL15(a[ 2], b[13])
 489                 + MUL15(a[ 3], b[12])
 490                 + MUL15(a[ 4], b[11])
 491                 + MUL15(a[ 5], b[10])
 492                 + MUL15(a[ 6], b[ 9])
 493                 + MUL15(a[ 7], b[ 8])
 494                 + MUL15(a[ 8], b[ 7])
 495                 + MUL15(a[ 9], b[ 6])
 496                 + MUL15(a[10], b[ 5])
 497                 + MUL15(a[11], b[ 4])
 498                 + MUL15(a[12], b[ 3])
 499                 + MUL15(a[13], b[ 2])
 500                 + MUL15(a[14], b[ 1])
 501                 + MUL15(a[15], b[ 0]);
 502         t[16] = MUL15(a[ 0], b[16])
 503                 + MUL15(a[ 1], b[15])
 504                 + MUL15(a[ 2], b[14])
 505                 + MUL15(a[ 3], b[13])
 506                 + MUL15(a[ 4], b[12])
 507                 + MUL15(a[ 5], b[11])
 508                 + MUL15(a[ 6], b[10])
 509                 + MUL15(a[ 7], b[ 9])
 510                 + MUL15(a[ 8], b[ 8])
 511                 + MUL15(a[ 9], b[ 7])
 512                 + MUL15(a[10], b[ 6])
 513                 + MUL15(a[11], b[ 5])
 514                 + MUL15(a[12], b[ 4])
 515                 + MUL15(a[13], b[ 3])
 516                 + MUL15(a[14], b[ 2])
 517                 + MUL15(a[15], b[ 1])
 518                 + MUL15(a[16], b[ 0]);
 519         t[17] = MUL15(a[ 0], b[17])
 520                 + MUL15(a[ 1], b[16])
 521                 + MUL15(a[ 2], b[15])
 522                 + MUL15(a[ 3], b[14])
 523                 + MUL15(a[ 4], b[13])
 524                 + MUL15(a[ 5], b[12])
 525                 + MUL15(a[ 6], b[11])
 526                 + MUL15(a[ 7], b[10])
 527                 + MUL15(a[ 8], b[ 9])
 528                 + MUL15(a[ 9], b[ 8])
 529                 + MUL15(a[10], b[ 7])
 530                 + MUL15(a[11], b[ 6])
 531                 + MUL15(a[12], b[ 5])
 532                 + MUL15(a[13], b[ 4])
 533                 + MUL15(a[14], b[ 3])
 534                 + MUL15(a[15], b[ 2])
 535                 + MUL15(a[16], b[ 1])
 536                 + MUL15(a[17], b[ 0]);
 537         t[18] = MUL15(a[ 0], b[18])
 538                 + MUL15(a[ 1], b[17])
 539                 + MUL15(a[ 2], b[16])
 540                 + MUL15(a[ 3], b[15])
 541                 + MUL15(a[ 4], b[14])
 542                 + MUL15(a[ 5], b[13])
 543                 + MUL15(a[ 6], b[12])
 544                 + MUL15(a[ 7], b[11])
 545                 + MUL15(a[ 8], b[10])
 546                 + MUL15(a[ 9], b[ 9])
 547                 + MUL15(a[10], b[ 8])
 548                 + MUL15(a[11], b[ 7])
 549                 + MUL15(a[12], b[ 6])
 550                 + MUL15(a[13], b[ 5])
 551                 + MUL15(a[14], b[ 4])
 552                 + MUL15(a[15], b[ 3])
 553                 + MUL15(a[16], b[ 2])
 554                 + MUL15(a[17], b[ 1])
 555                 + MUL15(a[18], b[ 0]);
 556         t[19] = MUL15(a[ 0], b[19])
 557                 + MUL15(a[ 1], b[18])
 558                 + MUL15(a[ 2], b[17])
 559                 + MUL15(a[ 3], b[16])
 560                 + MUL15(a[ 4], b[15])
 561                 + MUL15(a[ 5], b[14])
 562                 + MUL15(a[ 6], b[13])
 563                 + MUL15(a[ 7], b[12])
 564                 + MUL15(a[ 8], b[11])
 565                 + MUL15(a[ 9], b[10])
 566                 + MUL15(a[10], b[ 9])
 567                 + MUL15(a[11], b[ 8])
 568                 + MUL15(a[12], b[ 7])
 569                 + MUL15(a[13], b[ 6])
 570                 + MUL15(a[14], b[ 5])
 571                 + MUL15(a[15], b[ 4])
 572                 + MUL15(a[16], b[ 3])
 573                 + MUL15(a[17], b[ 2])
 574                 + MUL15(a[18], b[ 1])
 575                 + MUL15(a[19], b[ 0]);
 576         t[20] = MUL15(a[ 1], b[19])
 577                 + MUL15(a[ 2], b[18])
 578                 + MUL15(a[ 3], b[17])
 579                 + MUL15(a[ 4], b[16])
 580                 + MUL15(a[ 5], b[15])
 581                 + MUL15(a[ 6], b[14])
 582                 + MUL15(a[ 7], b[13])
 583                 + MUL15(a[ 8], b[12])
 584                 + MUL15(a[ 9], b[11])
 585                 + MUL15(a[10], b[10])
 586                 + MUL15(a[11], b[ 9])
 587                 + MUL15(a[12], b[ 8])
 588                 + MUL15(a[13], b[ 7])
 589                 + MUL15(a[14], b[ 6])
 590                 + MUL15(a[15], b[ 5])
 591                 + MUL15(a[16], b[ 4])
 592                 + MUL15(a[17], b[ 3])
 593                 + MUL15(a[18], b[ 2])
 594                 + MUL15(a[19], b[ 1]);
 595         t[21] = MUL15(a[ 2], b[19])
 596                 + MUL15(a[ 3], b[18])
 597                 + MUL15(a[ 4], b[17])
 598                 + MUL15(a[ 5], b[16])
 599                 + MUL15(a[ 6], b[15])
 600                 + MUL15(a[ 7], b[14])
 601                 + MUL15(a[ 8], b[13])
 602                 + MUL15(a[ 9], b[12])
 603                 + MUL15(a[10], b[11])
 604                 + MUL15(a[11], b[10])
 605                 + MUL15(a[12], b[ 9])
 606                 + MUL15(a[13], b[ 8])
 607                 + MUL15(a[14], b[ 7])
 608                 + MUL15(a[15], b[ 6])
 609                 + MUL15(a[16], b[ 5])
 610                 + MUL15(a[17], b[ 4])
 611                 + MUL15(a[18], b[ 3])
 612                 + MUL15(a[19], b[ 2]);
 613         t[22] = MUL15(a[ 3], b[19])
 614                 + MUL15(a[ 4], b[18])
 615                 + MUL15(a[ 5], b[17])
 616                 + MUL15(a[ 6], b[16])
 617                 + MUL15(a[ 7], b[15])
 618                 + MUL15(a[ 8], b[14])
 619                 + MUL15(a[ 9], b[13])
 620                 + MUL15(a[10], b[12])
 621                 + MUL15(a[11], b[11])
 622                 + MUL15(a[12], b[10])
 623                 + MUL15(a[13], b[ 9])
 624                 + MUL15(a[14], b[ 8])
 625                 + MUL15(a[15], b[ 7])
 626                 + MUL15(a[16], b[ 6])
 627                 + MUL15(a[17], b[ 5])
 628                 + MUL15(a[18], b[ 4])
 629                 + MUL15(a[19], b[ 3]);
 630         t[23] = MUL15(a[ 4], b[19])
 631                 + MUL15(a[ 5], b[18])
 632                 + MUL15(a[ 6], b[17])
 633                 + MUL15(a[ 7], b[16])
 634                 + MUL15(a[ 8], b[15])
 635                 + MUL15(a[ 9], b[14])
 636                 + MUL15(a[10], b[13])
 637                 + MUL15(a[11], b[12])
 638                 + MUL15(a[12], b[11])
 639                 + MUL15(a[13], b[10])
 640                 + MUL15(a[14], b[ 9])
 641                 + MUL15(a[15], b[ 8])
 642                 + MUL15(a[16], b[ 7])
 643                 + MUL15(a[17], b[ 6])
 644                 + MUL15(a[18], b[ 5])
 645                 + MUL15(a[19], b[ 4]);
 646         t[24] = MUL15(a[ 5], b[19])
 647                 + MUL15(a[ 6], b[18])
 648                 + MUL15(a[ 7], b[17])
 649                 + MUL15(a[ 8], b[16])
 650                 + MUL15(a[ 9], b[15])
 651                 + MUL15(a[10], b[14])
 652                 + MUL15(a[11], b[13])
 653                 + MUL15(a[12], b[12])
 654                 + MUL15(a[13], b[11])
 655                 + MUL15(a[14], b[10])
 656                 + MUL15(a[15], b[ 9])
 657                 + MUL15(a[16], b[ 8])
 658                 + MUL15(a[17], b[ 7])
 659                 + MUL15(a[18], b[ 6])
 660                 + MUL15(a[19], b[ 5]);
 661         t[25] = MUL15(a[ 6], b[19])
 662                 + MUL15(a[ 7], b[18])
 663                 + MUL15(a[ 8], b[17])
 664                 + MUL15(a[ 9], b[16])
 665                 + MUL15(a[10], b[15])
 666                 + MUL15(a[11], b[14])
 667                 + MUL15(a[12], b[13])
 668                 + MUL15(a[13], b[12])
 669                 + MUL15(a[14], b[11])
 670                 + MUL15(a[15], b[10])
 671                 + MUL15(a[16], b[ 9])
 672                 + MUL15(a[17], b[ 8])
 673                 + MUL15(a[18], b[ 7])
 674                 + MUL15(a[19], b[ 6]);
 675         t[26] = MUL15(a[ 7], b[19])
 676                 + MUL15(a[ 8], b[18])
 677                 + MUL15(a[ 9], b[17])
 678                 + MUL15(a[10], b[16])
 679                 + MUL15(a[11], b[15])
 680                 + MUL15(a[12], b[14])
 681                 + MUL15(a[13], b[13])
 682                 + MUL15(a[14], b[12])
 683                 + MUL15(a[15], b[11])
 684                 + MUL15(a[16], b[10])
 685                 + MUL15(a[17], b[ 9])
 686                 + MUL15(a[18], b[ 8])
 687                 + MUL15(a[19], b[ 7]);
 688         t[27] = MUL15(a[ 8], b[19])
 689                 + MUL15(a[ 9], b[18])
 690                 + MUL15(a[10], b[17])
 691                 + MUL15(a[11], b[16])
 692                 + MUL15(a[12], b[15])
 693                 + MUL15(a[13], b[14])
 694                 + MUL15(a[14], b[13])
 695                 + MUL15(a[15], b[12])
 696                 + MUL15(a[16], b[11])
 697                 + MUL15(a[17], b[10])
 698                 + MUL15(a[18], b[ 9])
 699                 + MUL15(a[19], b[ 8]);
 700         t[28] = MUL15(a[ 9], b[19])
 701                 + MUL15(a[10], b[18])
 702                 + MUL15(a[11], b[17])
 703                 + MUL15(a[12], b[16])
 704                 + MUL15(a[13], b[15])
 705                 + MUL15(a[14], b[14])
 706                 + MUL15(a[15], b[13])
 707                 + MUL15(a[16], b[12])
 708                 + MUL15(a[17], b[11])
 709                 + MUL15(a[18], b[10])
 710                 + MUL15(a[19], b[ 9]);
 711         t[29] = MUL15(a[10], b[19])
 712                 + MUL15(a[11], b[18])
 713                 + MUL15(a[12], b[17])
 714                 + MUL15(a[13], b[16])
 715                 + MUL15(a[14], b[15])
 716                 + MUL15(a[15], b[14])
 717                 + MUL15(a[16], b[13])
 718                 + MUL15(a[17], b[12])
 719                 + MUL15(a[18], b[11])
 720                 + MUL15(a[19], b[10]);
 721         t[30] = MUL15(a[11], b[19])
 722                 + MUL15(a[12], b[18])
 723                 + MUL15(a[13], b[17])
 724                 + MUL15(a[14], b[16])
 725                 + MUL15(a[15], b[15])
 726                 + MUL15(a[16], b[14])
 727                 + MUL15(a[17], b[13])
 728                 + MUL15(a[18], b[12])
 729                 + MUL15(a[19], b[11]);
 730         t[31] = MUL15(a[12], b[19])
 731                 + MUL15(a[13], b[18])
 732                 + MUL15(a[14], b[17])
 733                 + MUL15(a[15], b[16])
 734                 + MUL15(a[16], b[15])
 735                 + MUL15(a[17], b[14])
 736                 + MUL15(a[18], b[13])
 737                 + MUL15(a[19], b[12]);
 738         t[32] = MUL15(a[13], b[19])
 739                 + MUL15(a[14], b[18])
 740                 + MUL15(a[15], b[17])
 741                 + MUL15(a[16], b[16])
 742                 + MUL15(a[17], b[15])
 743                 + MUL15(a[18], b[14])
 744                 + MUL15(a[19], b[13]);
 745         t[33] = MUL15(a[14], b[19])
 746                 + MUL15(a[15], b[18])
 747                 + MUL15(a[16], b[17])
 748                 + MUL15(a[17], b[16])
 749                 + MUL15(a[18], b[15])
 750                 + MUL15(a[19], b[14]);
 751         t[34] = MUL15(a[15], b[19])
 752                 + MUL15(a[16], b[18])
 753                 + MUL15(a[17], b[17])
 754                 + MUL15(a[18], b[16])
 755                 + MUL15(a[19], b[15]);
 756         t[35] = MUL15(a[16], b[19])
 757                 + MUL15(a[17], b[18])
 758                 + MUL15(a[18], b[17])
 759                 + MUL15(a[19], b[16]);
 760         t[36] = MUL15(a[17], b[19])
 761                 + MUL15(a[18], b[18])
 762                 + MUL15(a[19], b[17]);
 763         t[37] = MUL15(a[18], b[19])
 764                 + MUL15(a[19], b[18]);
 765         t[38] = MUL15(a[19], b[19]);
 766         d[39] = norm13(d, t, 39);
 767 }
 768
 769 static void
 770 square20(uint32_t *d, const uint32_t *a)
 771 {
 772         uint32_t t[39];
 773
 774         t[ 0] = MUL15(a[ 0], a[ 0]);
 775         t[ 1] = ((MUL15(a[ 0], a[ 1])) << 1);
 776         t[ 2] = MUL15(a[ 1], a[ 1])
 777                 + ((MUL15(a[ 0], a[ 2])) << 1);
 778         t[ 3] = ((MUL15(a[ 0], a[ 3])
 779                 + MUL15(a[ 1], a[ 2])) << 1);
 780         t[ 4] = MUL15(a[ 2], a[ 2])
 781                 + ((MUL15(a[ 0], a[ 4])
 782                 + MUL15(a[ 1], a[ 3])) << 1);
 783         t[ 5] = ((MUL15(a[ 0], a[ 5])
 784                 + MUL15(a[ 1], a[ 4])
 785                 + MUL15(a[ 2], a[ 3])) << 1);
 786         t[ 6] = MUL15(a[ 3], a[ 3])
 787                 + ((MUL15(a[ 0], a[ 6])
 788                 + MUL15(a[ 1], a[ 5])
 789                 + MUL15(a[ 2], a[ 4])) << 1);
 790         t[ 7] = ((MUL15(a[ 0], a[ 7])
 791                 + MUL15(a[ 1], a[ 6])
 792                 + MUL15(a[ 2], a[ 5])
 793                 + MUL15(a[ 3], a[ 4])) << 1);
 794         t[ 8] = MUL15(a[ 4], a[ 4])
 795                 + ((MUL15(a[ 0], a[ 8])
 796                 + MUL15(a[ 1], a[ 7])
 797                 + MUL15(a[ 2], a[ 6])
 798                 + MUL15(a[ 3], a[ 5])) << 1);
 799         t[ 9] = ((MUL15(a[ 0], a[ 9])
 800                 + MUL15(a[ 1], a[ 8])
 801                 + MUL15(a[ 2], a[ 7])
 802                 + MUL15(a[ 3], a[ 6])
 803                 + MUL15(a[ 4], a[ 5])) << 1);
 804         t[10] = MUL15(a[ 5], a[ 5])
 805                 + ((MUL15(a[ 0], a[10])
 806                 + MUL15(a[ 1], a[ 9])
 807                 + MUL15(a[ 2], a[ 8])
 808                 + MUL15(a[ 3], a[ 7])
 809                 + MUL15(a[ 4], a[ 6])) << 1);
 810         t[11] = ((MUL15(a[ 0], a[11])
 811                 + MUL15(a[ 1], a[10])
 812                 + MUL15(a[ 2], a[ 9])
 813                 + MUL15(a[ 3], a[ 8])
 814                 + MUL15(a[ 4], a[ 7])
 815                 + MUL15(a[ 5], a[ 6])) << 1);
 816         t[12] = MUL15(a[ 6], a[ 6])
 817                 + ((MUL15(a[ 0], a[12])
 818                 + MUL15(a[ 1], a[11])
 819                 + MUL15(a[ 2], a[10])
 820                 + MUL15(a[ 3], a[ 9])
 821                 + MUL15(a[ 4], a[ 8])
 822                 + MUL15(a[ 5], a[ 7])) << 1);
 823         t[13] = ((MUL15(a[ 0], a[13])
 824                 + MUL15(a[ 1], a[12])
 825                 + MUL15(a[ 2], a[11])
 826                 + MUL15(a[ 3], a[10])
 827                 + MUL15(a[ 4], a[ 9])
 828                 + MUL15(a[ 5], a[ 8])
 829                 + MUL15(a[ 6], a[ 7])) << 1);
 830         t[14] = MUL15(a[ 7], a[ 7])
 831                 + ((MUL15(a[ 0], a[14])
 832                 + MUL15(a[ 1], a[13])
 833                 + MUL15(a[ 2], a[12])
 834                 + MUL15(a[ 3], a[11])
 835                 + MUL15(a[ 4], a[10])
 836                 + MUL15(a[ 5], a[ 9])
 837                 + MUL15(a[ 6], a[ 8])) << 1);
 838         t[15] = ((MUL15(a[ 0], a[15])
 839                 + MUL15(a[ 1], a[14])
 840                 + MUL15(a[ 2], a[13])
 841                 + MUL15(a[ 3], a[12])
 842                 + MUL15(a[ 4], a[11])
 843                 + MUL15(a[ 5], a[10])
 844                 + MUL15(a[ 6], a[ 9])
 845                 + MUL15(a[ 7], a[ 8])) << 1);
 846         t[16] = MUL15(a[ 8], a[ 8])
 847                 + ((MUL15(a[ 0], a[16])
 848                 + MUL15(a[ 1], a[15])
 849                 + MUL15(a[ 2], a[14])
 850                 + MUL15(a[ 3], a[13])
 851                 + MUL15(a[ 4], a[12])
 852                 + MUL15(a[ 5], a[11])
 853                 + MUL15(a[ 6], a[10])
 854                 + MUL15(a[ 7], a[ 9])) << 1);
 855         t[17] = ((MUL15(a[ 0], a[17])
 856                 + MUL15(a[ 1], a[16])
 857                 + MUL15(a[ 2], a[15])
 858                 + MUL15(a[ 3], a[14])
 859                 + MUL15(a[ 4], a[13])
 860                 + MUL15(a[ 5], a[12])
 861                 + MUL15(a[ 6], a[11])
 862                 + MUL15(a[ 7], a[10])
 863                 + MUL15(a[ 8], a[ 9])) << 1);
 864         t[18] = MUL15(a[ 9], a[ 9])
 865                 + ((MUL15(a[ 0], a[18])
 866                 + MUL15(a[ 1], a[17])
 867                 + MUL15(a[ 2], a[16])
 868                 + MUL15(a[ 3], a[15])
 869                 + MUL15(a[ 4], a[14])
 870                 + MUL15(a[ 5], a[13])
 871                 + MUL15(a[ 6], a[12])
 872                 + MUL15(a[ 7], a[11])
 873                 + MUL15(a[ 8], a[10])) << 1);
 874         t[19] = ((MUL15(a[ 0], a[19])
 875                 + MUL15(a[ 1], a[18])
 876                 + MUL15(a[ 2], a[17])
 877                 + MUL15(a[ 3], a[16])
 878                 + MUL15(a[ 4], a[15])
 879                 + MUL15(a[ 5], a[14])
 880                 + MUL15(a[ 6], a[13])
 881                 + MUL15(a[ 7], a[12])
 882                 + MUL15(a[ 8], a[11])
 883                 + MUL15(a[ 9], a[10])) << 1);
 884         t[20] = MUL15(a[10], a[10])
 885                 + ((MUL15(a[ 1], a[19])
 886                 + MUL15(a[ 2], a[18])
 887                 + MUL15(a[ 3], a[17])
 888                 + MUL15(a[ 4], a[16])
 889                 + MUL15(a[ 5], a[15])
 890                 + MUL15(a[ 6], a[14])
 891                 + MUL15(a[ 7], a[13])
 892                 + MUL15(a[ 8], a[12])
 893                 + MUL15(a[ 9], a[11])) << 1);
 894         t[21] = ((MUL15(a[ 2], a[19])
 895                 + MUL15(a[ 3], a[18])
 896                 + MUL15(a[ 4], a[17])
 897                 + MUL15(a[ 5], a[16])
 898                 + MUL15(a[ 6], a[15])
 899                 + MUL15(a[ 7], a[14])
 900                 + MUL15(a[ 8], a[13])
 901                 + MUL15(a[ 9], a[12])
 902                 + MUL15(a[10], a[11])) << 1);
 903         t[22] = MUL15(a[11], a[11])
 904                 + ((MUL15(a[ 3], a[19])
 905                 + MUL15(a[ 4], a[18])
 906                 + MUL15(a[ 5], a[17])
 907                 + MUL15(a[ 6], a[16])
 908                 + MUL15(a[ 7], a[15])
 909                 + MUL15(a[ 8], a[14])
 910                 + MUL15(a[ 9], a[13])
 911                 + MUL15(a[10], a[12])) << 1);
 912         t[23] = ((MUL15(a[ 4], a[19])
 913                 + MUL15(a[ 5], a[18])
 914                 + MUL15(a[ 6], a[17])
 915                 + MUL15(a[ 7], a[16])
 916                 + MUL15(a[ 8], a[15])
 917                 + MUL15(a[ 9], a[14])
 918                 + MUL15(a[10], a[13])
 919                 + MUL15(a[11], a[12])) << 1);
 920         t[24] = MUL15(a[12], a[12])
 921                 + ((MUL15(a[ 5], a[19])
 922                 + MUL15(a[ 6], a[18])
 923                 + MUL15(a[ 7], a[17])
 924                 + MUL15(a[ 8], a[16])
 925                 + MUL15(a[ 9], a[15])
 926                 + MUL15(a[10], a[14])
 927                 + MUL15(a[11], a[13])) << 1);
 928         t[25] = ((MUL15(a[ 6], a[19])
 929                 + MUL15(a[ 7], a[18])
 930                 + MUL15(a[ 8], a[17])
 931                 + MUL15(a[ 9], a[16])
 932                 + MUL15(a[10], a[15])
 933                 + MUL15(a[11], a[14])
 934                 + MUL15(a[12], a[13])) << 1);
 935         t[26] = MUL15(a[13], a[13])
 936                 + ((MUL15(a[ 7], a[19])
 937                 + MUL15(a[ 8], a[18])
 938                 + MUL15(a[ 9], a[17])
 939                 + MUL15(a[10], a[16])
 940                 + MUL15(a[11], a[15])
 941                 + MUL15(a[12], a[14])) << 1);
 942         t[27] = ((MUL15(a[ 8], a[19])
 943                 + MUL15(a[ 9], a[18])
 944                 + MUL15(a[10], a[17])
 945                 + MUL15(a[11], a[16])
 946                 + MUL15(a[12], a[15])
 947                 + MUL15(a[13], a[14])) << 1);
 948         t[28] = MUL15(a[14], a[14])
 949                 + ((MUL15(a[ 9], a[19])
 950                 + MUL15(a[10], a[18])
 951                 + MUL15(a[11], a[17])
 952                 + MUL15(a[12], a[16])
 953                 + MUL15(a[13], a[15])) << 1);
 954         t[29] = ((MUL15(a[10], a[19])
 955                 + MUL15(a[11], a[18])
 956                 + MUL15(a[12], a[17])
 957                 + MUL15(a[13], a[16])
 958                 + MUL15(a[14], a[15])) << 1);
 959         t[30] = MUL15(a[15], a[15])
 960                 + ((MUL15(a[11], a[19])
 961                 + MUL15(a[12], a[18])
 962                 + MUL15(a[13], a[17])
 963                 + MUL15(a[14], a[16])) << 1);
 964         t[31] = ((MUL15(a[12], a[19])
 965                 + MUL15(a[13], a[18])
 966                 + MUL15(a[14], a[17])
 967                 + MUL15(a[15], a[16])) << 1);
 968         t[32] = MUL15(a[16], a[16])
 969                 + ((MUL15(a[13], a[19])
 970                 + MUL15(a[14], a[18])
 971                 + MUL15(a[15], a[17])) << 1);
 972         t[33] = ((MUL15(a[14], a[19])
 973                 + MUL15(a[15], a[18])
 974                 + MUL15(a[16], a[17])) << 1);
 975         t[34] = MUL15(a[17], a[17])
 976                 + ((MUL15(a[15], a[19])
 977                 + MUL15(a[16], a[18])) << 1);
 978         t[35] = ((MUL15(a[16], a[19])
 979                 + MUL15(a[17], a[18])) << 1);
 980         t[36] = MUL15(a[18], a[18])
 981                 + ((MUL15(a[17], a[19])) << 1);
 982         t[37] = ((MUL15(a[18], a[19])) << 1);
 983         t[38] = MUL15(a[19], a[19]);
 984         d[39] = norm13(d, t, 39);
 985 }
 986
 987 #endif
 988
 989 /*
 990  * Modulus for field F256 (field for point coordinates in curve P-256).
 991  */
 992 static const uint32_t F256[] = {
 993         0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x001F,
 994         0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0400, 0x0000,
 995         0x0000, 0x1FF8, 0x1FFF, 0x01FF
 996 };
 997
 998 /*
 999  * The 'b' curve equation coefficient for P-256.
1000  */
1001 static const uint32_t P256_B[] = {
1002         0x004B, 0x1E93, 0x0F89, 0x1C78, 0x03BC, 0x187B, 0x114E, 0x1619,
1003         0x1D06, 0x0328, 0x01AF, 0x0D31, 0x1557, 0x15DE, 0x1ECF, 0x127C,
1004         0x0A3A, 0x0EC5, 0x118D, 0x00B5
1005 };
1006
1007 /*
1008  * Perform a "short reduction" in field F256 (field for curve P-256).
1009  * The source value should be less than 262 bits; on output, it will
1010  * be at most 257 bits, and less than twice the modulus.
1011  */
1012 static void
1013 reduce_f256(uint32_t *d)
1014 {
1015         uint32_t x;
1016
1017         x = d[19] >> 9;
1018         d[19] &= 0x01FF;
1019         d[17] += x << 3;
1020         d[14] -= x << 10;
1021         d[7] -= x << 5;
1022         d[0] += x;
1023         norm13(d, d, 20);
1024 }
1025
1026 /*
1027  * Perform a "final reduction" in field F256 (field for curve P-256).
1028  * The source value must be less than twice the modulus. If the value
1029  * is not lower than the modulus, then the modulus is subtracted and
1030  * this function returns 1; otherwise, it leaves it untouched and it
1031  * returns 0.
1032  */
1033 static uint32_t
1034 reduce_final_f256(uint32_t *d)
1035 {
1036         uint32_t t[20];
1037         uint32_t cc;
1038         int i;
1039
1040         memcpy(t, d, sizeof t);
1041         cc = 0;
1042         for (i = 0; i < 20; i ++) {
1043                 uint32_t w;
1044
1045                 w = t[i] - F256[i] - cc;
1046                 cc = w >> 31;
1047                 t[i] = w & 0x1FFF;
1048         }
1049         cc ^= 1;
1050         CCOPY(cc, d, t, sizeof t);
1051         return cc;
1052 }
1053
1054 /*
1055  * Perform a multiplication of two integers modulo
1056  * 2^256-2^224+2^192+2^96-1 (for NIST curve P-256). Operands are arrays
1057  * of 20 words, each containing 13 bits of data, in little-endian order.
1058  * On input, upper word may be up to 13 bits (hence value up to 2^260-1);
1059  * on output, value fits on 257 bits and is lower than twice the modulus.
1060  */
1061 static void
1062 mul_f256(uint32_t *d, const uint32_t *a, const uint32_t *b)
1063 {
1064         uint32_t t[40], cc;
1065         int i;
1066
1067         /*
1068          * Compute raw multiplication. All result words fit in 13 bits
1069          * each.
1070          */
1071         mul20(t, a, b);
1072
1073         /*
1074          * Modular reduction: each high word in added/subtracted where
1075          * necessary.
1076          *
1077          * The modulus is:
1078          *    p = 2^256 - 2^224 + 2^192 + 2^96 - 1
1079          * Therefore:
1080          *    2^256 = 2^224 - 2^192 - 2^96 + 1 mod p
1081          *
1082          * For a word x at bit offset n (n >= 256), we have:
1083          *    x*2^n = x*2^(n-32) - x*2^(n-64)
1084          *            - x*2^(n - 160) + x*2^(n-256) mod p
1085          *
1086          * Thus, we can nullify the high word if we reinject it at some
1087          * proper emplacements.
1088          */
1089         for (i = 39; i >= 20; i --) {
1090                 uint32_t x;
1091
1092                 x = t[i];
1093                 t[i - 2] += ARSH(x, 6);
1094                 t[i - 3] += (x << 7) & 0x1FFF;
1095                 t[i - 4] -= ARSH(x, 12);
1096                 t[i - 5] -= (x << 1) & 0x1FFF;
1097                 t[i - 12] -= ARSH(x, 4);
1098                 t[i - 13] -= (x << 9) & 0x1FFF;
1099                 t[i - 19] += ARSH(x, 9);
1100                 t[i - 20] += (x << 4) & 0x1FFF;
1101         }
1102
1103         /*
1104          * Propagate carries. Since the operation above really is a
1105          * truncature, followed by the addition of nonnegative values,
1106          * the result will be positive. Moreover, the carry cannot
1107          * exceed 5 bits (we performed 20 additions with values smaller
1108          * than 256 bits).
1109          */
1110         cc = norm13(t, t, 20);
1111
1112         /*
1113          * Perform modular reduction again for the bits beyond 256 (the carry
1114          * and the bits 256..259). This time, we can simply inject full
1115          * word values.
1116          */
1117         cc = (cc << 4) | (t[19] >> 9);
1118         t[19] &= 0x01FF;
1119         t[17] += cc << 3;
1120         t[14] -= cc << 10;
1121         t[7] -= cc << 5;
1122         t[0] += cc;
1123         norm13(d, t, 20);
1124 }
1125
1126 /*
1127  * Square an integer modulo 2^256-2^224+2^192+2^96-1 (for NIST curve
1128  * P-256). Operand is an array of 20 words, each containing 13 bits of
1129  * data, in little-endian order. On input, upper word may be up to 13
1130  * bits (hence value up to 2^260-1); on output, value fits on 257 bits
1131  * and is lower than twice the modulus.
1132  */
1133 static void
1134 square_f256(uint32_t *d, const uint32_t *a)
1135 {
1136         uint32_t t[40], cc;
1137         int i;
1138
1139         /*
1140          * Compute raw square. All result words fit in 13 bits each.
1141          */
1142         square20(t, a);
1143
1144         /*
1145          * Modular reduction: each high word in added/subtracted where
1146          * necessary.
1147          *
1148          * The modulus is:
1149          *    p = 2^256 - 2^224 + 2^192 + 2^96 - 1
1150          * Therefore:
1151          *    2^256 = 2^224 - 2^192 - 2^96 + 1 mod p
1152          *
1153          * For a word x at bit offset n (n >= 256), we have:
1154          *    x*2^n = x*2^(n-32) - x*2^(n-64)
1155          *            - x*2^(n - 160) + x*2^(n-256) mod p
1156          *
1157          * Thus, we can nullify the high word if we reinject it at some
1158          * proper emplacements.
1159          */
1160         for (i = 39; i >= 20; i --) {
1161                 uint32_t x;
1162
1163                 x = t[i];
1164                 t[i - 2] += ARSH(x, 6);
1165                 t[i - 3] += (x << 7) & 0x1FFF;
1166                 t[i - 4] -= ARSH(x, 12);
1167                 t[i - 5] -= (x << 1) & 0x1FFF;
1168                 t[i - 12] -= ARSH(x, 4);
1169                 t[i - 13] -= (x << 9) & 0x1FFF;
1170                 t[i - 19] += ARSH(x, 9);
1171                 t[i - 20] += (x << 4) & 0x1FFF;
1172         }
1173
1174         /*
1175          * Propagate carries. Since the operation above really is a
1176          * truncature, followed by the addition of nonnegative values,
1177          * the result will be positive. Moreover, the carry cannot
1178          * exceed 5 bits (we performed 20 additions with values smaller
1179          * than 256 bits).
1180          */
1181         cc = norm13(t, t, 20);
1182
1183         /*
1184          * Perform modular reduction again for the bits beyond 256 (the carry
1185          * and the bits 256..259). This time, we can simply inject full
1186          * word values.
1187          */
1188         cc = (cc << 4) | (t[19] >> 9);
1189         t[19] &= 0x01FF;
1190         t[17] += cc << 3;
1191         t[14] -= cc << 10;
1192         t[7] -= cc << 5;
1193         t[0] += cc;
1194         norm13(d, t, 20);
1195 }
1196
1197 /*
1198  * Jacobian coordinates for a point in P-256: affine coordinates (X,Y)
1199  * are such that:
1200  *   X = x / z^2
1201  *   Y = y / z^3
1202  * For the point at infinity, z = 0.
1203  * Each point thus admits many possible representations.
1204  *
1205  * Coordinates are represented in arrays of 32-bit integers, each holding
1206  * 13 bits of data. Values may also be slightly greater than the modulus,
1207  * but they will always be lower than twice the modulus.
1208  */
1209 typedef struct {
1210         uint32_t x[20];
1211         uint32_t y[20];
1212         uint32_t z[20];
1213 } p256_jacobian;
1214
1215 /*
1216  * Convert a point to affine coordinates:
1217  *  - If the point is the point at infinity, then all three coordinates
1218  *    are set to 0.
1219  *  - Otherwise, the 'z' coordinate is set to 1, and the 'x' and 'y'
1220  *    coordinates are the 'X' and 'Y' affine coordinates.
1221  * The coordinates are guaranteed to be lower than the modulus.
1222  */
1223 static void
1224 p256_to_affine(p256_jacobian *P)
1225 {
1226         uint32_t t1[20], t2[20];
1227         int i;
1228
1229         /*
1230          * Invert z with a modular exponentiation: the modulus is
1231          * p = 2^256 - 2^224 + 2^192 + 2^96 - 1, and the exponent is
1232          * p-2. Exponent bit pattern (from high to low) is:
1233          *  - 32 bits of value 1
1234          *  - 31 bits of value 0
1235          *  - 1 bit of value 1
1236          *  - 96 bits of value 0
1237          *  - 94 bits of value 1
1238          *  - 1 bit of value 0
1239          *  - 1 bit of value 1
1240          * Thus, we precompute z^(2^31-1) to speed things up.
1241          *
1242          * If z = 0 (point at infinity) then the modular exponentiation
1243          * will yield 0, which leads to the expected result (all three
1244          * coordinates set to 0).
1245          */
1246
1247         /*
1248          * A simple square-and-multiply for z^(2^31-1). We could save about
1249          * two dozen multiplications here with an addition chain, but
1250          * this would require a bit more code, and extra stack buffers.
1251          */
1252         memcpy(t1, P->z, sizeof P->z);
1253         for (i = 0; i < 30; i ++) {
1254                 square_f256(t1, t1);
1255                 mul_f256(t1, t1, P->z);
1256         }
1257
1258         /*
1259          * Square-and-multiply. Apart from the squarings, we have a few
1260          * multiplications to set bits to 1; we multiply by the original z
1261          * for setting 1 bit, and by t1 for setting 31 bits.
1262          */
1263         memcpy(t2, P->z, sizeof P->z);
1264         for (i = 1; i < 256; i ++) {
1265                 square_f256(t2, t2);
1266                 switch (i) {
1267                 case 31:
1268                 case 190:
1269                 case 221:
1270                 case 252:
1271                         mul_f256(t2, t2, t1);
1272                         break;
1273                 case 63:
1274                 case 253:
1275                 case 255:
1276                         mul_f256(t2, t2, P->z);
1277                         break;
1278                 }
1279         }
1280
1281         /*
1282          * Now that we have 1/z, multiply x by 1/z^2 and y by 1/z^3.
1283          */
1284         mul_f256(t1, t2, t2);
1285         mul_f256(P->x, t1, P->x);
1286         mul_f256(t1, t1, t2);
1287         mul_f256(P->y, t1, P->y);
1288         reduce_final_f256(P->x);
1289         reduce_final_f256(P->y);
1290
1291         /*
1292          * Multiply z by 1/z. If z = 0, then this will yield 0, otherwise
1293          * this will set z to 1.
1294          */
1295         mul_f256(P->z, P->z, t2);
1296         reduce_final_f256(P->z);
1297 }
1298
1299 /*
1300  * Double a point in P-256. This function works for all valid points,
1301  * including the point at infinity.
1302  */
1303 static void
1304 p256_double(p256_jacobian *Q)
1305 {
1306         /*
1307          * Doubling formulas are:
1308          *
1309          *   s = 4*x*y^2
1310          *   m = 3*(x + z^2)*(x - z^2)
1311          *   x' = m^2 - 2*s
1312          *   y' = m*(s - x') - 8*y^4
1313          *   z' = 2*y*z
1314          *
1315          * These formulas work for all points, including points of order 2
1316          * and points at infinity:
1317          *   - If y = 0 then z' = 0. But there is no such point in P-256
1318          *     anyway.
1319          *   - If z = 0 then z' = 0.
1320          */
1321         uint32_t t1[20], t2[20], t3[20], t4[20];
1322         int i;
1323
1324         /*
1325          * Compute z^2 in t1.
1326          */
1327         square_f256(t1, Q->z);
1328
1329         /*
1330          * Compute x-z^2 in t2 and x+z^2 in t1.
1331          */
1332         for (i = 0; i < 20; i ++) {
1333                 t2[i] = (F256[i] << 1) + Q->x[i] - t1[i];
1334                 t1[i] += Q->x[i];
1335         }
1336         norm13(t1, t1, 20);
1337         norm13(t2, t2, 20);
1338
1339         /*
1340          * Compute 3*(x+z^2)*(x-z^2) in t1.
1341          */
1342         mul_f256(t3, t1, t2);
1343         for (i = 0; i < 20; i ++) {
1344                 t1[i] = MUL15(3, t3[i]);
1345         }
1346         norm13(t1, t1, 20);
1347
1348         /*
1349          * Compute 4*x*y^2 (in t2) and 2*y^2 (in t3).
1350          */
1351         square_f256(t3, Q->y);
1352         for (i = 0; i < 20; i ++) {
1353                 t3[i] <<= 1;
1354         }
1355         norm13(t3, t3, 20);
1356         mul_f256(t2, Q->x, t3);
1357         for (i = 0; i < 20; i ++) {
1358                 t2[i] <<= 1;
1359         }
1360         norm13(t2, t2, 20);
1361         reduce_f256(t2);
1362
1363         /*
1364          * Compute x' = m^2 - 2*s.
1365          */
1366         square_f256(Q->x, t1);
1367         for (i = 0; i < 20; i ++) {
1368                 Q->x[i] += (F256[i] << 2) - (t2[i] << 1);
1369         }
1370         norm13(Q->x, Q->x, 20);
1371         reduce_f256(Q->x);
1372
1373         /*
1374          * Compute z' = 2*y*z.
1375          */
1376         mul_f256(t4, Q->y, Q->z);
1377         for (i = 0; i < 20; i ++) {
1378                 Q->z[i] = t4[i] << 1;
1379         }
1380         norm13(Q->z, Q->z, 20);
1381         reduce_f256(Q->z);
1382
1383         /*
1384          * Compute y' = m*(s - x') - 8*y^4. Note that we already have
1385          * 2*y^2 in t3.
1386          */
1387         for (i = 0; i < 20; i ++) {
1388                 t2[i] += (F256[i] << 1) - Q->x[i];
1389         }
1390         norm13(t2, t2, 20);
1391         mul_f256(Q->y, t1, t2);
1392         square_f256(t4, t3);
1393         for (i = 0; i < 20; i ++) {
1394                 Q->y[i] += (F256[i] << 2) - (t4[i] << 1);
1395         }
1396         norm13(Q->y, Q->y, 20);
1397         reduce_f256(Q->y);
1398 }
1399
1400 /*
1401  * Add point P2 to point P1.
1402  *
1403  * This function computes the wrong result in the following cases:
1404  *
1405  *   - If P1 == 0 but P2 != 0
1406  *   - If P1 != 0 but P2 == 0
1407  *   - If P1 == P2
1408  *
1409  * In all three cases, P1 is set to the point at infinity.
1410  *
1411  * Returned value is 0 if one of the following occurs:
1412  *
1413  *   - P1 and P2 have the same Y coordinate
1414  *   - P1 == 0 and P2 == 0
1415  *   - The Y coordinate of one of the points is 0 and the other point is
1416  *     the point at infinity.
1417  *
1418  * The third case cannot actually happen with valid points, since a point
1419  * with Y == 0 is a point of order 2, and there is no point of order 2 on
1420  * curve P-256.
1421  *
1422  * Therefore, assuming that P1 != 0 and P2 != 0 on input, then the caller
1423  * can apply the following:
1424  *
1425  *   - If the result is not the point at infinity, then it is correct.
1426  *   - Otherwise, if the returned value is 1, then this is a case of
1427  *     P1+P2 == 0, so the result is indeed the point at infinity.
1428  *   - Otherwise, P1 == P2, so a "double" operation should have been
1429  *     performed.
1430  */
1431 static uint32_t
1432 p256_add(p256_jacobian *P1, const p256_jacobian *P2)
1433 {
1434         /*
1435          * Addtions formulas are:
1436          *
1437          *   u1 = x1 * z2^2
1438          *   u2 = x2 * z1^2
1439          *   s1 = y1 * z2^3
1440          *   s2 = y2 * z1^3
1441          *   h = u2 - u1
1442          *   r = s2 - s1
1443          *   x3 = r^2 - h^3 - 2 * u1 * h^2
1444          *   y3 = r * (u1 * h^2 - x3) - s1 * h^3
1445          *   z3 = h * z1 * z2
1446          */
1447         uint32_t t1[20], t2[20], t3[20], t4[20], t5[20], t6[20], t7[20];
1448         uint32_t ret;
1449         int i;
1450
1451         /*
1452          * Compute u1 = x1*z2^2 (in t1) and s1 = y1*z2^3 (in t3).
1453          */
1454         square_f256(t3, P2->z);
1455         mul_f256(t1, P1->x, t3);
1456         mul_f256(t4, P2->z, t3);
1457         mul_f256(t3, P1->y, t4);
1458
1459         /*
1460          * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
1461          */
1462         square_f256(t4, P1->z);
1463         mul_f256(t2, P2->x, t4);
1464         mul_f256(t5, P1->z, t4);
1465         mul_f256(t4, P2->y, t5);
1466
1467         /*
1468          * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
1469          * We need to test whether r is zero, so we will do some extra
1470          * reduce.
1471          */
1472         for (i = 0; i < 20; i ++) {
1473                 t2[i] += (F256[i] << 1) - t1[i];
1474                 t4[i] += (F256[i] << 1) - t3[i];
1475         }
1476         norm13(t2, t2, 20);
1477         norm13(t4, t4, 20);
1478         reduce_f256(t4);
1479         reduce_final_f256(t4);
1480         ret = 0;
1481         for (i = 0; i < 20; i ++) {
1482                 ret |= t4[i];
1483         }
1484         ret = (ret | -ret) >> 31;
1485
1486         /*
1487          * Compute u1*h^2 (in t6) and h^3 (in t5);
1488          */
1489         square_f256(t7, t2);
1490         mul_f256(t6, t1, t7);
1491         mul_f256(t5, t7, t2);
1492
1493         /*
1494          * Compute x3 = r^2 - h^3 - 2*u1*h^2.
1495          */
1496         square_f256(P1->x, t4);
1497         for (i = 0; i < 20; i ++) {
1498                 P1->x[i] += (F256[i] << 3) - t5[i] - (t6[i] << 1);
1499         }
1500         norm13(P1->x, P1->x, 20);
1501         reduce_f256(P1->x);
1502
1503         /*
1504          * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
1505          */
1506         for (i = 0; i < 20; i ++) {
1507                 t6[i] += (F256[i] << 1) - P1->x[i];
1508         }
1509         norm13(t6, t6, 20);
1510         mul_f256(P1->y, t4, t6);
1511         mul_f256(t1, t5, t3);
1512         for (i = 0; i < 20; i ++) {
1513                 P1->y[i] += (F256[i] << 1) - t1[i];
1514         }
1515         norm13(P1->y, P1->y, 20);
1516         reduce_f256(P1->y);
1517
1518         /*
1519          * Compute z3 = h*z1*z2.
1520          */
1521         mul_f256(t1, P1->z, P2->z);
1522         mul_f256(P1->z, t1, t2);
1523
1524         return ret;
1525 }
1526
1527 /*
1528  * Add point P2 to point P1. This is a specialised function for the
1529  * case when P2 is a non-zero point in affine coordinate.
1530  *
1531  * This function computes the wrong result in the following cases:
1532  *
1533  *   - If P1 == 0
1534  *   - If P1 == P2
1535  *
1536  * In both cases, P1 is set to the point at infinity.
1537  *
1538  * Returned value is 0 if one of the following occurs:
1539  *
1540  *   - P1 and P2 have the same Y coordinate
1541  *   - The Y coordinate of P2 is 0 and P1 is the point at infinity.
1542  *
1543  * The second case cannot actually happen with valid points, since a point
1544  * with Y == 0 is a point of order 2, and there is no point of order 2 on
1545  * curve P-256.
1546  *
1547  * Therefore, assuming that P1 != 0 on input, then the caller
1548  * can apply the following:
1549  *
1550  *   - If the result is not the point at infinity, then it is correct.
1551  *   - Otherwise, if the returned value is 1, then this is a case of
1552  *     P1+P2 == 0, so the result is indeed the point at infinity.
1553  *   - Otherwise, P1 == P2, so a "double" operation should have been
1554  *     performed.
1555  */
1556 static uint32_t
1557 p256_add_mixed(p256_jacobian *P1, const p256_jacobian *P2)
1558 {
1559         /*
1560          * Addtions formulas are:
1561          *
1562          *   u1 = x1
1563          *   u2 = x2 * z1^2
1564          *   s1 = y1
1565          *   s2 = y2 * z1^3
1566          *   h = u2 - u1
1567          *   r = s2 - s1
1568          *   x3 = r^2 - h^3 - 2 * u1 * h^2
1569          *   y3 = r * (u1 * h^2 - x3) - s1 * h^3
1570          *   z3 = h * z1
1571          */
1572         uint32_t t1[20], t2[20], t3[20], t4[20], t5[20], t6[20], t7[20];
1573         uint32_t ret;
1574         int i;
1575
1576         /*
1577          * Compute u1 = x1 (in t1) and s1 = y1 (in t3).
1578          */
1579         memcpy(t1, P1->x, sizeof t1);
1580         memcpy(t3, P1->y, sizeof t3);
1581
1582         /*
1583          * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
1584          */
1585         square_f256(t4, P1->z);
1586         mul_f256(t2, P2->x, t4);
1587         mul_f256(t5, P1->z, t4);
1588         mul_f256(t4, P2->y, t5);
1589
1590         /*
1591          * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
1592          * We need to test whether r is zero, so we will do some extra
1593          * reduce.
1594          */
1595         for (i = 0; i < 20; i ++) {
1596                 t2[i] += (F256[i] << 1) - t1[i];
1597                 t4[i] += (F256[i] << 1) - t3[i];
1598         }
1599         norm13(t2, t2, 20);
1600         norm13(t4, t4, 20);
1601         reduce_f256(t4);
1602         reduce_final_f256(t4);
1603         ret = 0;
1604         for (i = 0; i < 20; i ++) {
1605                 ret |= t4[i];
1606         }
1607         ret = (ret | -ret) >> 31;
1608
1609         /*
1610          * Compute u1*h^2 (in t6) and h^3 (in t5);
1611          */
1612         square_f256(t7, t2);
1613         mul_f256(t6, t1, t7);
1614         mul_f256(t5, t7, t2);
1615
1616         /*
1617          * Compute x3 = r^2 - h^3 - 2*u1*h^2.
1618          */
1619         square_f256(P1->x, t4);
1620         for (i = 0; i < 20; i ++) {
1621                 P1->x[i] += (F256[i] << 3) - t5[i] - (t6[i] << 1);
1622         }
1623         norm13(P1->x, P1->x, 20);
1624         reduce_f256(P1->x);
1625
1626         /*
1627          * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
1628          */
1629         for (i = 0; i < 20; i ++) {
1630                 t6[i] += (F256[i] << 1) - P1->x[i];
1631         }
1632         norm13(t6, t6, 20);
1633         mul_f256(P1->y, t4, t6);
1634         mul_f256(t1, t5, t3);
1635         for (i = 0; i < 20; i ++) {
1636                 P1->y[i] += (F256[i] << 1) - t1[i];
1637         }
1638         norm13(P1->y, P1->y, 20);
1639         reduce_f256(P1->y);
1640
1641         /*
1642          * Compute z3 = h*z1*z2.
1643          */
1644         mul_f256(P1->z, P1->z, t2);
1645
1646         return ret;
1647 }
1648
1649 /*
1650  * Decode a P-256 point. This function does not support the point at
1651  * infinity. Returned value is 0 if the point is invalid, 1 otherwise.
1652  */
1653 static uint32_t
1654 p256_decode(p256_jacobian *P, const void *src, size_t len)
1655 {
1656         const unsigned char *buf;
1657         uint32_t tx[20], ty[20], t1[20], t2[20];
1658         uint32_t bad;
1659         int i;
1660
1661         if (len != 65) {
1662                 return 0;
1663         }
1664         buf = src;
1665
1666         /*
1667          * First byte must be 0x04 (uncompressed format). We could support
1668          * "hybrid format" (first byte is 0x06 or 0x07, and encodes the
1669          * least significant bit of the Y coordinate), but it is explicitly
1670          * forbidden by RFC 5480 (section 2.2).
1671          */
1672         bad = NEQ(buf[0], 0x04);
1673
1674         /*
1675          * Decode the coordinates, and check that they are both lower
1676          * than the modulus.
1677          */
1678         tx[19] = be8_to_le13(tx, buf + 1, 32);
1679         ty[19] = be8_to_le13(ty, buf + 33, 32);
1680         bad |= reduce_final_f256(tx);
1681         bad |= reduce_final_f256(ty);
1682
1683         /*
1684          * Check curve equation.
1685          */
1686         square_f256(t1, tx);
1687         mul_f256(t1, tx, t1);
1688         square_f256(t2, ty);
1689         for (i = 0; i < 20; i ++) {
1690                 t1[i] += (F256[i] << 3) - MUL15(3, tx[i]) + P256_B[i] - t2[i];
1691         }
1692         norm13(t1, t1, 20);
1693         reduce_f256(t1);
1694         reduce_final_f256(t1);
1695         for (i = 0; i < 20; i ++) {
1696                 bad |= t1[i];
1697         }
1698
1699         /*
1700          * Copy coordinates to the point structure.
1701          */
1702         memcpy(P->x, tx, sizeof tx);
1703         memcpy(P->y, ty, sizeof ty);
1704         memset(P->z, 0, sizeof P->z);
1705         P->z[0] = 1;
1706         return NEQ(bad, 0) ^ 1;
1707 }
1708
1709 /*
1710  * Encode a point into a buffer. This function assumes that the point is
1711  * valid, in affine coordinates, and not the point at infinity.
1712  */
1713 static void
1714 p256_encode(void *dst, const p256_jacobian *P)
1715 {
1716         unsigned char *buf;
1717
1718         buf = dst;
1719         buf[0] = 0x04;
1720         le13_to_be8(buf + 1, 32, P->x);
1721         le13_to_be8(buf + 33, 32, P->y);
1722 }
1723
1724 /*
1725  * Multiply a curve point by an integer. The integer is assumed to be
1726  * lower than the curve order, and the base point must not be the point
1727  * at infinity.
1728  */
1729 static void
1730 p256_mul(p256_jacobian *P, const unsigned char *x, size_t xlen)
1731 {
1732         /*
1733          * qz is a flag that is initially 1, and remains equal to 1
1734          * as long as the point is the point at infinity.
1735          *
1736          * We use a 2-bit window to handle multiplier bits by pairs.
1737          * The precomputed window really is the points P2 and P3.
1738          */
1739         uint32_t qz;
1740         p256_jacobian P2, P3, Q, T, U;
1741
1742         /*
1743          * Compute window values.
1744          */
1745         P2 = *P;
1746         p256_double(&P2);
1747         P3 = *P;
1748         p256_add(&P3, &P2);
1749
1750         /*
1751          * We start with Q = 0. We process multiplier bits 2 by 2.
1752          */
1753         memset(&Q, 0, sizeof Q);
1754         qz = 1;
1755         while (xlen -- > 0) {
1756                 int k;
1757
1758                 for (k = 6; k >= 0; k -= 2) {
1759                         uint32_t bits;
1760                         uint32_t bnz;
1761
1762                         p256_double(&Q);
1763                         p256_double(&Q);
1764                         T = *P;
1765                         U = Q;
1766                         bits = (*x >> k) & (uint32_t)3;
1767                         bnz = NEQ(bits, 0);
1768                         CCOPY(EQ(bits, 2), &T, &P2, sizeof T);
1769                         CCOPY(EQ(bits, 3), &T, &P3, sizeof T);
1770                         p256_add(&U, &T);
1771                         CCOPY(bnz & qz, &Q, &T, sizeof Q);
1772                         CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
1773                         qz &= ~bnz;
1774                 }
1775                 x ++;
1776         }
1777         *P = Q;
1778 }
1779
1780 /*
1781  * Precomputed window: k*G points, where G is the curve generator, and k
1782  * is an integer from 1 to 15 (inclusive). The X and Y coordinates of
1783  * the point are encoded as 20 words of 13 bits each (little-endian
1784  * order); 13-bit words are then grouped 2-by-2 into 32-bit words
1785  * (little-endian order within each word).
1786  */
1787 static const uint32_t Gwin[15][20] = {
1788
1789         { 0x04C60296, 0x02721176, 0x19D00F4A, 0x102517AC,
1790           0x13B8037D, 0x0748103C, 0x1E730E56, 0x08481FE2,
1791           0x0F97012C, 0x00D605F4, 0x1DFA11F5, 0x0C801A0D,
1792           0x0F670CBB, 0x0AED0CC5, 0x115E0E33, 0x181F0785,
1793           0x13F514A7, 0x0FF30E3B, 0x17171E1A, 0x009F18D0 },
1794
1795         { 0x1B341978, 0x16911F11, 0x0D9A1A60, 0x1C4E1FC8,
1796           0x1E040969, 0x096A06B0, 0x091C0030, 0x09EF1A29,
1797           0x18C40D03, 0x00F91C9E, 0x13C313D1, 0x096F0748,
1798           0x011419E0, 0x1CC713A6, 0x1DD31DAD, 0x1EE80C36,
1799           0x1ECD0C69, 0x1A0800A4, 0x08861B8E, 0x000E1DD5 },
1800
1801         { 0x173F1D6C, 0x02CC06F1, 0x14C21FB4, 0x043D1EB6,
1802           0x0F3606B7, 0x1A971C59, 0x1BF71951, 0x01481323,
1803           0x068D0633, 0x00BD12F9, 0x13EA1032, 0x136209E8,
1804           0x1C1E19A7, 0x06C7013E, 0x06C10AB0, 0x14C908BB,
1805           0x05830CE1, 0x1FEF18DD, 0x00620998, 0x010E0D19 },
1806
1807         { 0x18180852, 0x0604111A, 0x0B771509, 0x1B6F0156,
1808           0x00181FE2, 0x1DCC0AF4, 0x16EF0659, 0x11F70E80,
1809           0x11A912D0, 0x01C414D2, 0x027618C6, 0x05840FC6,
1810           0x100215C4, 0x187E0C3B, 0x12771C96, 0x150C0B5D,
1811           0x0FF705FD, 0x07981C67, 0x1AD20C63, 0x01C11C55 },
1812
1813         { 0x1E8113ED, 0x0A940370, 0x12920215, 0x1FA31D6F,
1814           0x1F7C0C82, 0x10CD03F7, 0x02640560, 0x081A0B5E,
1815           0x1BD21151, 0x00A21642, 0x0D0B0DA4, 0x0176113F,
1816           0x04440D1D, 0x001A1360, 0x1068012F, 0x1F141E49,
1817           0x10DF136B, 0x0E4F162B, 0x0D44104A, 0x01C1105F },
1818
1819         { 0x011411A9, 0x01551A4F, 0x0ADA0C6B, 0x01BD0EC8,
1820           0x18120C74, 0x112F1778, 0x099202CB, 0x0C05124B,
1821           0x195316A4, 0x01600685, 0x1E3B1FE2, 0x189014E3,
1822           0x0B5E1FD7, 0x0E0311F8, 0x08E000F7, 0x174E00DE,
1823           0x160702DF, 0x1B5A15BF, 0x03A11237, 0x01D01704 },
1824
1825         { 0x0C3D12A3, 0x0C501C0C, 0x17AD1300, 0x1715003F,
1826           0x03F719F8, 0x18031ED8, 0x1D980667, 0x0F681896,
1827           0x1B7D00BF, 0x011C14CE, 0x0FA000B4, 0x1C3501B0,
1828           0x0D901C55, 0x06790C10, 0x029E0736, 0x0DEB0400,
1829           0x034F183A, 0x030619B4, 0x0DEF0033, 0x00E71AC7 },
1830
1831         { 0x1B7D1393, 0x1B3B1076, 0x0BED1B4D, 0x13011F3A,
1832           0x0E0E1238, 0x156A132B, 0x013A02D3, 0x160A0D01,
1833           0x1CED1EE9, 0x00C5165D, 0x184C157E, 0x08141A83,
1834           0x153C0DA5, 0x1ED70F9D, 0x05170D51, 0x02CF13B8,
1835           0x18AE1771, 0x1B04113F, 0x05EC11E9, 0x015A16B3 },
1836
1837         { 0x04A41EE0, 0x1D1412E4, 0x1C591D79, 0x118511B7,
1838           0x14F00ACB, 0x1AE31E1C, 0x049C0D51, 0x016E061E,
1839           0x1DB71EDF, 0x01D41A35, 0x0E8208FA, 0x14441293,
1840           0x011F1E85, 0x1D54137A, 0x026B114F, 0x151D0832,
1841           0x00A50964, 0x1F9C1E1C, 0x064B12C9, 0x005409D1 },
1842
1843         { 0x062B123F, 0x0C0D0501, 0x183704C3, 0x08E31120,
1844           0x0A2E0A6C, 0x14440FED, 0x090A0D1E, 0x13271964,
1845           0x0B590A3A, 0x019D1D9B, 0x05780773, 0x09770A91,
1846           0x0F770CA3, 0x053F19D4, 0x02C80DED, 0x1A761304,
1847           0x091E0DD9, 0x15D201B8, 0x151109AA, 0x010F0198 },
1848
1849         { 0x05E101D1, 0x072314DD, 0x045F1433, 0x1A041541,
1850           0x10B3142E, 0x01840736, 0x1C1B19DB, 0x098B0418,
1851           0x1DBC083B, 0x007D1444, 0x01511740, 0x11DD1F3A,
1852           0x04ED0E2F, 0x1B4B1A62, 0x10480D04, 0x09E911A2,
1853           0x04211AFA, 0x19140893, 0x04D60CC4, 0x01210648 },
1854
1855         { 0x112703C4, 0x018B1BA1, 0x164C1D50, 0x05160BE0,
1856           0x0BCC1830, 0x01CB1554, 0x13291732, 0x1B2B1918,
1857           0x0DED0817, 0x00E80775, 0x0A2401D3, 0x0BFE08B3,
1858           0x0E531199, 0x058616E9, 0x04770B91, 0x110F0C55,
1859           0x19C11554, 0x0BFB1159, 0x03541C38, 0x000E1C2D },
1860
1861         { 0x10390C01, 0x02BB0751, 0x0AC5098E, 0x096C17AB,
1862           0x03C90E28, 0x10BD18BF, 0x002E1F2D, 0x092B0986,
1863           0x1BD700AC, 0x002E1F20, 0x1E3D1FD8, 0x077718BB,
1864           0x06F919C4, 0x187407ED, 0x11370E14, 0x081E139C,
1865           0x00481ADB, 0x14AB0289, 0x066A0EBE, 0x00C70ED6 },
1866
1867         { 0x0694120B, 0x124E1CC9, 0x0E2F0570, 0x17CF081A,
1868           0x078906AC, 0x066D17CF, 0x1B3207F4, 0x0C5705E9,
1869           0x10001C38, 0x00A919DE, 0x06851375, 0x0F900BD8,
1870           0x080401BA, 0x0EEE0D42, 0x1B8B11EA, 0x0B4519F0,
1871           0x090F18C0, 0x062E1508, 0x0DD909F4, 0x01EB067C },
1872
1873         { 0x0CDC1D5F, 0x0D1818F9, 0x07781636, 0x125B18E8,
1874           0x0D7003AF, 0x13110099, 0x1D9B1899, 0x175C1EB7,
1875           0x0E34171A, 0x01E01153, 0x081A0F36, 0x0B391783,
1876           0x1D1F147E, 0x19CE16D7, 0x11511B21, 0x1F2C10F9,
1877           0x12CA0E51, 0x05A31D39, 0x171A192E, 0x016B0E4F }
1878 };
1879
1880 /*
1881  * Lookup one of the Gwin[] values, by index. This is constant-time.
1882  */
1883 static void
1884 lookup_Gwin(p256_jacobian *T, uint32_t idx)
1885 {
1886         uint32_t xy[20];
1887         uint32_t k;
1888         size_t u;
1889
1890         memset(xy, 0, sizeof xy);
1891         for (k = 0; k < 15; k ++) {
1892                 uint32_t m;
1893
1894                 m = -EQ(idx, k + 1);
1895                 for (u = 0; u < 20; u ++) {
1896                         xy[u] |= m & Gwin[k][u];
1897                 }
1898         }
1899         for (u = 0; u < 10; u ++) {
1900                 T->x[(u << 1) + 0] = xy[u] & 0xFFFF;
1901                 T->x[(u << 1) + 1] = xy[u] >> 16;
1902                 T->y[(u << 1) + 0] = xy[u + 10] & 0xFFFF;
1903                 T->y[(u << 1) + 1] = xy[u + 10] >> 16;
1904         }
1905         memset(T->z, 0, sizeof T->z);
1906         T->z[0] = 1;
1907 }
1908
1909 /*
1910  * Multiply the generator by an integer. The integer is assumed non-zero
1911  * and lower than the curve order.
1912  */
1913 static void
1914 p256_mulgen(p256_jacobian *P, const unsigned char *x, size_t xlen)
1915 {
1916         /*
1917          * qz is a flag that is initially 1, and remains equal to 1
1918          * as long as the point is the point at infinity.
1919          *
1920          * We use a 4-bit window to handle multiplier bits by groups
1921          * of 4. The precomputed window is constant static data, with
1922          * points in affine coordinates; we use a constant-time lookup.
1923          */
1924         p256_jacobian Q;
1925         uint32_t qz;
1926
1927         memset(&Q, 0, sizeof Q);
1928         qz = 1;
1929         while (xlen -- > 0) {
1930                 int k;
1931                 unsigned bx;
1932
1933                 bx = *x ++;
1934                 for (k = 0; k < 2; k ++) {
1935                         uint32_t bits;
1936                         uint32_t bnz;
1937                         p256_jacobian T, U;
1938
1939                         p256_double(&Q);
1940                         p256_double(&Q);
1941                         p256_double(&Q);
1942                         p256_double(&Q);
1943                         bits = (bx >> 4) & 0x0F;
1944                         bnz = NEQ(bits, 0);
1945                         lookup_Gwin(&T, bits);
1946                         U = Q;
1947                         p256_add_mixed(&U, &T);
1948                         CCOPY(bnz & qz, &Q, &T, sizeof Q);
1949                         CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
1950                         qz &= ~bnz;
1951                         bx <<= 4;
1952                 }
1953         }
1954         *P = Q;
1955 }
1956
1957 static const unsigned char P256_G[] = {
1958         0x04, 0x6B, 0x17, 0xD1, 0xF2, 0xE1, 0x2C, 0x42, 0x47, 0xF8,
1959         0xBC, 0xE6, 0xE5, 0x63, 0xA4, 0x40, 0xF2, 0x77, 0x03, 0x7D,
1960         0x81, 0x2D, 0xEB, 0x33, 0xA0, 0xF4, 0xA1, 0x39, 0x45, 0xD8,
1961         0x98, 0xC2, 0x96, 0x4F, 0xE3, 0x42, 0xE2, 0xFE, 0x1A, 0x7F,
1962         0x9B, 0x8E, 0xE7, 0xEB, 0x4A, 0x7C, 0x0F, 0x9E, 0x16, 0x2B,
1963         0xCE, 0x33, 0x57, 0x6B, 0x31, 0x5E, 0xCE, 0xCB, 0xB6, 0x40,
1964         0x68, 0x37, 0xBF, 0x51, 0xF5
1965 };
1966
1967 static const unsigned char P256_N[] = {
1968         0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF,
1969         0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xBC, 0xE6, 0xFA, 0xAD,
1970         0xA7, 0x17, 0x9E, 0x84, 0xF3, 0xB9, 0xCA, 0xC2, 0xFC, 0x63,
1971         0x25, 0x51
1972 };
1973
1974 static const unsigned char *
1975 api_generator(int curve, size_t *len)
1976 {
1977         (void)curve;
1978         *len = sizeof P256_G;
1979         return P256_G;
1980 }
1981
1982 static const unsigned char *
1983 api_order(int curve, size_t *len)
1984 {
1985         (void)curve;
1986         *len = sizeof P256_N;
1987         return P256_N;
1988 }
1989
1990 static uint32_t
1991 api_mul(unsigned char *G, size_t Glen,
1992         const unsigned char *x, size_t xlen, int curve)
1993 {
1994         uint32_t r;
1995         p256_jacobian P;
1996
1997         (void)curve;
1998         r = p256_decode(&P, G, Glen);
1999         p256_mul(&P, x, xlen);
2000         if (Glen >= 65) {
2001                 p256_to_affine(&P);
2002                 p256_encode(G, &P);
2003         }
2004         return r;
2005 }
2006
2007 static size_t
2008 api_mulgen(unsigned char *R,
2009         const unsigned char *x, size_t xlen, int curve)
2010 {
2011         p256_jacobian P;
2012
2013         (void)curve;
2014         p256_mulgen(&P, x, xlen);
2015         p256_to_affine(&P);
2016         p256_encode(R, &P);
2017         return 65;
2018
2019         /*
2020         const unsigned char *G;
2021         size_t Glen;
2022
2023         G = api_generator(curve, &Glen);
2024         memcpy(R, G, Glen);
2025         api_mul(R, Glen, x, xlen, curve);
2026         return Glen;
2027         */
2028 }
2029
2030 static uint32_t
2031 api_muladd(unsigned char *A, const unsigned char *B, size_t len,
2032         const unsigned char *x, size_t xlen,
2033         const unsigned char *y, size_t ylen, int curve)
2034 {
2035         p256_jacobian P, Q;
2036         uint32_t r, t, z;
2037         int i;
2038
2039         (void)curve;
2040         r = p256_decode(&P, A, len);
2041         p256_mul(&P, x, xlen);
2042         if (B == NULL) {
2043                 p256_mulgen(&Q, y, ylen);
2044         } else {
2045                 r &= p256_decode(&Q, B, len);
2046                 p256_mul(&Q, y, ylen);
2047         }
2048
2049         /*
2050          * The final addition may fail in case both points are equal.
2051          */
2052         t = p256_add(&P, &Q);
2053         reduce_final_f256(P.z);
2054         z = 0;
2055         for (i = 0; i < 20; i ++) {
2056                 z |= P.z[i];
2057         }
2058         z = EQ(z, 0);
2059         p256_double(&Q);
2060
2061         /*
2062          * If z is 1 then either P+Q = 0 (t = 1) or P = Q (t = 0). So we
2063          * have the following:
2064          *
2065          *   z = 0, t = 0   return P (normal addition)
2066          *   z = 0, t = 1   return P (normal addition)
2067          *   z = 1, t = 0   return Q (a 'double' case)
2068          *   z = 1, t = 1   report an error (P+Q = 0)
2069          */
2070         CCOPY(z & ~t, &P, &Q, sizeof Q);
2071         p256_to_affine(&P);
2072         p256_encode(A, &P);
2073         r &= ~(z & t);
2074         return r;
2075 }
2076
2077 /* see bearssl_ec.h */
2078 const br_ec_impl br_ec_p256_m15 = {
2079         (uint32_t)0x00800000,
2080         &api_generator,
2081         &api_order,
2082         &api_mul,
2083         &api_mulgen,
2084         &api_muladd
2085 };