_ Git - BearSSL/blob - src/int/i15_core.c

   1 /*
   2  * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining
   5  * a copy of this software and associated documentation files (the
   6  * "Software"), to deal in the Software without restriction, including
   7  * without limitation the rights to use, copy, modify, merge, publish,
   8  * distribute, sublicense, and/or sell copies of the Software, and to
   9  * permit persons to whom the Software is furnished to do so, subject to
  10  * the following conditions:
  11  *
  12  * The above copyright notice and this permission notice shall be
  13  * included in all copies or substantial portions of the Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  16  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  17  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  18  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  19  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  20  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  21  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "inner.h"
  26
  27 /*
  28  * This file contains the core "big integer" functions for the i15
  29  * implementation, that represents integers as sequences of 15-bit
  30  * words.
  31  */
  32
  33 /* see inner.h */
  34 uint32_t
  35 br_i15_iszero(const uint16_t *x)
  36 {
  37         uint32_t z;
  38         size_t u;
  39
  40         z = 0;
  41         for (u = (x[0] + 15) >> 4; u > 0; u --) {
  42                 z |= x[u];
  43         }
  44         return ~(z | -z) >> 31;
  45 }
  46
  47 /* see inner.h */
  48 uint16_t
  49 br_i15_ninv15(uint16_t x)
  50 {
  51         uint32_t y;
  52
  53         y = 2 - x;
  54         y = MUL15(y, 2 - MUL15(x, y));
  55         y = MUL15(y, 2 - MUL15(x, y));
  56         y = MUL15(y, 2 - MUL15(x, y));
  57         return MUX(x & 1, -y, 0) & 0x7FFF;
  58 }
  59
  60 /* see inner.h */
  61 uint32_t
  62 br_i15_add(uint16_t *a, const uint16_t *b, uint32_t ctl)
  63 {
  64         uint32_t cc;
  65         size_t u, m;
  66
  67         cc = 0;
  68         m = (a[0] + 31) >> 4;
  69         for (u = 1; u < m; u ++) {
  70                 uint32_t aw, bw, naw;
  71
  72                 aw = a[u];
  73                 bw = b[u];
  74                 naw = aw + bw + cc;
  75                 cc = naw >> 15;
  76                 a[u] = MUX(ctl, naw & 0x7FFF, aw);
  77         }
  78         return cc;
  79 }
  80
  81 /* see inner.h */
  82 uint32_t
  83 br_i15_sub(uint16_t *a, const uint16_t *b, uint32_t ctl)
  84 {
  85         uint32_t cc;
  86         size_t u, m;
  87
  88         cc = 0;
  89         m = (a[0] + 31) >> 4;
  90         for (u = 1; u < m; u ++) {
  91                 uint32_t aw, bw, naw;
  92
  93                 aw = a[u];
  94                 bw = b[u];
  95                 naw = aw - bw - cc;
  96                 cc = naw >> 31;
  97                 a[u] = MUX(ctl, naw & 0x7FFF, aw);
  98         }
  99         return cc;
 100 }
 101
 102 /*
 103  * Constant-time division. The divisor must not be larger than 16 bits,
 104  * and the quotient must fit on 17 bits.
 105  */
 106 static uint32_t
 107 divrem16(uint32_t x, uint32_t d, uint32_t *r)
 108 {
 109         int i;
 110         uint32_t q;
 111
 112         q = 0;
 113         d <<= 16;
 114         for (i = 16; i >= 0; i --) {
 115                 uint32_t ctl;
 116
 117                 ctl = LE(d, x);
 118                 q |= ctl << i;
 119                 x -= (-ctl) & d;
 120                 d >>= 1;
 121         }
 122         if (r != NULL) {
 123                 *r = x;
 124         }
 125         return q;
 126 }
 127
 128 /* see inner.h */
 129 void
 130 br_i15_muladd_small(uint16_t *x, uint16_t z, const uint16_t *m)
 131 {
 132         /*
 133          * Constant-time: we accept to leak the exact bit length of the
 134          * modulus m.
 135          */
 136         unsigned m_bitlen, mblr;
 137         size_t u, mlen;
 138         uint32_t hi, a0, a, b, q;
 139         uint32_t cc, tb, over, under;
 140
 141         /*
 142          * Simple case: the modulus fits on one word.
 143          */
 144         m_bitlen = m[0];
 145         if (m_bitlen == 0) {
 146                 return;
 147         }
 148         if (m_bitlen <= 15) {
 149                 uint32_t rem;
 150
 151                 divrem16(((uint32_t)x[1] << 15) | z, m[1], &rem);
 152                 x[1] = rem;
 153                 return;
 154         }
 155         mlen = (m_bitlen + 15) >> 4;
 156         mblr = m_bitlen & 15;
 157
 158         /*
 159          * Principle: we estimate the quotient (x*2^15+z)/m by
 160          * doing a 30/15 division with the high words.
 161          *
 162          * Let:
 163          *   w = 2^15
 164          *   a = (w*a0 + a1) * w^N + a2
 165          *   b = b0 * w^N + b2
 166          * such that:
 167          *   0 <= a0 < w
 168          *   0 <= a1 < w
 169          *   0 <= a2 < w^N
 170          *   w/2 <= b0 < w
 171          *   0 <= b2 < w^N
 172          *   a < w*b
 173          * I.e. the two top words of a are a0:a1, the top word of b is
 174          * b0, we ensured that b0 is "full" (high bit set), and a is
 175          * such that the quotient q = a/b fits on one word (0 <= q < w).
 176          *
 177          * If a = b*q + r (with 0 <= r < q), then we can estimate q by
 178          * using a division on the top words:
 179          *   a0*w + a1 = b0*u + v (with 0 <= v < b0)
 180          * Then the following holds:
 181          *   0 <= u <= w
 182          *   u-2 <= q <= u
 183          */
 184         hi = x[mlen];
 185         if (mblr == 0) {
 186                 a0 = x[mlen];
 187                 memmove(x + 2, x + 1, (mlen - 1) * sizeof *x);
 188                 x[1] = z;
 189                 a = (a0 << 15) + x[mlen];
 190                 b = m[mlen];
 191         } else {
 192                 a0 = (x[mlen] << (15 - mblr)) | (x[mlen - 1] >> mblr);
 193                 memmove(x + 2, x + 1, (mlen - 1) * sizeof *x);
 194                 x[1] = z;
 195                 a = (a0 << 15) | (((x[mlen] << (15 - mblr))
 196                         | (x[mlen - 1] >> mblr)) & 0x7FFF);
 197                 b = (m[mlen] << (15 - mblr)) | (m[mlen - 1] >> mblr);
 198         }
 199         q = divrem16(a, b, NULL);
 200
 201         /*
 202          * We computed an estimate for q, but the real one may be q,
 203          * q-1 or q-2; moreover, the division may have returned a value
 204          * 8000 or even 8001 if the two high words were identical, and
 205          * we want to avoid values beyond 7FFF. We thus adjust q so
 206          * that the "true" multiplier will be q+1, q or q-1, and q is
 207          * in the 0000..7FFF range.
 208          */
 209         q = MUX(EQ(b, a0), 0x7FFF, q - 1 + ((q - 1) >> 31));
 210
 211         /*
 212          * We subtract q*m from x (x has an extra high word of value 'hi').
 213          * Since q may be off by 1 (in either direction), we may have to
 214          * add or subtract m afterwards.
 215          *
 216          * The 'tb' flag will be true (1) at the end of the loop if the
 217          * result is greater than or equal to the modulus (not counting
 218          * 'hi' or the carry).
 219          */
 220         cc = 0;
 221         tb = 1;
 222         for (u = 1; u <= mlen; u ++) {
 223                 uint32_t mw, zl, xw, nxw;
 224
 225                 mw = m[u];
 226                 zl = MUL15(mw, q) + cc;
 227                 cc = zl >> 15;
 228                 zl &= 0x7FFF;
 229                 xw = x[u];
 230                 nxw = xw - zl;
 231                 cc += nxw >> 31;
 232                 nxw &= 0x7FFF;
 233                 x[u] = nxw;
 234                 tb = MUX(EQ(nxw, mw), tb, GT(nxw, mw));
 235         }
 236
 237         /*
 238          * If we underestimated q, then either cc < hi (one extra bit
 239          * beyond the top array word), or cc == hi and tb is true (no
 240          * extra bit, but the result is not lower than the modulus).
 241          *
 242          * If we overestimated q, then cc > hi.
 243          */
 244         over = GT(cc, hi);
 245         under = ~over & (tb | LT(cc, hi));
 246         br_i15_add(x, m, over);
 247         br_i15_sub(x, m, under);
 248 }
 249
 250 /* see inner.h */
 251 void
 252 br_i15_montymul(uint16_t *d, const uint16_t *x, const uint16_t *y,
 253         const uint16_t *m, uint16_t m0i)
 254 {
 255         size_t len, len4, u, v;
 256         uint32_t dh;
 257
 258         len = (m[0] + 15) >> 4;
 259         len4 = len & ~(size_t)3;
 260         br_i15_zero(d, m[0]);
 261         dh = 0;
 262         for (u = 0; u < len; u ++) {
 263                 uint32_t f, xu, r, zh;
 264
 265                 xu = x[u + 1];
 266                 f = MUL15(d[1] + MUL15(x[u + 1], y[1]), m0i) & 0x7FFF;
 267
 268                 r = 0;
 269                 for (v = 0; v < len4; v += 4) {
 270                         uint32_t z;
 271
 272                         z = d[v + 1] + MUL15(xu, y[v + 1])
 273                                 + MUL15(f, m[v + 1]) + r;
 274                         r = z >> 15;
 275                         d[v + 0] = z & 0x7FFF;
 276                         z = d[v + 2] + MUL15(xu, y[v + 2])
 277                                 + MUL15(f, m[v + 2]) + r;
 278                         r = z >> 15;
 279                         d[v + 1] = z & 0x7FFF;
 280                         z = d[v + 3] + MUL15(xu, y[v + 3])
 281                                 + MUL15(f, m[v + 3]) + r;
 282                         r = z >> 15;
 283                         d[v + 2] = z & 0x7FFF;
 284                         z = d[v + 4] + MUL15(xu, y[v + 4])
 285                                 + MUL15(f, m[v + 4]) + r;
 286                         r = z >> 15;
 287                         d[v + 3] = z & 0x7FFF;
 288                 }
 289                 for (; v < len; v ++) {
 290                         uint32_t z;
 291
 292                         z = d[v + 1] + MUL15(xu, y[v + 1])
 293                                 + MUL15(f, m[v + 1]) + r;
 294                         r = z >> 15;
 295                         d[v + 0] = z & 0x7FFF;
 296                 }
 297
 298                 zh = dh + r;
 299                 d[len] = zh & 0x7FFF;
 300                 dh = zh >> 31;
 301         }
 302
 303         /*
 304          * Restore the bit length (it was overwritten in the loop above).
 305          */
 306         d[0] = m[0];
 307
 308         /*
 309          * d[] may be greater than m[], but it is still lower than twice
 310          * the modulus.
 311          */
 312         br_i15_sub(d, m, NEQ(dh, 0) | NOT(br_i15_sub(d, m, 0)));
 313 }
 314
 315 /* see inner.h */
 316 void
 317 br_i15_to_monty(uint16_t *x, const uint16_t *m)
 318 {
 319         unsigned k;
 320
 321         for (k = (m[0] + 15) >> 4; k > 0; k --) {
 322                 br_i15_muladd_small(x, 0, m);
 323         }
 324 }
 325
 326 /* see inner.h */
 327 void
 328 br_i15_modpow(uint16_t *x,
 329         const unsigned char *e, size_t elen,
 330         const uint16_t *m, uint16_t m0i, uint16_t *t1, uint16_t *t2)
 331 {
 332         size_t mlen;
 333         unsigned k;
 334
 335         mlen = ((m[0] + 31) >> 4) * sizeof m[0];
 336         memcpy(t1, x, mlen);
 337         br_i15_to_monty(t1, m);
 338         br_i15_zero(x, m[0]);
 339         x[1] = 1;
 340         for (k = 0; k < ((unsigned)elen << 3); k ++) {
 341                 uint32_t ctl;
 342
 343                 ctl = (e[elen - 1 - (k >> 3)] >> (k & 7)) & 1;
 344                 br_i15_montymul(t2, x, t1, m, m0i);
 345                 CCOPY(ctl, x, t2, mlen);
 346                 br_i15_montymul(t2, t1, t1, m, m0i);
 347                 memcpy(t1, t2, mlen);
 348         }
 349 }
 350
 351 /* see inner.h */
 352 void
 353 br_i15_encode(void *dst, size_t len, const uint16_t *x)
 354 {
 355         unsigned char *buf;
 356         size_t u, xlen;
 357         uint32_t acc;
 358         int acc_len;
 359
 360         xlen = (x[0] + 15) >> 4;
 361         if (xlen == 0) {
 362                 memset(dst, 0, len);
 363                 return;
 364         }
 365         u = 1;
 366         acc = 0;
 367         acc_len = 0;
 368         buf = dst;
 369         while (len -- > 0) {
 370                 if (acc_len < 8) {
 371                         if (u <= xlen) {
 372                                 acc += (uint32_t)x[u ++] << acc_len;
 373                         }
 374                         acc_len += 15;
 375                 }
 376                 buf[len] = (unsigned char)acc;
 377                 acc >>= 8;
 378                 acc_len -= 8;
 379         }
 380 }
 381
 382 /* see inner.h */
 383 uint32_t
 384 br_i15_decode_mod(uint16_t *x, const void *src, size_t len, const uint16_t *m)
 385 {
 386         /*
 387          * Two-pass algorithm: in the first pass, we determine whether the
 388          * value fits; in the second pass, we do the actual write.
 389          *
 390          * During the first pass, 'r' contains the comparison result so
 391          * far:
 392          *  0x00000000   value is equal to the modulus
 393          *  0x00000001   value is greater than the modulus
 394          *  0xFFFFFFFF   value is lower than the modulus
 395          *
 396          * Since we iterate starting with the least significant bytes (at
 397          * the end of src[]), each new comparison overrides the previous
 398          * except when the comparison yields 0 (equal).
 399          *
 400          * During the second pass, 'r' is either 0xFFFFFFFF (value fits)
 401          * or 0x00000000 (value does not fit).
 402          *
 403          * We must iterate over all bytes of the source, _and_ possibly
 404          * some extra virutal bytes (with value 0) so as to cover the
 405          * complete modulus as well. We also add 4 such extra bytes beyond
 406          * the modulus length because it then guarantees that no accumulated
 407          * partial word remains to be processed.
 408          */
 409         const unsigned char *buf;
 410         size_t mlen, tlen;
 411         int pass;
 412         uint32_t r;
 413
 414         buf = src;
 415         mlen = (m[0] + 15) >> 4;
 416         tlen = (mlen << 1);
 417         if (tlen < len) {
 418                 tlen = len;
 419         }
 420         tlen += 4;
 421         r = 0;
 422         for (pass = 0; pass < 2; pass ++) {
 423                 size_t u, v;
 424                 uint32_t acc;
 425                 int acc_len;
 426
 427                 v = 1;
 428                 acc = 0;
 429                 acc_len = 0;
 430                 for (u = 0; u < tlen; u ++) {
 431                         uint32_t b;
 432
 433                         if (u < len) {
 434                                 b = buf[len - 1 - u];
 435                         } else {
 436                                 b = 0;
 437                         }
 438                         acc |= (b << acc_len);
 439                         acc_len += 8;
 440                         if (acc_len >= 15) {
 441                                 uint32_t xw;
 442
 443                                 xw = acc & (uint32_t)0x7FFF;
 444                                 acc_len -= 15;
 445                                 acc = b >> (8 - acc_len);
 446                                 if (v <= mlen) {
 447                                         if (pass) {
 448                                                 x[v] = r & xw;
 449                                         } else {
 450                                                 uint32_t cc;
 451
 452                                                 cc = (uint32_t)CMP(xw, m[v]);
 453                                                 r = MUX(EQ(cc, 0), r, cc);
 454                                         }
 455                                 } else {
 456                                         if (!pass) {
 457                                                 r = MUX(EQ(xw, 0), r, 1);
 458                                         }
 459                                 }
 460                                 v ++;
 461                         }
 462                 }
 463
 464                 /*
 465                  * When we reach this point at the end of the first pass:
 466                  * r is either 0, 1 or -1; we want to set r to 0 if it
 467                  * is equal to 0 or 1, and leave it to -1 otherwise.
 468                  *
 469                  * When we reach this point at the end of the second pass:
 470                  * r is either 0 or -1; we want to leave that value
 471                  * untouched. This is a subcase of the previous.
 472                  */
 473                 r >>= 1;
 474                 r |= (r << 1);
 475         }
 476
 477         x[0] = m[0];
 478         return r & (uint32_t)1;
 479 }