2 * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
4 * Permission is hereby granted, free of charge, to any person obtaining
5 * a copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sublicense, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
12 * The above copyright notice and this permission notice shall be
13 * included in all copies or substantial portions of the Software.
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27 #if BR_INT128 || BR_UMUL128
32 * Compute x*y+v1+v2. Operands are 64-bit, and result is 128-bit, with
33 * high word in "hi" and low word in "lo".
35 #define FMA1(hi, lo, x, y, v1, v2) do { \
36 unsigned __int128 fmaz; \
37 fmaz = (unsigned __int128)(x) * (unsigned __int128)(y) \
38 + (unsigned __int128)(v1) + (unsigned __int128)(v2); \
39 (hi) = (uint64_t)(fmaz >> 64); \
40 (lo) = (uint64_t)fmaz; \
44 * Compute x1*y1+x2*y2+v1+v2. Operands are 64-bit, and result is 128-bit,
45 * with high word in "hi" and low word in "lo".
47 * Callers should ensure that the two inner products, and the v1 and v2
48 * operands, are multiple of 4 (this is not used by this specific definition
49 * but may help other implementations).
51 #define FMA2(hi, lo, x1, y1, x2, y2, v1, v2) do { \
52 unsigned __int128 fmaz; \
53 fmaz = (unsigned __int128)(x1) * (unsigned __int128)(y1) \
54 + (unsigned __int128)(x2) * (unsigned __int128)(y2) \
55 + (unsigned __int128)(v1) + (unsigned __int128)(v2); \
56 (hi) = (uint64_t)(fmaz >> 64); \
57 (lo) = (uint64_t)fmaz; \
64 #define FMA1(hi, lo, x, y, v1, v2) do { \
65 uint64_t fmahi, fmalo; \
66 unsigned char fmacc; \
67 fmalo = _umul128((x), (y), &fmahi); \
68 fmacc = _addcarry_u64(0, fmalo, (v1), &fmalo); \
69 _addcarry_u64(fmacc, fmahi, 0, &fmahi); \
70 fmacc = _addcarry_u64(0, fmalo, (v2), &(lo)); \
71 _addcarry_u64(fmacc, fmahi, 0, &(hi)); \
75 * Normally we should use _addcarry_u64() for FMA2 too, but it makes
76 * Visual Studio crash. Instead we use this version, which leverages
77 * the fact that the vx operands, and the products, are multiple of 4.
78 * This is unfortunately slower.
80 #define FMA2(hi, lo, x1, y1, x2, y2, v1, v2) do { \
81 uint64_t fma1hi, fma1lo; \
82 uint64_t fma2hi, fma2lo; \
84 fma1lo = _umul128((x1), (y1), &fma1hi); \
85 fma2lo = _umul128((x2), (y2), &fma2hi); \
86 fmatt = (fma1lo >> 2) + (fma2lo >> 2) \
87 + ((v1) >> 2) + ((v2) >> 2); \
89 (hi) = fma1hi + fma2hi + (fmatt >> 62); \
93 * The FMA2 macro definition we would prefer to use, but it triggers
94 * an internal compiler error in Visual Studio 2015.
96 #define FMA2(hi, lo, x1, y1, x2, y2, v1, v2) do { \
97 uint64_t fma1hi, fma1lo; \
98 uint64_t fma2hi, fma2lo; \
99 unsigned char fmacc; \
100 fma1lo = _umul128((x1), (y1), &fma1hi); \
101 fma2lo = _umul128((x2), (y2), &fma2hi); \
102 fmacc = _addcarry_u64(0, fma1lo, (v1), &fma1lo); \
103 _addcarry_u64(fmacc, fma1hi, 0, &fma1hi); \
104 fmacc = _addcarry_u64(0, fma2lo, (v2), &fma2lo); \
105 _addcarry_u64(fmacc, fma2hi, 0, &fma2hi); \
106 fmacc = _addcarry_u64(0, fma1lo, fma2lo, &(lo)); \
107 _addcarry_u64(fmacc, fma1hi, fma2hi, &(hi)); \
113 #define MASK62 ((uint64_t)0x3FFFFFFFFFFFFFFF)
114 #define MUL62_lo(x, y) (((uint64_t)(x) * (uint64_t)(y)) & MASK62)
117 * Subtract b from a, and return the final carry. If 'ctl32' is 0, then
118 * a[] is kept unmodified, but the final carry is still computed and
122 i62_sub(uint64_t *a
, const uint64_t *b
, size_t num
, uint32_t ctl32
)
129 mask
= (uint64_t)ctl32
| ((uint64_t)ctl32
<< 32);
130 for (u
= 0; u
< num
; u
++) {
138 a
[u
] = aw
^ (mask
& (dw
^ aw
));
144 * Montgomery multiplication, over arrays of 62-bit values. The
145 * destination array (d) must be distinct from the other operands
146 * (x, y and m). All arrays are in little-endian format (least
147 * significant word comes first) over 'num' words.
150 montymul(uint64_t *d
, const uint64_t *x
, const uint64_t *y
,
151 const uint64_t *m
, size_t num
, uint64_t m0i
)
156 num4
= 1 + ((num
- 1) & ~(size_t)3);
157 memset(d
, 0, num
* sizeof *d
);
159 for (u
= 0; u
< num
; u
++) {
166 f
= MUL62_lo(d
[0] + MUL62_lo(x
[u
], y
[0]), m0i
) << 2;
168 FMA2(hi
, lo
, xu
, y
[0], f
, m
[0], d
[0] << 2, 0);
171 for (v
= 1; v
< num4
; v
+= 4) {
172 FMA2(hi
, lo
, xu
, y
[v
+ 0],
173 f
, m
[v
+ 0], d
[v
+ 0] << 2, r
<< 2);
176 FMA2(hi
, lo
, xu
, y
[v
+ 1],
177 f
, m
[v
+ 1], d
[v
+ 1] << 2, r
<< 2);
180 FMA2(hi
, lo
, xu
, y
[v
+ 2],
181 f
, m
[v
+ 2], d
[v
+ 2] << 2, r
<< 2);
184 FMA2(hi
, lo
, xu
, y
[v
+ 3],
185 f
, m
[v
+ 3], d
[v
+ 3] << 2, r
<< 2);
189 for (; v
< num
; v
++) {
190 FMA2(hi
, lo
, xu
, y
[v
], f
, m
[v
], d
[v
] << 2, r
<< 2);
196 d
[num
- 1] = zh
& MASK62
;
199 i62_sub(d
, m
, num
, (uint32_t)dh
| NOT(i62_sub(d
, m
, num
, 0)));
203 * Conversion back from Montgomery representation.
206 frommonty(uint64_t *x
, const uint64_t *m
, size_t num
, uint64_t m0i
)
210 for (u
= 0; u
< num
; u
++) {
213 f
= MUL62_lo(x
[0], m0i
) << 2;
215 for (v
= 0; v
< num
; v
++) {
218 FMA1(hi
, lo
, f
, m
[v
], x
[v
] << 2, cc
);
224 x
[num
- 1] = cc
>> 2;
226 i62_sub(x
, m
, num
, NOT(i62_sub(x
, m
, num
, 0)));
231 br_i62_modpow_opt(uint32_t *x31
, const unsigned char *e
, size_t elen
,
232 const uint32_t *m31
, uint32_t m0i31
, uint64_t *tmp
, size_t twlen
)
234 size_t u
, mw31num
, mw62num
;
235 uint64_t *x
, *m
, *t1
, *t2
;
238 int win_len
, acc_len
;
241 * Get modulus size, in words.
243 mw31num
= (m31
[0] + 31) >> 5;
244 mw62num
= (mw31num
+ 1) >> 1;
247 * In order to apply this function, we must have enough room tp
248 * copy the operand and modulus into the temporary array, along
249 * with at least two temporaries. If there is not enough room,
250 * switch to br_i31_modpow(). We also use br_i31_modpow() if the
251 * modulus length is not at least four words (94 bits or more).
253 if (mw31num
< 4 || (mw62num
<< 2) > twlen
) {
255 * We assume here that we can split an aligned uint64_t
256 * into two properly aligned uint32_t. Since both types
257 * are supposed to have an exact width with no padding,
258 * then this property must hold.
266 br_i31_modpow(x31
, e
, elen
, m31
, m0i31
,
267 (uint32_t *)tmp
, (uint32_t *)tmp
+ txlen
);
272 * Convert x to Montgomery representation: this means that
273 * we replace x with x*2^z mod m, where z is the smallest multiple
274 * of the word size such that 2^z >= m. We want to reuse the 31-bit
275 * functions here (for constant-time operation), but we need z
276 * for a 62-bit word size.
278 for (u
= 0; u
< mw62num
; u
++) {
279 br_i31_muladd_small(x31
, 0, m31
);
280 br_i31_muladd_small(x31
, 0, m31
);
284 * Assemble operands into arrays of 62-bit words. Note that
285 * all the arrays of 62-bit words that we will handle here
286 * are without any leading size word.
288 * We also adjust tmp and twlen to account for the words used
289 * for these extra arrays.
293 tmp
+= (mw62num
<< 1);
294 twlen
-= (mw62num
<< 1);
295 for (u
= 0; u
< mw31num
; u
+= 2) {
299 if ((u
+ 1) == mw31num
) {
300 m
[v
] = (uint64_t)m31
[u
+ 1];
301 x
[v
] = (uint64_t)x31
[u
+ 1];
303 m
[v
] = (uint64_t)m31
[u
+ 1]
304 + ((uint64_t)m31
[u
+ 2] << 31);
305 x
[v
] = (uint64_t)x31
[u
+ 1]
306 + ((uint64_t)x31
[u
+ 2] << 31);
311 * Compute window size. We support windows up to 5 bits; for a
312 * window of size k bits, we need 2^k+1 temporaries (for k = 1,
313 * we use special code that uses only 2 temporaries).
315 for (win_len
= 5; win_len
> 1; win_len
--) {
316 if ((((uint32_t)1 << win_len
) + 1) * mw62num
<= twlen
) {
325 * Compute m0i, which is equal to -(1/m0) mod 2^62. We were
326 * provided with m0i31, which already fulfills this property
327 * modulo 2^31; the single expression below is then sufficient.
329 m0i
= (uint64_t)m0i31
;
330 m0i
= MUL62_lo(m0i
, (uint64_t)2 + MUL62_lo(m0i
, m
[0]));
333 * Compute window contents. If the window has size one bit only,
334 * then t2 is set to x; otherwise, t2[0] is left untouched, and
335 * t2[k] is set to x^k (for k >= 1).
338 memcpy(t2
, x
, mw62num
* sizeof *x
);
342 memcpy(t2
+ mw62num
, x
, mw62num
* sizeof *x
);
344 for (u
= 2; u
< ((unsigned)1 << win_len
); u
++) {
345 montymul(base
+ mw62num
, base
, x
, m
, mw62num
, m0i
);
351 * Set x to 1, in Montgomery representation. We again use the
354 br_i31_zero(x31
, m31
[0]);
355 x31
[(m31
[0] + 31) >> 5] = 1;
356 br_i31_muladd_small(x31
, 0, m31
);
358 br_i31_muladd_small(x31
, 0, m31
);
360 for (u
= 0; u
< mw31num
; u
+= 2) {
364 if ((u
+ 1) == mw31num
) {
365 x
[v
] = (uint64_t)x31
[u
+ 1];
367 x
[v
] = (uint64_t)x31
[u
+ 1]
368 + ((uint64_t)x31
[u
+ 2] << 31);
373 * We process bits from most to least significant. At each
374 * loop iteration, we have acc_len bits in acc.
378 while (acc_len
> 0 || elen
> 0) {
381 uint64_t mask1
, mask2
;
387 if (acc_len
< win_len
) {
389 acc
= (acc
<< 8) | *e
++;
396 bits
= (acc
>> (acc_len
- k
)) & (((uint32_t)1 << k
) - 1);
400 * We could get exactly k bits. Compute k squarings.
402 for (i
= 0; i
< k
; i
++) {
403 montymul(t1
, x
, x
, m
, mw62num
, m0i
);
404 memcpy(x
, t1
, mw62num
* sizeof *x
);
408 * Window lookup: we want to set t2 to the window
409 * lookup value, assuming the bits are non-zero. If
410 * the window length is 1 bit only, then t2 is
411 * already set; otherwise, we do a constant-time lookup.
416 memset(t2
, 0, mw62num
* sizeof *t2
);
418 for (u
= 1; u
< ((uint32_t)1 << k
); u
++) {
422 mask
= -(uint64_t)EQ(u
, bits
);
423 for (v
= 0; v
< mw62num
; v
++) {
424 t2
[v
] |= mask
& base
[v
];
431 * Multiply with the looked-up value. We keep the product
432 * only if the exponent bits are not all-zero.
434 montymul(t1
, x
, t2
, m
, mw62num
, m0i
);
435 mask1
= -(uint64_t)EQ(bits
, 0);
437 for (u
= 0; u
< mw62num
; u
++) {
438 x
[u
] = (mask1
& x
[u
]) | (mask2
& t1
[u
]);
443 * Convert back from Montgomery representation.
445 frommonty(x
, m
, mw62num
, m0i
);
448 * Convert result into 31-bit words.
450 for (u
= 0; u
< mw31num
; u
+= 2) {
454 x31
[u
+ 1] = (uint32_t)zw
& 0x7FFFFFFF;
455 if ((u
+ 1) < mw31num
) {
456 x31
[u
+ 2] = (uint32_t)(zw
>> 31);
466 br_i62_modpow_opt(uint32_t *x31
, const unsigned char *e
, size_t elen
,
467 const uint32_t *m31
, uint32_t m0i31
, uint64_t *tmp
, size_t twlen
)
471 mwlen
= (m31
[0] + 63) >> 5;
475 return br_i31_modpow_opt(x31
, e
, elen
, m31
, m0i31
,
476 (uint32_t *)tmp
, twlen
<< 1);